{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.46498687574543207, "eval_steps": 500, "global_step": 150000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.3426266074180604, "epoch": 3.099912504969547e-05, "grad_norm": 15.919363975524902, "learning_rate": 4.359197907585005e-09, "loss": 0.904, "mean_token_accuracy": 0.7392466813325882, "num_tokens": 11974.0, "step": 10 }, { "entropy": 2.357245463132858, "epoch": 6.199825009939094e-05, "grad_norm": 17.161699295043945, "learning_rate": 9.202751138235009e-09, "loss": 0.9299, "mean_token_accuracy": 0.7459014117717743, "num_tokens": 23117.0, "step": 20 }, { "entropy": 2.2194840207695963, "epoch": 9.299737514908641e-05, "grad_norm": 5.266523361206055, "learning_rate": 1.4046304368885014e-08, "loss": 0.7805, "mean_token_accuracy": 0.7642201945185662, "num_tokens": 36481.0, "step": 30 }, { "entropy": 2.316211926937103, "epoch": 0.00012399650019878189, "grad_norm": 17.800691604614258, "learning_rate": 1.8889857599535022e-08, "loss": 0.901, "mean_token_accuracy": 0.756186394393444, "num_tokens": 48700.0, "step": 40 }, { "entropy": 2.3188992157578467, "epoch": 0.00015499562524847737, "grad_norm": 16.84282875061035, "learning_rate": 2.3733410830185025e-08, "loss": 0.8694, "mean_token_accuracy": 0.7459902837872505, "num_tokens": 60860.0, "step": 50 }, { "entropy": 2.334391838312149, "epoch": 0.00018599475029817283, "grad_norm": 17.42536735534668, "learning_rate": 2.857696406083503e-08, "loss": 0.9114, "mean_token_accuracy": 0.744971951842308, "num_tokens": 72100.0, "step": 60 }, { "entropy": 2.3774717271327974, "epoch": 0.00021699387534786831, "grad_norm": 13.821733474731445, "learning_rate": 3.3420517291485034e-08, "loss": 0.9343, "mean_token_accuracy": 0.7515697747468948, "num_tokens": 83144.0, "step": 70 }, { "entropy": 2.2282222718000413, "epoch": 0.00024799300039756377, "grad_norm": 16.813045501708984, "learning_rate": 3.8264070522135044e-08, "loss": 0.7473, "mean_token_accuracy": 0.7650388479232788, "num_tokens": 95878.0, "step": 80 }, { "entropy": 2.308245059847832, "epoch": 0.00027899212544725923, "grad_norm": 9.254444122314453, "learning_rate": 4.310762375278505e-08, "loss": 0.894, "mean_token_accuracy": 0.7462251231074333, "num_tokens": 107556.0, "step": 90 }, { "entropy": 2.1781405553221704, "epoch": 0.00030999125049695474, "grad_norm": 8.745959281921387, "learning_rate": 4.795117698343506e-08, "loss": 0.7802, "mean_token_accuracy": 0.7630234107375145, "num_tokens": 121234.0, "step": 100 }, { "entropy": 2.2521888092160225, "epoch": 0.0003409903755466502, "grad_norm": 9.263279914855957, "learning_rate": 5.2794730214085053e-08, "loss": 0.8459, "mean_token_accuracy": 0.7538067430257798, "num_tokens": 133561.0, "step": 110 }, { "entropy": 2.208202359080315, "epoch": 0.00037198950059634566, "grad_norm": 14.294042587280273, "learning_rate": 5.763828344473506e-08, "loss": 0.7828, "mean_token_accuracy": 0.7738959670066834, "num_tokens": 146427.0, "step": 120 }, { "entropy": 2.3306238532066343, "epoch": 0.0004029886256460411, "grad_norm": 18.315011978149414, "learning_rate": 6.248183667538507e-08, "loss": 0.935, "mean_token_accuracy": 0.7471725791692734, "num_tokens": 157320.0, "step": 130 }, { "entropy": 2.1904593795537948, "epoch": 0.00043398775069573663, "grad_norm": 9.24004077911377, "learning_rate": 6.732538990603508e-08, "loss": 0.7454, "mean_token_accuracy": 0.7584904819726944, "num_tokens": 170758.0, "step": 140 }, { "entropy": 2.2158657625317573, "epoch": 0.0004649868757454321, "grad_norm": 15.915936470031738, "learning_rate": 7.216894313668508e-08, "loss": 0.8056, "mean_token_accuracy": 0.7633484750986099, "num_tokens": 183462.0, "step": 150 }, { "entropy": 2.1613963916897774, "epoch": 0.0004959860007951275, "grad_norm": 16.494924545288086, "learning_rate": 7.701249636733508e-08, "loss": 0.7496, "mean_token_accuracy": 0.7714725866913795, "num_tokens": 196484.0, "step": 160 }, { "entropy": 2.2984951809048653, "epoch": 0.0005269851258448231, "grad_norm": 14.924416542053223, "learning_rate": 8.18560495979851e-08, "loss": 0.8421, "mean_token_accuracy": 0.749389934539795, "num_tokens": 208243.0, "step": 170 }, { "entropy": 2.2832500755786898, "epoch": 0.0005579842508945185, "grad_norm": 15.250982284545898, "learning_rate": 8.66996028286351e-08, "loss": 0.8231, "mean_token_accuracy": 0.7653956174850464, "num_tokens": 219830.0, "step": 180 }, { "entropy": 2.3017154157161714, "epoch": 0.000588983375944214, "grad_norm": 9.919282913208008, "learning_rate": 9.154315605928509e-08, "loss": 0.8631, "mean_token_accuracy": 0.7567846044898033, "num_tokens": 231756.0, "step": 190 }, { "entropy": 2.2674851924180985, "epoch": 0.0006199825009939095, "grad_norm": 15.856183052062988, "learning_rate": 9.63867092899351e-08, "loss": 0.8038, "mean_token_accuracy": 0.7690823331475258, "num_tokens": 244149.0, "step": 200 }, { "entropy": 2.2884296044707297, "epoch": 0.0006509816260436049, "grad_norm": 15.011128425598145, "learning_rate": 1.0123026252058511e-07, "loss": 0.8224, "mean_token_accuracy": 0.7501513630151748, "num_tokens": 256276.0, "step": 210 }, { "entropy": 2.3350342929363253, "epoch": 0.0006819807510933004, "grad_norm": 17.258758544921875, "learning_rate": 1.0607381575123511e-07, "loss": 0.904, "mean_token_accuracy": 0.7543755397200584, "num_tokens": 267504.0, "step": 220 }, { "entropy": 2.2519437611103057, "epoch": 0.0007129798761429959, "grad_norm": 13.8502197265625, "learning_rate": 1.1091736898188512e-07, "loss": 0.8167, "mean_token_accuracy": 0.7757848709821701, "num_tokens": 278715.0, "step": 230 }, { "entropy": 2.2798040971159934, "epoch": 0.0007439790011926913, "grad_norm": 18.540184020996094, "learning_rate": 1.1576092221253512e-07, "loss": 0.9195, "mean_token_accuracy": 0.7466915145516395, "num_tokens": 290352.0, "step": 240 }, { "entropy": 2.1401579171419143, "epoch": 0.0007749781262423868, "grad_norm": 13.905881881713867, "learning_rate": 1.2060447544318512e-07, "loss": 0.7208, "mean_token_accuracy": 0.7657153263688088, "num_tokens": 304153.0, "step": 250 }, { "entropy": 2.27926591783762, "epoch": 0.0008059772512920822, "grad_norm": 15.406006813049316, "learning_rate": 1.2544802867383514e-07, "loss": 0.8348, "mean_token_accuracy": 0.7559728816151619, "num_tokens": 316234.0, "step": 260 }, { "entropy": 2.285950776934624, "epoch": 0.0008369763763417777, "grad_norm": 7.619900703430176, "learning_rate": 1.3029158190448516e-07, "loss": 0.7548, "mean_token_accuracy": 0.7609011575579643, "num_tokens": 329019.0, "step": 270 }, { "entropy": 2.331369215250015, "epoch": 0.0008679755013914733, "grad_norm": 21.633079528808594, "learning_rate": 1.3513513513513515e-07, "loss": 0.8983, "mean_token_accuracy": 0.7470938101410866, "num_tokens": 340699.0, "step": 280 }, { "entropy": 2.3679069608449934, "epoch": 0.0008989746264411687, "grad_norm": 15.301081657409668, "learning_rate": 1.3997868836578514e-07, "loss": 0.8797, "mean_token_accuracy": 0.7614053919911384, "num_tokens": 351539.0, "step": 290 }, { "entropy": 2.349914161860943, "epoch": 0.0009299737514908642, "grad_norm": 13.113327026367188, "learning_rate": 1.4482224159643515e-07, "loss": 0.8534, "mean_token_accuracy": 0.7527757495641708, "num_tokens": 362660.0, "step": 300 }, { "entropy": 2.274288383126259, "epoch": 0.0009609728765405597, "grad_norm": 16.123380661010742, "learning_rate": 1.4966579482708517e-07, "loss": 0.8189, "mean_token_accuracy": 0.7668241068720818, "num_tokens": 374530.0, "step": 310 }, { "entropy": 2.2432350262999536, "epoch": 0.000991972001590255, "grad_norm": 9.562804222106934, "learning_rate": 1.5450934805773516e-07, "loss": 0.8106, "mean_token_accuracy": 0.7698193415999413, "num_tokens": 387392.0, "step": 320 }, { "entropy": 2.311431211233139, "epoch": 0.0010229711266399506, "grad_norm": 16.730819702148438, "learning_rate": 1.593529012883852e-07, "loss": 0.8178, "mean_token_accuracy": 0.7644891291856766, "num_tokens": 399180.0, "step": 330 }, { "entropy": 2.349755284190178, "epoch": 0.0010539702516896461, "grad_norm": 16.725875854492188, "learning_rate": 1.641964545190352e-07, "loss": 0.8588, "mean_token_accuracy": 0.7604990780353547, "num_tokens": 410702.0, "step": 340 }, { "entropy": 2.279722733795643, "epoch": 0.0010849693767393416, "grad_norm": 14.276802062988281, "learning_rate": 1.6904000774968518e-07, "loss": 0.7989, "mean_token_accuracy": 0.7695920899510383, "num_tokens": 422591.0, "step": 350 }, { "entropy": 2.1071667522192, "epoch": 0.001115968501789037, "grad_norm": 12.381896018981934, "learning_rate": 1.7388356098033517e-07, "loss": 0.6475, "mean_token_accuracy": 0.7926038712263107, "num_tokens": 436846.0, "step": 360 }, { "entropy": 2.3276678711175918, "epoch": 0.0011469676268387324, "grad_norm": 17.90573501586914, "learning_rate": 1.7872711421098519e-07, "loss": 0.8036, "mean_token_accuracy": 0.7576885908842087, "num_tokens": 448403.0, "step": 370 }, { "entropy": 2.2606518104672433, "epoch": 0.001177966751888428, "grad_norm": 12.439233779907227, "learning_rate": 1.8357066744163518e-07, "loss": 0.7721, "mean_token_accuracy": 0.7698314875364304, "num_tokens": 460244.0, "step": 380 }, { "entropy": 2.338596761226654, "epoch": 0.0012089658769381235, "grad_norm": 14.934176445007324, "learning_rate": 1.884142206722852e-07, "loss": 0.8393, "mean_token_accuracy": 0.7656854018568993, "num_tokens": 470678.0, "step": 390 }, { "entropy": 2.332528692483902, "epoch": 0.001239965001987819, "grad_norm": 16.811922073364258, "learning_rate": 1.932577739029352e-07, "loss": 0.8758, "mean_token_accuracy": 0.7543434649705887, "num_tokens": 482035.0, "step": 400 }, { "entropy": 2.282673454284668, "epoch": 0.0012709641270375143, "grad_norm": 15.464518547058105, "learning_rate": 1.981013271335852e-07, "loss": 0.8126, "mean_token_accuracy": 0.7647529050707818, "num_tokens": 494000.0, "step": 410 }, { "entropy": 2.306594988703728, "epoch": 0.0013019632520872098, "grad_norm": 16.271461486816406, "learning_rate": 2.0294488036423522e-07, "loss": 0.8339, "mean_token_accuracy": 0.755289240181446, "num_tokens": 505996.0, "step": 420 }, { "entropy": 2.3733946830034256, "epoch": 0.0013329623771369053, "grad_norm": 16.00725746154785, "learning_rate": 2.0778843359488523e-07, "loss": 0.8955, "mean_token_accuracy": 0.7515278026461601, "num_tokens": 517285.0, "step": 430 }, { "entropy": 2.2198241636157037, "epoch": 0.0013639615021866008, "grad_norm": 15.578631401062012, "learning_rate": 2.1263198682553522e-07, "loss": 0.746, "mean_token_accuracy": 0.7685888081789016, "num_tokens": 529581.0, "step": 440 }, { "entropy": 2.352111691981554, "epoch": 0.0013949606272362963, "grad_norm": 14.224849700927734, "learning_rate": 2.1747554005618524e-07, "loss": 0.859, "mean_token_accuracy": 0.7565658301115036, "num_tokens": 541039.0, "step": 450 }, { "entropy": 2.17586030960083, "epoch": 0.0014259597522859918, "grad_norm": 15.60697078704834, "learning_rate": 2.2231909328683525e-07, "loss": 0.7372, "mean_token_accuracy": 0.775518947839737, "num_tokens": 554099.0, "step": 460 }, { "entropy": 2.3119050413370132, "epoch": 0.0014569588773356871, "grad_norm": 14.367731094360352, "learning_rate": 2.2716264651748524e-07, "loss": 0.8381, "mean_token_accuracy": 0.7697366833686828, "num_tokens": 564893.0, "step": 470 }, { "entropy": 2.2108234629034995, "epoch": 0.0014879580023853826, "grad_norm": 17.166481018066406, "learning_rate": 2.3200619974813526e-07, "loss": 0.7081, "mean_token_accuracy": 0.7743594750761986, "num_tokens": 577998.0, "step": 480 }, { "entropy": 2.2766464233398436, "epoch": 0.0015189571274350781, "grad_norm": 16.3574275970459, "learning_rate": 2.3684975297878528e-07, "loss": 0.7699, "mean_token_accuracy": 0.7645501315593719, "num_tokens": 589715.0, "step": 490 }, { "entropy": 2.264049357175827, "epoch": 0.0015499562524847737, "grad_norm": 6.618349075317383, "learning_rate": 2.4169330620943527e-07, "loss": 0.78, "mean_token_accuracy": 0.7658553898334504, "num_tokens": 602659.0, "step": 500 }, { "entropy": 2.268606995046139, "epoch": 0.0015809553775344692, "grad_norm": 15.35413646697998, "learning_rate": 2.4653685944008526e-07, "loss": 0.8016, "mean_token_accuracy": 0.7688265934586525, "num_tokens": 615422.0, "step": 510 }, { "entropy": 2.181258149445057, "epoch": 0.0016119545025841645, "grad_norm": 16.496660232543945, "learning_rate": 2.513804126707353e-07, "loss": 0.6568, "mean_token_accuracy": 0.7991862148046494, "num_tokens": 627480.0, "step": 520 }, { "entropy": 2.207347111403942, "epoch": 0.00164295362763386, "grad_norm": 15.786961555480957, "learning_rate": 2.562239659013853e-07, "loss": 0.7347, "mean_token_accuracy": 0.7753358393907547, "num_tokens": 640090.0, "step": 530 }, { "entropy": 2.328162580728531, "epoch": 0.0016739527526835555, "grad_norm": 16.40909767150879, "learning_rate": 2.610675191320353e-07, "loss": 0.8802, "mean_token_accuracy": 0.7577004536986351, "num_tokens": 651238.0, "step": 540 }, { "entropy": 2.285073181986809, "epoch": 0.001704951877733251, "grad_norm": 15.64692211151123, "learning_rate": 2.659110723626853e-07, "loss": 0.808, "mean_token_accuracy": 0.7738980025053024, "num_tokens": 663483.0, "step": 550 }, { "entropy": 2.221916152536869, "epoch": 0.0017359510027829465, "grad_norm": 16.929515838623047, "learning_rate": 2.7075462559333526e-07, "loss": 0.7659, "mean_token_accuracy": 0.7734002724289895, "num_tokens": 676094.0, "step": 560 }, { "entropy": 2.207345911860466, "epoch": 0.001766950127832642, "grad_norm": 16.074033737182617, "learning_rate": 2.755981788239853e-07, "loss": 0.6923, "mean_token_accuracy": 0.7804467469453812, "num_tokens": 688999.0, "step": 570 }, { "entropy": 2.3080645158886908, "epoch": 0.0017979492528823373, "grad_norm": 14.152597427368164, "learning_rate": 2.804417320546353e-07, "loss": 0.8044, "mean_token_accuracy": 0.7673657357692718, "num_tokens": 700787.0, "step": 580 }, { "entropy": 2.3365137010812758, "epoch": 0.0018289483779320328, "grad_norm": 8.505473136901855, "learning_rate": 2.852852852852853e-07, "loss": 0.8693, "mean_token_accuracy": 0.7561125203967094, "num_tokens": 712382.0, "step": 590 }, { "entropy": 2.26595401763916, "epoch": 0.0018599475029817283, "grad_norm": 15.482884407043457, "learning_rate": 2.901288385159353e-07, "loss": 0.7499, "mean_token_accuracy": 0.7823521599173546, "num_tokens": 724324.0, "step": 600 }, { "entropy": 2.2956106424331666, "epoch": 0.0018909466280314239, "grad_norm": 17.781259536743164, "learning_rate": 2.949723917465853e-07, "loss": 0.8367, "mean_token_accuracy": 0.7637320205569267, "num_tokens": 736090.0, "step": 610 }, { "entropy": 2.380499321222305, "epoch": 0.0019219457530811194, "grad_norm": 13.911691665649414, "learning_rate": 2.998159449772353e-07, "loss": 0.8561, "mean_token_accuracy": 0.7514171093702317, "num_tokens": 746693.0, "step": 620 }, { "entropy": 2.3080674216151236, "epoch": 0.0019529448781308147, "grad_norm": 18.39996337890625, "learning_rate": 3.0465949820788535e-07, "loss": 0.7974, "mean_token_accuracy": 0.7719434529542923, "num_tokens": 758380.0, "step": 630 }, { "entropy": 2.33802735209465, "epoch": 0.00198394400318051, "grad_norm": 14.702959060668945, "learning_rate": 3.0950305143853534e-07, "loss": 0.8612, "mean_token_accuracy": 0.749081814289093, "num_tokens": 770107.0, "step": 640 }, { "entropy": 2.3233169302344323, "epoch": 0.002014943128230206, "grad_norm": 15.600394248962402, "learning_rate": 3.143466046691854e-07, "loss": 0.8358, "mean_token_accuracy": 0.7564076974987983, "num_tokens": 781888.0, "step": 650 }, { "entropy": 2.3429936945438383, "epoch": 0.002045942253279901, "grad_norm": 15.831826210021973, "learning_rate": 3.1919015789983537e-07, "loss": 0.8509, "mean_token_accuracy": 0.7663177505135537, "num_tokens": 792407.0, "step": 660 }, { "entropy": 2.2723167344927786, "epoch": 0.0020769413783295965, "grad_norm": 14.536561012268066, "learning_rate": 3.240337111304853e-07, "loss": 0.7932, "mean_token_accuracy": 0.7709016263484955, "num_tokens": 804473.0, "step": 670 }, { "entropy": 2.317927873134613, "epoch": 0.0021079405033792922, "grad_norm": 15.074233055114746, "learning_rate": 3.2887726436113535e-07, "loss": 0.7876, "mean_token_accuracy": 0.7756809115409851, "num_tokens": 815615.0, "step": 680 }, { "entropy": 2.279827579855919, "epoch": 0.0021389396284289875, "grad_norm": 15.745168685913086, "learning_rate": 3.3372081759178534e-07, "loss": 0.8729, "mean_token_accuracy": 0.7602526545524597, "num_tokens": 827866.0, "step": 690 }, { "entropy": 2.2185587152838706, "epoch": 0.0021699387534786833, "grad_norm": 13.561483383178711, "learning_rate": 3.385643708224354e-07, "loss": 0.748, "mean_token_accuracy": 0.779901297390461, "num_tokens": 839797.0, "step": 700 }, { "entropy": 2.234003722667694, "epoch": 0.0022009378785283785, "grad_norm": 13.80466079711914, "learning_rate": 3.4340792405308537e-07, "loss": 0.7714, "mean_token_accuracy": 0.7791284009814262, "num_tokens": 851460.0, "step": 710 }, { "entropy": 2.242032915353775, "epoch": 0.002231937003578074, "grad_norm": 8.220836639404297, "learning_rate": 3.482514772837354e-07, "loss": 0.7225, "mean_token_accuracy": 0.7788921162486077, "num_tokens": 863582.0, "step": 720 }, { "entropy": 2.2308412209153174, "epoch": 0.0022629361286277696, "grad_norm": 15.163178443908691, "learning_rate": 3.5309503051438535e-07, "loss": 0.7775, "mean_token_accuracy": 0.771105645596981, "num_tokens": 875961.0, "step": 730 }, { "entropy": 2.1884254679083823, "epoch": 0.002293935253677465, "grad_norm": 7.352316856384277, "learning_rate": 3.579385837450354e-07, "loss": 0.7359, "mean_token_accuracy": 0.7716657683253288, "num_tokens": 887838.0, "step": 740 }, { "entropy": 2.2045844838023188, "epoch": 0.0023249343787271606, "grad_norm": 14.715704917907715, "learning_rate": 3.627821369756854e-07, "loss": 0.7344, "mean_token_accuracy": 0.7731250256299973, "num_tokens": 900117.0, "step": 750 }, { "entropy": 2.261673641204834, "epoch": 0.002355933503776856, "grad_norm": 14.88746166229248, "learning_rate": 3.676256902063354e-07, "loss": 0.7627, "mean_token_accuracy": 0.7688748240470886, "num_tokens": 912552.0, "step": 760 }, { "entropy": 2.318584197759628, "epoch": 0.002386932628826551, "grad_norm": 15.337506294250488, "learning_rate": 3.7246924343698536e-07, "loss": 0.8543, "mean_token_accuracy": 0.7685193121433258, "num_tokens": 923351.0, "step": 770 }, { "entropy": 2.198628255724907, "epoch": 0.002417931753876247, "grad_norm": 16.160091400146484, "learning_rate": 3.773127966676354e-07, "loss": 0.7398, "mean_token_accuracy": 0.7804590031504631, "num_tokens": 935777.0, "step": 780 }, { "entropy": 2.291598491370678, "epoch": 0.002448930878925942, "grad_norm": 7.9013800621032715, "learning_rate": 3.821563498982854e-07, "loss": 0.7758, "mean_token_accuracy": 0.7750082641839982, "num_tokens": 947492.0, "step": 790 }, { "entropy": 2.3038537070155143, "epoch": 0.002479930003975638, "grad_norm": 16.498620986938477, "learning_rate": 3.8699990312893544e-07, "loss": 0.7678, "mean_token_accuracy": 0.7833232745528221, "num_tokens": 958770.0, "step": 800 }, { "entropy": 2.2682218730449675, "epoch": 0.0025109291290253332, "grad_norm": 15.202181816101074, "learning_rate": 3.9184345635958543e-07, "loss": 0.8518, "mean_token_accuracy": 0.7557914197444916, "num_tokens": 970777.0, "step": 810 }, { "entropy": 2.199041871726513, "epoch": 0.0025419282540750285, "grad_norm": 13.799867630004883, "learning_rate": 3.9668700959023547e-07, "loss": 0.7567, "mean_token_accuracy": 0.7677057057619094, "num_tokens": 982601.0, "step": 820 }, { "entropy": 2.2466016352176665, "epoch": 0.0025729273791247243, "grad_norm": 14.757229804992676, "learning_rate": 4.015305628208854e-07, "loss": 0.776, "mean_token_accuracy": 0.7803550854325294, "num_tokens": 994451.0, "step": 830 }, { "entropy": 2.304102659225464, "epoch": 0.0026039265041744196, "grad_norm": 16.70046615600586, "learning_rate": 4.0637411605153545e-07, "loss": 0.8395, "mean_token_accuracy": 0.7707262724637985, "num_tokens": 1005368.0, "step": 840 }, { "entropy": 2.2025340750813482, "epoch": 0.0026349256292241153, "grad_norm": 13.70840835571289, "learning_rate": 4.1121766928218544e-07, "loss": 0.722, "mean_token_accuracy": 0.7827607110142708, "num_tokens": 1017690.0, "step": 850 }, { "entropy": 2.1886146038770677, "epoch": 0.0026659247542738106, "grad_norm": 13.6995849609375, "learning_rate": 4.160612225128355e-07, "loss": 0.6548, "mean_token_accuracy": 0.7894467368721962, "num_tokens": 1030737.0, "step": 860 }, { "entropy": 2.119298684597015, "epoch": 0.002696923879323506, "grad_norm": 16.397750854492188, "learning_rate": 4.2090477574348547e-07, "loss": 0.6458, "mean_token_accuracy": 0.7952458307147026, "num_tokens": 1044784.0, "step": 870 }, { "entropy": 2.243351861834526, "epoch": 0.0027279230043732016, "grad_norm": 14.533041954040527, "learning_rate": 4.257483289741354e-07, "loss": 0.7294, "mean_token_accuracy": 0.7760630249977112, "num_tokens": 1056942.0, "step": 880 }, { "entropy": 2.3225993037223818, "epoch": 0.002758922129422897, "grad_norm": 16.506406784057617, "learning_rate": 4.3059188220478545e-07, "loss": 0.8633, "mean_token_accuracy": 0.7579209730029106, "num_tokens": 1067838.0, "step": 890 }, { "entropy": 2.0741579085588455, "epoch": 0.0027899212544725926, "grad_norm": 11.532833099365234, "learning_rate": 4.3543543543543544e-07, "loss": 0.6632, "mean_token_accuracy": 0.7945702284574508, "num_tokens": 1082957.0, "step": 900 }, { "entropy": 2.2643200919032096, "epoch": 0.002820920379522288, "grad_norm": 15.212760925292969, "learning_rate": 4.402789886660855e-07, "loss": 0.8228, "mean_token_accuracy": 0.76753601282835, "num_tokens": 1093997.0, "step": 910 }, { "entropy": 2.2407288014888764, "epoch": 0.0028519195045719837, "grad_norm": 16.18030548095703, "learning_rate": 4.451225418967355e-07, "loss": 0.8187, "mean_token_accuracy": 0.7693384140729904, "num_tokens": 1106072.0, "step": 920 }, { "entropy": 2.219868388772011, "epoch": 0.002882918629621679, "grad_norm": 16.22972297668457, "learning_rate": 4.499660951273855e-07, "loss": 0.7272, "mean_token_accuracy": 0.7875460609793663, "num_tokens": 1118504.0, "step": 930 }, { "entropy": 2.300278900563717, "epoch": 0.0029139177546713742, "grad_norm": 16.145498275756836, "learning_rate": 4.5480964835803545e-07, "loss": 0.7712, "mean_token_accuracy": 0.7815862789750099, "num_tokens": 1129720.0, "step": 940 }, { "entropy": 2.3173892736434936, "epoch": 0.00294491687972107, "grad_norm": 14.980684280395508, "learning_rate": 4.596532015886855e-07, "loss": 0.9265, "mean_token_accuracy": 0.7435617208480835, "num_tokens": 1140306.0, "step": 950 }, { "entropy": 2.172358636558056, "epoch": 0.0029759160047707653, "grad_norm": 14.839875221252441, "learning_rate": 4.644967548193355e-07, "loss": 0.7082, "mean_token_accuracy": 0.7885784819722176, "num_tokens": 1152921.0, "step": 960 }, { "entropy": 2.2328238427639007, "epoch": 0.003006915129820461, "grad_norm": 13.23880386352539, "learning_rate": 4.6934030804998553e-07, "loss": 0.7795, "mean_token_accuracy": 0.7753711074590683, "num_tokens": 1164657.0, "step": 970 }, { "entropy": 2.114644192159176, "epoch": 0.0030379142548701563, "grad_norm": 11.22083854675293, "learning_rate": 4.741838612806355e-07, "loss": 0.6744, "mean_token_accuracy": 0.7775777190923691, "num_tokens": 1177945.0, "step": 980 }, { "entropy": 2.1031323984265327, "epoch": 0.0030689133799198516, "grad_norm": 4.399860382080078, "learning_rate": 4.790274145112855e-07, "loss": 0.668, "mean_token_accuracy": 0.7964637473225593, "num_tokens": 1190989.0, "step": 990 }, { "entropy": 2.2336785733699798, "epoch": 0.0030999125049695473, "grad_norm": 15.71125602722168, "learning_rate": 4.838709677419355e-07, "loss": 0.772, "mean_token_accuracy": 0.7712054967880249, "num_tokens": 1201872.0, "step": 1000 }, { "entropy": 2.2714061349630357, "epoch": 0.0031309116300192426, "grad_norm": 15.550898551940918, "learning_rate": 4.887145209725856e-07, "loss": 0.795, "mean_token_accuracy": 0.7752420023083687, "num_tokens": 1213279.0, "step": 1010 }, { "entropy": 2.111206144094467, "epoch": 0.0031619107550689383, "grad_norm": 14.369123458862305, "learning_rate": 4.935580742032355e-07, "loss": 0.6963, "mean_token_accuracy": 0.7931261286139488, "num_tokens": 1226500.0, "step": 1020 }, { "entropy": 2.17624341994524, "epoch": 0.0031929098801186336, "grad_norm": 18.43202781677246, "learning_rate": 4.984016274338856e-07, "loss": 0.7139, "mean_token_accuracy": 0.7850997045636177, "num_tokens": 1238540.0, "step": 1030 }, { "entropy": 2.209955517947674, "epoch": 0.003223909005168329, "grad_norm": 16.401142120361328, "learning_rate": 5.032451806645356e-07, "loss": 0.7126, "mean_token_accuracy": 0.7926564201712608, "num_tokens": 1250753.0, "step": 1040 }, { "entropy": 2.267963781952858, "epoch": 0.0032549081302180247, "grad_norm": 15.670921325683594, "learning_rate": 5.080887338951856e-07, "loss": 0.8273, "mean_token_accuracy": 0.7623932763934136, "num_tokens": 1262037.0, "step": 1050 }, { "entropy": 1.9811606004834175, "epoch": 0.00328590725526772, "grad_norm": 14.190478324890137, "learning_rate": 5.129322871258355e-07, "loss": 0.5722, "mean_token_accuracy": 0.7981273889541626, "num_tokens": 1276993.0, "step": 1060 }, { "entropy": 2.1613825380802156, "epoch": 0.0033169063803174157, "grad_norm": 7.023738861083984, "learning_rate": 5.177758403564856e-07, "loss": 0.6904, "mean_token_accuracy": 0.7866356909275055, "num_tokens": 1289051.0, "step": 1070 }, { "entropy": 2.25160670876503, "epoch": 0.003347905505367111, "grad_norm": 14.642976760864258, "learning_rate": 5.226193935871355e-07, "loss": 0.8145, "mean_token_accuracy": 0.7689608931541443, "num_tokens": 1300620.0, "step": 1080 }, { "entropy": 2.2016865596175195, "epoch": 0.0033789046304168063, "grad_norm": 15.589518547058105, "learning_rate": 5.274629468177855e-07, "loss": 0.7511, "mean_token_accuracy": 0.7763894140720368, "num_tokens": 1312252.0, "step": 1090 }, { "entropy": 2.2270815491676332, "epoch": 0.003409903755466502, "grad_norm": 14.04956340789795, "learning_rate": 5.323065000484356e-07, "loss": 0.7445, "mean_token_accuracy": 0.7833787277340889, "num_tokens": 1323950.0, "step": 1100 }, { "entropy": 2.1960760056972504, "epoch": 0.0034409028805161973, "grad_norm": 17.72970199584961, "learning_rate": 5.371500532790856e-07, "loss": 0.7735, "mean_token_accuracy": 0.7613553330302238, "num_tokens": 1336895.0, "step": 1110 }, { "entropy": 2.0831907019019127, "epoch": 0.003471902005565893, "grad_norm": 10.339810371398926, "learning_rate": 5.419936065097356e-07, "loss": 0.6841, "mean_token_accuracy": 0.7895815879106521, "num_tokens": 1350813.0, "step": 1120 }, { "entropy": 2.1665103510022163, "epoch": 0.0035029011306155883, "grad_norm": 14.524060249328613, "learning_rate": 5.468371597403856e-07, "loss": 0.7598, "mean_token_accuracy": 0.7754980430006981, "num_tokens": 1362111.0, "step": 1130 }, { "entropy": 2.217361180484295, "epoch": 0.003533900255665284, "grad_norm": 14.30985164642334, "learning_rate": 5.516807129710356e-07, "loss": 0.7568, "mean_token_accuracy": 0.782513102889061, "num_tokens": 1373317.0, "step": 1140 }, { "entropy": 2.1852347582578657, "epoch": 0.0035648993807149794, "grad_norm": 16.35944938659668, "learning_rate": 5.565242662016856e-07, "loss": 0.7744, "mean_token_accuracy": 0.7764955341815949, "num_tokens": 1384737.0, "step": 1150 }, { "entropy": 2.2084068596363067, "epoch": 0.0035958985057646746, "grad_norm": 18.96138572692871, "learning_rate": 5.613678194323357e-07, "loss": 0.8437, "mean_token_accuracy": 0.7699245184659957, "num_tokens": 1396007.0, "step": 1160 }, { "entropy": 2.1020758375525475, "epoch": 0.0036268976308143704, "grad_norm": 7.519232749938965, "learning_rate": 5.662113726629856e-07, "loss": 0.6637, "mean_token_accuracy": 0.7951419726014137, "num_tokens": 1408380.0, "step": 1170 }, { "entropy": 2.157642234861851, "epoch": 0.0036578967558640657, "grad_norm": 15.029388427734375, "learning_rate": 5.710549258936356e-07, "loss": 0.7325, "mean_token_accuracy": 0.785622188448906, "num_tokens": 1420661.0, "step": 1180 }, { "entropy": 2.037250469624996, "epoch": 0.0036888958809137614, "grad_norm": 15.331890106201172, "learning_rate": 5.758984791242856e-07, "loss": 0.6885, "mean_token_accuracy": 0.7827024027705193, "num_tokens": 1434538.0, "step": 1190 }, { "entropy": 2.085458919405937, "epoch": 0.0037198950059634567, "grad_norm": 17.225831985473633, "learning_rate": 5.807420323549356e-07, "loss": 0.6795, "mean_token_accuracy": 0.7806023344397545, "num_tokens": 1448017.0, "step": 1200 }, { "entropy": 2.104422791302204, "epoch": 0.003750894131013152, "grad_norm": 16.510398864746094, "learning_rate": 5.855855855855856e-07, "loss": 0.7358, "mean_token_accuracy": 0.7740836933255195, "num_tokens": 1460208.0, "step": 1210 }, { "entropy": 2.1881393998861314, "epoch": 0.0037818932560628477, "grad_norm": 16.095951080322266, "learning_rate": 5.904291388162357e-07, "loss": 0.762, "mean_token_accuracy": 0.7820860460400582, "num_tokens": 1471627.0, "step": 1220 }, { "entropy": 2.092408487200737, "epoch": 0.003812892381112543, "grad_norm": 16.695289611816406, "learning_rate": 5.952726920468856e-07, "loss": 0.7157, "mean_token_accuracy": 0.787506853044033, "num_tokens": 1483004.0, "step": 1230 }, { "entropy": 2.0120772644877434, "epoch": 0.0038438915061622387, "grad_norm": 15.932051658630371, "learning_rate": 6.001162452775357e-07, "loss": 0.6887, "mean_token_accuracy": 0.7973495230078698, "num_tokens": 1496481.0, "step": 1240 }, { "entropy": 2.160550518333912, "epoch": 0.003874890631211934, "grad_norm": 15.765700340270996, "learning_rate": 6.049597985081857e-07, "loss": 0.7399, "mean_token_accuracy": 0.7820422008633614, "num_tokens": 1508377.0, "step": 1250 }, { "entropy": 2.1905842304229735, "epoch": 0.0039058897562616293, "grad_norm": 16.464012145996094, "learning_rate": 6.098033517388357e-07, "loss": 0.8825, "mean_token_accuracy": 0.7656736433506012, "num_tokens": 1519413.0, "step": 1260 }, { "entropy": 2.1988802313804627, "epoch": 0.003936888881311325, "grad_norm": 13.513689041137695, "learning_rate": 6.146469049694856e-07, "loss": 0.8144, "mean_token_accuracy": 0.7779660537838936, "num_tokens": 1530217.0, "step": 1270 }, { "entropy": 2.183208304643631, "epoch": 0.00396788800636102, "grad_norm": 7.471315383911133, "learning_rate": 6.194904582001357e-07, "loss": 0.7551, "mean_token_accuracy": 0.7802443280816078, "num_tokens": 1542024.0, "step": 1280 }, { "entropy": 2.0785870283842085, "epoch": 0.003998887131410716, "grad_norm": 17.728321075439453, "learning_rate": 6.243340114307856e-07, "loss": 0.6849, "mean_token_accuracy": 0.7881320580840111, "num_tokens": 1555038.0, "step": 1290 }, { "entropy": 2.0753221943974496, "epoch": 0.004029886256460412, "grad_norm": 13.686918258666992, "learning_rate": 6.291775646614356e-07, "loss": 0.6741, "mean_token_accuracy": 0.7906556352972984, "num_tokens": 1567185.0, "step": 1300 }, { "entropy": 2.083100973069668, "epoch": 0.004060885381510107, "grad_norm": 7.613674163818359, "learning_rate": 6.340211178920856e-07, "loss": 0.6265, "mean_token_accuracy": 0.7916320934891701, "num_tokens": 1580688.0, "step": 1310 }, { "entropy": 2.227837671339512, "epoch": 0.004091884506559802, "grad_norm": 17.518901824951172, "learning_rate": 6.388646711227357e-07, "loss": 0.7923, "mean_token_accuracy": 0.7738375559449195, "num_tokens": 1592038.0, "step": 1320 }, { "entropy": 2.1594600453972816, "epoch": 0.004122883631609498, "grad_norm": 15.655396461486816, "learning_rate": 6.437082243533857e-07, "loss": 0.7167, "mean_token_accuracy": 0.7852950558066368, "num_tokens": 1603983.0, "step": 1330 }, { "entropy": 2.0584792092442514, "epoch": 0.004153882756659193, "grad_norm": 12.117267608642578, "learning_rate": 6.485517775840357e-07, "loss": 0.6728, "mean_token_accuracy": 0.7903219699859619, "num_tokens": 1616249.0, "step": 1340 }, { "entropy": 2.088547757267952, "epoch": 0.004184881881708889, "grad_norm": 16.858585357666016, "learning_rate": 6.533953308146857e-07, "loss": 0.7391, "mean_token_accuracy": 0.778919804096222, "num_tokens": 1628549.0, "step": 1350 }, { "entropy": 2.1754756391048433, "epoch": 0.0042158810067585845, "grad_norm": 15.385615348815918, "learning_rate": 6.582388840453358e-07, "loss": 0.7799, "mean_token_accuracy": 0.7731549352407455, "num_tokens": 1639876.0, "step": 1360 }, { "entropy": 2.127278658747673, "epoch": 0.004246880131808279, "grad_norm": 8.056036949157715, "learning_rate": 6.630824372759858e-07, "loss": 0.7034, "mean_token_accuracy": 0.7827352419495582, "num_tokens": 1652097.0, "step": 1370 }, { "entropy": 2.0123574048280717, "epoch": 0.004277879256857975, "grad_norm": 6.172573566436768, "learning_rate": 6.679259905066357e-07, "loss": 0.6069, "mean_token_accuracy": 0.7994264915585518, "num_tokens": 1665043.0, "step": 1380 }, { "entropy": 2.096929004788399, "epoch": 0.004308878381907671, "grad_norm": 8.49801254272461, "learning_rate": 6.727695437372856e-07, "loss": 0.7136, "mean_token_accuracy": 0.7863008975982666, "num_tokens": 1677725.0, "step": 1390 }, { "entropy": 2.0824546411633493, "epoch": 0.0043398775069573665, "grad_norm": 14.584894180297852, "learning_rate": 6.776130969679358e-07, "loss": 0.7036, "mean_token_accuracy": 0.7845976158976555, "num_tokens": 1690896.0, "step": 1400 }, { "entropy": 2.13474909812212, "epoch": 0.004370876632007061, "grad_norm": 13.656248092651367, "learning_rate": 6.824566501985857e-07, "loss": 0.732, "mean_token_accuracy": 0.7771820738911629, "num_tokens": 1702660.0, "step": 1410 }, { "entropy": 2.0404009863734247, "epoch": 0.004401875757056757, "grad_norm": 13.051438331604004, "learning_rate": 6.873002034292357e-07, "loss": 0.6294, "mean_token_accuracy": 0.7985868468880654, "num_tokens": 1714725.0, "step": 1420 }, { "entropy": 2.0748074553906917, "epoch": 0.004432874882106453, "grad_norm": 13.570876121520996, "learning_rate": 6.921437566598857e-07, "loss": 0.6639, "mean_token_accuracy": 0.7878038331866264, "num_tokens": 1728017.0, "step": 1430 }, { "entropy": 2.072512838244438, "epoch": 0.004463874007156148, "grad_norm": 7.851343154907227, "learning_rate": 6.969873098905358e-07, "loss": 0.7054, "mean_token_accuracy": 0.7887277230620384, "num_tokens": 1740469.0, "step": 1440 }, { "entropy": 2.15796595364809, "epoch": 0.004494873132205843, "grad_norm": 13.530000686645508, "learning_rate": 7.018308631211858e-07, "loss": 0.7452, "mean_token_accuracy": 0.7796575650572777, "num_tokens": 1751979.0, "step": 1450 }, { "entropy": 2.0756077498197554, "epoch": 0.004525872257255539, "grad_norm": 14.499431610107422, "learning_rate": 7.066744163518358e-07, "loss": 0.7363, "mean_token_accuracy": 0.7842569574713707, "num_tokens": 1764493.0, "step": 1460 }, { "entropy": 1.988140694797039, "epoch": 0.004556871382305234, "grad_norm": 6.652562618255615, "learning_rate": 7.115179695824858e-07, "loss": 0.6319, "mean_token_accuracy": 0.7997438430786132, "num_tokens": 1777296.0, "step": 1470 }, { "entropy": 2.0755784660577774, "epoch": 0.00458787050735493, "grad_norm": 13.8985013961792, "learning_rate": 7.163615228131359e-07, "loss": 0.7402, "mean_token_accuracy": 0.7709438994526863, "num_tokens": 1789171.0, "step": 1480 }, { "entropy": 2.0258029848337173, "epoch": 0.0046188696324046255, "grad_norm": 7.679947376251221, "learning_rate": 7.212050760437858e-07, "loss": 0.6942, "mean_token_accuracy": 0.785995414853096, "num_tokens": 1801926.0, "step": 1490 }, { "entropy": 2.0404342114925385, "epoch": 0.004649868757454321, "grad_norm": 6.81480598449707, "learning_rate": 7.260486292744357e-07, "loss": 0.6672, "mean_token_accuracy": 0.7987658172845841, "num_tokens": 1814939.0, "step": 1500 }, { "entropy": 2.071673668920994, "epoch": 0.004680867882504016, "grad_norm": 15.106314659118652, "learning_rate": 7.308921825050857e-07, "loss": 0.6894, "mean_token_accuracy": 0.7804762050509453, "num_tokens": 1827340.0, "step": 1510 }, { "entropy": 2.0692993998527527, "epoch": 0.004711867007553712, "grad_norm": 15.853338241577148, "learning_rate": 7.357357357357357e-07, "loss": 0.7129, "mean_token_accuracy": 0.7840728506445884, "num_tokens": 1839510.0, "step": 1520 }, { "entropy": 2.135234770178795, "epoch": 0.0047428661326034075, "grad_norm": 6.517123699188232, "learning_rate": 7.405792889663858e-07, "loss": 0.8329, "mean_token_accuracy": 0.7573605760931968, "num_tokens": 1851013.0, "step": 1530 }, { "entropy": 2.036342756450176, "epoch": 0.004773865257653102, "grad_norm": 13.540575981140137, "learning_rate": 7.454228421970358e-07, "loss": 0.6977, "mean_token_accuracy": 0.7853451654314995, "num_tokens": 1863818.0, "step": 1540 }, { "entropy": 2.087058800458908, "epoch": 0.004804864382702798, "grad_norm": 8.103325843811035, "learning_rate": 7.502663954276858e-07, "loss": 0.7266, "mean_token_accuracy": 0.7897397369146347, "num_tokens": 1875751.0, "step": 1550 }, { "entropy": 2.091054494678974, "epoch": 0.004835863507752494, "grad_norm": 16.008771896362305, "learning_rate": 7.551099486583358e-07, "loss": 0.7302, "mean_token_accuracy": 0.7836149573326111, "num_tokens": 1887588.0, "step": 1560 }, { "entropy": 2.0431090027093886, "epoch": 0.0048668626328021896, "grad_norm": 8.24061393737793, "learning_rate": 7.599535018889859e-07, "loss": 0.6644, "mean_token_accuracy": 0.8029666885733604, "num_tokens": 1899870.0, "step": 1570 }, { "entropy": 1.8285046428442002, "epoch": 0.004897861757851884, "grad_norm": 7.983487606048584, "learning_rate": 7.647970551196359e-07, "loss": 0.5009, "mean_token_accuracy": 0.8131875425577164, "num_tokens": 1915664.0, "step": 1580 }, { "entropy": 2.086928182840347, "epoch": 0.00492886088290158, "grad_norm": 13.69112777709961, "learning_rate": 7.696406083502859e-07, "loss": 0.6791, "mean_token_accuracy": 0.7944577828049659, "num_tokens": 1927081.0, "step": 1590 }, { "entropy": 2.0402188524603844, "epoch": 0.004959860007951276, "grad_norm": 15.185161590576172, "learning_rate": 7.744841615809357e-07, "loss": 0.7159, "mean_token_accuracy": 0.7936604365706443, "num_tokens": 1940073.0, "step": 1600 }, { "entropy": 2.149085035920143, "epoch": 0.004990859133000971, "grad_norm": 16.92563247680664, "learning_rate": 7.793277148115859e-07, "loss": 0.7675, "mean_token_accuracy": 0.7764922738075256, "num_tokens": 1951448.0, "step": 1610 }, { "entropy": 2.099419781565666, "epoch": 0.0050218582580506665, "grad_norm": 15.367118835449219, "learning_rate": 7.841712680422358e-07, "loss": 0.7569, "mean_token_accuracy": 0.7802120968699455, "num_tokens": 1962645.0, "step": 1620 }, { "entropy": 2.0213639751076697, "epoch": 0.005052857383100362, "grad_norm": 13.852250099182129, "learning_rate": 7.890148212728858e-07, "loss": 0.6431, "mean_token_accuracy": 0.7958111792802811, "num_tokens": 1974941.0, "step": 1630 }, { "entropy": 2.1066105023026465, "epoch": 0.005083856508150057, "grad_norm": 15.502884864807129, "learning_rate": 7.938583745035358e-07, "loss": 0.6782, "mean_token_accuracy": 0.7936943307518959, "num_tokens": 1987185.0, "step": 1640 }, { "entropy": 2.0182456478476523, "epoch": 0.005114855633199753, "grad_norm": 7.306271076202393, "learning_rate": 7.987019277341859e-07, "loss": 0.5772, "mean_token_accuracy": 0.8059568896889686, "num_tokens": 2000487.0, "step": 1650 }, { "entropy": 2.147824813425541, "epoch": 0.0051458547582494485, "grad_norm": 18.491527557373047, "learning_rate": 8.035454809648359e-07, "loss": 0.8096, "mean_token_accuracy": 0.7748027428984642, "num_tokens": 2011914.0, "step": 1660 }, { "entropy": 1.9622011199593543, "epoch": 0.005176853883299144, "grad_norm": 13.780349731445312, "learning_rate": 8.083890341954859e-07, "loss": 0.6272, "mean_token_accuracy": 0.8053760439157486, "num_tokens": 2025757.0, "step": 1670 }, { "entropy": 2.098005874454975, "epoch": 0.005207853008348839, "grad_norm": 14.81534194946289, "learning_rate": 8.132325874261359e-07, "loss": 0.7484, "mean_token_accuracy": 0.7781560063362122, "num_tokens": 2037711.0, "step": 1680 }, { "entropy": 2.052612027525902, "epoch": 0.005238852133398535, "grad_norm": 8.133528709411621, "learning_rate": 8.18076140656786e-07, "loss": 0.6822, "mean_token_accuracy": 0.79173034876585, "num_tokens": 2049738.0, "step": 1690 }, { "entropy": 1.9735546708106995, "epoch": 0.005269851258448231, "grad_norm": 16.37797737121582, "learning_rate": 8.22919693887436e-07, "loss": 0.6594, "mean_token_accuracy": 0.7950135871767998, "num_tokens": 2062316.0, "step": 1700 }, { "entropy": 2.029072532057762, "epoch": 0.0053008503834979254, "grad_norm": 5.160717010498047, "learning_rate": 8.277632471180859e-07, "loss": 0.6559, "mean_token_accuracy": 0.7948900654911994, "num_tokens": 2075028.0, "step": 1710 }, { "entropy": 2.0763241961598395, "epoch": 0.005331849508547621, "grad_norm": 15.46190357208252, "learning_rate": 8.326068003487358e-07, "loss": 0.684, "mean_token_accuracy": 0.7929447039961814, "num_tokens": 2086702.0, "step": 1720 }, { "entropy": 2.141001485288143, "epoch": 0.005362848633597317, "grad_norm": 13.897162437438965, "learning_rate": 8.374503535793858e-07, "loss": 0.7299, "mean_token_accuracy": 0.7951222896575928, "num_tokens": 2097706.0, "step": 1730 }, { "entropy": 2.0640996396541595, "epoch": 0.005393847758647012, "grad_norm": 13.737348556518555, "learning_rate": 8.422939068100359e-07, "loss": 0.7175, "mean_token_accuracy": 0.7896956667304039, "num_tokens": 2108391.0, "step": 1740 }, { "entropy": 1.9356930390000344, "epoch": 0.0054248468836967075, "grad_norm": 8.433479309082031, "learning_rate": 8.471374600406859e-07, "loss": 0.631, "mean_token_accuracy": 0.7932530760765075, "num_tokens": 2120471.0, "step": 1750 }, { "entropy": 2.0442377462983132, "epoch": 0.005455846008746403, "grad_norm": 18.229915618896484, "learning_rate": 8.519810132713359e-07, "loss": 0.7221, "mean_token_accuracy": 0.7952359080314636, "num_tokens": 2132167.0, "step": 1760 }, { "entropy": 2.04462625682354, "epoch": 0.005486845133796099, "grad_norm": 14.919930458068848, "learning_rate": 8.568245665019859e-07, "loss": 0.699, "mean_token_accuracy": 0.7886338919401169, "num_tokens": 2143973.0, "step": 1770 }, { "entropy": 2.0158157765865328, "epoch": 0.005517844258845794, "grad_norm": 19.1096248626709, "learning_rate": 8.61668119732636e-07, "loss": 0.7034, "mean_token_accuracy": 0.7857451155781746, "num_tokens": 2156439.0, "step": 1780 }, { "entropy": 2.1503632470965384, "epoch": 0.0055488433838954895, "grad_norm": 17.376968383789062, "learning_rate": 8.66511672963286e-07, "loss": 0.742, "mean_token_accuracy": 0.7804848656058312, "num_tokens": 2167869.0, "step": 1790 }, { "entropy": 2.0076410725712774, "epoch": 0.005579842508945185, "grad_norm": 18.342008590698242, "learning_rate": 8.71355226193936e-07, "loss": 0.642, "mean_token_accuracy": 0.8009541600942611, "num_tokens": 2180188.0, "step": 1800 }, { "entropy": 2.0049708664417265, "epoch": 0.00561084163399488, "grad_norm": 8.827497482299805, "learning_rate": 8.761987794245858e-07, "loss": 0.6828, "mean_token_accuracy": 0.7910264268517494, "num_tokens": 2192579.0, "step": 1810 }, { "entropy": 2.085582806169987, "epoch": 0.005641840759044576, "grad_norm": 17.707990646362305, "learning_rate": 8.81042332655236e-07, "loss": 0.7866, "mean_token_accuracy": 0.7726237401366234, "num_tokens": 2204679.0, "step": 1820 }, { "entropy": 2.0235436514019964, "epoch": 0.005672839884094272, "grad_norm": 9.473557472229004, "learning_rate": 8.858858858858859e-07, "loss": 0.7397, "mean_token_accuracy": 0.7853599205613137, "num_tokens": 2216727.0, "step": 1830 }, { "entropy": 2.0955518752336504, "epoch": 0.005703839009143967, "grad_norm": 14.616801261901855, "learning_rate": 8.907294391165359e-07, "loss": 0.7601, "mean_token_accuracy": 0.7755588620901108, "num_tokens": 2228738.0, "step": 1840 }, { "entropy": 2.0462396532297134, "epoch": 0.005734838134193662, "grad_norm": 12.282236099243164, "learning_rate": 8.955729923471859e-07, "loss": 0.7061, "mean_token_accuracy": 0.7882364228367805, "num_tokens": 2240796.0, "step": 1850 }, { "entropy": 2.0632716536521913, "epoch": 0.005765837259243358, "grad_norm": 18.294025421142578, "learning_rate": 9.00416545577836e-07, "loss": 0.72, "mean_token_accuracy": 0.7841200634837151, "num_tokens": 2252408.0, "step": 1860 }, { "entropy": 2.0464861541986465, "epoch": 0.005796836384293054, "grad_norm": 13.904800415039062, "learning_rate": 9.05260098808486e-07, "loss": 0.7355, "mean_token_accuracy": 0.7857467010617256, "num_tokens": 2263031.0, "step": 1870 }, { "entropy": 1.9634292259812356, "epoch": 0.0058278355093427485, "grad_norm": 7.740158557891846, "learning_rate": 9.10103652039136e-07, "loss": 0.6566, "mean_token_accuracy": 0.7917238727211953, "num_tokens": 2275474.0, "step": 1880 }, { "entropy": 1.934257398545742, "epoch": 0.005858834634392444, "grad_norm": 7.317460060119629, "learning_rate": 9.14947205269786e-07, "loss": 0.6318, "mean_token_accuracy": 0.8091923862695694, "num_tokens": 2287535.0, "step": 1890 }, { "entropy": 2.0293245404958724, "epoch": 0.00588983375944214, "grad_norm": 13.60265064239502, "learning_rate": 9.197907585004361e-07, "loss": 0.6919, "mean_token_accuracy": 0.7935610696673393, "num_tokens": 2299006.0, "step": 1900 }, { "entropy": 1.9936009973287583, "epoch": 0.005920832884491835, "grad_norm": 7.678701400756836, "learning_rate": 9.246343117310861e-07, "loss": 0.6424, "mean_token_accuracy": 0.7983625203371048, "num_tokens": 2310660.0, "step": 1910 }, { "entropy": 1.910171502828598, "epoch": 0.0059518320095415305, "grad_norm": 4.376688480377197, "learning_rate": 9.29477864961736e-07, "loss": 0.6749, "mean_token_accuracy": 0.791431900858879, "num_tokens": 2323660.0, "step": 1920 }, { "entropy": 1.9754113674163818, "epoch": 0.005982831134591226, "grad_norm": 16.712688446044922, "learning_rate": 9.343214181923859e-07, "loss": 0.6853, "mean_token_accuracy": 0.7904626414179802, "num_tokens": 2335479.0, "step": 1930 }, { "entropy": 1.9469898149371148, "epoch": 0.006013830259640922, "grad_norm": 7.1344733238220215, "learning_rate": 9.391649714230359e-07, "loss": 0.6714, "mean_token_accuracy": 0.7852405294775963, "num_tokens": 2347294.0, "step": 1940 }, { "entropy": 2.031823492050171, "epoch": 0.006044829384690617, "grad_norm": 14.974668502807617, "learning_rate": 9.44008524653686e-07, "loss": 0.7696, "mean_token_accuracy": 0.7771394848823547, "num_tokens": 2358885.0, "step": 1950 }, { "entropy": 2.001576192677021, "epoch": 0.006075828509740313, "grad_norm": 14.197503089904785, "learning_rate": 9.48852077884336e-07, "loss": 0.7037, "mean_token_accuracy": 0.7951447427272796, "num_tokens": 2370132.0, "step": 1960 }, { "entropy": 1.9794816732406617, "epoch": 0.006106827634790008, "grad_norm": 14.473783493041992, "learning_rate": 9.53695631114986e-07, "loss": 0.7147, "mean_token_accuracy": 0.7915472134947776, "num_tokens": 2381547.0, "step": 1970 }, { "entropy": 2.093907243013382, "epoch": 0.006137826759839703, "grad_norm": 14.241476058959961, "learning_rate": 9.585391843456359e-07, "loss": 0.8013, "mean_token_accuracy": 0.7786402046680451, "num_tokens": 2392334.0, "step": 1980 }, { "entropy": 1.9560458779335022, "epoch": 0.006168825884889399, "grad_norm": 13.721399307250977, "learning_rate": 9.633827375762862e-07, "loss": 0.6542, "mean_token_accuracy": 0.7954897537827492, "num_tokens": 2405128.0, "step": 1990 }, { "entropy": 2.0026605874300003, "epoch": 0.006199825009939095, "grad_norm": 16.695531845092773, "learning_rate": 9.68226290806936e-07, "loss": 0.7113, "mean_token_accuracy": 0.7941533356904984, "num_tokens": 2416699.0, "step": 2000 }, { "entropy": 1.9149562925100327, "epoch": 0.00623082413498879, "grad_norm": 13.72096061706543, "learning_rate": 9.73069844037586e-07, "loss": 0.6126, "mean_token_accuracy": 0.7917631477117538, "num_tokens": 2429162.0, "step": 2010 }, { "entropy": 1.981172299385071, "epoch": 0.006261823260038485, "grad_norm": 15.749645233154297, "learning_rate": 9.77913397268236e-07, "loss": 0.7357, "mean_token_accuracy": 0.7887249678373337, "num_tokens": 2440565.0, "step": 2020 }, { "entropy": 1.9732341334223746, "epoch": 0.006292822385088181, "grad_norm": 16.25255012512207, "learning_rate": 9.827569504988861e-07, "loss": 0.6904, "mean_token_accuracy": 0.7926795452833175, "num_tokens": 2452318.0, "step": 2030 }, { "entropy": 1.9686682283878327, "epoch": 0.006323821510137877, "grad_norm": 7.215946197509766, "learning_rate": 9.87600503729536e-07, "loss": 0.6178, "mean_token_accuracy": 0.7987834200263023, "num_tokens": 2464480.0, "step": 2040 }, { "entropy": 2.0191121250391006, "epoch": 0.0063548206351875716, "grad_norm": 7.287489891052246, "learning_rate": 9.924440569601861e-07, "loss": 0.751, "mean_token_accuracy": 0.7695384725928307, "num_tokens": 2477286.0, "step": 2050 }, { "entropy": 1.9891038194298745, "epoch": 0.006385819760237267, "grad_norm": 15.255844116210938, "learning_rate": 9.97287610190836e-07, "loss": 0.6402, "mean_token_accuracy": 0.7988035991787911, "num_tokens": 2489470.0, "step": 2060 }, { "entropy": 2.0310388937592507, "epoch": 0.006416818885286963, "grad_norm": 18.825220108032227, "learning_rate": 1.0021311634214861e-06, "loss": 0.6939, "mean_token_accuracy": 0.790933045744896, "num_tokens": 2500898.0, "step": 2070 }, { "entropy": 1.9929038733243942, "epoch": 0.006447818010336658, "grad_norm": 12.69325065612793, "learning_rate": 1.0069747166521362e-06, "loss": 0.7442, "mean_token_accuracy": 0.7899109661579132, "num_tokens": 2511757.0, "step": 2080 }, { "entropy": 1.956763118505478, "epoch": 0.006478817135386354, "grad_norm": 11.434943199157715, "learning_rate": 1.011818269882786e-06, "loss": 0.7163, "mean_token_accuracy": 0.7898407876491547, "num_tokens": 2522888.0, "step": 2090 }, { "entropy": 1.97008508592844, "epoch": 0.006509816260436049, "grad_norm": 15.685938835144043, "learning_rate": 1.016661823113436e-06, "loss": 0.7396, "mean_token_accuracy": 0.785623998939991, "num_tokens": 2534453.0, "step": 2100 }, { "entropy": 1.9547904863953591, "epoch": 0.006540815385485745, "grad_norm": 13.821385383605957, "learning_rate": 1.021505376344086e-06, "loss": 0.7402, "mean_token_accuracy": 0.785305617749691, "num_tokens": 2546617.0, "step": 2110 }, { "entropy": 1.8934344440698623, "epoch": 0.00657181451053544, "grad_norm": 13.396963119506836, "learning_rate": 1.0263489295747362e-06, "loss": 0.627, "mean_token_accuracy": 0.8024709656834602, "num_tokens": 2559429.0, "step": 2120 }, { "entropy": 1.9755502566695213, "epoch": 0.006602813635585136, "grad_norm": 13.538336753845215, "learning_rate": 1.031192482805386e-06, "loss": 0.7059, "mean_token_accuracy": 0.7905629277229309, "num_tokens": 2570593.0, "step": 2130 }, { "entropy": 1.949264821410179, "epoch": 0.006633812760634831, "grad_norm": 14.021260261535645, "learning_rate": 1.0360360360360361e-06, "loss": 0.6666, "mean_token_accuracy": 0.7925188347697258, "num_tokens": 2582602.0, "step": 2140 }, { "entropy": 1.9639265209436416, "epoch": 0.006664811885684526, "grad_norm": 14.326904296875, "learning_rate": 1.040879589266686e-06, "loss": 0.7264, "mean_token_accuracy": 0.7851147443056107, "num_tokens": 2594982.0, "step": 2150 }, { "entropy": 1.908056390285492, "epoch": 0.006695811010734222, "grad_norm": 15.651636123657227, "learning_rate": 1.0457231424973361e-06, "loss": 0.6616, "mean_token_accuracy": 0.7910144224762916, "num_tokens": 2607900.0, "step": 2160 }, { "entropy": 2.064947435259819, "epoch": 0.006726810135783918, "grad_norm": 13.445158004760742, "learning_rate": 1.0505666957279862e-06, "loss": 0.7854, "mean_token_accuracy": 0.7805202215909958, "num_tokens": 2618173.0, "step": 2170 }, { "entropy": 1.9829859480261802, "epoch": 0.0067578092608336126, "grad_norm": 17.02464485168457, "learning_rate": 1.055410248958636e-06, "loss": 0.7521, "mean_token_accuracy": 0.7822321712970733, "num_tokens": 2629832.0, "step": 2180 }, { "entropy": 1.9977169573307036, "epoch": 0.006788808385883308, "grad_norm": 16.560504913330078, "learning_rate": 1.060253802189286e-06, "loss": 0.6964, "mean_token_accuracy": 0.793058268725872, "num_tokens": 2641302.0, "step": 2190 }, { "entropy": 2.0350762784481047, "epoch": 0.006819807510933004, "grad_norm": 16.779306411743164, "learning_rate": 1.0650973554199363e-06, "loss": 0.754, "mean_token_accuracy": 0.7842149198055267, "num_tokens": 2651948.0, "step": 2200 }, { "entropy": 1.9923300370573997, "epoch": 0.0068508066359827, "grad_norm": 16.477792739868164, "learning_rate": 1.0699409086505862e-06, "loss": 0.7457, "mean_token_accuracy": 0.7842650130391121, "num_tokens": 2663131.0, "step": 2210 }, { "entropy": 1.7988253235816956, "epoch": 0.006881805761032395, "grad_norm": 6.730621814727783, "learning_rate": 1.074784461881236e-06, "loss": 0.5973, "mean_token_accuracy": 0.80888991355896, "num_tokens": 2676405.0, "step": 2220 }, { "entropy": 1.9884469777345657, "epoch": 0.00691280488608209, "grad_norm": 15.052291870117188, "learning_rate": 1.0796280151118862e-06, "loss": 0.6849, "mean_token_accuracy": 0.7967025130987168, "num_tokens": 2688102.0, "step": 2230 }, { "entropy": 2.003503252565861, "epoch": 0.006943804011131786, "grad_norm": 7.750051498413086, "learning_rate": 1.0844715683425363e-06, "loss": 0.7153, "mean_token_accuracy": 0.7897115662693978, "num_tokens": 2699848.0, "step": 2240 }, { "entropy": 1.8995970517396927, "epoch": 0.006974803136181481, "grad_norm": 7.199272632598877, "learning_rate": 1.0893151215731861e-06, "loss": 0.5577, "mean_token_accuracy": 0.8024276942014694, "num_tokens": 2713648.0, "step": 2250 }, { "entropy": 2.017641696333885, "epoch": 0.007005802261231177, "grad_norm": 15.489433288574219, "learning_rate": 1.0941586748038362e-06, "loss": 0.7434, "mean_token_accuracy": 0.7756871730089188, "num_tokens": 2725047.0, "step": 2260 }, { "entropy": 1.895441135764122, "epoch": 0.007036801386280872, "grad_norm": 17.170063018798828, "learning_rate": 1.0990022280344861e-06, "loss": 0.6771, "mean_token_accuracy": 0.7931119576096535, "num_tokens": 2737777.0, "step": 2270 }, { "entropy": 1.8989380843937398, "epoch": 0.007067800511330568, "grad_norm": 13.440524101257324, "learning_rate": 1.1038457812651362e-06, "loss": 0.6491, "mean_token_accuracy": 0.806152519583702, "num_tokens": 2749458.0, "step": 2280 }, { "entropy": 1.9610642537474632, "epoch": 0.007098799636380263, "grad_norm": 17.207420349121094, "learning_rate": 1.1086893344957863e-06, "loss": 0.7137, "mean_token_accuracy": 0.7841380387544632, "num_tokens": 2760376.0, "step": 2290 }, { "entropy": 1.9233600437641143, "epoch": 0.007129798761429959, "grad_norm": 14.002983093261719, "learning_rate": 1.1135328877264362e-06, "loss": 0.6691, "mean_token_accuracy": 0.7973776832222939, "num_tokens": 2771954.0, "step": 2300 }, { "entropy": 1.9587767377495766, "epoch": 0.0071607978864796544, "grad_norm": 14.966231346130371, "learning_rate": 1.118376440957086e-06, "loss": 0.6645, "mean_token_accuracy": 0.7941723063588142, "num_tokens": 2784375.0, "step": 2310 }, { "entropy": 1.994602382183075, "epoch": 0.007191797011529349, "grad_norm": 15.076648712158203, "learning_rate": 1.1232199941877364e-06, "loss": 0.7767, "mean_token_accuracy": 0.7866624847054482, "num_tokens": 2795409.0, "step": 2320 }, { "entropy": 1.871568574011326, "epoch": 0.007222796136579045, "grad_norm": 15.472980499267578, "learning_rate": 1.1280635474183863e-06, "loss": 0.6379, "mean_token_accuracy": 0.7967082962393761, "num_tokens": 2807885.0, "step": 2330 }, { "entropy": 1.9470938667654991, "epoch": 0.007253795261628741, "grad_norm": 8.13938045501709, "learning_rate": 1.1329071006490362e-06, "loss": 0.6471, "mean_token_accuracy": 0.796507653594017, "num_tokens": 2820252.0, "step": 2340 }, { "entropy": 1.92869935631752, "epoch": 0.007284794386678436, "grad_norm": 15.049967765808105, "learning_rate": 1.1377506538796862e-06, "loss": 0.705, "mean_token_accuracy": 0.7862442404031753, "num_tokens": 2832038.0, "step": 2350 }, { "entropy": 1.914162775874138, "epoch": 0.007315793511728131, "grad_norm": 14.12210750579834, "learning_rate": 1.1425942071103361e-06, "loss": 0.6729, "mean_token_accuracy": 0.7895276993513107, "num_tokens": 2844886.0, "step": 2360 }, { "entropy": 1.967710091173649, "epoch": 0.007346792636777827, "grad_norm": 7.458662033081055, "learning_rate": 1.1474377603409862e-06, "loss": 0.7499, "mean_token_accuracy": 0.7771386727690697, "num_tokens": 2857547.0, "step": 2370 }, { "entropy": 1.998754534125328, "epoch": 0.007377791761827523, "grad_norm": 14.489635467529297, "learning_rate": 1.1522813135716363e-06, "loss": 0.7481, "mean_token_accuracy": 0.7853496834635735, "num_tokens": 2868673.0, "step": 2380 }, { "entropy": 1.9616554602980614, "epoch": 0.007408790886877218, "grad_norm": 12.88488483428955, "learning_rate": 1.1571248668022862e-06, "loss": 0.7012, "mean_token_accuracy": 0.7969075560569763, "num_tokens": 2880536.0, "step": 2390 }, { "entropy": 2.0036663502454757, "epoch": 0.007439790011926913, "grad_norm": 13.311254501342773, "learning_rate": 1.161968420032936e-06, "loss": 0.7523, "mean_token_accuracy": 0.7829051271080971, "num_tokens": 2891783.0, "step": 2400 }, { "entropy": 1.9757195591926575, "epoch": 0.007470789136976609, "grad_norm": 16.514055252075195, "learning_rate": 1.1668119732635864e-06, "loss": 0.716, "mean_token_accuracy": 0.7831418976187706, "num_tokens": 2903383.0, "step": 2410 }, { "entropy": 1.897585642337799, "epoch": 0.007501788262026304, "grad_norm": 14.471832275390625, "learning_rate": 1.1716555264942363e-06, "loss": 0.6846, "mean_token_accuracy": 0.7894420132040978, "num_tokens": 2915336.0, "step": 2420 }, { "entropy": 2.0040734350681304, "epoch": 0.007532787387076, "grad_norm": 15.47757339477539, "learning_rate": 1.1764990797248862e-06, "loss": 0.7554, "mean_token_accuracy": 0.7723087310791016, "num_tokens": 2926824.0, "step": 2430 }, { "entropy": 1.9852148294448853, "epoch": 0.0075637865121256954, "grad_norm": 14.812381744384766, "learning_rate": 1.1813426329555363e-06, "loss": 0.7039, "mean_token_accuracy": 0.7904437810182572, "num_tokens": 2938564.0, "step": 2440 }, { "entropy": 1.9800173118710518, "epoch": 0.00759478563717539, "grad_norm": 7.551329612731934, "learning_rate": 1.1861861861861864e-06, "loss": 0.7097, "mean_token_accuracy": 0.7884911954402923, "num_tokens": 2950233.0, "step": 2450 }, { "entropy": 1.9487542375922202, "epoch": 0.007625784762225086, "grad_norm": 15.525565147399902, "learning_rate": 1.1910297394168362e-06, "loss": 0.706, "mean_token_accuracy": 0.7928291246294975, "num_tokens": 2962418.0, "step": 2460 }, { "entropy": 1.9103297770023346, "epoch": 0.007656783887274782, "grad_norm": 16.958786010742188, "learning_rate": 1.1958732926474863e-06, "loss": 0.6459, "mean_token_accuracy": 0.7916588813066483, "num_tokens": 2974375.0, "step": 2470 }, { "entropy": 1.8603628262877465, "epoch": 0.0076877830123244775, "grad_norm": 15.165175437927246, "learning_rate": 1.2007168458781362e-06, "loss": 0.6034, "mean_token_accuracy": 0.8035720109939575, "num_tokens": 2986722.0, "step": 2480 }, { "entropy": 1.8906757444143296, "epoch": 0.007718782137374172, "grad_norm": 14.799652099609375, "learning_rate": 1.2055603991087863e-06, "loss": 0.721, "mean_token_accuracy": 0.7916418060660362, "num_tokens": 2998939.0, "step": 2490 }, { "entropy": 1.8013859555125236, "epoch": 0.007749781262423868, "grad_norm": 6.574148654937744, "learning_rate": 1.2104039523394364e-06, "loss": 0.591, "mean_token_accuracy": 0.8007106646895409, "num_tokens": 3011678.0, "step": 2500 }, { "entropy": 1.9229096472263336, "epoch": 0.007780780387473564, "grad_norm": 15.573455810546875, "learning_rate": 1.2152475055700863e-06, "loss": 0.6444, "mean_token_accuracy": 0.7954755112528801, "num_tokens": 3024579.0, "step": 2510 }, { "entropy": 1.938393659889698, "epoch": 0.007811779512523259, "grad_norm": 14.614435195922852, "learning_rate": 1.2200910588007362e-06, "loss": 0.7364, "mean_token_accuracy": 0.7778101339936256, "num_tokens": 3036102.0, "step": 2520 }, { "entropy": 2.013949643075466, "epoch": 0.007842778637572954, "grad_norm": 8.017887115478516, "learning_rate": 1.2249346120313865e-06, "loss": 0.7157, "mean_token_accuracy": 0.7902801647782326, "num_tokens": 3047786.0, "step": 2530 }, { "entropy": 1.9791011467576027, "epoch": 0.00787377776262265, "grad_norm": 13.628707885742188, "learning_rate": 1.2297781652620364e-06, "loss": 0.7443, "mean_token_accuracy": 0.7846587881445884, "num_tokens": 3059102.0, "step": 2540 }, { "entropy": 1.9235740274190902, "epoch": 0.007904776887672346, "grad_norm": 14.778630256652832, "learning_rate": 1.2346217184926863e-06, "loss": 0.6579, "mean_token_accuracy": 0.7987881228327751, "num_tokens": 3071029.0, "step": 2550 }, { "entropy": 1.873728483915329, "epoch": 0.00793577601272204, "grad_norm": 5.178359508514404, "learning_rate": 1.2394652717233364e-06, "loss": 0.6452, "mean_token_accuracy": 0.7905326500535012, "num_tokens": 3083691.0, "step": 2560 }, { "entropy": 1.9002502277493476, "epoch": 0.007966775137771736, "grad_norm": 6.418622016906738, "learning_rate": 1.2443088249539862e-06, "loss": 0.6569, "mean_token_accuracy": 0.7951910138130188, "num_tokens": 3096979.0, "step": 2570 }, { "entropy": 2.094506660103798, "epoch": 0.007997774262821432, "grad_norm": 14.790390968322754, "learning_rate": 1.2491523781846363e-06, "loss": 0.7683, "mean_token_accuracy": 0.7839270010590553, "num_tokens": 3107589.0, "step": 2580 }, { "entropy": 1.9785408988595008, "epoch": 0.008028773387871127, "grad_norm": 13.493193626403809, "learning_rate": 1.2539959314152864e-06, "loss": 0.6176, "mean_token_accuracy": 0.7813256174325943, "num_tokens": 3120357.0, "step": 2590 }, { "entropy": 1.8982400611042975, "epoch": 0.008059772512920824, "grad_norm": 8.22580623626709, "learning_rate": 1.2588394846459365e-06, "loss": 0.533, "mean_token_accuracy": 0.8012925937771798, "num_tokens": 3134146.0, "step": 2600 }, { "entropy": 1.9128680050373077, "epoch": 0.008090771637970519, "grad_norm": 12.298065185546875, "learning_rate": 1.2636830378765864e-06, "loss": 0.6659, "mean_token_accuracy": 0.7981240004301071, "num_tokens": 3146528.0, "step": 2610 }, { "entropy": 1.8557716280221939, "epoch": 0.008121770763020213, "grad_norm": 13.299572944641113, "learning_rate": 1.2685265911072365e-06, "loss": 0.5324, "mean_token_accuracy": 0.811582088470459, "num_tokens": 3160369.0, "step": 2620 }, { "entropy": 1.9083554148674011, "epoch": 0.00815276988806991, "grad_norm": 12.815226554870605, "learning_rate": 1.2733701443378862e-06, "loss": 0.7, "mean_token_accuracy": 0.7826020732522011, "num_tokens": 3171895.0, "step": 2630 }, { "entropy": 1.858922117948532, "epoch": 0.008183769013119605, "grad_norm": 7.371999740600586, "learning_rate": 1.2782136975685363e-06, "loss": 0.6308, "mean_token_accuracy": 0.7977426454424859, "num_tokens": 3184167.0, "step": 2640 }, { "entropy": 1.9080826759338378, "epoch": 0.0082147681381693, "grad_norm": 15.016209602355957, "learning_rate": 1.2830572507991866e-06, "loss": 0.6998, "mean_token_accuracy": 0.7945051088929176, "num_tokens": 3196310.0, "step": 2650 }, { "entropy": 2.000731149315834, "epoch": 0.008245767263218996, "grad_norm": 7.419742107391357, "learning_rate": 1.2879008040298362e-06, "loss": 0.7669, "mean_token_accuracy": 0.7797714114189148, "num_tokens": 3207853.0, "step": 2660 }, { "entropy": 1.9436663463711739, "epoch": 0.008276766388268691, "grad_norm": 14.453858375549316, "learning_rate": 1.2927443572604863e-06, "loss": 0.7125, "mean_token_accuracy": 0.7874180287122726, "num_tokens": 3219704.0, "step": 2670 }, { "entropy": 1.8370084151625634, "epoch": 0.008307765513318386, "grad_norm": 13.891809463500977, "learning_rate": 1.2975879104911364e-06, "loss": 0.6604, "mean_token_accuracy": 0.7957833841443062, "num_tokens": 3232139.0, "step": 2680 }, { "entropy": 1.8790874809026719, "epoch": 0.008338764638368083, "grad_norm": 6.238895893096924, "learning_rate": 1.3024314637217863e-06, "loss": 0.6469, "mean_token_accuracy": 0.8013461381196976, "num_tokens": 3244302.0, "step": 2690 }, { "entropy": 1.8268845871090889, "epoch": 0.008369763763417777, "grad_norm": 8.341919898986816, "learning_rate": 1.3072750169524364e-06, "loss": 0.5651, "mean_token_accuracy": 0.8057503595948219, "num_tokens": 3258280.0, "step": 2700 }, { "entropy": 1.9424984157085419, "epoch": 0.008400762888467472, "grad_norm": 14.885581970214844, "learning_rate": 1.3121185701830863e-06, "loss": 0.6694, "mean_token_accuracy": 0.7825981408357621, "num_tokens": 3270381.0, "step": 2710 }, { "entropy": 1.954266221821308, "epoch": 0.008431762013517169, "grad_norm": 14.373212814331055, "learning_rate": 1.3169621234137364e-06, "loss": 0.6892, "mean_token_accuracy": 0.7864295184612274, "num_tokens": 3282087.0, "step": 2720 }, { "entropy": 1.9162860542535782, "epoch": 0.008462761138566864, "grad_norm": 12.995490074157715, "learning_rate": 1.3218056766443865e-06, "loss": 0.7248, "mean_token_accuracy": 0.7878184035420418, "num_tokens": 3295059.0, "step": 2730 }, { "entropy": 1.8371574386954308, "epoch": 0.008493760263616559, "grad_norm": 14.610986709594727, "learning_rate": 1.3266492298750364e-06, "loss": 0.6114, "mean_token_accuracy": 0.7994052410125733, "num_tokens": 3307978.0, "step": 2740 }, { "entropy": 1.850772686302662, "epoch": 0.008524759388666255, "grad_norm": 14.59720516204834, "learning_rate": 1.3314927831056865e-06, "loss": 0.675, "mean_token_accuracy": 0.7939030557870865, "num_tokens": 3320392.0, "step": 2750 }, { "entropy": 1.8222134336829185, "epoch": 0.00855575851371595, "grad_norm": 5.238037109375, "learning_rate": 1.3363363363363366e-06, "loss": 0.5953, "mean_token_accuracy": 0.8046846255660057, "num_tokens": 3333208.0, "step": 2760 }, { "entropy": 1.9551292091608048, "epoch": 0.008586757638765647, "grad_norm": 16.98749351501465, "learning_rate": 1.3411798895669865e-06, "loss": 0.77, "mean_token_accuracy": 0.7961449101567268, "num_tokens": 3344004.0, "step": 2770 }, { "entropy": 1.9047151386737824, "epoch": 0.008617756763815342, "grad_norm": 15.543542861938477, "learning_rate": 1.3460234427976366e-06, "loss": 0.7398, "mean_token_accuracy": 0.7901824861764908, "num_tokens": 3354724.0, "step": 2780 }, { "entropy": 1.9245003148913384, "epoch": 0.008648755888865036, "grad_norm": 12.885390281677246, "learning_rate": 1.3508669960282864e-06, "loss": 0.7131, "mean_token_accuracy": 0.7908341690897942, "num_tokens": 3366242.0, "step": 2790 }, { "entropy": 1.873139852285385, "epoch": 0.008679755013914733, "grad_norm": 12.843483924865723, "learning_rate": 1.3557105492589365e-06, "loss": 0.6774, "mean_token_accuracy": 0.8066575720906257, "num_tokens": 3377837.0, "step": 2800 }, { "entropy": 1.8930615320801736, "epoch": 0.008710754138964428, "grad_norm": 10.335051536560059, "learning_rate": 1.3605541024895866e-06, "loss": 0.6729, "mean_token_accuracy": 0.789339256286621, "num_tokens": 3390578.0, "step": 2810 }, { "entropy": 1.9254612401127815, "epoch": 0.008741753264014123, "grad_norm": 13.433055877685547, "learning_rate": 1.3653976557202365e-06, "loss": 0.7126, "mean_token_accuracy": 0.7813933417201042, "num_tokens": 3401808.0, "step": 2820 }, { "entropy": 1.8774896636605263, "epoch": 0.00877275238906382, "grad_norm": 15.659320831298828, "learning_rate": 1.3702412089508866e-06, "loss": 0.6582, "mean_token_accuracy": 0.7960716530680656, "num_tokens": 3412922.0, "step": 2830 }, { "entropy": 1.8881728678941727, "epoch": 0.008803751514113514, "grad_norm": 14.669523239135742, "learning_rate": 1.3750847621815363e-06, "loss": 0.6416, "mean_token_accuracy": 0.8034508541226387, "num_tokens": 3425285.0, "step": 2840 }, { "entropy": 1.907693549990654, "epoch": 0.008834750639163209, "grad_norm": 17.293590545654297, "learning_rate": 1.3799283154121864e-06, "loss": 0.6792, "mean_token_accuracy": 0.797477675974369, "num_tokens": 3436697.0, "step": 2850 }, { "entropy": 1.9830518007278441, "epoch": 0.008865749764212906, "grad_norm": 15.576813697814941, "learning_rate": 1.3847718686428367e-06, "loss": 0.7526, "mean_token_accuracy": 0.779937069118023, "num_tokens": 3447898.0, "step": 2860 }, { "entropy": 2.013961046934128, "epoch": 0.0088967488892626, "grad_norm": 19.190113067626953, "learning_rate": 1.3896154218734864e-06, "loss": 0.7515, "mean_token_accuracy": 0.7880441144108772, "num_tokens": 3458894.0, "step": 2870 }, { "entropy": 1.9314459562301636, "epoch": 0.008927748014312295, "grad_norm": 12.630243301391602, "learning_rate": 1.3944589751041364e-06, "loss": 0.6666, "mean_token_accuracy": 0.7965318202972412, "num_tokens": 3471083.0, "step": 2880 }, { "entropy": 1.9051019206643105, "epoch": 0.008958747139361992, "grad_norm": 12.55810546875, "learning_rate": 1.3993025283347865e-06, "loss": 0.678, "mean_token_accuracy": 0.7915308341383934, "num_tokens": 3483322.0, "step": 2890 }, { "entropy": 1.889833214879036, "epoch": 0.008989746264411687, "grad_norm": 11.794564247131348, "learning_rate": 1.4041460815654364e-06, "loss": 0.5963, "mean_token_accuracy": 0.7985853642225266, "num_tokens": 3496188.0, "step": 2900 }, { "entropy": 1.8793418243527413, "epoch": 0.009020745389461382, "grad_norm": 6.941933631896973, "learning_rate": 1.4089896347960865e-06, "loss": 0.6065, "mean_token_accuracy": 0.8100902363657951, "num_tokens": 3508412.0, "step": 2910 }, { "entropy": 1.7804090306162834, "epoch": 0.009051744514511078, "grad_norm": 7.316125392913818, "learning_rate": 1.4138331880267364e-06, "loss": 0.5352, "mean_token_accuracy": 0.8099708408117294, "num_tokens": 3522455.0, "step": 2920 }, { "entropy": 1.9128621608018874, "epoch": 0.009082743639560773, "grad_norm": 17.679824829101562, "learning_rate": 1.4186767412573865e-06, "loss": 0.684, "mean_token_accuracy": 0.7950308054685593, "num_tokens": 3534077.0, "step": 2930 }, { "entropy": 1.951967991888523, "epoch": 0.009113742764610468, "grad_norm": 15.615214347839355, "learning_rate": 1.4235202944880366e-06, "loss": 0.6822, "mean_token_accuracy": 0.7895113542675972, "num_tokens": 3545717.0, "step": 2940 }, { "entropy": 1.993370993435383, "epoch": 0.009144741889660165, "grad_norm": 6.757071018218994, "learning_rate": 1.4283638477186865e-06, "loss": 0.7091, "mean_token_accuracy": 0.7836599707603454, "num_tokens": 3556645.0, "step": 2950 }, { "entropy": 2.0306035369634627, "epoch": 0.00917574101470986, "grad_norm": 14.662017822265625, "learning_rate": 1.4332074009493366e-06, "loss": 0.7833, "mean_token_accuracy": 0.7853764742612839, "num_tokens": 3568121.0, "step": 2960 }, { "entropy": 1.981213203072548, "epoch": 0.009206740139759556, "grad_norm": 15.54949951171875, "learning_rate": 1.4380509541799867e-06, "loss": 0.7714, "mean_token_accuracy": 0.7741693139076233, "num_tokens": 3578617.0, "step": 2970 }, { "entropy": 1.872342900186777, "epoch": 0.009237739264809251, "grad_norm": 17.143173217773438, "learning_rate": 1.4428945074106366e-06, "loss": 0.6521, "mean_token_accuracy": 0.7934080705046653, "num_tokens": 3590839.0, "step": 2980 }, { "entropy": 1.9597981229424477, "epoch": 0.009268738389858946, "grad_norm": 14.204910278320312, "learning_rate": 1.4477380606412867e-06, "loss": 0.7372, "mean_token_accuracy": 0.7796640574932099, "num_tokens": 3602197.0, "step": 2990 }, { "entropy": 1.9134353682398797, "epoch": 0.009299737514908642, "grad_norm": 15.377642631530762, "learning_rate": 1.4525816138719365e-06, "loss": 0.5693, "mean_token_accuracy": 0.80574119836092, "num_tokens": 3614912.0, "step": 3000 }, { "entropy": 1.978382122516632, "epoch": 0.009330736639958337, "grad_norm": 7.089384078979492, "learning_rate": 1.4574251671025866e-06, "loss": 0.7043, "mean_token_accuracy": 0.7872982352972031, "num_tokens": 3626738.0, "step": 3010 }, { "entropy": 1.9824548229575156, "epoch": 0.009361735765008032, "grad_norm": 7.4606781005859375, "learning_rate": 1.4622687203332367e-06, "loss": 0.7379, "mean_token_accuracy": 0.7830901652574539, "num_tokens": 3638714.0, "step": 3020 }, { "entropy": 1.8985080510377883, "epoch": 0.009392734890057729, "grad_norm": 13.994378089904785, "learning_rate": 1.4671122735638866e-06, "loss": 0.6141, "mean_token_accuracy": 0.8004487827420235, "num_tokens": 3651514.0, "step": 3030 }, { "entropy": 1.9109556749463081, "epoch": 0.009423734015107424, "grad_norm": 11.190482139587402, "learning_rate": 1.4719558267945367e-06, "loss": 0.6391, "mean_token_accuracy": 0.8045189529657364, "num_tokens": 3662697.0, "step": 3040 }, { "entropy": 1.8760196268558502, "epoch": 0.009454733140157118, "grad_norm": 12.307433128356934, "learning_rate": 1.4767993800251864e-06, "loss": 0.6314, "mean_token_accuracy": 0.8026507899165154, "num_tokens": 3674503.0, "step": 3050 }, { "entropy": 1.8742030158638954, "epoch": 0.009485732265206815, "grad_norm": 7.280045509338379, "learning_rate": 1.4816429332558365e-06, "loss": 0.6784, "mean_token_accuracy": 0.7885152325034142, "num_tokens": 3686269.0, "step": 3060 }, { "entropy": 1.7706302866339683, "epoch": 0.00951673139025651, "grad_norm": 16.125728607177734, "learning_rate": 1.4864864864864868e-06, "loss": 0.6367, "mean_token_accuracy": 0.8014728352427483, "num_tokens": 3699713.0, "step": 3070 }, { "entropy": 1.9473395302891732, "epoch": 0.009547730515306205, "grad_norm": 13.948101997375488, "learning_rate": 1.4913300397171365e-06, "loss": 0.7302, "mean_token_accuracy": 0.7855488672852516, "num_tokens": 3710852.0, "step": 3080 }, { "entropy": 1.9612311527132988, "epoch": 0.009578729640355901, "grad_norm": 13.97789192199707, "learning_rate": 1.4961735929477866e-06, "loss": 0.6629, "mean_token_accuracy": 0.7972382113337517, "num_tokens": 3722160.0, "step": 3090 }, { "entropy": 1.960909178853035, "epoch": 0.009609728765405596, "grad_norm": 17.231172561645508, "learning_rate": 1.5010171461784366e-06, "loss": 0.754, "mean_token_accuracy": 0.7888594791293144, "num_tokens": 3732729.0, "step": 3100 }, { "entropy": 1.8868347622454167, "epoch": 0.009640727890455291, "grad_norm": 15.940637588500977, "learning_rate": 1.5058606994090865e-06, "loss": 0.5887, "mean_token_accuracy": 0.8036533623933793, "num_tokens": 3746194.0, "step": 3110 }, { "entropy": 2.00607987344265, "epoch": 0.009671727015504988, "grad_norm": 16.22367286682129, "learning_rate": 1.5107042526397366e-06, "loss": 0.7286, "mean_token_accuracy": 0.7928143292665482, "num_tokens": 3756751.0, "step": 3120 }, { "entropy": 1.9029984012246133, "epoch": 0.009702726140554683, "grad_norm": 6.391202926635742, "learning_rate": 1.5155478058703865e-06, "loss": 0.6529, "mean_token_accuracy": 0.7976860404014587, "num_tokens": 3768774.0, "step": 3130 }, { "entropy": 1.900516900420189, "epoch": 0.009733725265604379, "grad_norm": 8.11662483215332, "learning_rate": 1.5203913591010366e-06, "loss": 0.6046, "mean_token_accuracy": 0.8091256007552147, "num_tokens": 3780820.0, "step": 3140 }, { "entropy": 1.9626210525631904, "epoch": 0.009764724390654074, "grad_norm": 12.520034790039062, "learning_rate": 1.5252349123316867e-06, "loss": 0.7038, "mean_token_accuracy": 0.7925601005554199, "num_tokens": 3792530.0, "step": 3150 }, { "entropy": 1.9671666517853736, "epoch": 0.009795723515703769, "grad_norm": 13.528938293457031, "learning_rate": 1.5300784655623366e-06, "loss": 0.6599, "mean_token_accuracy": 0.7918861374258995, "num_tokens": 3804625.0, "step": 3160 }, { "entropy": 2.005161625146866, "epoch": 0.009826722640753465, "grad_norm": 15.400350570678711, "learning_rate": 1.5349220187929867e-06, "loss": 0.7848, "mean_token_accuracy": 0.7739899069070816, "num_tokens": 3815444.0, "step": 3170 }, { "entropy": 1.9359246730804442, "epoch": 0.00985772176580316, "grad_norm": 12.780713081359863, "learning_rate": 1.5397655720236368e-06, "loss": 0.6371, "mean_token_accuracy": 0.7938413843512535, "num_tokens": 3827684.0, "step": 3180 }, { "entropy": 1.8368206307291985, "epoch": 0.009888720890852855, "grad_norm": 14.72050952911377, "learning_rate": 1.5446091252542867e-06, "loss": 0.6173, "mean_token_accuracy": 0.8116529732942581, "num_tokens": 3840934.0, "step": 3190 }, { "entropy": 1.9378165647387504, "epoch": 0.009919720015902552, "grad_norm": 14.93758487701416, "learning_rate": 1.5494526784849368e-06, "loss": 0.7211, "mean_token_accuracy": 0.7911474660038949, "num_tokens": 3852242.0, "step": 3200 }, { "entropy": 1.9542915537953376, "epoch": 0.009950719140952247, "grad_norm": 16.133121490478516, "learning_rate": 1.5542962317155866e-06, "loss": 0.7008, "mean_token_accuracy": 0.7842934697866439, "num_tokens": 3864015.0, "step": 3210 }, { "entropy": 1.9251963838934898, "epoch": 0.009981718266001941, "grad_norm": 15.101007461547852, "learning_rate": 1.5591397849462367e-06, "loss": 0.6018, "mean_token_accuracy": 0.8036449015140533, "num_tokens": 3876673.0, "step": 3220 }, { "entropy": 1.9787523686885833, "epoch": 0.010012717391051638, "grad_norm": 11.887308120727539, "learning_rate": 1.5639833381768868e-06, "loss": 0.7158, "mean_token_accuracy": 0.79226556122303, "num_tokens": 3888063.0, "step": 3230 }, { "entropy": 1.99595515280962, "epoch": 0.010043716516101333, "grad_norm": 12.0714693069458, "learning_rate": 1.5688268914075367e-06, "loss": 0.7203, "mean_token_accuracy": 0.7814503669738769, "num_tokens": 3899339.0, "step": 3240 }, { "entropy": 1.92583971619606, "epoch": 0.010074715641151028, "grad_norm": 14.31470775604248, "learning_rate": 1.5736704446381868e-06, "loss": 0.6961, "mean_token_accuracy": 0.7950286611914634, "num_tokens": 3911124.0, "step": 3250 }, { "entropy": 1.9035099178552628, "epoch": 0.010105714766200724, "grad_norm": 14.206976890563965, "learning_rate": 1.5785139978688365e-06, "loss": 0.6835, "mean_token_accuracy": 0.7935141950845719, "num_tokens": 3923110.0, "step": 3260 }, { "entropy": 1.8103605896234511, "epoch": 0.01013671389125042, "grad_norm": 11.845626831054688, "learning_rate": 1.5833575510994868e-06, "loss": 0.5957, "mean_token_accuracy": 0.8122652113437653, "num_tokens": 3936835.0, "step": 3270 }, { "entropy": 1.9789897188544274, "epoch": 0.010167713016300114, "grad_norm": 16.31435775756836, "learning_rate": 1.5882011043301369e-06, "loss": 0.6931, "mean_token_accuracy": 0.790991485118866, "num_tokens": 3948691.0, "step": 3280 }, { "entropy": 1.9635705202817917, "epoch": 0.01019871214134981, "grad_norm": 18.17036247253418, "learning_rate": 1.5930446575607866e-06, "loss": 0.7027, "mean_token_accuracy": 0.7976090654730796, "num_tokens": 3960588.0, "step": 3290 }, { "entropy": 1.8178722724318503, "epoch": 0.010229711266399506, "grad_norm": 4.42231559753418, "learning_rate": 1.5978882107914367e-06, "loss": 0.6036, "mean_token_accuracy": 0.8080889210104942, "num_tokens": 3974030.0, "step": 3300 }, { "entropy": 2.014702117443085, "epoch": 0.010260710391449202, "grad_norm": 13.163649559020996, "learning_rate": 1.6027317640220868e-06, "loss": 0.7217, "mean_token_accuracy": 0.7939134731888771, "num_tokens": 3984810.0, "step": 3310 }, { "entropy": 1.9950988233089446, "epoch": 0.010291709516498897, "grad_norm": 15.815617561340332, "learning_rate": 1.6075753172527366e-06, "loss": 0.7374, "mean_token_accuracy": 0.7863194987177848, "num_tokens": 3996191.0, "step": 3320 }, { "entropy": 1.9132106304168701, "epoch": 0.010322708641548592, "grad_norm": 15.737754821777344, "learning_rate": 1.6124188704833867e-06, "loss": 0.7396, "mean_token_accuracy": 0.7841628924012184, "num_tokens": 4007285.0, "step": 3330 }, { "entropy": 1.8114167541265487, "epoch": 0.010353707766598289, "grad_norm": 13.689631462097168, "learning_rate": 1.6172624237140366e-06, "loss": 0.5893, "mean_token_accuracy": 0.8101338252425194, "num_tokens": 4020382.0, "step": 3340 }, { "entropy": 1.8398823648691178, "epoch": 0.010384706891647983, "grad_norm": 14.286563873291016, "learning_rate": 1.6221059769446867e-06, "loss": 0.6389, "mean_token_accuracy": 0.8049512296915055, "num_tokens": 4032030.0, "step": 3350 }, { "entropy": 1.9326018333435058, "epoch": 0.010415706016697678, "grad_norm": 16.373798370361328, "learning_rate": 1.6269495301753368e-06, "loss": 0.7155, "mean_token_accuracy": 0.7966372415423393, "num_tokens": 4043118.0, "step": 3360 }, { "entropy": 1.905664649605751, "epoch": 0.010446705141747375, "grad_norm": 12.13680648803711, "learning_rate": 1.6317930834059867e-06, "loss": 0.7183, "mean_token_accuracy": 0.7907021954655647, "num_tokens": 4055085.0, "step": 3370 }, { "entropy": 1.7940703511238099, "epoch": 0.01047770426679707, "grad_norm": 6.879446983337402, "learning_rate": 1.6366366366366368e-06, "loss": 0.5276, "mean_token_accuracy": 0.8221948266029357, "num_tokens": 4067929.0, "step": 3380 }, { "entropy": 1.964170092344284, "epoch": 0.010508703391846765, "grad_norm": 17.012205123901367, "learning_rate": 1.6414801898672869e-06, "loss": 0.7205, "mean_token_accuracy": 0.7949927791953086, "num_tokens": 4079191.0, "step": 3390 }, { "entropy": 1.9278406336903573, "epoch": 0.010539702516896461, "grad_norm": 15.772573471069336, "learning_rate": 1.6463237430979368e-06, "loss": 0.6634, "mean_token_accuracy": 0.7951213136315346, "num_tokens": 4090571.0, "step": 3400 }, { "entropy": 1.894165250658989, "epoch": 0.010570701641946156, "grad_norm": 7.302316188812256, "learning_rate": 1.6511672963285869e-06, "loss": 0.6412, "mean_token_accuracy": 0.8126410812139511, "num_tokens": 4102478.0, "step": 3410 }, { "entropy": 1.8725802972912788, "epoch": 0.010601700766995851, "grad_norm": 12.873794555664062, "learning_rate": 1.6560108495592367e-06, "loss": 0.6578, "mean_token_accuracy": 0.808172582089901, "num_tokens": 4114349.0, "step": 3420 }, { "entropy": 1.9037815779447556, "epoch": 0.010632699892045547, "grad_norm": 6.517296314239502, "learning_rate": 1.6608544027898868e-06, "loss": 0.6275, "mean_token_accuracy": 0.8006631091237069, "num_tokens": 4126609.0, "step": 3430 }, { "entropy": 1.9055060788989067, "epoch": 0.010663699017095242, "grad_norm": 15.855356216430664, "learning_rate": 1.665697956020537e-06, "loss": 0.7089, "mean_token_accuracy": 0.7951321870088577, "num_tokens": 4138476.0, "step": 3440 }, { "entropy": 1.9123858138918877, "epoch": 0.010694698142144937, "grad_norm": 14.749032020568848, "learning_rate": 1.6705415092511868e-06, "loss": 0.6421, "mean_token_accuracy": 0.799940338730812, "num_tokens": 4150474.0, "step": 3450 }, { "entropy": 1.9116358309984207, "epoch": 0.010725697267194634, "grad_norm": 13.992262840270996, "learning_rate": 1.675385062481837e-06, "loss": 0.5853, "mean_token_accuracy": 0.8097973421216011, "num_tokens": 4162977.0, "step": 3460 }, { "entropy": 1.9737422183156013, "epoch": 0.010756696392244329, "grad_norm": 19.65887451171875, "learning_rate": 1.6802286157124866e-06, "loss": 0.7513, "mean_token_accuracy": 0.786674790084362, "num_tokens": 4174740.0, "step": 3470 }, { "entropy": 1.9084539324045182, "epoch": 0.010787695517294024, "grad_norm": 7.143436908721924, "learning_rate": 1.685072168943137e-06, "loss": 0.6752, "mean_token_accuracy": 0.8009995728731155, "num_tokens": 4187420.0, "step": 3480 }, { "entropy": 1.9033571183681488, "epoch": 0.01081869464234372, "grad_norm": 12.777674674987793, "learning_rate": 1.689915722173787e-06, "loss": 0.6786, "mean_token_accuracy": 0.8100510269403458, "num_tokens": 4198138.0, "step": 3490 }, { "entropy": 1.8302487596869468, "epoch": 0.010849693767393415, "grad_norm": 14.17345905303955, "learning_rate": 1.6947592754044367e-06, "loss": 0.6558, "mean_token_accuracy": 0.7974511593580246, "num_tokens": 4210639.0, "step": 3500 }, { "entropy": 1.8793094977736473, "epoch": 0.010880692892443112, "grad_norm": 15.137555122375488, "learning_rate": 1.6996028286350868e-06, "loss": 0.6217, "mean_token_accuracy": 0.8036959484219551, "num_tokens": 4222201.0, "step": 3510 }, { "entropy": 1.8393305256962775, "epoch": 0.010911692017492806, "grad_norm": 12.150157928466797, "learning_rate": 1.704446381865737e-06, "loss": 0.6265, "mean_token_accuracy": 0.8053354471921921, "num_tokens": 4234129.0, "step": 3520 }, { "entropy": 1.8129374995827674, "epoch": 0.010942691142542501, "grad_norm": 6.902397155761719, "learning_rate": 1.7092899350963867e-06, "loss": 0.5879, "mean_token_accuracy": 0.8074716746807098, "num_tokens": 4247360.0, "step": 3530 }, { "entropy": 1.847067180275917, "epoch": 0.010973690267592198, "grad_norm": 9.06917953491211, "learning_rate": 1.7141334883270368e-06, "loss": 0.6657, "mean_token_accuracy": 0.7977814689278603, "num_tokens": 4260605.0, "step": 3540 }, { "entropy": 1.935109367966652, "epoch": 0.011004689392641893, "grad_norm": 13.947689056396484, "learning_rate": 1.7189770415576867e-06, "loss": 0.6719, "mean_token_accuracy": 0.8019600152969361, "num_tokens": 4271987.0, "step": 3550 }, { "entropy": 1.8618975296616553, "epoch": 0.011035688517691588, "grad_norm": 7.604375839233398, "learning_rate": 1.7238205947883368e-06, "loss": 0.6746, "mean_token_accuracy": 0.7953737512230873, "num_tokens": 4283981.0, "step": 3560 }, { "entropy": 1.9873407423496245, "epoch": 0.011066687642741284, "grad_norm": 12.513638496398926, "learning_rate": 1.728664148018987e-06, "loss": 0.8001, "mean_token_accuracy": 0.7810007587075234, "num_tokens": 4295314.0, "step": 3570 }, { "entropy": 1.8661947965621948, "epoch": 0.011097686767790979, "grad_norm": 8.476912498474121, "learning_rate": 1.7335077012496368e-06, "loss": 0.6368, "mean_token_accuracy": 0.8108328223228455, "num_tokens": 4306919.0, "step": 3580 }, { "entropy": 1.9405100882053374, "epoch": 0.011128685892840674, "grad_norm": 11.374967575073242, "learning_rate": 1.7383512544802869e-06, "loss": 0.7597, "mean_token_accuracy": 0.7805906578898429, "num_tokens": 4318405.0, "step": 3590 }, { "entropy": 1.8536500945687293, "epoch": 0.01115968501789037, "grad_norm": 17.95821762084961, "learning_rate": 1.743194807710937e-06, "loss": 0.6538, "mean_token_accuracy": 0.7971635937690735, "num_tokens": 4330730.0, "step": 3600 }, { "entropy": 1.8885389596223832, "epoch": 0.011190684142940065, "grad_norm": 15.392064094543457, "learning_rate": 1.7480383609415869e-06, "loss": 0.6708, "mean_token_accuracy": 0.8034435287117958, "num_tokens": 4341732.0, "step": 3610 }, { "entropy": 1.9072374895215034, "epoch": 0.01122168326798976, "grad_norm": 15.804481506347656, "learning_rate": 1.752881914172237e-06, "loss": 0.7122, "mean_token_accuracy": 0.7888311371207237, "num_tokens": 4352944.0, "step": 3620 }, { "entropy": 1.8851828515529632, "epoch": 0.011252682393039457, "grad_norm": 14.782668113708496, "learning_rate": 1.7577254674028869e-06, "loss": 0.6998, "mean_token_accuracy": 0.7909768223762512, "num_tokens": 4363879.0, "step": 3630 }, { "entropy": 1.8380015999078751, "epoch": 0.011283681518089152, "grad_norm": 14.773683547973633, "learning_rate": 1.762569020633537e-06, "loss": 0.6264, "mean_token_accuracy": 0.8070567846298218, "num_tokens": 4375759.0, "step": 3640 }, { "entropy": 1.8058712944388389, "epoch": 0.011314680643138847, "grad_norm": 13.945756912231445, "learning_rate": 1.767412573864187e-06, "loss": 0.5706, "mean_token_accuracy": 0.8059628069400787, "num_tokens": 4389341.0, "step": 3650 }, { "entropy": 1.8813235729932785, "epoch": 0.011345679768188543, "grad_norm": 12.711499214172363, "learning_rate": 1.772256127094837e-06, "loss": 0.6448, "mean_token_accuracy": 0.8153657332062721, "num_tokens": 4400801.0, "step": 3660 }, { "entropy": 1.849845513701439, "epoch": 0.011376678893238238, "grad_norm": 15.230743408203125, "learning_rate": 1.777099680325487e-06, "loss": 0.5924, "mean_token_accuracy": 0.8115788295865058, "num_tokens": 4413458.0, "step": 3670 }, { "entropy": 1.9601149141788483, "epoch": 0.011407678018287935, "grad_norm": 13.785551071166992, "learning_rate": 1.7819432335561367e-06, "loss": 0.7572, "mean_token_accuracy": 0.7834953621029854, "num_tokens": 4424999.0, "step": 3680 }, { "entropy": 1.8368350595235825, "epoch": 0.01143867714333763, "grad_norm": 7.308245658874512, "learning_rate": 1.786786786786787e-06, "loss": 0.5917, "mean_token_accuracy": 0.8099960327148438, "num_tokens": 4436854.0, "step": 3690 }, { "entropy": 1.8301602736115457, "epoch": 0.011469676268387324, "grad_norm": 13.396066665649414, "learning_rate": 1.791630340017437e-06, "loss": 0.61, "mean_token_accuracy": 0.8089475125074387, "num_tokens": 4449388.0, "step": 3700 }, { "entropy": 1.9232326075434685, "epoch": 0.011500675393437021, "grad_norm": 14.820940971374512, "learning_rate": 1.7964738932480868e-06, "loss": 0.7029, "mean_token_accuracy": 0.7972525000572205, "num_tokens": 4460601.0, "step": 3710 }, { "entropy": 1.9275172814726829, "epoch": 0.011531674518486716, "grad_norm": 13.760834693908691, "learning_rate": 1.8013174464787369e-06, "loss": 0.67, "mean_token_accuracy": 0.7982899993658066, "num_tokens": 4472147.0, "step": 3720 }, { "entropy": 1.841733305156231, "epoch": 0.01156267364353641, "grad_norm": 4.695168495178223, "learning_rate": 1.8061609997093872e-06, "loss": 0.604, "mean_token_accuracy": 0.8182388171553612, "num_tokens": 4484477.0, "step": 3730 }, { "entropy": 1.8292774349451064, "epoch": 0.011593672768586107, "grad_norm": 5.965690612792969, "learning_rate": 1.8110045529400368e-06, "loss": 0.5746, "mean_token_accuracy": 0.8187378898262978, "num_tokens": 4497212.0, "step": 3740 }, { "entropy": 1.8096903830766677, "epoch": 0.011624671893635802, "grad_norm": 6.74788761138916, "learning_rate": 1.815848106170687e-06, "loss": 0.5768, "mean_token_accuracy": 0.8042061701416969, "num_tokens": 4509570.0, "step": 3750 }, { "entropy": 1.8733750000596046, "epoch": 0.011655671018685497, "grad_norm": 15.247547149658203, "learning_rate": 1.8206916594013368e-06, "loss": 0.6363, "mean_token_accuracy": 0.7944400921463967, "num_tokens": 4521995.0, "step": 3760 }, { "entropy": 1.9379871144890786, "epoch": 0.011686670143735194, "grad_norm": 5.622687816619873, "learning_rate": 1.825535212631987e-06, "loss": 0.6895, "mean_token_accuracy": 0.7910193100571632, "num_tokens": 4533546.0, "step": 3770 }, { "entropy": 1.8144460648298264, "epoch": 0.011717669268784888, "grad_norm": 7.539439678192139, "learning_rate": 1.830378765862637e-06, "loss": 0.6076, "mean_token_accuracy": 0.8145319998264313, "num_tokens": 4546173.0, "step": 3780 }, { "entropy": 1.914628729224205, "epoch": 0.011748668393834583, "grad_norm": 15.919760704040527, "learning_rate": 1.835222319093287e-06, "loss": 0.6686, "mean_token_accuracy": 0.8031175434589386, "num_tokens": 4557017.0, "step": 3790 }, { "entropy": 1.873679694533348, "epoch": 0.01177966751888428, "grad_norm": 15.300971031188965, "learning_rate": 1.840065872323937e-06, "loss": 0.6355, "mean_token_accuracy": 0.7994208186864853, "num_tokens": 4569294.0, "step": 3800 }, { "entropy": 1.9676134511828423, "epoch": 0.011810666643933975, "grad_norm": 14.174129486083984, "learning_rate": 1.844909425554587e-06, "loss": 0.709, "mean_token_accuracy": 0.7975045815110207, "num_tokens": 4580270.0, "step": 3810 }, { "entropy": 1.7934204161167144, "epoch": 0.01184166576898367, "grad_norm": 14.214337348937988, "learning_rate": 1.849752978785237e-06, "loss": 0.5367, "mean_token_accuracy": 0.8222482308745385, "num_tokens": 4593333.0, "step": 3820 }, { "entropy": 1.880036075413227, "epoch": 0.011872664894033366, "grad_norm": 13.970633506774902, "learning_rate": 1.854596532015887e-06, "loss": 0.6634, "mean_token_accuracy": 0.7958371922373771, "num_tokens": 4605903.0, "step": 3830 }, { "entropy": 1.8512480556964874, "epoch": 0.011903664019083061, "grad_norm": 11.63100814819336, "learning_rate": 1.859440085246537e-06, "loss": 0.5904, "mean_token_accuracy": 0.8205496445298195, "num_tokens": 4618539.0, "step": 3840 }, { "entropy": 1.9032649219036102, "epoch": 0.011934663144132758, "grad_norm": 14.786650657653809, "learning_rate": 1.864283638477187e-06, "loss": 0.6271, "mean_token_accuracy": 0.7999899938702584, "num_tokens": 4630679.0, "step": 3850 }, { "entropy": 1.8507294937968255, "epoch": 0.011965662269182453, "grad_norm": 15.505373001098633, "learning_rate": 1.8691271917078371e-06, "loss": 0.5308, "mean_token_accuracy": 0.8145019263029099, "num_tokens": 4644442.0, "step": 3860 }, { "entropy": 2.043610119819641, "epoch": 0.011996661394232147, "grad_norm": 17.056150436401367, "learning_rate": 1.873970744938487e-06, "loss": 0.719, "mean_token_accuracy": 0.7810810253024101, "num_tokens": 4654902.0, "step": 3870 }, { "entropy": 1.9894573852419852, "epoch": 0.012027660519281844, "grad_norm": 14.494179725646973, "learning_rate": 1.8788142981691371e-06, "loss": 0.6273, "mean_token_accuracy": 0.8132184654474258, "num_tokens": 4665669.0, "step": 3880 }, { "entropy": 2.0583216458559037, "epoch": 0.012058659644331539, "grad_norm": 12.334035873413086, "learning_rate": 1.883657851399787e-06, "loss": 0.7878, "mean_token_accuracy": 0.7764063253998756, "num_tokens": 4676308.0, "step": 3890 }, { "entropy": 1.8608665503561497, "epoch": 0.012089658769381234, "grad_norm": 6.846446514129639, "learning_rate": 1.8885014046304371e-06, "loss": 0.6183, "mean_token_accuracy": 0.8085096433758736, "num_tokens": 4689421.0, "step": 3900 }, { "entropy": 1.957814833521843, "epoch": 0.01212065789443093, "grad_norm": 12.504427909851074, "learning_rate": 1.8933449578610872e-06, "loss": 0.6852, "mean_token_accuracy": 0.8001207739114762, "num_tokens": 4701492.0, "step": 3910 }, { "entropy": 1.8347266063094139, "epoch": 0.012151657019480625, "grad_norm": 15.001429557800293, "learning_rate": 1.8981885110917369e-06, "loss": 0.5869, "mean_token_accuracy": 0.81041249781847, "num_tokens": 4714252.0, "step": 3920 }, { "entropy": 1.9000362247228622, "epoch": 0.01218265614453032, "grad_norm": 15.871277809143066, "learning_rate": 1.903032064322387e-06, "loss": 0.6574, "mean_token_accuracy": 0.8029514774680138, "num_tokens": 4725743.0, "step": 3930 }, { "entropy": 1.9085357084870338, "epoch": 0.012213655269580017, "grad_norm": 11.861626625061035, "learning_rate": 1.907875617553037e-06, "loss": 0.6232, "mean_token_accuracy": 0.8074530705809593, "num_tokens": 4737602.0, "step": 3940 }, { "entropy": 1.8550396144390107, "epoch": 0.012244654394629712, "grad_norm": 8.315388679504395, "learning_rate": 1.912719170783687e-06, "loss": 0.6392, "mean_token_accuracy": 0.7964627891778946, "num_tokens": 4749843.0, "step": 3950 }, { "entropy": 1.8868352055549622, "epoch": 0.012275653519679406, "grad_norm": 6.1222028732299805, "learning_rate": 1.9175627240143373e-06, "loss": 0.6339, "mean_token_accuracy": 0.8049155220389366, "num_tokens": 4762341.0, "step": 3960 }, { "entropy": 1.9090922087430955, "epoch": 0.012306652644729103, "grad_norm": 13.033318519592285, "learning_rate": 1.922406277244987e-06, "loss": 0.7009, "mean_token_accuracy": 0.7983160421252251, "num_tokens": 4774156.0, "step": 3970 }, { "entropy": 1.8667214959859848, "epoch": 0.012337651769778798, "grad_norm": 13.20643424987793, "learning_rate": 1.927249830475637e-06, "loss": 0.6541, "mean_token_accuracy": 0.7958778321743012, "num_tokens": 4786644.0, "step": 3980 }, { "entropy": 1.8116755485534668, "epoch": 0.012368650894828493, "grad_norm": 4.642879962921143, "learning_rate": 1.9320933837062873e-06, "loss": 0.5928, "mean_token_accuracy": 0.8170793354511261, "num_tokens": 4799594.0, "step": 3990 }, { "entropy": 1.7941434875130653, "epoch": 0.01239965001987819, "grad_norm": 13.760114669799805, "learning_rate": 1.9369369369369372e-06, "loss": 0.5889, "mean_token_accuracy": 0.8124065786600113, "num_tokens": 4811994.0, "step": 4000 }, { "entropy": 1.9247236236929894, "epoch": 0.012430649144927884, "grad_norm": 13.147157669067383, "learning_rate": 1.941780490167587e-06, "loss": 0.7134, "mean_token_accuracy": 0.7897293627262115, "num_tokens": 4823158.0, "step": 4010 }, { "entropy": 1.853769588470459, "epoch": 0.01246164826997758, "grad_norm": 11.97884750366211, "learning_rate": 1.9466240433982374e-06, "loss": 0.6361, "mean_token_accuracy": 0.8066916093230247, "num_tokens": 4834679.0, "step": 4020 }, { "entropy": 1.8805758222937583, "epoch": 0.012492647395027276, "grad_norm": 13.70005989074707, "learning_rate": 1.951467596628887e-06, "loss": 0.684, "mean_token_accuracy": 0.7904590263962745, "num_tokens": 4846234.0, "step": 4030 }, { "entropy": 1.9497404143214225, "epoch": 0.01252364652007697, "grad_norm": 5.448077201843262, "learning_rate": 1.956311149859537e-06, "loss": 0.7173, "mean_token_accuracy": 0.7891417220234871, "num_tokens": 4857890.0, "step": 4040 }, { "entropy": 1.8685417965054512, "epoch": 0.012554645645126667, "grad_norm": 13.170268058776855, "learning_rate": 1.961154703090187e-06, "loss": 0.6227, "mean_token_accuracy": 0.8130503103137017, "num_tokens": 4869779.0, "step": 4050 }, { "entropy": 1.91714718490839, "epoch": 0.012585644770176362, "grad_norm": 13.95510196685791, "learning_rate": 1.965998256320837e-06, "loss": 0.6498, "mean_token_accuracy": 0.7988581836223603, "num_tokens": 4881872.0, "step": 4060 }, { "entropy": 1.8488874793052674, "epoch": 0.012616643895226057, "grad_norm": 15.533134460449219, "learning_rate": 1.9708418095514873e-06, "loss": 0.6223, "mean_token_accuracy": 0.8086504280567169, "num_tokens": 4893930.0, "step": 4070 }, { "entropy": 1.827307391166687, "epoch": 0.012647643020275753, "grad_norm": 14.92998218536377, "learning_rate": 1.975685362782137e-06, "loss": 0.6516, "mean_token_accuracy": 0.7971120476722717, "num_tokens": 4905551.0, "step": 4080 }, { "entropy": 1.8286997452378273, "epoch": 0.012678642145325448, "grad_norm": 14.836496353149414, "learning_rate": 1.980528916012787e-06, "loss": 0.6449, "mean_token_accuracy": 0.7994356840848923, "num_tokens": 4917025.0, "step": 4090 }, { "entropy": 1.8299384236335754, "epoch": 0.012709641270375143, "grad_norm": 13.094915390014648, "learning_rate": 1.985372469243437e-06, "loss": 0.5917, "mean_token_accuracy": 0.8010865300893784, "num_tokens": 4930128.0, "step": 4100 }, { "entropy": 1.895867148041725, "epoch": 0.01274064039542484, "grad_norm": 13.526707649230957, "learning_rate": 1.990216022474087e-06, "loss": 0.7135, "mean_token_accuracy": 0.7984941571950912, "num_tokens": 4941517.0, "step": 4110 }, { "entropy": 1.89191782027483, "epoch": 0.012771639520474535, "grad_norm": 14.294553756713867, "learning_rate": 1.995059575704737e-06, "loss": 0.6931, "mean_token_accuracy": 0.7926407441496849, "num_tokens": 4952810.0, "step": 4120 }, { "entropy": 1.8057468429207801, "epoch": 0.01280263864552423, "grad_norm": 12.936646461486816, "learning_rate": 1.999903128935387e-06, "loss": 0.6215, "mean_token_accuracy": 0.806446696817875, "num_tokens": 4965259.0, "step": 4130 }, { "entropy": 1.8559681817889213, "epoch": 0.012833637770573926, "grad_norm": 14.09201431274414, "learning_rate": 2.0047466821660373e-06, "loss": 0.6436, "mean_token_accuracy": 0.8033310145139694, "num_tokens": 4976636.0, "step": 4140 }, { "entropy": 1.754169662296772, "epoch": 0.012864636895623621, "grad_norm": 6.45542573928833, "learning_rate": 2.009590235396687e-06, "loss": 0.5474, "mean_token_accuracy": 0.8193281710147857, "num_tokens": 4989310.0, "step": 4150 }, { "entropy": 1.755221924185753, "epoch": 0.012895636020673316, "grad_norm": 6.969062328338623, "learning_rate": 2.014433788627337e-06, "loss": 0.5726, "mean_token_accuracy": 0.8212425723671913, "num_tokens": 5001740.0, "step": 4160 }, { "entropy": 1.923591212928295, "epoch": 0.012926635145723012, "grad_norm": 16.79177474975586, "learning_rate": 2.0192773418579874e-06, "loss": 0.7115, "mean_token_accuracy": 0.7964399144053459, "num_tokens": 5012837.0, "step": 4170 }, { "entropy": 1.789121389389038, "epoch": 0.012957634270772707, "grad_norm": 6.560935974121094, "learning_rate": 2.0241208950886372e-06, "loss": 0.5094, "mean_token_accuracy": 0.8209047600626945, "num_tokens": 5026059.0, "step": 4180 }, { "entropy": 1.8681050762534142, "epoch": 0.012988633395822402, "grad_norm": 13.890046119689941, "learning_rate": 2.028964448319287e-06, "loss": 0.614, "mean_token_accuracy": 0.7979257851839066, "num_tokens": 5038464.0, "step": 4190 }, { "entropy": 1.844995491206646, "epoch": 0.013019632520872099, "grad_norm": 13.706587791442871, "learning_rate": 2.0338080015499374e-06, "loss": 0.655, "mean_token_accuracy": 0.8055862948298455, "num_tokens": 5051036.0, "step": 4200 }, { "entropy": 1.8614793449640274, "epoch": 0.013050631645921794, "grad_norm": 14.581295013427734, "learning_rate": 2.0386515547805873e-06, "loss": 0.6409, "mean_token_accuracy": 0.8066158786416053, "num_tokens": 5063162.0, "step": 4210 }, { "entropy": 1.8593030095100402, "epoch": 0.01308163077097149, "grad_norm": 13.553071022033691, "learning_rate": 2.043495108011237e-06, "loss": 0.5952, "mean_token_accuracy": 0.8146178603172303, "num_tokens": 5075545.0, "step": 4220 }, { "entropy": 1.8848890259861946, "epoch": 0.013112629896021185, "grad_norm": 17.947132110595703, "learning_rate": 2.0483386612418875e-06, "loss": 0.6514, "mean_token_accuracy": 0.7945050925016404, "num_tokens": 5087372.0, "step": 4230 }, { "entropy": 1.880325546860695, "epoch": 0.01314362902107088, "grad_norm": 15.84373950958252, "learning_rate": 2.053182214472537e-06, "loss": 0.6546, "mean_token_accuracy": 0.8056602075695991, "num_tokens": 5098820.0, "step": 4240 }, { "entropy": 1.9198369443416596, "epoch": 0.013174628146120576, "grad_norm": 14.227551460266113, "learning_rate": 2.0580257677031873e-06, "loss": 0.6509, "mean_token_accuracy": 0.8100201249122619, "num_tokens": 5109708.0, "step": 4250 }, { "entropy": 1.95897875726223, "epoch": 0.013205627271170271, "grad_norm": 15.566633224487305, "learning_rate": 2.062869320933837e-06, "loss": 0.6949, "mean_token_accuracy": 0.7956610158085823, "num_tokens": 5121454.0, "step": 4260 }, { "entropy": 1.8369690001010894, "epoch": 0.013236626396219966, "grad_norm": 16.143478393554688, "learning_rate": 2.067712874164487e-06, "loss": 0.5837, "mean_token_accuracy": 0.8159258916974068, "num_tokens": 5134141.0, "step": 4270 }, { "entropy": 1.912563818693161, "epoch": 0.013267625521269663, "grad_norm": 15.098668098449707, "learning_rate": 2.0725564273951374e-06, "loss": 0.6755, "mean_token_accuracy": 0.7963416025042533, "num_tokens": 5146353.0, "step": 4280 }, { "entropy": 1.9880844503641129, "epoch": 0.013298624646319358, "grad_norm": 14.18931770324707, "learning_rate": 2.0773999806257872e-06, "loss": 0.7678, "mean_token_accuracy": 0.7802196651697159, "num_tokens": 5157703.0, "step": 4290 }, { "entropy": 1.8342043563723565, "epoch": 0.013329623771369052, "grad_norm": 6.213775157928467, "learning_rate": 2.082243533856437e-06, "loss": 0.5523, "mean_token_accuracy": 0.8177143961191178, "num_tokens": 5170695.0, "step": 4300 }, { "entropy": 1.8414741292595864, "epoch": 0.013360622896418749, "grad_norm": 12.719599723815918, "learning_rate": 2.087087087087087e-06, "loss": 0.5698, "mean_token_accuracy": 0.8134493753314018, "num_tokens": 5183018.0, "step": 4310 }, { "entropy": 1.8449064493179321, "epoch": 0.013391622021468444, "grad_norm": 12.744540214538574, "learning_rate": 2.0919306403177373e-06, "loss": 0.6141, "mean_token_accuracy": 0.7993644192814827, "num_tokens": 5195549.0, "step": 4320 }, { "entropy": 1.9135398477315904, "epoch": 0.013422621146518139, "grad_norm": 15.205565452575684, "learning_rate": 2.096774193548387e-06, "loss": 0.6636, "mean_token_accuracy": 0.8040877833962441, "num_tokens": 5207595.0, "step": 4330 }, { "entropy": 1.892987634241581, "epoch": 0.013453620271567835, "grad_norm": 14.970274925231934, "learning_rate": 2.101617746779037e-06, "loss": 0.6144, "mean_token_accuracy": 0.8050429001450539, "num_tokens": 5220012.0, "step": 4340 }, { "entropy": 1.933681371808052, "epoch": 0.01348461939661753, "grad_norm": 16.078292846679688, "learning_rate": 2.1064613000096874e-06, "loss": 0.6802, "mean_token_accuracy": 0.7953403264284133, "num_tokens": 5232144.0, "step": 4350 }, { "entropy": 1.8837040960788727, "epoch": 0.013515618521667225, "grad_norm": 12.591126441955566, "learning_rate": 2.1113048532403373e-06, "loss": 0.6592, "mean_token_accuracy": 0.8096277773380279, "num_tokens": 5244523.0, "step": 4360 }, { "entropy": 1.8900502383708955, "epoch": 0.013546617646716922, "grad_norm": 7.1288604736328125, "learning_rate": 2.116148406470987e-06, "loss": 0.6995, "mean_token_accuracy": 0.797728767991066, "num_tokens": 5255816.0, "step": 4370 }, { "entropy": 1.8959971249103547, "epoch": 0.013577616771766617, "grad_norm": 8.134716987609863, "learning_rate": 2.1209919597016375e-06, "loss": 0.6631, "mean_token_accuracy": 0.7931605547666549, "num_tokens": 5268217.0, "step": 4380 }, { "entropy": 1.8685301005840302, "epoch": 0.013608615896816313, "grad_norm": 14.050363540649414, "learning_rate": 2.1258355129322874e-06, "loss": 0.5861, "mean_token_accuracy": 0.7998355850577354, "num_tokens": 5280627.0, "step": 4390 }, { "entropy": 1.9573094069957733, "epoch": 0.013639615021866008, "grad_norm": 12.361193656921387, "learning_rate": 2.1306790661629372e-06, "loss": 0.6558, "mean_token_accuracy": 0.797396968305111, "num_tokens": 5291443.0, "step": 4400 }, { "entropy": 1.8597141653299332, "epoch": 0.013670614146915703, "grad_norm": 16.280147552490234, "learning_rate": 2.1355226193935875e-06, "loss": 0.6441, "mean_token_accuracy": 0.7984641641378403, "num_tokens": 5303414.0, "step": 4410 }, { "entropy": 1.8121001735329627, "epoch": 0.0137016132719654, "grad_norm": 19.821773529052734, "learning_rate": 2.1403661726242374e-06, "loss": 0.5859, "mean_token_accuracy": 0.8085174813866616, "num_tokens": 5316024.0, "step": 4420 }, { "entropy": 1.8680142611265182, "epoch": 0.013732612397015094, "grad_norm": 14.235099792480469, "learning_rate": 2.1452097258548873e-06, "loss": 0.6586, "mean_token_accuracy": 0.7996479585766793, "num_tokens": 5327955.0, "step": 4430 }, { "entropy": 1.8850964441895486, "epoch": 0.01376361152206479, "grad_norm": 12.817394256591797, "learning_rate": 2.1500532790855376e-06, "loss": 0.6506, "mean_token_accuracy": 0.7959688842296601, "num_tokens": 5340123.0, "step": 4440 }, { "entropy": 1.9236231684684753, "epoch": 0.013794610647114486, "grad_norm": 13.93416976928711, "learning_rate": 2.1548968323161875e-06, "loss": 0.6866, "mean_token_accuracy": 0.8000624299049377, "num_tokens": 5351902.0, "step": 4450 }, { "entropy": 1.8246183797717095, "epoch": 0.01382560977216418, "grad_norm": 6.33852481842041, "learning_rate": 2.1597403855468374e-06, "loss": 0.5937, "mean_token_accuracy": 0.8101627513766289, "num_tokens": 5364681.0, "step": 4460 }, { "entropy": 1.896468523144722, "epoch": 0.013856608897213876, "grad_norm": 15.372577667236328, "learning_rate": 2.1645839387774873e-06, "loss": 0.6683, "mean_token_accuracy": 0.8046876505017281, "num_tokens": 5376647.0, "step": 4470 }, { "entropy": 1.8945748567581178, "epoch": 0.013887608022263572, "grad_norm": 14.416013717651367, "learning_rate": 2.169427492008137e-06, "loss": 0.6216, "mean_token_accuracy": 0.8085326358675957, "num_tokens": 5388517.0, "step": 4480 }, { "entropy": 1.9351640045642853, "epoch": 0.013918607147313267, "grad_norm": 11.383118629455566, "learning_rate": 2.1742710452387875e-06, "loss": 0.7617, "mean_token_accuracy": 0.7835462704300881, "num_tokens": 5399458.0, "step": 4490 }, { "entropy": 1.873738704621792, "epoch": 0.013949606272362962, "grad_norm": 14.66506576538086, "learning_rate": 2.1791145984694373e-06, "loss": 0.6014, "mean_token_accuracy": 0.8175080880522728, "num_tokens": 5411219.0, "step": 4500 }, { "entropy": 1.9110942378640174, "epoch": 0.013980605397412658, "grad_norm": 7.845452308654785, "learning_rate": 2.1839581517000872e-06, "loss": 0.6462, "mean_token_accuracy": 0.799024523794651, "num_tokens": 5423441.0, "step": 4510 }, { "entropy": 1.8728701308369637, "epoch": 0.014011604522462353, "grad_norm": 14.299894332885742, "learning_rate": 2.188801704930737e-06, "loss": 0.6333, "mean_token_accuracy": 0.7987611889839172, "num_tokens": 5435798.0, "step": 4520 }, { "entropy": 1.9454208359122276, "epoch": 0.014042603647512048, "grad_norm": 14.531904220581055, "learning_rate": 2.1936452581613874e-06, "loss": 0.6534, "mean_token_accuracy": 0.8002553582191467, "num_tokens": 5447898.0, "step": 4530 }, { "entropy": 1.93377585709095, "epoch": 0.014073602772561745, "grad_norm": 12.258691787719727, "learning_rate": 2.1984888113920373e-06, "loss": 0.6732, "mean_token_accuracy": 0.7952050551772117, "num_tokens": 5459752.0, "step": 4540 }, { "entropy": 1.9374273508787154, "epoch": 0.01410460189761144, "grad_norm": 6.370416641235352, "learning_rate": 2.203332364622687e-06, "loss": 0.6438, "mean_token_accuracy": 0.8051278084516525, "num_tokens": 5470921.0, "step": 4550 }, { "entropy": 1.9412765011191369, "epoch": 0.014135601022661136, "grad_norm": 6.380893230438232, "learning_rate": 2.2081759178533375e-06, "loss": 0.6152, "mean_token_accuracy": 0.811233189702034, "num_tokens": 5482832.0, "step": 4560 }, { "entropy": 1.954646387696266, "epoch": 0.014166600147710831, "grad_norm": 11.699299812316895, "learning_rate": 2.2130194710839874e-06, "loss": 0.6937, "mean_token_accuracy": 0.7911556661128998, "num_tokens": 5494419.0, "step": 4570 }, { "entropy": 1.8840202033519744, "epoch": 0.014197599272760526, "grad_norm": 14.447190284729004, "learning_rate": 2.2178630243146373e-06, "loss": 0.5864, "mean_token_accuracy": 0.8123130038380623, "num_tokens": 5507775.0, "step": 4580 }, { "entropy": 1.9366067603230477, "epoch": 0.014228598397810223, "grad_norm": 6.459699630737305, "learning_rate": 2.2227065775452876e-06, "loss": 0.6687, "mean_token_accuracy": 0.7912245571613312, "num_tokens": 5519641.0, "step": 4590 }, { "entropy": 1.8860922396183013, "epoch": 0.014259597522859917, "grad_norm": 7.023352146148682, "learning_rate": 2.2275501307759375e-06, "loss": 0.5427, "mean_token_accuracy": 0.8207052007317543, "num_tokens": 5532518.0, "step": 4600 }, { "entropy": 1.8792527481913566, "epoch": 0.014290596647909612, "grad_norm": 14.053731918334961, "learning_rate": 2.2323936840065873e-06, "loss": 0.6267, "mean_token_accuracy": 0.8102740809321404, "num_tokens": 5545434.0, "step": 4610 }, { "entropy": 1.8577488988637925, "epoch": 0.014321595772959309, "grad_norm": 13.818965911865234, "learning_rate": 2.2372372372372376e-06, "loss": 0.607, "mean_token_accuracy": 0.8134890556335449, "num_tokens": 5557659.0, "step": 4620 }, { "entropy": 1.8798778101801872, "epoch": 0.014352594898009004, "grad_norm": 15.251192092895508, "learning_rate": 2.2420807904678875e-06, "loss": 0.7015, "mean_token_accuracy": 0.8011739581823349, "num_tokens": 5569313.0, "step": 4630 }, { "entropy": 1.9401899367570876, "epoch": 0.014383594023058699, "grad_norm": 14.222334861755371, "learning_rate": 2.2469243436985374e-06, "loss": 0.6341, "mean_token_accuracy": 0.8067525029182434, "num_tokens": 5580286.0, "step": 4640 }, { "entropy": 1.791366559267044, "epoch": 0.014414593148108395, "grad_norm": 15.209399223327637, "learning_rate": 2.2517678969291877e-06, "loss": 0.621, "mean_token_accuracy": 0.8049216285347939, "num_tokens": 5593090.0, "step": 4650 }, { "entropy": 1.8601211935281754, "epoch": 0.01444559227315809, "grad_norm": 6.989866733551025, "learning_rate": 2.2566114501598376e-06, "loss": 0.6226, "mean_token_accuracy": 0.7975085467100144, "num_tokens": 5606291.0, "step": 4660 }, { "entropy": 1.8646612569689751, "epoch": 0.014476591398207785, "grad_norm": 6.809642314910889, "learning_rate": 2.2614550033904875e-06, "loss": 0.6267, "mean_token_accuracy": 0.801960551738739, "num_tokens": 5618542.0, "step": 4670 }, { "entropy": 1.8657331004738809, "epoch": 0.014507590523257482, "grad_norm": 15.695577621459961, "learning_rate": 2.2662985566211374e-06, "loss": 0.6266, "mean_token_accuracy": 0.8036523833870888, "num_tokens": 5630622.0, "step": 4680 }, { "entropy": 1.8683514580130578, "epoch": 0.014538589648307176, "grad_norm": 14.665810585021973, "learning_rate": 2.2711421098517873e-06, "loss": 0.6272, "mean_token_accuracy": 0.8168015897274017, "num_tokens": 5642626.0, "step": 4690 }, { "entropy": 1.910755640268326, "epoch": 0.014569588773356871, "grad_norm": 14.815983772277832, "learning_rate": 2.2759856630824376e-06, "loss": 0.6303, "mean_token_accuracy": 0.7996454507112503, "num_tokens": 5654704.0, "step": 4700 }, { "entropy": 1.9073377847671509, "epoch": 0.014600587898406568, "grad_norm": 12.390499114990234, "learning_rate": 2.2808292163130874e-06, "loss": 0.6777, "mean_token_accuracy": 0.7973004877567291, "num_tokens": 5666771.0, "step": 4710 }, { "entropy": 1.8886594474315643, "epoch": 0.014631587023456263, "grad_norm": 15.042806625366211, "learning_rate": 2.2856727695437373e-06, "loss": 0.666, "mean_token_accuracy": 0.7995603799819946, "num_tokens": 5679125.0, "step": 4720 }, { "entropy": 1.8458405777812004, "epoch": 0.014662586148505958, "grad_norm": 13.178945541381836, "learning_rate": 2.2905163227743872e-06, "loss": 0.6103, "mean_token_accuracy": 0.8050804138183594, "num_tokens": 5691263.0, "step": 4730 }, { "entropy": 1.8889022842049599, "epoch": 0.014693585273555654, "grad_norm": 13.8629150390625, "learning_rate": 2.2953598760050375e-06, "loss": 0.6005, "mean_token_accuracy": 0.8008949771523476, "num_tokens": 5703903.0, "step": 4740 }, { "entropy": 1.8346245408058166, "epoch": 0.014724584398605349, "grad_norm": 13.840920448303223, "learning_rate": 2.3002034292356874e-06, "loss": 0.6284, "mean_token_accuracy": 0.8062951177358627, "num_tokens": 5715573.0, "step": 4750 }, { "entropy": 1.797354480624199, "epoch": 0.014755583523655046, "grad_norm": 7.390272617340088, "learning_rate": 2.3050469824663373e-06, "loss": 0.5823, "mean_token_accuracy": 0.81457539498806, "num_tokens": 5728120.0, "step": 4760 }, { "entropy": 1.787947428226471, "epoch": 0.01478658264870474, "grad_norm": 13.413910865783691, "learning_rate": 2.3098905356969876e-06, "loss": 0.6103, "mean_token_accuracy": 0.8159710958600044, "num_tokens": 5739981.0, "step": 4770 }, { "entropy": 1.7457917869091033, "epoch": 0.014817581773754435, "grad_norm": 14.069392204284668, "learning_rate": 2.3147340889276375e-06, "loss": 0.5329, "mean_token_accuracy": 0.8245069414377213, "num_tokens": 5753136.0, "step": 4780 }, { "entropy": 1.8335485979914665, "epoch": 0.014848580898804132, "grad_norm": 12.345169067382812, "learning_rate": 2.3195776421582874e-06, "loss": 0.6593, "mean_token_accuracy": 0.8006348922848702, "num_tokens": 5765376.0, "step": 4790 }, { "entropy": 1.8139121234416962, "epoch": 0.014879580023853827, "grad_norm": 15.01768970489502, "learning_rate": 2.3244211953889377e-06, "loss": 0.6159, "mean_token_accuracy": 0.8078545331954956, "num_tokens": 5778012.0, "step": 4800 }, { "entropy": 1.8669386252760887, "epoch": 0.014910579148903522, "grad_norm": 14.147043228149414, "learning_rate": 2.3292647486195876e-06, "loss": 0.6281, "mean_token_accuracy": 0.8018081709742546, "num_tokens": 5789805.0, "step": 4810 }, { "entropy": 1.784064681828022, "epoch": 0.014941578273953218, "grad_norm": 13.715585708618164, "learning_rate": 2.3341083018502374e-06, "loss": 0.6004, "mean_token_accuracy": 0.8130191281437874, "num_tokens": 5802546.0, "step": 4820 }, { "entropy": 1.7632321387529373, "epoch": 0.014972577399002913, "grad_norm": 14.334385871887207, "learning_rate": 2.3389518550808878e-06, "loss": 0.6187, "mean_token_accuracy": 0.8080756813287735, "num_tokens": 5815760.0, "step": 4830 }, { "entropy": 1.826922358572483, "epoch": 0.015003576524052608, "grad_norm": 12.539878845214844, "learning_rate": 2.3437954083115376e-06, "loss": 0.6262, "mean_token_accuracy": 0.8105258822441102, "num_tokens": 5827438.0, "step": 4840 }, { "entropy": 1.788956308364868, "epoch": 0.015034575649102305, "grad_norm": 11.371108055114746, "learning_rate": 2.3486389615421875e-06, "loss": 0.5732, "mean_token_accuracy": 0.817362517118454, "num_tokens": 5839410.0, "step": 4850 }, { "entropy": 1.734141993522644, "epoch": 0.015065574774152, "grad_norm": 6.056836128234863, "learning_rate": 2.353482514772838e-06, "loss": 0.4693, "mean_token_accuracy": 0.8292117938399315, "num_tokens": 5852222.0, "step": 4860 }, { "entropy": 1.9274750858545304, "epoch": 0.015096573899201694, "grad_norm": 14.039139747619629, "learning_rate": 2.3583260680034877e-06, "loss": 0.6941, "mean_token_accuracy": 0.7952279299497604, "num_tokens": 5864055.0, "step": 4870 }, { "entropy": 1.8699121579527855, "epoch": 0.015127573024251391, "grad_norm": 15.34942626953125, "learning_rate": 2.3631696212341376e-06, "loss": 0.669, "mean_token_accuracy": 0.7950506001710892, "num_tokens": 5876040.0, "step": 4880 }, { "entropy": 1.9636721938848496, "epoch": 0.015158572149301086, "grad_norm": 16.529897689819336, "learning_rate": 2.3680131744647875e-06, "loss": 0.7517, "mean_token_accuracy": 0.7848048597574234, "num_tokens": 5887191.0, "step": 4890 }, { "entropy": 1.7576201945543288, "epoch": 0.01518957127435078, "grad_norm": 11.712950706481934, "learning_rate": 2.3728567276954374e-06, "loss": 0.5591, "mean_token_accuracy": 0.8087734043598175, "num_tokens": 5901313.0, "step": 4900 }, { "entropy": 1.8790557369589806, "epoch": 0.015220570399400477, "grad_norm": 13.049158096313477, "learning_rate": 2.3777002809260877e-06, "loss": 0.6831, "mean_token_accuracy": 0.7949941202998161, "num_tokens": 5913169.0, "step": 4910 }, { "entropy": 1.8304862260818482, "epoch": 0.015251569524450172, "grad_norm": 14.758519172668457, "learning_rate": 2.3825438341567376e-06, "loss": 0.6111, "mean_token_accuracy": 0.7983208373188972, "num_tokens": 5925901.0, "step": 4920 }, { "entropy": 1.7591045543551445, "epoch": 0.015282568649499869, "grad_norm": 7.14612340927124, "learning_rate": 2.3873873873873874e-06, "loss": 0.608, "mean_token_accuracy": 0.8068576440215111, "num_tokens": 5939642.0, "step": 4930 }, { "entropy": 1.9316822454333304, "epoch": 0.015313567774549564, "grad_norm": 11.606863021850586, "learning_rate": 2.3922309406180373e-06, "loss": 0.6747, "mean_token_accuracy": 0.7950692802667618, "num_tokens": 5951806.0, "step": 4940 }, { "entropy": 1.8316121339797973, "epoch": 0.015344566899599258, "grad_norm": 12.468024253845215, "learning_rate": 2.3970744938486876e-06, "loss": 0.6445, "mean_token_accuracy": 0.7969418242573738, "num_tokens": 5963560.0, "step": 4950 }, { "entropy": 1.8592291116714477, "epoch": 0.015375566024648955, "grad_norm": 10.784393310546875, "learning_rate": 2.4019180470793375e-06, "loss": 0.584, "mean_token_accuracy": 0.815529865026474, "num_tokens": 5975576.0, "step": 4960 }, { "entropy": 1.8554728999733925, "epoch": 0.01540656514969865, "grad_norm": 6.568962574005127, "learning_rate": 2.4067616003099874e-06, "loss": 0.6138, "mean_token_accuracy": 0.8106912553310395, "num_tokens": 5987431.0, "step": 4970 }, { "entropy": 1.9579525411128997, "epoch": 0.015437564274748345, "grad_norm": 15.040802001953125, "learning_rate": 2.4116051535406377e-06, "loss": 0.7144, "mean_token_accuracy": 0.7940159112215042, "num_tokens": 5998986.0, "step": 4980 }, { "entropy": 1.8558162599802017, "epoch": 0.015468563399798041, "grad_norm": 12.90155029296875, "learning_rate": 2.4164487067712876e-06, "loss": 0.5972, "mean_token_accuracy": 0.8087948963046074, "num_tokens": 6011316.0, "step": 4990 }, { "entropy": 1.9120135977864265, "epoch": 0.015499562524847736, "grad_norm": 13.731481552124023, "learning_rate": 2.4212922600019375e-06, "loss": 0.6358, "mean_token_accuracy": 0.808275742828846, "num_tokens": 6022791.0, "step": 5000 }, { "entropy": 1.9652427747845649, "epoch": 0.015530561649897431, "grad_norm": 10.624104499816895, "learning_rate": 2.4261358132325878e-06, "loss": 0.7283, "mean_token_accuracy": 0.7885913565754891, "num_tokens": 6034662.0, "step": 5010 }, { "entropy": 1.8572456985712051, "epoch": 0.015561560774947128, "grad_norm": 13.485920906066895, "learning_rate": 2.4309793664632377e-06, "loss": 0.6653, "mean_token_accuracy": 0.8142799854278564, "num_tokens": 6046637.0, "step": 5020 }, { "entropy": 1.8741802372038365, "epoch": 0.015592559899996822, "grad_norm": 13.382155418395996, "learning_rate": 2.4358229196938875e-06, "loss": 0.673, "mean_token_accuracy": 0.7978162422776223, "num_tokens": 6058910.0, "step": 5030 }, { "entropy": 1.8276728346943856, "epoch": 0.015623559025046517, "grad_norm": 13.904196739196777, "learning_rate": 2.440666472924538e-06, "loss": 0.6098, "mean_token_accuracy": 0.80892014503479, "num_tokens": 6071136.0, "step": 5040 }, { "entropy": 1.8636898145079612, "epoch": 0.015654558150096212, "grad_norm": 12.65003490447998, "learning_rate": 2.4455100261551877e-06, "loss": 0.6052, "mean_token_accuracy": 0.8113350749015809, "num_tokens": 6082834.0, "step": 5050 }, { "entropy": 1.9150115132331849, "epoch": 0.01568555727514591, "grad_norm": 12.964512825012207, "learning_rate": 2.4503535793858376e-06, "loss": 0.6974, "mean_token_accuracy": 0.7987506031990051, "num_tokens": 6093808.0, "step": 5060 }, { "entropy": 1.8618119135499, "epoch": 0.015716556400195605, "grad_norm": 5.460781097412109, "learning_rate": 2.455197132616488e-06, "loss": 0.6885, "mean_token_accuracy": 0.8034725025296211, "num_tokens": 6105718.0, "step": 5070 }, { "entropy": 1.9206832945346832, "epoch": 0.0157475555252453, "grad_norm": 6.1644134521484375, "learning_rate": 2.460040685847138e-06, "loss": 0.6132, "mean_token_accuracy": 0.8128495365381241, "num_tokens": 6117606.0, "step": 5080 }, { "entropy": 1.8482828214764595, "epoch": 0.015778554650294995, "grad_norm": 16.253582000732422, "learning_rate": 2.4648842390777877e-06, "loss": 0.5834, "mean_token_accuracy": 0.8172304585576058, "num_tokens": 6130415.0, "step": 5090 }, { "entropy": 1.998060804605484, "epoch": 0.01580955377534469, "grad_norm": 12.287386894226074, "learning_rate": 2.4697277923084376e-06, "loss": 0.7151, "mean_token_accuracy": 0.8040244802832603, "num_tokens": 6140984.0, "step": 5100 }, { "entropy": 1.8899462610483169, "epoch": 0.015840552900394385, "grad_norm": 14.631074905395508, "learning_rate": 2.4745713455390875e-06, "loss": 0.5975, "mean_token_accuracy": 0.8050005823373795, "num_tokens": 6152531.0, "step": 5110 }, { "entropy": 1.9535317480564118, "epoch": 0.01587155202544408, "grad_norm": 17.399723052978516, "learning_rate": 2.4794148987697378e-06, "loss": 0.714, "mean_token_accuracy": 0.7943957537412644, "num_tokens": 6163431.0, "step": 5120 }, { "entropy": 1.8350759640336036, "epoch": 0.015902551150493778, "grad_norm": 6.640737056732178, "learning_rate": 2.4842584520003877e-06, "loss": 0.5901, "mean_token_accuracy": 0.8108617261052131, "num_tokens": 6176031.0, "step": 5130 }, { "entropy": 1.8753447353839874, "epoch": 0.01593355027554347, "grad_norm": 12.087038040161133, "learning_rate": 2.4891020052310375e-06, "loss": 0.6479, "mean_token_accuracy": 0.8096778213977813, "num_tokens": 6187445.0, "step": 5140 }, { "entropy": 1.9582001134753226, "epoch": 0.015964549400593168, "grad_norm": 14.169228553771973, "learning_rate": 2.4939455584616874e-06, "loss": 0.6759, "mean_token_accuracy": 0.8033049464225769, "num_tokens": 6199080.0, "step": 5150 }, { "entropy": 1.9630192652344705, "epoch": 0.015995548525642864, "grad_norm": 15.533053398132324, "learning_rate": 2.4987891116923377e-06, "loss": 0.6722, "mean_token_accuracy": 0.7981782972812652, "num_tokens": 6210218.0, "step": 5160 }, { "entropy": 1.9423971146345138, "epoch": 0.016026547650692557, "grad_norm": 15.028423309326172, "learning_rate": 2.5036326649229876e-06, "loss": 0.6986, "mean_token_accuracy": 0.7927390620112419, "num_tokens": 6221926.0, "step": 5170 }, { "entropy": 1.9331501245498657, "epoch": 0.016057546775742254, "grad_norm": 14.322336196899414, "learning_rate": 2.5084762181536375e-06, "loss": 0.669, "mean_token_accuracy": 0.7981711998581886, "num_tokens": 6233615.0, "step": 5180 }, { "entropy": 1.920173704624176, "epoch": 0.01608854590079195, "grad_norm": 13.353188514709473, "learning_rate": 2.513319771384288e-06, "loss": 0.6021, "mean_token_accuracy": 0.804129633307457, "num_tokens": 6245365.0, "step": 5190 }, { "entropy": 1.9042732536792755, "epoch": 0.016119545025841647, "grad_norm": 6.624873161315918, "learning_rate": 2.5181633246149377e-06, "loss": 0.6251, "mean_token_accuracy": 0.8022510379552841, "num_tokens": 6257307.0, "step": 5200 }, { "entropy": 1.9409416317939758, "epoch": 0.01615054415089134, "grad_norm": 16.728931427001953, "learning_rate": 2.523006877845588e-06, "loss": 0.6694, "mean_token_accuracy": 0.8007285013794899, "num_tokens": 6268908.0, "step": 5210 }, { "entropy": 1.9607915192842484, "epoch": 0.016181543275941037, "grad_norm": 12.1609525680542, "learning_rate": 2.5278504310762375e-06, "loss": 0.6902, "mean_token_accuracy": 0.7996085345745086, "num_tokens": 6279792.0, "step": 5220 }, { "entropy": 1.9298648953437805, "epoch": 0.016212542400990734, "grad_norm": 14.782210350036621, "learning_rate": 2.5326939843068878e-06, "loss": 0.6543, "mean_token_accuracy": 0.7992920339107513, "num_tokens": 6291405.0, "step": 5230 }, { "entropy": 1.8961961045861244, "epoch": 0.016243541526040427, "grad_norm": 15.001334190368652, "learning_rate": 2.5375375375375377e-06, "loss": 0.619, "mean_token_accuracy": 0.8142734676599502, "num_tokens": 6303357.0, "step": 5240 }, { "entropy": 1.8733942970633506, "epoch": 0.016274540651090123, "grad_norm": 11.908313751220703, "learning_rate": 2.542381090768188e-06, "loss": 0.5783, "mean_token_accuracy": 0.8036831006407738, "num_tokens": 6316147.0, "step": 5250 }, { "entropy": 1.8637539759278297, "epoch": 0.01630553977613982, "grad_norm": 13.72813606262207, "learning_rate": 2.547224643998838e-06, "loss": 0.6272, "mean_token_accuracy": 0.8079806834459304, "num_tokens": 6328593.0, "step": 5260 }, { "entropy": 1.9503967970609666, "epoch": 0.016336538901189513, "grad_norm": 13.402142524719238, "learning_rate": 2.5520681972294873e-06, "loss": 0.7049, "mean_token_accuracy": 0.7919132471084595, "num_tokens": 6340259.0, "step": 5270 }, { "entropy": 1.8650514677166938, "epoch": 0.01636753802623921, "grad_norm": 17.254850387573242, "learning_rate": 2.5569117504601376e-06, "loss": 0.6297, "mean_token_accuracy": 0.8054231390357017, "num_tokens": 6351914.0, "step": 5280 }, { "entropy": 1.861028863489628, "epoch": 0.016398537151288906, "grad_norm": 14.428080558776855, "learning_rate": 2.561755303690788e-06, "loss": 0.681, "mean_token_accuracy": 0.7947863683104515, "num_tokens": 6364405.0, "step": 5290 }, { "entropy": 1.9019827499985695, "epoch": 0.0164295362763386, "grad_norm": 14.161713600158691, "learning_rate": 2.566598856921438e-06, "loss": 0.6824, "mean_token_accuracy": 0.8035579726099968, "num_tokens": 6375673.0, "step": 5300 }, { "entropy": 1.8992380529642106, "epoch": 0.016460535401388296, "grad_norm": 13.563090324401855, "learning_rate": 2.571442410152088e-06, "loss": 0.6121, "mean_token_accuracy": 0.8122377499938012, "num_tokens": 6387160.0, "step": 5310 }, { "entropy": 1.8584588319063187, "epoch": 0.016491534526437993, "grad_norm": 7.150653839111328, "learning_rate": 2.576285963382738e-06, "loss": 0.6296, "mean_token_accuracy": 0.8052732735872269, "num_tokens": 6399189.0, "step": 5320 }, { "entropy": 1.822690936923027, "epoch": 0.016522533651487686, "grad_norm": 13.757638931274414, "learning_rate": 2.5811295166133875e-06, "loss": 0.6489, "mean_token_accuracy": 0.8050631493330002, "num_tokens": 6411077.0, "step": 5330 }, { "entropy": 1.86706335991621, "epoch": 0.016553532776537382, "grad_norm": 12.249866485595703, "learning_rate": 2.5859730698440378e-06, "loss": 0.6379, "mean_token_accuracy": 0.8055833503603935, "num_tokens": 6422795.0, "step": 5340 }, { "entropy": 1.914227731525898, "epoch": 0.01658453190158708, "grad_norm": 13.125116348266602, "learning_rate": 2.5908166230746876e-06, "loss": 0.6412, "mean_token_accuracy": 0.8045208930969239, "num_tokens": 6434827.0, "step": 5350 }, { "entropy": 1.8957944259047508, "epoch": 0.016615531026636772, "grad_norm": 16.094440460205078, "learning_rate": 2.595660176305338e-06, "loss": 0.6912, "mean_token_accuracy": 0.8000255629420281, "num_tokens": 6446181.0, "step": 5360 }, { "entropy": 1.8262048691511155, "epoch": 0.01664653015168647, "grad_norm": 7.709341526031494, "learning_rate": 2.6005037295359883e-06, "loss": 0.5977, "mean_token_accuracy": 0.8138811901211739, "num_tokens": 6458511.0, "step": 5370 }, { "entropy": 1.9274291083216668, "epoch": 0.016677529276736165, "grad_norm": 15.519542694091797, "learning_rate": 2.6053472827666377e-06, "loss": 0.7491, "mean_token_accuracy": 0.7906691998243331, "num_tokens": 6470175.0, "step": 5380 }, { "entropy": 1.8656795375049113, "epoch": 0.01670852840178586, "grad_norm": 15.985265731811523, "learning_rate": 2.6101908359972876e-06, "loss": 0.6216, "mean_token_accuracy": 0.8056905150413514, "num_tokens": 6482873.0, "step": 5390 }, { "entropy": 1.9562640219926835, "epoch": 0.016739527526835555, "grad_norm": 12.090657234191895, "learning_rate": 2.615034389227938e-06, "loss": 0.6927, "mean_token_accuracy": 0.7986342236399651, "num_tokens": 6494045.0, "step": 5400 }, { "entropy": 1.919235098361969, "epoch": 0.01677052665188525, "grad_norm": 12.741799354553223, "learning_rate": 2.619877942458588e-06, "loss": 0.6563, "mean_token_accuracy": 0.8064717799425125, "num_tokens": 6504934.0, "step": 5410 }, { "entropy": 1.8321070238947867, "epoch": 0.016801525776934945, "grad_norm": 13.527047157287598, "learning_rate": 2.624721495689238e-06, "loss": 0.5623, "mean_token_accuracy": 0.8185374692082406, "num_tokens": 6517746.0, "step": 5420 }, { "entropy": 1.8565064251422883, "epoch": 0.01683252490198464, "grad_norm": 6.626979351043701, "learning_rate": 2.6295650489198876e-06, "loss": 0.6632, "mean_token_accuracy": 0.801714438199997, "num_tokens": 6530512.0, "step": 5430 }, { "entropy": 1.815597450733185, "epoch": 0.016863524027034338, "grad_norm": 14.231537818908691, "learning_rate": 2.634408602150538e-06, "loss": 0.6259, "mean_token_accuracy": 0.8043477773666382, "num_tokens": 6543281.0, "step": 5440 }, { "entropy": 1.847208908200264, "epoch": 0.01689452315208403, "grad_norm": 14.332513809204102, "learning_rate": 2.6392521553811878e-06, "loss": 0.5887, "mean_token_accuracy": 0.8129609242081642, "num_tokens": 6556103.0, "step": 5450 }, { "entropy": 1.8287439972162247, "epoch": 0.016925522277133728, "grad_norm": 17.275390625, "learning_rate": 2.644095708611838e-06, "loss": 0.5664, "mean_token_accuracy": 0.8119298800826072, "num_tokens": 6568831.0, "step": 5460 }, { "entropy": 1.9554360315203667, "epoch": 0.016956521402183424, "grad_norm": 16.211164474487305, "learning_rate": 2.648939261842488e-06, "loss": 0.7553, "mean_token_accuracy": 0.790603817999363, "num_tokens": 6580730.0, "step": 5470 }, { "entropy": 1.8777785643935203, "epoch": 0.016987520527233117, "grad_norm": 7.529537677764893, "learning_rate": 2.6537828150731374e-06, "loss": 0.5822, "mean_token_accuracy": 0.81802958548069, "num_tokens": 6593151.0, "step": 5480 }, { "entropy": 1.83996739089489, "epoch": 0.017018519652282814, "grad_norm": 16.056753158569336, "learning_rate": 2.6586263683037877e-06, "loss": 0.5795, "mean_token_accuracy": 0.8094681099057197, "num_tokens": 6605508.0, "step": 5490 }, { "entropy": 1.8156257584691047, "epoch": 0.01704951877733251, "grad_norm": 13.18197250366211, "learning_rate": 2.663469921534438e-06, "loss": 0.5445, "mean_token_accuracy": 0.818387684226036, "num_tokens": 6617913.0, "step": 5500 }, { "entropy": 1.830448153614998, "epoch": 0.017080517902382204, "grad_norm": 17.966718673706055, "learning_rate": 2.668313474765088e-06, "loss": 0.591, "mean_token_accuracy": 0.8055599793791771, "num_tokens": 6630804.0, "step": 5510 }, { "entropy": 1.9519992083311082, "epoch": 0.0171115170274319, "grad_norm": 15.51613998413086, "learning_rate": 2.6731570279957382e-06, "loss": 0.7094, "mean_token_accuracy": 0.7885363206267357, "num_tokens": 6641484.0, "step": 5520 }, { "entropy": 1.787497617304325, "epoch": 0.017142516152481597, "grad_norm": 13.95913028717041, "learning_rate": 2.678000581226388e-06, "loss": 0.5966, "mean_token_accuracy": 0.8152831554412842, "num_tokens": 6655013.0, "step": 5530 }, { "entropy": 1.8791037619113922, "epoch": 0.017173515277531293, "grad_norm": 12.858197212219238, "learning_rate": 2.6828441344570376e-06, "loss": 0.708, "mean_token_accuracy": 0.7970382794737816, "num_tokens": 6665889.0, "step": 5540 }, { "entropy": 1.8964224010705948, "epoch": 0.017204514402580987, "grad_norm": 14.71556282043457, "learning_rate": 2.687687687687688e-06, "loss": 0.7067, "mean_token_accuracy": 0.8008865773677826, "num_tokens": 6676444.0, "step": 5550 }, { "entropy": 1.868991169333458, "epoch": 0.017235513527630683, "grad_norm": 12.040255546569824, "learning_rate": 2.6925312409183378e-06, "loss": 0.6944, "mean_token_accuracy": 0.7887442111968994, "num_tokens": 6687681.0, "step": 5560 }, { "entropy": 1.7748029723763465, "epoch": 0.01726651265268038, "grad_norm": 16.154150009155273, "learning_rate": 2.697374794148988e-06, "loss": 0.558, "mean_token_accuracy": 0.818262355029583, "num_tokens": 6700056.0, "step": 5570 }, { "entropy": 1.8267305195331573, "epoch": 0.017297511777730073, "grad_norm": 13.24530029296875, "learning_rate": 2.7022183473796384e-06, "loss": 0.6354, "mean_token_accuracy": 0.8041493833065033, "num_tokens": 6712309.0, "step": 5580 }, { "entropy": 1.801226018369198, "epoch": 0.01732851090277977, "grad_norm": 13.650617599487305, "learning_rate": 2.707061900610288e-06, "loss": 0.6252, "mean_token_accuracy": 0.8037795275449753, "num_tokens": 6725171.0, "step": 5590 }, { "entropy": 1.9054549396038056, "epoch": 0.017359510027829466, "grad_norm": 13.89938735961914, "learning_rate": 2.7119054538409377e-06, "loss": 0.673, "mean_token_accuracy": 0.794456647336483, "num_tokens": 6735768.0, "step": 5600 }, { "entropy": 1.8770520448684693, "epoch": 0.01739050915287916, "grad_norm": 11.668399810791016, "learning_rate": 2.716749007071588e-06, "loss": 0.6098, "mean_token_accuracy": 0.8073992922902107, "num_tokens": 6748114.0, "step": 5610 }, { "entropy": 1.8336502104997634, "epoch": 0.017421508277928856, "grad_norm": 15.977221488952637, "learning_rate": 2.721592560302238e-06, "loss": 0.5509, "mean_token_accuracy": 0.7992581978440285, "num_tokens": 6761005.0, "step": 5620 }, { "entropy": 1.868009014427662, "epoch": 0.017452507402978552, "grad_norm": 14.214484214782715, "learning_rate": 2.726436113532888e-06, "loss": 0.6505, "mean_token_accuracy": 0.794841094315052, "num_tokens": 6773183.0, "step": 5630 }, { "entropy": 1.8636821389198304, "epoch": 0.017483506528028245, "grad_norm": 12.620464324951172, "learning_rate": 2.7312796667635377e-06, "loss": 0.6509, "mean_token_accuracy": 0.8034439250826836, "num_tokens": 6785345.0, "step": 5640 }, { "entropy": 1.9341711491346358, "epoch": 0.017514505653077942, "grad_norm": 15.093106269836426, "learning_rate": 2.736123219994188e-06, "loss": 0.6546, "mean_token_accuracy": 0.80091482847929, "num_tokens": 6796959.0, "step": 5650 }, { "entropy": 1.8229330405592918, "epoch": 0.01754550477812764, "grad_norm": 5.629141807556152, "learning_rate": 2.740966773224838e-06, "loss": 0.5279, "mean_token_accuracy": 0.8240866810083389, "num_tokens": 6809608.0, "step": 5660 }, { "entropy": 1.8584133207798004, "epoch": 0.017576503903177332, "grad_norm": 14.71494197845459, "learning_rate": 2.745810326455488e-06, "loss": 0.5945, "mean_token_accuracy": 0.8071165978908539, "num_tokens": 6821161.0, "step": 5670 }, { "entropy": 1.9193618685007094, "epoch": 0.01760750302822703, "grad_norm": 6.501834392547607, "learning_rate": 2.750653879686138e-06, "loss": 0.6277, "mean_token_accuracy": 0.8095588624477387, "num_tokens": 6832907.0, "step": 5680 }, { "entropy": 1.9340075165033341, "epoch": 0.017638502153276725, "grad_norm": 16.142831802368164, "learning_rate": 2.755497432916788e-06, "loss": 0.6317, "mean_token_accuracy": 0.8106139227747917, "num_tokens": 6844618.0, "step": 5690 }, { "entropy": 1.9251418694853784, "epoch": 0.017669501278326418, "grad_norm": 5.188778877258301, "learning_rate": 2.760340986147438e-06, "loss": 0.5842, "mean_token_accuracy": 0.8212108716368676, "num_tokens": 6856467.0, "step": 5700 }, { "entropy": 2.0215213894844055, "epoch": 0.017700500403376115, "grad_norm": 14.539006233215332, "learning_rate": 2.765184539378088e-06, "loss": 0.7512, "mean_token_accuracy": 0.7919975519180298, "num_tokens": 6867647.0, "step": 5710 }, { "entropy": 1.8021862357854843, "epoch": 0.01773149952842581, "grad_norm": 12.78034782409668, "learning_rate": 2.770028092608738e-06, "loss": 0.5369, "mean_token_accuracy": 0.8308590099215507, "num_tokens": 6879996.0, "step": 5720 }, { "entropy": 1.9824547916650772, "epoch": 0.017762498653475504, "grad_norm": 16.377723693847656, "learning_rate": 2.7748716458393883e-06, "loss": 0.6998, "mean_token_accuracy": 0.7981508508324623, "num_tokens": 6890686.0, "step": 5730 }, { "entropy": 1.8871808886528014, "epoch": 0.0177934977785252, "grad_norm": 8.088064193725586, "learning_rate": 2.779715199070038e-06, "loss": 0.6382, "mean_token_accuracy": 0.8009686887264251, "num_tokens": 6902728.0, "step": 5740 }, { "entropy": 1.9506133437156676, "epoch": 0.017824496903574898, "grad_norm": 13.706978797912598, "learning_rate": 2.7845587523006877e-06, "loss": 0.7109, "mean_token_accuracy": 0.7973517820239067, "num_tokens": 6913584.0, "step": 5750 }, { "entropy": 1.8800716385245324, "epoch": 0.01785549602862459, "grad_norm": 14.467167854309082, "learning_rate": 2.789402305531338e-06, "loss": 0.6481, "mean_token_accuracy": 0.8017850533127785, "num_tokens": 6925591.0, "step": 5760 }, { "entropy": 1.9344962298870088, "epoch": 0.017886495153674287, "grad_norm": 13.922574996948242, "learning_rate": 2.794245858761988e-06, "loss": 0.7058, "mean_token_accuracy": 0.8056110054254532, "num_tokens": 6936542.0, "step": 5770 }, { "entropy": 1.8384886384010315, "epoch": 0.017917494278723984, "grad_norm": 13.464262962341309, "learning_rate": 2.799089411992638e-06, "loss": 0.6779, "mean_token_accuracy": 0.8026763945817947, "num_tokens": 6948233.0, "step": 5780 }, { "entropy": 1.8690043538808823, "epoch": 0.017948493403773677, "grad_norm": 19.54326820373535, "learning_rate": 2.8039329652232885e-06, "loss": 0.6308, "mean_token_accuracy": 0.8030871346592903, "num_tokens": 6959330.0, "step": 5790 }, { "entropy": 1.8759321600198746, "epoch": 0.017979492528823374, "grad_norm": 12.744894981384277, "learning_rate": 2.808776518453938e-06, "loss": 0.6481, "mean_token_accuracy": 0.8097668170928956, "num_tokens": 6970815.0, "step": 5800 }, { "entropy": 1.828912016749382, "epoch": 0.01801049165387307, "grad_norm": 12.839016914367676, "learning_rate": 2.813620071684588e-06, "loss": 0.629, "mean_token_accuracy": 0.8034252509474754, "num_tokens": 6982527.0, "step": 5810 }, { "entropy": 1.8393566399812697, "epoch": 0.018041490778922763, "grad_norm": 12.746121406555176, "learning_rate": 2.818463624915238e-06, "loss": 0.6182, "mean_token_accuracy": 0.8172145113348961, "num_tokens": 6993866.0, "step": 5820 }, { "entropy": 1.9092845901846887, "epoch": 0.01807248990397246, "grad_norm": 6.71993350982666, "learning_rate": 2.823307178145888e-06, "loss": 0.6816, "mean_token_accuracy": 0.7965900272130966, "num_tokens": 7005083.0, "step": 5830 }, { "entropy": 1.8039664879441262, "epoch": 0.018103489029022157, "grad_norm": 14.516589164733887, "learning_rate": 2.8281507313765383e-06, "loss": 0.6241, "mean_token_accuracy": 0.8058669179677963, "num_tokens": 7017669.0, "step": 5840 }, { "entropy": 1.8136644974350928, "epoch": 0.01813448815407185, "grad_norm": 5.473270893096924, "learning_rate": 2.8329942846071878e-06, "loss": 0.5442, "mean_token_accuracy": 0.8144378513097763, "num_tokens": 7030554.0, "step": 5850 }, { "entropy": 1.8616077184677124, "epoch": 0.018165487279121546, "grad_norm": 6.733494281768799, "learning_rate": 2.837837837837838e-06, "loss": 0.6268, "mean_token_accuracy": 0.8107643172144889, "num_tokens": 7043459.0, "step": 5860 }, { "entropy": 1.8695634379982948, "epoch": 0.018196486404171243, "grad_norm": 13.21927547454834, "learning_rate": 2.842681391068488e-06, "loss": 0.66, "mean_token_accuracy": 0.7932876944541931, "num_tokens": 7055971.0, "step": 5870 }, { "entropy": 1.9111657977104186, "epoch": 0.018227485529220936, "grad_norm": 12.689513206481934, "learning_rate": 2.8475249442991383e-06, "loss": 0.6847, "mean_token_accuracy": 0.8026935487985611, "num_tokens": 7067129.0, "step": 5880 }, { "entropy": 1.9150221094489097, "epoch": 0.018258484654270633, "grad_norm": 15.985901832580566, "learning_rate": 2.852368497529788e-06, "loss": 0.636, "mean_token_accuracy": 0.8080303505063057, "num_tokens": 7078292.0, "step": 5890 }, { "entropy": 1.9083669915795327, "epoch": 0.01828948377932033, "grad_norm": 6.71138334274292, "learning_rate": 2.857212050760438e-06, "loss": 0.625, "mean_token_accuracy": 0.8103194802999496, "num_tokens": 7089362.0, "step": 5900 }, { "entropy": 1.9217876300215722, "epoch": 0.018320482904370026, "grad_norm": 15.180530548095703, "learning_rate": 2.862055603991088e-06, "loss": 0.7363, "mean_token_accuracy": 0.8016302824020386, "num_tokens": 7100640.0, "step": 5910 }, { "entropy": 1.8832744032144546, "epoch": 0.01835148202941972, "grad_norm": 7.438052654266357, "learning_rate": 2.8668991572217382e-06, "loss": 0.6582, "mean_token_accuracy": 0.7962846517562866, "num_tokens": 7112772.0, "step": 5920 }, { "entropy": 1.855891165137291, "epoch": 0.018382481154469416, "grad_norm": 14.392753601074219, "learning_rate": 2.871742710452388e-06, "loss": 0.6342, "mean_token_accuracy": 0.7954742982983589, "num_tokens": 7124974.0, "step": 5930 }, { "entropy": 1.906690326333046, "epoch": 0.018413480279519112, "grad_norm": 5.785147666931152, "learning_rate": 2.8765862636830384e-06, "loss": 0.6857, "mean_token_accuracy": 0.7974092051386833, "num_tokens": 7136071.0, "step": 5940 }, { "entropy": 1.8553122743964194, "epoch": 0.018444479404568805, "grad_norm": 12.362131118774414, "learning_rate": 2.8814298169136883e-06, "loss": 0.6059, "mean_token_accuracy": 0.8133484557271004, "num_tokens": 7148231.0, "step": 5950 }, { "entropy": 1.8504623636603355, "epoch": 0.018475478529618502, "grad_norm": 12.675224304199219, "learning_rate": 2.8862733701443378e-06, "loss": 0.6486, "mean_token_accuracy": 0.8114531069993973, "num_tokens": 7159463.0, "step": 5960 }, { "entropy": 1.7733413144946097, "epoch": 0.0185064776546682, "grad_norm": 14.104755401611328, "learning_rate": 2.891116923374988e-06, "loss": 0.6063, "mean_token_accuracy": 0.8127880603075027, "num_tokens": 7172067.0, "step": 5970 }, { "entropy": 1.9293433710932733, "epoch": 0.01853747677971789, "grad_norm": 13.121195793151855, "learning_rate": 2.895960476605638e-06, "loss": 0.6808, "mean_token_accuracy": 0.7942067563533783, "num_tokens": 7183215.0, "step": 5980 }, { "entropy": 1.9015644788742065, "epoch": 0.018568475904767588, "grad_norm": 14.695234298706055, "learning_rate": 2.9008040298362883e-06, "loss": 0.6561, "mean_token_accuracy": 0.807443767786026, "num_tokens": 7193804.0, "step": 5990 }, { "entropy": 1.8584507659077645, "epoch": 0.018599475029817285, "grad_norm": 16.059717178344727, "learning_rate": 2.9056475830669386e-06, "loss": 0.6235, "mean_token_accuracy": 0.808501772582531, "num_tokens": 7205778.0, "step": 6000 }, { "entropy": 1.8808137744665145, "epoch": 0.018630474154866978, "grad_norm": 13.323634147644043, "learning_rate": 2.910491136297588e-06, "loss": 0.6352, "mean_token_accuracy": 0.8102394938468933, "num_tokens": 7217406.0, "step": 6010 }, { "entropy": 1.9097854852676392, "epoch": 0.018661473279916675, "grad_norm": 12.783294677734375, "learning_rate": 2.915334689528238e-06, "loss": 0.6669, "mean_token_accuracy": 0.8101324290037155, "num_tokens": 7228513.0, "step": 6020 }, { "entropy": 1.949710837006569, "epoch": 0.01869247240496637, "grad_norm": 8.33492660522461, "learning_rate": 2.9201782427588882e-06, "loss": 0.7241, "mean_token_accuracy": 0.7899691253900528, "num_tokens": 7240091.0, "step": 6030 }, { "entropy": 1.7995387852191924, "epoch": 0.018723471530016064, "grad_norm": 14.658129692077637, "learning_rate": 2.925021795989538e-06, "loss": 0.5537, "mean_token_accuracy": 0.8123287737369538, "num_tokens": 7252717.0, "step": 6040 }, { "entropy": 1.8480943590402603, "epoch": 0.01875447065506576, "grad_norm": 12.393167495727539, "learning_rate": 2.9298653492201884e-06, "loss": 0.5651, "mean_token_accuracy": 0.8123827025294303, "num_tokens": 7265132.0, "step": 6050 }, { "entropy": 1.9156224384903908, "epoch": 0.018785469780115457, "grad_norm": 17.64678382873535, "learning_rate": 2.934708902450838e-06, "loss": 0.637, "mean_token_accuracy": 0.7986405953764916, "num_tokens": 7277392.0, "step": 6060 }, { "entropy": 1.8689854100346566, "epoch": 0.01881646890516515, "grad_norm": 13.045455932617188, "learning_rate": 2.939552455681488e-06, "loss": 0.605, "mean_token_accuracy": 0.8187311470508576, "num_tokens": 7289307.0, "step": 6070 }, { "entropy": 1.9299559414386749, "epoch": 0.018847468030214847, "grad_norm": 14.602327346801758, "learning_rate": 2.944396008912138e-06, "loss": 0.6499, "mean_token_accuracy": 0.8123447954654693, "num_tokens": 7300976.0, "step": 6080 }, { "entropy": 1.8863015726208687, "epoch": 0.018878467155264544, "grad_norm": 16.277345657348633, "learning_rate": 2.9492395621427884e-06, "loss": 0.6285, "mean_token_accuracy": 0.805467925965786, "num_tokens": 7312816.0, "step": 6090 }, { "entropy": 1.88625720590353, "epoch": 0.018909466280314237, "grad_norm": 13.414533615112305, "learning_rate": 2.9540831153734383e-06, "loss": 0.61, "mean_token_accuracy": 0.8089847132563591, "num_tokens": 7325107.0, "step": 6100 }, { "entropy": 1.9170991733670235, "epoch": 0.018940465405363933, "grad_norm": 12.490900039672852, "learning_rate": 2.958926668604088e-06, "loss": 0.5832, "mean_token_accuracy": 0.8148081079125404, "num_tokens": 7337186.0, "step": 6110 }, { "entropy": 1.8110515862703322, "epoch": 0.01897146453041363, "grad_norm": 6.398469924926758, "learning_rate": 2.963770221834738e-06, "loss": 0.5864, "mean_token_accuracy": 0.8025536656379699, "num_tokens": 7351631.0, "step": 6120 }, { "entropy": 1.925946244597435, "epoch": 0.019002463655463323, "grad_norm": 12.947382926940918, "learning_rate": 2.9686137750653883e-06, "loss": 0.6428, "mean_token_accuracy": 0.8030333101749421, "num_tokens": 7363334.0, "step": 6130 }, { "entropy": 1.84049913585186, "epoch": 0.01903346278051302, "grad_norm": 13.585901260375977, "learning_rate": 2.9734573282960382e-06, "loss": 0.5674, "mean_token_accuracy": 0.8090210378170013, "num_tokens": 7376221.0, "step": 6140 }, { "entropy": 1.913094098865986, "epoch": 0.019064461905562716, "grad_norm": 13.463624954223633, "learning_rate": 2.9783008815266885e-06, "loss": 0.6148, "mean_token_accuracy": 0.8032952442765235, "num_tokens": 7388430.0, "step": 6150 }, { "entropy": 1.9327594608068466, "epoch": 0.01909546103061241, "grad_norm": 13.593147277832031, "learning_rate": 2.9831444347573384e-06, "loss": 0.6631, "mean_token_accuracy": 0.8024881407618523, "num_tokens": 7400021.0, "step": 6160 }, { "entropy": 1.920905977487564, "epoch": 0.019126460155662106, "grad_norm": 12.407073020935059, "learning_rate": 2.987987987987988e-06, "loss": 0.6858, "mean_token_accuracy": 0.8053328812122345, "num_tokens": 7411092.0, "step": 6170 }, { "entropy": 1.9286428660154342, "epoch": 0.019157459280711803, "grad_norm": 11.15323543548584, "learning_rate": 2.992831541218638e-06, "loss": 0.6284, "mean_token_accuracy": 0.8029859021306038, "num_tokens": 7423534.0, "step": 6180 }, { "entropy": 1.91870975792408, "epoch": 0.019188458405761496, "grad_norm": 6.104159832000732, "learning_rate": 2.9976750944492885e-06, "loss": 0.6215, "mean_token_accuracy": 0.804423876106739, "num_tokens": 7434989.0, "step": 6190 }, { "entropy": 1.9800142824649811, "epoch": 0.019219457530811192, "grad_norm": 15.913412094116211, "learning_rate": 3.0025186476799384e-06, "loss": 0.6803, "mean_token_accuracy": 0.8035276308655739, "num_tokens": 7446518.0, "step": 6200 }, { "entropy": 1.925678089261055, "epoch": 0.01925045665586089, "grad_norm": 16.255130767822266, "learning_rate": 3.0073622009105887e-06, "loss": 0.6876, "mean_token_accuracy": 0.8033014148473739, "num_tokens": 7457926.0, "step": 6210 }, { "entropy": 1.951409675180912, "epoch": 0.019281455780910582, "grad_norm": 13.905477523803711, "learning_rate": 3.012205754141238e-06, "loss": 0.6945, "mean_token_accuracy": 0.793704767525196, "num_tokens": 7469282.0, "step": 6220 }, { "entropy": 1.8324480682611466, "epoch": 0.01931245490596028, "grad_norm": 6.767990589141846, "learning_rate": 3.017049307371888e-06, "loss": 0.5416, "mean_token_accuracy": 0.818550530076027, "num_tokens": 7481902.0, "step": 6230 }, { "entropy": 1.9617059826850891, "epoch": 0.019343454031009975, "grad_norm": 6.368531227111816, "learning_rate": 3.0218928606025383e-06, "loss": 0.6478, "mean_token_accuracy": 0.8018120333552361, "num_tokens": 7493336.0, "step": 6240 }, { "entropy": 1.8573770090937614, "epoch": 0.019374453156059672, "grad_norm": 6.3957295417785645, "learning_rate": 3.0267364138331882e-06, "loss": 0.5436, "mean_token_accuracy": 0.8206924736499787, "num_tokens": 7506208.0, "step": 6250 }, { "entropy": 1.8330803513526917, "epoch": 0.019405452281109365, "grad_norm": 6.754683494567871, "learning_rate": 3.0315799670638385e-06, "loss": 0.5934, "mean_token_accuracy": 0.8097323834896087, "num_tokens": 7519948.0, "step": 6260 }, { "entropy": 1.911997850239277, "epoch": 0.01943645140615906, "grad_norm": 14.22529125213623, "learning_rate": 3.036423520294488e-06, "loss": 0.5984, "mean_token_accuracy": 0.8061494305729866, "num_tokens": 7531446.0, "step": 6270 }, { "entropy": 1.7933774992823601, "epoch": 0.019467450531208758, "grad_norm": 12.579453468322754, "learning_rate": 3.0412670735251383e-06, "loss": 0.5515, "mean_token_accuracy": 0.8117320775985718, "num_tokens": 7545248.0, "step": 6280 }, { "entropy": 2.001555660367012, "epoch": 0.01949844965625845, "grad_norm": 11.950443267822266, "learning_rate": 3.046110626755788e-06, "loss": 0.6827, "mean_token_accuracy": 0.8007282823324203, "num_tokens": 7555914.0, "step": 6290 }, { "entropy": 1.832805335521698, "epoch": 0.019529448781308148, "grad_norm": 16.177709579467773, "learning_rate": 3.0509541799864385e-06, "loss": 0.5295, "mean_token_accuracy": 0.8259735867381096, "num_tokens": 7568442.0, "step": 6300 }, { "entropy": 1.9412631839513779, "epoch": 0.019560447906357845, "grad_norm": 13.161090850830078, "learning_rate": 3.0557977332170884e-06, "loss": 0.7107, "mean_token_accuracy": 0.7944168001413345, "num_tokens": 7579082.0, "step": 6310 }, { "entropy": 1.8757167264819146, "epoch": 0.019591447031407538, "grad_norm": 11.784358978271484, "learning_rate": 3.0606412864477382e-06, "loss": 0.5951, "mean_token_accuracy": 0.8066261202096939, "num_tokens": 7590950.0, "step": 6320 }, { "entropy": 1.8339521378278731, "epoch": 0.019622446156457234, "grad_norm": 14.665081977844238, "learning_rate": 3.065484839678388e-06, "loss": 0.5788, "mean_token_accuracy": 0.8134505435824394, "num_tokens": 7602873.0, "step": 6330 }, { "entropy": 1.853539614379406, "epoch": 0.01965344528150693, "grad_norm": 14.00977611541748, "learning_rate": 3.0703283929090384e-06, "loss": 0.6516, "mean_token_accuracy": 0.8087933987379075, "num_tokens": 7614636.0, "step": 6340 }, { "entropy": 1.8311856165528297, "epoch": 0.019684444406556624, "grad_norm": 13.730056762695312, "learning_rate": 3.0751719461396883e-06, "loss": 0.5566, "mean_token_accuracy": 0.8031821623444557, "num_tokens": 7627435.0, "step": 6350 }, { "entropy": 1.9072323009371757, "epoch": 0.01971544353160632, "grad_norm": 5.6056036949157715, "learning_rate": 3.0800154993703386e-06, "loss": 0.6932, "mean_token_accuracy": 0.7961813092231751, "num_tokens": 7639023.0, "step": 6360 }, { "entropy": 1.8226707085967064, "epoch": 0.019746442656656017, "grad_norm": 15.677154541015625, "learning_rate": 3.0848590526009885e-06, "loss": 0.6193, "mean_token_accuracy": 0.8108833208680153, "num_tokens": 7651962.0, "step": 6370 }, { "entropy": 1.7706042945384979, "epoch": 0.01977744178170571, "grad_norm": 13.05368423461914, "learning_rate": 3.089702605831638e-06, "loss": 0.5883, "mean_token_accuracy": 0.8179239988327026, "num_tokens": 7664383.0, "step": 6380 }, { "entropy": 1.888214285671711, "epoch": 0.019808440906755407, "grad_norm": 12.008337020874023, "learning_rate": 3.0945461590622883e-06, "loss": 0.6435, "mean_token_accuracy": 0.8116688475012779, "num_tokens": 7675538.0, "step": 6390 }, { "entropy": 1.8811195820569993, "epoch": 0.019839440031805104, "grad_norm": 13.883013725280762, "learning_rate": 3.0993897122929386e-06, "loss": 0.6319, "mean_token_accuracy": 0.8048406511545181, "num_tokens": 7687099.0, "step": 6400 }, { "entropy": 1.8231300503015517, "epoch": 0.019870439156854797, "grad_norm": 11.85662841796875, "learning_rate": 3.1042332655235885e-06, "loss": 0.5582, "mean_token_accuracy": 0.816590279340744, "num_tokens": 7700987.0, "step": 6410 }, { "entropy": 1.8579674810171127, "epoch": 0.019901438281904493, "grad_norm": 15.262175559997559, "learning_rate": 3.1090768187542388e-06, "loss": 0.6259, "mean_token_accuracy": 0.8095797553658486, "num_tokens": 7713844.0, "step": 6420 }, { "entropy": 1.879908984899521, "epoch": 0.01993243740695419, "grad_norm": 13.756391525268555, "learning_rate": 3.1139203719848882e-06, "loss": 0.6006, "mean_token_accuracy": 0.8058964297175407, "num_tokens": 7726962.0, "step": 6430 }, { "entropy": 1.9172916844487191, "epoch": 0.019963436532003883, "grad_norm": 6.88080358505249, "learning_rate": 3.118763925215538e-06, "loss": 0.6446, "mean_token_accuracy": 0.7972790181636811, "num_tokens": 7738773.0, "step": 6440 }, { "entropy": 1.9533959448337554, "epoch": 0.01999443565705358, "grad_norm": 14.810494422912598, "learning_rate": 3.1236074784461884e-06, "loss": 0.7092, "mean_token_accuracy": 0.797048932313919, "num_tokens": 7749295.0, "step": 6450 }, { "entropy": 1.904009547829628, "epoch": 0.020025434782103276, "grad_norm": 12.683305740356445, "learning_rate": 3.1284510316768383e-06, "loss": 0.6457, "mean_token_accuracy": 0.8019493162631989, "num_tokens": 7760568.0, "step": 6460 }, { "entropy": 1.8452860102057458, "epoch": 0.02005643390715297, "grad_norm": 6.6882171630859375, "learning_rate": 3.1332945849074886e-06, "loss": 0.591, "mean_token_accuracy": 0.8225623354315758, "num_tokens": 7773203.0, "step": 6470 }, { "entropy": 1.891256783902645, "epoch": 0.020087433032202666, "grad_norm": 15.097151756286621, "learning_rate": 3.138138138138138e-06, "loss": 0.6204, "mean_token_accuracy": 0.8069896474480629, "num_tokens": 7785046.0, "step": 6480 }, { "entropy": 1.841528557240963, "epoch": 0.020118432157252363, "grad_norm": 4.251867771148682, "learning_rate": 3.1429816913687884e-06, "loss": 0.5801, "mean_token_accuracy": 0.8183139115571976, "num_tokens": 7798180.0, "step": 6490 }, { "entropy": 1.8745207443833352, "epoch": 0.020149431282302056, "grad_norm": 13.895639419555664, "learning_rate": 3.1478252445994383e-06, "loss": 0.6282, "mean_token_accuracy": 0.8085477828979493, "num_tokens": 7810555.0, "step": 6500 }, { "entropy": 1.8891053274273872, "epoch": 0.020180430407351752, "grad_norm": 14.992795944213867, "learning_rate": 3.1526687978300886e-06, "loss": 0.6291, "mean_token_accuracy": 0.80573670566082, "num_tokens": 7821921.0, "step": 6510 }, { "entropy": 1.9124128386378287, "epoch": 0.02021142953240145, "grad_norm": 12.522514343261719, "learning_rate": 3.1575123510607385e-06, "loss": 0.6336, "mean_token_accuracy": 0.820842219889164, "num_tokens": 7832746.0, "step": 6520 }, { "entropy": 1.7800809681415557, "epoch": 0.020242428657451142, "grad_norm": 11.537628173828125, "learning_rate": 3.1623559042913884e-06, "loss": 0.5485, "mean_token_accuracy": 0.8302456393837929, "num_tokens": 7845908.0, "step": 6530 }, { "entropy": 1.8512769252061845, "epoch": 0.02027342778250084, "grad_norm": 13.687551498413086, "learning_rate": 3.1671994575220382e-06, "loss": 0.594, "mean_token_accuracy": 0.8206526070833207, "num_tokens": 7857703.0, "step": 6540 }, { "entropy": 1.9494429141283036, "epoch": 0.020304426907550535, "grad_norm": 13.054556846618652, "learning_rate": 3.1720430107526885e-06, "loss": 0.6967, "mean_token_accuracy": 0.7968014314770698, "num_tokens": 7868457.0, "step": 6550 }, { "entropy": 1.872872059047222, "epoch": 0.02033542603260023, "grad_norm": 14.116209030151367, "learning_rate": 3.1768865639833384e-06, "loss": 0.5831, "mean_token_accuracy": 0.8167121425271034, "num_tokens": 7880101.0, "step": 6560 }, { "entropy": 1.8299372598528862, "epoch": 0.020366425157649925, "grad_norm": 15.94222354888916, "learning_rate": 3.1817301172139887e-06, "loss": 0.6118, "mean_token_accuracy": 0.8142679139971734, "num_tokens": 7892509.0, "step": 6570 }, { "entropy": 1.8935213565826416, "epoch": 0.02039742428269962, "grad_norm": 15.16163158416748, "learning_rate": 3.1865736704446386e-06, "loss": 0.6278, "mean_token_accuracy": 0.8115665182471276, "num_tokens": 7903535.0, "step": 6580 }, { "entropy": 1.737954817712307, "epoch": 0.020428423407749315, "grad_norm": 13.716304779052734, "learning_rate": 3.191417223675288e-06, "loss": 0.4859, "mean_token_accuracy": 0.8255019202828408, "num_tokens": 7917342.0, "step": 6590 }, { "entropy": 1.8174697816371919, "epoch": 0.02045942253279901, "grad_norm": 13.559125900268555, "learning_rate": 3.1962607769059384e-06, "loss": 0.5972, "mean_token_accuracy": 0.8168426662683487, "num_tokens": 7929488.0, "step": 6600 }, { "entropy": 1.8946778982877732, "epoch": 0.020490421657848708, "grad_norm": 15.61142635345459, "learning_rate": 3.2011043301365887e-06, "loss": 0.7263, "mean_token_accuracy": 0.799173790216446, "num_tokens": 7940278.0, "step": 6610 }, { "entropy": 1.8623188078403472, "epoch": 0.020521420782898404, "grad_norm": 11.181509017944336, "learning_rate": 3.2059478833672386e-06, "loss": 0.6354, "mean_token_accuracy": 0.8031112432479859, "num_tokens": 7951379.0, "step": 6620 }, { "entropy": 1.814198338985443, "epoch": 0.020552419907948098, "grad_norm": 13.728723526000977, "learning_rate": 3.210791436597889e-06, "loss": 0.6129, "mean_token_accuracy": 0.8157623007893562, "num_tokens": 7963714.0, "step": 6630 }, { "entropy": 1.7929986909031868, "epoch": 0.020583419032997794, "grad_norm": 15.344460487365723, "learning_rate": 3.2156349898285383e-06, "loss": 0.5832, "mean_token_accuracy": 0.8098177522420883, "num_tokens": 7976870.0, "step": 6640 }, { "entropy": 1.947634482383728, "epoch": 0.02061441815804749, "grad_norm": 14.946516036987305, "learning_rate": 3.2204785430591882e-06, "loss": 0.7049, "mean_token_accuracy": 0.79284388422966, "num_tokens": 7987426.0, "step": 6650 }, { "entropy": 1.8572285056114197, "epoch": 0.020645417283097184, "grad_norm": 6.301254749298096, "learning_rate": 3.2253220962898385e-06, "loss": 0.6301, "mean_token_accuracy": 0.8085553154349328, "num_tokens": 8000549.0, "step": 6660 }, { "entropy": 1.803759203851223, "epoch": 0.02067641640814688, "grad_norm": 12.17286491394043, "learning_rate": 3.2301656495204884e-06, "loss": 0.5198, "mean_token_accuracy": 0.8267651617527008, "num_tokens": 8015000.0, "step": 6670 }, { "entropy": 1.8756111353635787, "epoch": 0.020707415533196577, "grad_norm": 15.4856595993042, "learning_rate": 3.2350092027511387e-06, "loss": 0.5969, "mean_token_accuracy": 0.80719293653965, "num_tokens": 8026773.0, "step": 6680 }, { "entropy": 1.9130279645323753, "epoch": 0.02073841465824627, "grad_norm": 14.601099014282227, "learning_rate": 3.239852755981788e-06, "loss": 0.6407, "mean_token_accuracy": 0.8074927061796189, "num_tokens": 8038375.0, "step": 6690 }, { "entropy": 1.84910279661417, "epoch": 0.020769413783295967, "grad_norm": 13.639535903930664, "learning_rate": 3.2446963092124385e-06, "loss": 0.6246, "mean_token_accuracy": 0.8115042328834534, "num_tokens": 8050746.0, "step": 6700 }, { "entropy": 1.9024951666593553, "epoch": 0.020800412908345663, "grad_norm": 11.334275245666504, "learning_rate": 3.2495398624430884e-06, "loss": 0.6458, "mean_token_accuracy": 0.8086831450462342, "num_tokens": 8062275.0, "step": 6710 }, { "entropy": 1.953022199869156, "epoch": 0.020831412033395356, "grad_norm": 12.958690643310547, "learning_rate": 3.2543834156737387e-06, "loss": 0.6934, "mean_token_accuracy": 0.8028324991464615, "num_tokens": 8073879.0, "step": 6720 }, { "entropy": 1.9028348535299302, "epoch": 0.020862411158445053, "grad_norm": 12.473337173461914, "learning_rate": 3.2592269689043886e-06, "loss": 0.6281, "mean_token_accuracy": 0.808925811946392, "num_tokens": 8086058.0, "step": 6730 }, { "entropy": 1.8883586995303632, "epoch": 0.02089341028349475, "grad_norm": 13.6912841796875, "learning_rate": 3.2640705221350385e-06, "loss": 0.5796, "mean_token_accuracy": 0.8137388199567794, "num_tokens": 8097481.0, "step": 6740 }, { "entropy": 1.8357958167791366, "epoch": 0.020924409408544443, "grad_norm": 12.956893920898438, "learning_rate": 3.2689140753656883e-06, "loss": 0.5703, "mean_token_accuracy": 0.8275958806276321, "num_tokens": 8109412.0, "step": 6750 }, { "entropy": 1.7908343866467475, "epoch": 0.02095540853359414, "grad_norm": 14.640457153320312, "learning_rate": 3.2737576285963386e-06, "loss": 0.5865, "mean_token_accuracy": 0.8118628889322281, "num_tokens": 8122837.0, "step": 6760 }, { "entropy": 1.7841477155685426, "epoch": 0.020986407658643836, "grad_norm": 12.808713912963867, "learning_rate": 3.2786011818269885e-06, "loss": 0.6055, "mean_token_accuracy": 0.8140031367540359, "num_tokens": 8136163.0, "step": 6770 }, { "entropy": 1.8466138496994973, "epoch": 0.02101740678369353, "grad_norm": 14.429435729980469, "learning_rate": 3.283444735057639e-06, "loss": 0.6075, "mean_token_accuracy": 0.8045207351446152, "num_tokens": 8148613.0, "step": 6780 }, { "entropy": 1.7560095891356469, "epoch": 0.021048405908743226, "grad_norm": 7.405726909637451, "learning_rate": 3.2882882882882887e-06, "loss": 0.5141, "mean_token_accuracy": 0.8221228688955307, "num_tokens": 8162089.0, "step": 6790 }, { "entropy": 1.9308898389339446, "epoch": 0.021079405033792922, "grad_norm": 14.129651069641113, "learning_rate": 3.293131841518938e-06, "loss": 0.6899, "mean_token_accuracy": 0.8042490169405937, "num_tokens": 8173779.0, "step": 6800 }, { "entropy": 1.9463881149888038, "epoch": 0.021110404158842615, "grad_norm": 8.938533782958984, "learning_rate": 3.2979753947495885e-06, "loss": 0.6966, "mean_token_accuracy": 0.8073234111070633, "num_tokens": 8184942.0, "step": 6810 }, { "entropy": 1.8929473131895065, "epoch": 0.021141403283892312, "grad_norm": 17.423473358154297, "learning_rate": 3.302818947980239e-06, "loss": 0.7093, "mean_token_accuracy": 0.7885128021240234, "num_tokens": 8196565.0, "step": 6820 }, { "entropy": 1.8256116762757302, "epoch": 0.02117240240894201, "grad_norm": 11.359122276306152, "learning_rate": 3.3076625012108887e-06, "loss": 0.6383, "mean_token_accuracy": 0.8059591919183731, "num_tokens": 8208383.0, "step": 6830 }, { "entropy": 1.8639884352684022, "epoch": 0.021203401533991702, "grad_norm": 14.2997407913208, "learning_rate": 3.312506054441539e-06, "loss": 0.6282, "mean_token_accuracy": 0.8068465679883957, "num_tokens": 8220104.0, "step": 6840 }, { "entropy": 1.9167477533221244, "epoch": 0.0212344006590414, "grad_norm": 6.671177387237549, "learning_rate": 3.3173496076721885e-06, "loss": 0.6529, "mean_token_accuracy": 0.8033589154481888, "num_tokens": 8231370.0, "step": 6850 }, { "entropy": 1.8746015638113023, "epoch": 0.021265399784091095, "grad_norm": 14.27453327178955, "learning_rate": 3.3221931609028383e-06, "loss": 0.6251, "mean_token_accuracy": 0.8159545630216598, "num_tokens": 8243023.0, "step": 6860 }, { "entropy": 1.917281800508499, "epoch": 0.021296398909140788, "grad_norm": 11.446333885192871, "learning_rate": 3.3270367141334886e-06, "loss": 0.6558, "mean_token_accuracy": 0.7994436591863632, "num_tokens": 8254401.0, "step": 6870 }, { "entropy": 1.8957316786050797, "epoch": 0.021327398034190485, "grad_norm": 11.739078521728516, "learning_rate": 3.3318802673641385e-06, "loss": 0.7036, "mean_token_accuracy": 0.8000991299748421, "num_tokens": 8265419.0, "step": 6880 }, { "entropy": 1.7615302629768848, "epoch": 0.02135839715924018, "grad_norm": 5.259082317352295, "learning_rate": 3.336723820594789e-06, "loss": 0.5323, "mean_token_accuracy": 0.8190023839473725, "num_tokens": 8278764.0, "step": 6890 }, { "entropy": 1.7910688430070878, "epoch": 0.021389396284289874, "grad_norm": 12.807435035705566, "learning_rate": 3.3415673738254383e-06, "loss": 0.5615, "mean_token_accuracy": 0.8227502018213272, "num_tokens": 8291327.0, "step": 6900 }, { "entropy": 1.7641884714365006, "epoch": 0.02142039540933957, "grad_norm": 12.872736930847168, "learning_rate": 3.3464109270560886e-06, "loss": 0.5901, "mean_token_accuracy": 0.8182562112808227, "num_tokens": 8303703.0, "step": 6910 }, { "entropy": 1.8527298361063003, "epoch": 0.021451394534389268, "grad_norm": 14.935564041137695, "learning_rate": 3.3512544802867385e-06, "loss": 0.6421, "mean_token_accuracy": 0.8045241579413414, "num_tokens": 8315526.0, "step": 6920 }, { "entropy": 1.8295287489891052, "epoch": 0.02148239365943896, "grad_norm": 13.789701461791992, "learning_rate": 3.356098033517389e-06, "loss": 0.6008, "mean_token_accuracy": 0.8134370803833008, "num_tokens": 8327672.0, "step": 6930 }, { "entropy": 1.896050798892975, "epoch": 0.021513392784488657, "grad_norm": 12.54214859008789, "learning_rate": 3.3609415867480387e-06, "loss": 0.6978, "mean_token_accuracy": 0.7970891669392586, "num_tokens": 8339084.0, "step": 6940 }, { "entropy": 1.8715087831020356, "epoch": 0.021544391909538354, "grad_norm": 13.210384368896484, "learning_rate": 3.3657851399786886e-06, "loss": 0.6879, "mean_token_accuracy": 0.7981110483407974, "num_tokens": 8350484.0, "step": 6950 }, { "entropy": 1.747877648472786, "epoch": 0.021575391034588047, "grad_norm": 13.573982238769531, "learning_rate": 3.3706286932093384e-06, "loss": 0.4994, "mean_token_accuracy": 0.8265791207551956, "num_tokens": 8363033.0, "step": 6960 }, { "entropy": 1.7737156122922897, "epoch": 0.021606390159637744, "grad_norm": 16.360973358154297, "learning_rate": 3.3754722464399888e-06, "loss": 0.5927, "mean_token_accuracy": 0.8136523142457008, "num_tokens": 8375536.0, "step": 6970 }, { "entropy": 1.7883441895246506, "epoch": 0.02163738928468744, "grad_norm": 13.998126029968262, "learning_rate": 3.3803157996706386e-06, "loss": 0.5853, "mean_token_accuracy": 0.8114990651607513, "num_tokens": 8388141.0, "step": 6980 }, { "entropy": 1.9003767311573028, "epoch": 0.021668388409737137, "grad_norm": 17.087512969970703, "learning_rate": 3.385159352901289e-06, "loss": 0.677, "mean_token_accuracy": 0.8007842287421226, "num_tokens": 8399614.0, "step": 6990 }, { "entropy": 1.8496545001864433, "epoch": 0.02169938753478683, "grad_norm": 11.947819709777832, "learning_rate": 3.390002906131939e-06, "loss": 0.5943, "mean_token_accuracy": 0.8178739234805107, "num_tokens": 8410656.0, "step": 7000 }, { "entropy": 1.8931600719690322, "epoch": 0.021730386659836527, "grad_norm": 12.223316192626953, "learning_rate": 3.3948464593625883e-06, "loss": 0.6515, "mean_token_accuracy": 0.804112882912159, "num_tokens": 8422509.0, "step": 7010 }, { "entropy": 1.8370696052908897, "epoch": 0.021761385784886223, "grad_norm": 13.245317459106445, "learning_rate": 3.3996900125932386e-06, "loss": 0.6179, "mean_token_accuracy": 0.8125099033117295, "num_tokens": 8434469.0, "step": 7020 }, { "entropy": 1.9446087509393692, "epoch": 0.021792384909935916, "grad_norm": 13.750452995300293, "learning_rate": 3.404533565823889e-06, "loss": 0.6823, "mean_token_accuracy": 0.793831068277359, "num_tokens": 8445989.0, "step": 7030 }, { "entropy": 1.8339137956500053, "epoch": 0.021823384034985613, "grad_norm": 12.146233558654785, "learning_rate": 3.409377119054539e-06, "loss": 0.585, "mean_token_accuracy": 0.8186416819691658, "num_tokens": 8459049.0, "step": 7040 }, { "entropy": 1.9154435515403747, "epoch": 0.02185438316003531, "grad_norm": 13.75799560546875, "learning_rate": 3.414220672285189e-06, "loss": 0.6339, "mean_token_accuracy": 0.8105358377099037, "num_tokens": 8470694.0, "step": 7050 }, { "entropy": 1.8546890705823897, "epoch": 0.021885382285085003, "grad_norm": 14.610628128051758, "learning_rate": 3.4190642255158386e-06, "loss": 0.6188, "mean_token_accuracy": 0.8084279328584671, "num_tokens": 8483926.0, "step": 7060 }, { "entropy": 1.9373271316289902, "epoch": 0.0219163814101347, "grad_norm": 13.321712493896484, "learning_rate": 3.4239077787464884e-06, "loss": 0.635, "mean_token_accuracy": 0.7939963206648827, "num_tokens": 8495258.0, "step": 7070 }, { "entropy": 1.8608694806694985, "epoch": 0.021947380535184396, "grad_norm": 12.879969596862793, "learning_rate": 3.4287513319771387e-06, "loss": 0.5935, "mean_token_accuracy": 0.8156390026211738, "num_tokens": 8507779.0, "step": 7080 }, { "entropy": 1.9322496995329856, "epoch": 0.02197837966023409, "grad_norm": 15.521547317504883, "learning_rate": 3.4335948852077886e-06, "loss": 0.65, "mean_token_accuracy": 0.8115120708942414, "num_tokens": 8519474.0, "step": 7090 }, { "entropy": 1.7319314986467362, "epoch": 0.022009378785283785, "grad_norm": 16.432004928588867, "learning_rate": 3.438438438438439e-06, "loss": 0.5074, "mean_token_accuracy": 0.8291773125529289, "num_tokens": 8533400.0, "step": 7100 }, { "entropy": 1.8349164828658104, "epoch": 0.022040377910333482, "grad_norm": 4.809366703033447, "learning_rate": 3.4432819916690884e-06, "loss": 0.598, "mean_token_accuracy": 0.809609466791153, "num_tokens": 8547001.0, "step": 7110 }, { "entropy": 1.8549170672893525, "epoch": 0.022071377035383175, "grad_norm": 12.525724411010742, "learning_rate": 3.4481255448997387e-06, "loss": 0.5441, "mean_token_accuracy": 0.8268691346049308, "num_tokens": 8559045.0, "step": 7120 }, { "entropy": 1.8226833492517471, "epoch": 0.022102376160432872, "grad_norm": 11.418012619018555, "learning_rate": 3.4529690981303886e-06, "loss": 0.5714, "mean_token_accuracy": 0.8124491006135941, "num_tokens": 8572509.0, "step": 7130 }, { "entropy": 1.7938767224550247, "epoch": 0.02213337528548257, "grad_norm": 14.121147155761719, "learning_rate": 3.457812651361039e-06, "loss": 0.5331, "mean_token_accuracy": 0.8254873216152191, "num_tokens": 8585878.0, "step": 7140 }, { "entropy": 1.7850069940090179, "epoch": 0.02216437441053226, "grad_norm": 3.6775126457214355, "learning_rate": 3.4626562045916888e-06, "loss": 0.563, "mean_token_accuracy": 0.8238705024123192, "num_tokens": 8599106.0, "step": 7150 }, { "entropy": 1.8556845039129257, "epoch": 0.022195373535581958, "grad_norm": 11.72307300567627, "learning_rate": 3.4674997578223387e-06, "loss": 0.5998, "mean_token_accuracy": 0.8144358694553375, "num_tokens": 8611671.0, "step": 7160 }, { "entropy": 1.9032898783683776, "epoch": 0.022226372660631655, "grad_norm": 15.8334379196167, "learning_rate": 3.4723433110529886e-06, "loss": 0.6952, "mean_token_accuracy": 0.7943542674183846, "num_tokens": 8623666.0, "step": 7170 }, { "entropy": 1.8691613361239434, "epoch": 0.022257371785681348, "grad_norm": 11.972046852111816, "learning_rate": 3.477186864283639e-06, "loss": 0.6065, "mean_token_accuracy": 0.8114256381988525, "num_tokens": 8635505.0, "step": 7180 }, { "entropy": 1.8365458711981772, "epoch": 0.022288370910731044, "grad_norm": 14.07669448852539, "learning_rate": 3.4820304175142887e-06, "loss": 0.5641, "mean_token_accuracy": 0.8142189055681228, "num_tokens": 8648420.0, "step": 7190 }, { "entropy": 1.8941682323813438, "epoch": 0.02231937003578074, "grad_norm": 12.625129699707031, "learning_rate": 3.486873970744939e-06, "loss": 0.6311, "mean_token_accuracy": 0.8075986996293067, "num_tokens": 8661128.0, "step": 7200 }, { "entropy": 1.8504259541630745, "epoch": 0.022350369160830434, "grad_norm": 6.088210105895996, "learning_rate": 3.491717523975589e-06, "loss": 0.6102, "mean_token_accuracy": 0.8124124839901924, "num_tokens": 8672830.0, "step": 7210 }, { "entropy": 1.8653683230280875, "epoch": 0.02238136828588013, "grad_norm": 15.06690502166748, "learning_rate": 3.496561077206239e-06, "loss": 0.6134, "mean_token_accuracy": 0.8111754149198532, "num_tokens": 8684754.0, "step": 7220 }, { "entropy": 1.9144769728183746, "epoch": 0.022412367410929827, "grad_norm": 13.07332992553711, "learning_rate": 3.5014046304368887e-06, "loss": 0.7025, "mean_token_accuracy": 0.8014708399772644, "num_tokens": 8695969.0, "step": 7230 }, { "entropy": 1.8702037513256073, "epoch": 0.02244336653597952, "grad_norm": 13.671896934509277, "learning_rate": 3.506248183667539e-06, "loss": 0.657, "mean_token_accuracy": 0.809376485645771, "num_tokens": 8707677.0, "step": 7240 }, { "entropy": 1.9119601994752884, "epoch": 0.022474365661029217, "grad_norm": 16.43385124206543, "learning_rate": 3.511091736898189e-06, "loss": 0.6931, "mean_token_accuracy": 0.7956320688128471, "num_tokens": 8718648.0, "step": 7250 }, { "entropy": 1.8430123403668404, "epoch": 0.022505364786078914, "grad_norm": 5.884398937225342, "learning_rate": 3.515935290128839e-06, "loss": 0.6187, "mean_token_accuracy": 0.8040406808257103, "num_tokens": 8731028.0, "step": 7260 }, { "entropy": 1.7957608908414842, "epoch": 0.022536363911128607, "grad_norm": 6.280706882476807, "learning_rate": 3.5207788433594887e-06, "loss": 0.5309, "mean_token_accuracy": 0.8192771345376968, "num_tokens": 8743771.0, "step": 7270 }, { "entropy": 1.8952230989933014, "epoch": 0.022567363036178303, "grad_norm": 11.960763931274414, "learning_rate": 3.5256223965901385e-06, "loss": 0.6716, "mean_token_accuracy": 0.7986912429332733, "num_tokens": 8754987.0, "step": 7280 }, { "entropy": 1.7763853132724763, "epoch": 0.022598362161228, "grad_norm": 11.904229164123535, "learning_rate": 3.530465949820789e-06, "loss": 0.5633, "mean_token_accuracy": 0.8182390749454498, "num_tokens": 8767593.0, "step": 7290 }, { "entropy": 1.8223341763019563, "epoch": 0.022629361286277693, "grad_norm": 13.150022506713867, "learning_rate": 3.5353095030514387e-06, "loss": 0.6274, "mean_token_accuracy": 0.8080811381340027, "num_tokens": 8779386.0, "step": 7300 }, { "entropy": 1.8453481420874596, "epoch": 0.02266036041132739, "grad_norm": 16.510391235351562, "learning_rate": 3.540153056282089e-06, "loss": 0.6204, "mean_token_accuracy": 0.8153272330760956, "num_tokens": 8791800.0, "step": 7310 }, { "entropy": 1.8597090423107148, "epoch": 0.022691359536377086, "grad_norm": 11.312858581542969, "learning_rate": 3.5449966095127385e-06, "loss": 0.6451, "mean_token_accuracy": 0.8115374326705933, "num_tokens": 8802831.0, "step": 7320 }, { "entropy": 1.7629368484020234, "epoch": 0.022722358661426783, "grad_norm": 10.922659873962402, "learning_rate": 3.549840162743389e-06, "loss": 0.5833, "mean_token_accuracy": 0.8109967395663261, "num_tokens": 8816085.0, "step": 7330 }, { "entropy": 1.8767753183841704, "epoch": 0.022753357786476476, "grad_norm": 12.062516212463379, "learning_rate": 3.5546837159740387e-06, "loss": 0.6765, "mean_token_accuracy": 0.7968204706907273, "num_tokens": 8828125.0, "step": 7340 }, { "entropy": 1.798590750992298, "epoch": 0.022784356911526173, "grad_norm": 6.616394996643066, "learning_rate": 3.559527269204689e-06, "loss": 0.534, "mean_token_accuracy": 0.8197192326188087, "num_tokens": 8840873.0, "step": 7350 }, { "entropy": 1.8430965647101403, "epoch": 0.02281535603657587, "grad_norm": 12.304119110107422, "learning_rate": 3.564370822435339e-06, "loss": 0.5905, "mean_token_accuracy": 0.8144068956375122, "num_tokens": 8853322.0, "step": 7360 }, { "entropy": 1.8360921517014503, "epoch": 0.022846355161625562, "grad_norm": 13.151806831359863, "learning_rate": 3.5692143756659888e-06, "loss": 0.5766, "mean_token_accuracy": 0.8184613898396492, "num_tokens": 8866377.0, "step": 7370 }, { "entropy": 1.8777944207191468, "epoch": 0.02287735428667526, "grad_norm": 11.325216293334961, "learning_rate": 3.5740579288966387e-06, "loss": 0.6267, "mean_token_accuracy": 0.8124101653695106, "num_tokens": 8877572.0, "step": 7380 }, { "entropy": 1.8015395179390907, "epoch": 0.022908353411724956, "grad_norm": 13.632472038269043, "learning_rate": 3.578901482127289e-06, "loss": 0.5888, "mean_token_accuracy": 0.8304689675569534, "num_tokens": 8890279.0, "step": 7390 }, { "entropy": 1.9334001183509826, "epoch": 0.02293935253677465, "grad_norm": 18.318246841430664, "learning_rate": 3.583745035357939e-06, "loss": 0.6549, "mean_token_accuracy": 0.8056276828050614, "num_tokens": 8901650.0, "step": 7400 }, { "entropy": 1.8892749726772309, "epoch": 0.022970351661824345, "grad_norm": 13.172524452209473, "learning_rate": 3.588588588588589e-06, "loss": 0.6269, "mean_token_accuracy": 0.8132000923156738, "num_tokens": 8912513.0, "step": 7410 }, { "entropy": 1.9355769097805022, "epoch": 0.023001350786874042, "grad_norm": 12.621254920959473, "learning_rate": 3.593432141819239e-06, "loss": 0.6664, "mean_token_accuracy": 0.8047688767313957, "num_tokens": 8923668.0, "step": 7420 }, { "entropy": 1.7977433398365974, "epoch": 0.023032349911923735, "grad_norm": 14.412604331970215, "learning_rate": 3.598275695049889e-06, "loss": 0.5684, "mean_token_accuracy": 0.8217224717140198, "num_tokens": 8936743.0, "step": 7430 }, { "entropy": 1.8730682261288165, "epoch": 0.02306334903697343, "grad_norm": 18.06209373474121, "learning_rate": 3.603119248280539e-06, "loss": 0.687, "mean_token_accuracy": 0.7976897180080413, "num_tokens": 8948361.0, "step": 7440 }, { "entropy": 1.8379228845238686, "epoch": 0.023094348162023128, "grad_norm": 16.5709171295166, "learning_rate": 3.607962801511189e-06, "loss": 0.647, "mean_token_accuracy": 0.8052407920360565, "num_tokens": 8960081.0, "step": 7450 }, { "entropy": 1.8426879778504373, "epoch": 0.02312534728707282, "grad_norm": 6.525999546051025, "learning_rate": 3.612806354741839e-06, "loss": 0.6043, "mean_token_accuracy": 0.8133143231272697, "num_tokens": 8972165.0, "step": 7460 }, { "entropy": 1.8481390669941902, "epoch": 0.023156346412122518, "grad_norm": 11.893180847167969, "learning_rate": 3.6176499079724893e-06, "loss": 0.5844, "mean_token_accuracy": 0.8177680626511574, "num_tokens": 8983683.0, "step": 7470 }, { "entropy": 1.8435255289077759, "epoch": 0.023187345537172215, "grad_norm": 13.133444786071777, "learning_rate": 3.6224934612031388e-06, "loss": 0.6272, "mean_token_accuracy": 0.8149840265512467, "num_tokens": 8994970.0, "step": 7480 }, { "entropy": 1.8090941041707993, "epoch": 0.023218344662221908, "grad_norm": 5.264407157897949, "learning_rate": 3.6273370144337886e-06, "loss": 0.6154, "mean_token_accuracy": 0.8030720934271812, "num_tokens": 9007512.0, "step": 7490 }, { "entropy": 1.8767491683363915, "epoch": 0.023249343787271604, "grad_norm": 12.091042518615723, "learning_rate": 3.632180567664439e-06, "loss": 0.6655, "mean_token_accuracy": 0.8082658454775811, "num_tokens": 9018984.0, "step": 7500 }, { "entropy": 1.8025119572877883, "epoch": 0.0232803429123213, "grad_norm": 14.580635070800781, "learning_rate": 3.637024120895089e-06, "loss": 0.5927, "mean_token_accuracy": 0.8104995712637901, "num_tokens": 9032061.0, "step": 7510 }, { "entropy": 1.807465235888958, "epoch": 0.023311342037370994, "grad_norm": 12.528902053833008, "learning_rate": 3.641867674125739e-06, "loss": 0.5885, "mean_token_accuracy": 0.8124234288930893, "num_tokens": 9044338.0, "step": 7520 }, { "entropy": 1.8326406210660935, "epoch": 0.02334234116242069, "grad_norm": 15.056619644165039, "learning_rate": 3.6467112273563886e-06, "loss": 0.5776, "mean_token_accuracy": 0.8178173303604126, "num_tokens": 9056355.0, "step": 7530 }, { "entropy": 1.7988777324557303, "epoch": 0.023373340287470387, "grad_norm": 12.382633209228516, "learning_rate": 3.651554780587039e-06, "loss": 0.638, "mean_token_accuracy": 0.8013660088181496, "num_tokens": 9068937.0, "step": 7540 }, { "entropy": 1.8599283427000046, "epoch": 0.02340433941252008, "grad_norm": 11.771322250366211, "learning_rate": 3.656398333817689e-06, "loss": 0.644, "mean_token_accuracy": 0.8095093712210655, "num_tokens": 9080598.0, "step": 7550 }, { "entropy": 1.9167393915355206, "epoch": 0.023435338537569777, "grad_norm": 15.524273872375488, "learning_rate": 3.661241887048339e-06, "loss": 0.6336, "mean_token_accuracy": 0.807981975376606, "num_tokens": 9091965.0, "step": 7560 }, { "entropy": 1.9095857173204422, "epoch": 0.023466337662619473, "grad_norm": 11.96094799041748, "learning_rate": 3.666085440278989e-06, "loss": 0.6676, "mean_token_accuracy": 0.7986720770597457, "num_tokens": 9103815.0, "step": 7570 }, { "entropy": 1.8987512320280076, "epoch": 0.023497336787669167, "grad_norm": 13.277989387512207, "learning_rate": 3.670928993509639e-06, "loss": 0.6695, "mean_token_accuracy": 0.8065518125891685, "num_tokens": 9115160.0, "step": 7580 }, { "entropy": 1.8922671616077422, "epoch": 0.023528335912718863, "grad_norm": 13.145485877990723, "learning_rate": 3.6757725467402888e-06, "loss": 0.685, "mean_token_accuracy": 0.8018161416053772, "num_tokens": 9126074.0, "step": 7590 }, { "entropy": 1.8771102100610733, "epoch": 0.02355933503776856, "grad_norm": 10.288253784179688, "learning_rate": 3.680616099970939e-06, "loss": 0.5876, "mean_token_accuracy": 0.8214558124542236, "num_tokens": 9138014.0, "step": 7600 }, { "entropy": 1.8878172472119332, "epoch": 0.023590334162818253, "grad_norm": 12.76760482788086, "learning_rate": 3.685459653201589e-06, "loss": 0.6105, "mean_token_accuracy": 0.8086960777640343, "num_tokens": 9149156.0, "step": 7610 }, { "entropy": 1.8934050709009171, "epoch": 0.02362133328786795, "grad_norm": 12.631563186645508, "learning_rate": 3.6903032064322393e-06, "loss": 0.7241, "mean_token_accuracy": 0.7934870898723603, "num_tokens": 9161339.0, "step": 7620 }, { "entropy": 1.9421094968914985, "epoch": 0.023652332412917646, "grad_norm": 14.10696792602539, "learning_rate": 3.695146759662889e-06, "loss": 0.6605, "mean_token_accuracy": 0.7969638511538506, "num_tokens": 9172617.0, "step": 7630 }, { "entropy": 1.8514475598931313, "epoch": 0.02368333153796734, "grad_norm": 13.774374008178711, "learning_rate": 3.699990312893539e-06, "loss": 0.6989, "mean_token_accuracy": 0.7987961351871491, "num_tokens": 9184783.0, "step": 7640 }, { "entropy": 1.7891797423362732, "epoch": 0.023714330663017036, "grad_norm": 5.731255054473877, "learning_rate": 3.704833866124189e-06, "loss": 0.595, "mean_token_accuracy": 0.8199530601501465, "num_tokens": 9197781.0, "step": 7650 }, { "entropy": 1.8698611691594125, "epoch": 0.023745329788066732, "grad_norm": 12.50722599029541, "learning_rate": 3.7096774193548392e-06, "loss": 0.5373, "mean_token_accuracy": 0.822203965485096, "num_tokens": 9210004.0, "step": 7660 }, { "entropy": 1.9275674849748612, "epoch": 0.023776328913116426, "grad_norm": 11.671341896057129, "learning_rate": 3.714520972585489e-06, "loss": 0.6877, "mean_token_accuracy": 0.7972215577960015, "num_tokens": 9221045.0, "step": 7670 }, { "entropy": 1.8518989741802216, "epoch": 0.023807328038166122, "grad_norm": 12.143589973449707, "learning_rate": 3.7193645258161394e-06, "loss": 0.5829, "mean_token_accuracy": 0.8160637483000756, "num_tokens": 9233582.0, "step": 7680 }, { "entropy": 1.9745838671922684, "epoch": 0.02383832716321582, "grad_norm": 8.162403106689453, "learning_rate": 3.724208079046789e-06, "loss": 0.6552, "mean_token_accuracy": 0.8079164877533913, "num_tokens": 9244759.0, "step": 7690 }, { "entropy": 1.906966118514538, "epoch": 0.023869326288265515, "grad_norm": 5.672482490539551, "learning_rate": 3.7290516322774388e-06, "loss": 0.6534, "mean_token_accuracy": 0.8026331990957261, "num_tokens": 9256899.0, "step": 7700 }, { "entropy": 1.8956855058670044, "epoch": 0.02390032541331521, "grad_norm": 11.346663475036621, "learning_rate": 3.733895185508089e-06, "loss": 0.5923, "mean_token_accuracy": 0.8169514790177346, "num_tokens": 9268280.0, "step": 7710 }, { "entropy": 1.9229377016425133, "epoch": 0.023931324538364905, "grad_norm": 6.657054424285889, "learning_rate": 3.7387387387387394e-06, "loss": 0.6111, "mean_token_accuracy": 0.8021419748663903, "num_tokens": 9280505.0, "step": 7720 }, { "entropy": 1.7850939154624939, "epoch": 0.0239623236634146, "grad_norm": 3.4274275302886963, "learning_rate": 3.7435822919693893e-06, "loss": 0.5326, "mean_token_accuracy": 0.829072143137455, "num_tokens": 9293099.0, "step": 7730 }, { "entropy": 1.9557977497577668, "epoch": 0.023993322788464295, "grad_norm": 13.326607704162598, "learning_rate": 3.7484258452000387e-06, "loss": 0.6884, "mean_token_accuracy": 0.798516571521759, "num_tokens": 9304408.0, "step": 7740 }, { "entropy": 1.8509003862738609, "epoch": 0.02402432191351399, "grad_norm": 13.444816589355469, "learning_rate": 3.753269398430689e-06, "loss": 0.6127, "mean_token_accuracy": 0.8102645307779313, "num_tokens": 9316689.0, "step": 7750 }, { "entropy": 1.7711122930049896, "epoch": 0.024055321038563688, "grad_norm": 5.156175136566162, "learning_rate": 3.758112951661339e-06, "loss": 0.5507, "mean_token_accuracy": 0.8136220246553421, "num_tokens": 9329593.0, "step": 7760 }, { "entropy": 1.8134596601128579, "epoch": 0.02408632016361338, "grad_norm": 12.387152671813965, "learning_rate": 3.7629565048919892e-06, "loss": 0.5433, "mean_token_accuracy": 0.8308922097086906, "num_tokens": 9341486.0, "step": 7770 }, { "entropy": 1.8755627959966659, "epoch": 0.024117319288663078, "grad_norm": 12.338440895080566, "learning_rate": 3.767800058122639e-06, "loss": 0.6576, "mean_token_accuracy": 0.8139973327517509, "num_tokens": 9352093.0, "step": 7780 }, { "entropy": 1.7985035717487334, "epoch": 0.024148318413712774, "grad_norm": 15.153599739074707, "learning_rate": 3.772643611353289e-06, "loss": 0.5679, "mean_token_accuracy": 0.8182680875062942, "num_tokens": 9364468.0, "step": 7790 }, { "entropy": 1.841192065924406, "epoch": 0.024179317538762467, "grad_norm": 4.2616729736328125, "learning_rate": 3.777487164583939e-06, "loss": 0.5846, "mean_token_accuracy": 0.8146574392914772, "num_tokens": 9377926.0, "step": 7800 }, { "entropy": 1.920288896560669, "epoch": 0.024210316663812164, "grad_norm": 14.35584545135498, "learning_rate": 3.782330717814589e-06, "loss": 0.7463, "mean_token_accuracy": 0.7909737974405289, "num_tokens": 9389642.0, "step": 7810 }, { "entropy": 1.8628991067409515, "epoch": 0.02424131578886186, "grad_norm": 15.304876327514648, "learning_rate": 3.787174271045239e-06, "loss": 0.586, "mean_token_accuracy": 0.8081997647881508, "num_tokens": 9402632.0, "step": 7820 }, { "entropy": 1.9279965221881867, "epoch": 0.024272314913911554, "grad_norm": 14.144513130187988, "learning_rate": 3.7920178242758894e-06, "loss": 0.6125, "mean_token_accuracy": 0.8075599849224091, "num_tokens": 9414752.0, "step": 7830 }, { "entropy": 1.8668213859200478, "epoch": 0.02430331403896125, "grad_norm": 13.267974853515625, "learning_rate": 3.7968613775065392e-06, "loss": 0.6041, "mean_token_accuracy": 0.8120471283793449, "num_tokens": 9426149.0, "step": 7840 }, { "entropy": 1.9365358769893646, "epoch": 0.024334313164010947, "grad_norm": 13.412445068359375, "learning_rate": 3.801704930737189e-06, "loss": 0.6644, "mean_token_accuracy": 0.8024364963173867, "num_tokens": 9437306.0, "step": 7850 }, { "entropy": 1.9267346888780594, "epoch": 0.02436531228906064, "grad_norm": 13.718780517578125, "learning_rate": 3.806548483967839e-06, "loss": 0.6209, "mean_token_accuracy": 0.7963616490364075, "num_tokens": 9449401.0, "step": 7860 }, { "entropy": 1.8495199769735335, "epoch": 0.024396311414110337, "grad_norm": 11.75393009185791, "learning_rate": 3.8113920371984893e-06, "loss": 0.5303, "mean_token_accuracy": 0.8257737800478935, "num_tokens": 9461930.0, "step": 7870 }, { "entropy": 1.8680783659219742, "epoch": 0.024427310539160033, "grad_norm": 10.541101455688477, "learning_rate": 3.81623559042914e-06, "loss": 0.5844, "mean_token_accuracy": 0.8095761522650718, "num_tokens": 9473753.0, "step": 7880 }, { "entropy": 1.874305109679699, "epoch": 0.024458309664209726, "grad_norm": 17.521041870117188, "learning_rate": 3.8210791436597895e-06, "loss": 0.5836, "mean_token_accuracy": 0.8210688814520836, "num_tokens": 9486165.0, "step": 7890 }, { "entropy": 1.9577010348439217, "epoch": 0.024489308789259423, "grad_norm": 11.793618202209473, "learning_rate": 3.8259226968904386e-06, "loss": 0.656, "mean_token_accuracy": 0.8024436876177787, "num_tokens": 9497697.0, "step": 7900 }, { "entropy": 1.9006599888205529, "epoch": 0.02452030791430912, "grad_norm": 13.784941673278809, "learning_rate": 3.830766250121089e-06, "loss": 0.5801, "mean_token_accuracy": 0.8112906947731972, "num_tokens": 9510027.0, "step": 7910 }, { "entropy": 1.943654653429985, "epoch": 0.024551307039358813, "grad_norm": 12.328388214111328, "learning_rate": 3.835609803351739e-06, "loss": 0.6456, "mean_token_accuracy": 0.8070406153798103, "num_tokens": 9521309.0, "step": 7920 }, { "entropy": 1.8668604627251626, "epoch": 0.02458230616440851, "grad_norm": 13.092666625976562, "learning_rate": 3.840453356582389e-06, "loss": 0.6008, "mean_token_accuracy": 0.8188393861055374, "num_tokens": 9533756.0, "step": 7930 }, { "entropy": 1.8963513985276221, "epoch": 0.024613305289458206, "grad_norm": 12.246744155883789, "learning_rate": 3.84529690981304e-06, "loss": 0.6602, "mean_token_accuracy": 0.8034892365336418, "num_tokens": 9545338.0, "step": 7940 }, { "entropy": 1.8683793991804123, "epoch": 0.0246443044145079, "grad_norm": 5.166250228881836, "learning_rate": 3.850140463043689e-06, "loss": 0.6427, "mean_token_accuracy": 0.8070285364985466, "num_tokens": 9558352.0, "step": 7950 }, { "entropy": 1.848561166226864, "epoch": 0.024675303539557596, "grad_norm": 12.724311828613281, "learning_rate": 3.854984016274339e-06, "loss": 0.603, "mean_token_accuracy": 0.8135858446359634, "num_tokens": 9570765.0, "step": 7960 }, { "entropy": 1.9112862929701806, "epoch": 0.024706302664607292, "grad_norm": 12.425032615661621, "learning_rate": 3.8598275695049894e-06, "loss": 0.6219, "mean_token_accuracy": 0.8120498448610306, "num_tokens": 9581809.0, "step": 7970 }, { "entropy": 1.9325566202402116, "epoch": 0.024737301789656985, "grad_norm": 12.291463851928711, "learning_rate": 3.864671122735639e-06, "loss": 0.7302, "mean_token_accuracy": 0.7900342762470245, "num_tokens": 9592614.0, "step": 7980 }, { "entropy": 1.8553043097257613, "epoch": 0.024768300914706682, "grad_norm": 14.992151260375977, "learning_rate": 3.869514675966289e-06, "loss": 0.6167, "mean_token_accuracy": 0.8028819039463997, "num_tokens": 9604643.0, "step": 7990 }, { "entropy": 1.86794516146183, "epoch": 0.02479930003975638, "grad_norm": 11.671442985534668, "learning_rate": 3.874358229196939e-06, "loss": 0.5971, "mean_token_accuracy": 0.8125581681728363, "num_tokens": 9617101.0, "step": 8000 }, { "entropy": 1.977939623594284, "epoch": 0.02483029916480607, "grad_norm": 14.295232772827148, "learning_rate": 3.879201782427589e-06, "loss": 0.7147, "mean_token_accuracy": 0.7930570155382156, "num_tokens": 9628082.0, "step": 8010 }, { "entropy": 1.9121051743626594, "epoch": 0.02486129828985577, "grad_norm": 13.757525444030762, "learning_rate": 3.884045335658239e-06, "loss": 0.6761, "mean_token_accuracy": 0.8087022632360459, "num_tokens": 9639932.0, "step": 8020 }, { "entropy": 1.841791082918644, "epoch": 0.024892297414905465, "grad_norm": 6.790825366973877, "learning_rate": 3.88888888888889e-06, "loss": 0.5533, "mean_token_accuracy": 0.8306658193469048, "num_tokens": 9652182.0, "step": 8030 }, { "entropy": 1.937432810664177, "epoch": 0.02492329653995516, "grad_norm": 13.771592140197754, "learning_rate": 3.8937324421195395e-06, "loss": 0.635, "mean_token_accuracy": 0.8062109544873237, "num_tokens": 9664539.0, "step": 8040 }, { "entropy": 1.8707983300089837, "epoch": 0.024954295665004855, "grad_norm": 12.660676956176758, "learning_rate": 3.898575995350189e-06, "loss": 0.6223, "mean_token_accuracy": 0.8023678585886955, "num_tokens": 9676330.0, "step": 8050 }, { "entropy": 1.9288925692439078, "epoch": 0.02498529479005455, "grad_norm": 14.32672119140625, "learning_rate": 3.903419548580839e-06, "loss": 0.65, "mean_token_accuracy": 0.8022472143173218, "num_tokens": 9687521.0, "step": 8060 }, { "entropy": 1.84913961738348, "epoch": 0.025016293915104248, "grad_norm": 6.136168003082275, "learning_rate": 3.908263101811489e-06, "loss": 0.5446, "mean_token_accuracy": 0.8220839589834213, "num_tokens": 9699208.0, "step": 8070 }, { "entropy": 1.8712972477078438, "epoch": 0.02504729304015394, "grad_norm": 11.905717849731445, "learning_rate": 3.913106655042139e-06, "loss": 0.6119, "mean_token_accuracy": 0.8191757917404174, "num_tokens": 9711146.0, "step": 8080 }, { "entropy": 1.9155054926872253, "epoch": 0.025078292165203638, "grad_norm": 12.616907119750977, "learning_rate": 3.91795020827279e-06, "loss": 0.689, "mean_token_accuracy": 0.8038010418415069, "num_tokens": 9722222.0, "step": 8090 }, { "entropy": 1.9103090360760688, "epoch": 0.025109291290253334, "grad_norm": 12.909311294555664, "learning_rate": 3.92279376150344e-06, "loss": 0.6034, "mean_token_accuracy": 0.812662410736084, "num_tokens": 9733890.0, "step": 8100 }, { "entropy": 1.8312827795743942, "epoch": 0.025140290415303027, "grad_norm": 14.832688331604004, "learning_rate": 3.927637314734089e-06, "loss": 0.5364, "mean_token_accuracy": 0.8249428883194924, "num_tokens": 9746809.0, "step": 8110 }, { "entropy": 1.8507552802562715, "epoch": 0.025171289540352724, "grad_norm": 13.549945831298828, "learning_rate": 3.932480867964739e-06, "loss": 0.5865, "mean_token_accuracy": 0.8166124686598778, "num_tokens": 9759355.0, "step": 8120 }, { "entropy": 1.8369555801153183, "epoch": 0.02520228866540242, "grad_norm": 11.552787780761719, "learning_rate": 3.937324421195389e-06, "loss": 0.5874, "mean_token_accuracy": 0.825542688369751, "num_tokens": 9771999.0, "step": 8130 }, { "entropy": 1.8414116382598877, "epoch": 0.025233287790452114, "grad_norm": 12.822636604309082, "learning_rate": 3.942167974426039e-06, "loss": 0.5757, "mean_token_accuracy": 0.8180017948150635, "num_tokens": 9784135.0, "step": 8140 }, { "entropy": 1.847972247004509, "epoch": 0.02526428691550181, "grad_norm": 6.315905570983887, "learning_rate": 3.94701152765669e-06, "loss": 0.6042, "mean_token_accuracy": 0.8069576799869538, "num_tokens": 9795872.0, "step": 8150 }, { "entropy": 1.8612953543663024, "epoch": 0.025295286040551507, "grad_norm": 7.054393768310547, "learning_rate": 3.951855080887339e-06, "loss": 0.6359, "mean_token_accuracy": 0.808410094678402, "num_tokens": 9807148.0, "step": 8160 }, { "entropy": 1.745456214249134, "epoch": 0.0253262851656012, "grad_norm": 10.395822525024414, "learning_rate": 3.956698634117989e-06, "loss": 0.5158, "mean_token_accuracy": 0.8252757102251053, "num_tokens": 9820692.0, "step": 8170 }, { "entropy": 1.933900985121727, "epoch": 0.025357284290650896, "grad_norm": 15.611356735229492, "learning_rate": 3.9615421873486395e-06, "loss": 0.8023, "mean_token_accuracy": 0.7968411594629288, "num_tokens": 9832230.0, "step": 8180 }, { "entropy": 1.8410508632659912, "epoch": 0.025388283415700593, "grad_norm": 15.906883239746094, "learning_rate": 3.966385740579289e-06, "loss": 0.5578, "mean_token_accuracy": 0.8164623320102692, "num_tokens": 9844672.0, "step": 8190 }, { "entropy": 1.9910759955644608, "epoch": 0.025419282540750286, "grad_norm": 13.914506912231445, "learning_rate": 3.971229293809939e-06, "loss": 0.7414, "mean_token_accuracy": 0.7940586417913437, "num_tokens": 9856014.0, "step": 8200 }, { "entropy": 1.9779651939868927, "epoch": 0.025450281665799983, "grad_norm": 12.388845443725586, "learning_rate": 3.976072847040589e-06, "loss": 0.6406, "mean_token_accuracy": 0.810752010345459, "num_tokens": 9867585.0, "step": 8210 }, { "entropy": 1.950091141462326, "epoch": 0.02548128079084968, "grad_norm": 15.438441276550293, "learning_rate": 3.980916400271239e-06, "loss": 0.6076, "mean_token_accuracy": 0.8093546569347382, "num_tokens": 9879228.0, "step": 8220 }, { "entropy": 1.8706778109073638, "epoch": 0.025512279915899373, "grad_norm": 10.18140983581543, "learning_rate": 3.985759953501889e-06, "loss": 0.5486, "mean_token_accuracy": 0.8231084540486335, "num_tokens": 9892299.0, "step": 8230 }, { "entropy": 1.9114290565252303, "epoch": 0.02554327904094907, "grad_norm": 11.814135551452637, "learning_rate": 3.99060350673254e-06, "loss": 0.6377, "mean_token_accuracy": 0.8094885662198067, "num_tokens": 9905027.0, "step": 8240 }, { "entropy": 1.8559225603938103, "epoch": 0.025574278165998766, "grad_norm": 16.11547088623047, "learning_rate": 3.9954470599631896e-06, "loss": 0.6124, "mean_token_accuracy": 0.8048399239778519, "num_tokens": 9917498.0, "step": 8250 }, { "entropy": 1.9120782747864724, "epoch": 0.02560527729104846, "grad_norm": 6.817448616027832, "learning_rate": 4.0002906131938395e-06, "loss": 0.6091, "mean_token_accuracy": 0.8105715274810791, "num_tokens": 9929963.0, "step": 8260 }, { "entropy": 1.8872886016964912, "epoch": 0.025636276416098155, "grad_norm": 13.45474624633789, "learning_rate": 4.005134166424489e-06, "loss": 0.5923, "mean_token_accuracy": 0.8145119935274124, "num_tokens": 9942393.0, "step": 8270 }, { "entropy": 1.9791032537817954, "epoch": 0.025667275541147852, "grad_norm": 12.72803020477295, "learning_rate": 4.009977719655139e-06, "loss": 0.6922, "mean_token_accuracy": 0.7994261741638183, "num_tokens": 9953810.0, "step": 8280 }, { "entropy": 1.9359009936451912, "epoch": 0.025698274666197545, "grad_norm": 11.705611228942871, "learning_rate": 4.014821272885789e-06, "loss": 0.6192, "mean_token_accuracy": 0.8066216111183167, "num_tokens": 9965045.0, "step": 8290 }, { "entropy": 1.8848050013184547, "epoch": 0.025729273791247242, "grad_norm": 14.14693832397461, "learning_rate": 4.01966482611644e-06, "loss": 0.6122, "mean_token_accuracy": 0.8066284090280533, "num_tokens": 9977622.0, "step": 8300 }, { "entropy": 1.9196738287806512, "epoch": 0.02576027291629694, "grad_norm": 13.333892822265625, "learning_rate": 4.02450837934709e-06, "loss": 0.6378, "mean_token_accuracy": 0.8114329054951668, "num_tokens": 9989396.0, "step": 8310 }, { "entropy": 1.9220668867230415, "epoch": 0.02579127204134663, "grad_norm": 8.307788848876953, "learning_rate": 4.029351932577739e-06, "loss": 0.676, "mean_token_accuracy": 0.8107616409659386, "num_tokens": 10001039.0, "step": 8320 }, { "entropy": 1.924265030026436, "epoch": 0.025822271166396328, "grad_norm": 13.42192554473877, "learning_rate": 4.0341954858083895e-06, "loss": 0.6432, "mean_token_accuracy": 0.8023953422904014, "num_tokens": 10012862.0, "step": 8330 }, { "entropy": 1.8803733557462692, "epoch": 0.025853270291446025, "grad_norm": 14.513736724853516, "learning_rate": 4.039039039039039e-06, "loss": 0.614, "mean_token_accuracy": 0.8223489403724671, "num_tokens": 10024615.0, "step": 8340 }, { "entropy": 1.8715410023927688, "epoch": 0.025884269416495718, "grad_norm": 13.777809143066406, "learning_rate": 4.043882592269689e-06, "loss": 0.5892, "mean_token_accuracy": 0.8061651542782784, "num_tokens": 10036943.0, "step": 8350 }, { "entropy": 1.8626436039805412, "epoch": 0.025915268541545414, "grad_norm": 5.88369607925415, "learning_rate": 4.04872614550034e-06, "loss": 0.5776, "mean_token_accuracy": 0.816595071554184, "num_tokens": 10049582.0, "step": 8360 }, { "entropy": 1.893427351117134, "epoch": 0.02594626766659511, "grad_norm": 5.693576335906982, "learning_rate": 4.053569698730989e-06, "loss": 0.5993, "mean_token_accuracy": 0.8074675589799881, "num_tokens": 10061474.0, "step": 8370 }, { "entropy": 1.9071956276893616, "epoch": 0.025977266791644804, "grad_norm": 6.2122063636779785, "learning_rate": 4.058413251961639e-06, "loss": 0.5746, "mean_token_accuracy": 0.8195055171847343, "num_tokens": 10073377.0, "step": 8380 }, { "entropy": 1.9347770288586617, "epoch": 0.0260082659166945, "grad_norm": 6.3852434158325195, "learning_rate": 4.06325680519229e-06, "loss": 0.6468, "mean_token_accuracy": 0.7985034555196762, "num_tokens": 10085414.0, "step": 8390 }, { "entropy": 1.879788914322853, "epoch": 0.026039265041744197, "grad_norm": 5.960961818695068, "learning_rate": 4.0681003584229395e-06, "loss": 0.5019, "mean_token_accuracy": 0.8187567561864852, "num_tokens": 10098381.0, "step": 8400 }, { "entropy": 1.915135058760643, "epoch": 0.026070264166793894, "grad_norm": 5.453474521636963, "learning_rate": 4.072943911653589e-06, "loss": 0.5682, "mean_token_accuracy": 0.8210947424173355, "num_tokens": 10110037.0, "step": 8410 }, { "entropy": 1.9362727150321006, "epoch": 0.026101263291843587, "grad_norm": 14.594948768615723, "learning_rate": 4.077787464884239e-06, "loss": 0.6608, "mean_token_accuracy": 0.7989747643470764, "num_tokens": 10121901.0, "step": 8420 }, { "entropy": 1.8305003002285958, "epoch": 0.026132262416893284, "grad_norm": 14.329607963562012, "learning_rate": 4.082631018114889e-06, "loss": 0.5911, "mean_token_accuracy": 0.8075525343418122, "num_tokens": 10134905.0, "step": 8430 }, { "entropy": 1.8578375533223153, "epoch": 0.02616326154194298, "grad_norm": 13.74600887298584, "learning_rate": 4.087474571345539e-06, "loss": 0.5891, "mean_token_accuracy": 0.8238616168498993, "num_tokens": 10146939.0, "step": 8440 }, { "entropy": 1.9704759448766709, "epoch": 0.026194260666992673, "grad_norm": 12.829072952270508, "learning_rate": 4.09231812457619e-06, "loss": 0.6982, "mean_token_accuracy": 0.7840036064386368, "num_tokens": 10158552.0, "step": 8450 }, { "entropy": 1.8708418890833856, "epoch": 0.02622525979204237, "grad_norm": 12.569872856140137, "learning_rate": 4.09716167780684e-06, "loss": 0.5778, "mean_token_accuracy": 0.8157704427838326, "num_tokens": 10170966.0, "step": 8460 }, { "entropy": 1.9303714036941528, "epoch": 0.026256258917092067, "grad_norm": 14.426962852478027, "learning_rate": 4.1020052310374896e-06, "loss": 0.6081, "mean_token_accuracy": 0.8103082060813904, "num_tokens": 10183633.0, "step": 8470 }, { "entropy": 1.862780438363552, "epoch": 0.02628725804214176, "grad_norm": 5.966731548309326, "learning_rate": 4.1068487842681394e-06, "loss": 0.593, "mean_token_accuracy": 0.804168826341629, "num_tokens": 10195713.0, "step": 8480 }, { "entropy": 1.9560957163572312, "epoch": 0.026318257167191456, "grad_norm": 13.00171947479248, "learning_rate": 4.111692337498789e-06, "loss": 0.6648, "mean_token_accuracy": 0.799228847026825, "num_tokens": 10207536.0, "step": 8490 }, { "entropy": 1.889628753066063, "epoch": 0.026349256292241153, "grad_norm": 6.381255626678467, "learning_rate": 4.116535890729439e-06, "loss": 0.5822, "mean_token_accuracy": 0.8094806551933289, "num_tokens": 10220149.0, "step": 8500 }, { "entropy": 1.9717933177947997, "epoch": 0.026380255417290846, "grad_norm": 12.86796760559082, "learning_rate": 4.12137944396009e-06, "loss": 0.6727, "mean_token_accuracy": 0.8050152316689492, "num_tokens": 10231949.0, "step": 8510 }, { "entropy": 1.934307949244976, "epoch": 0.026411254542340543, "grad_norm": 11.483423233032227, "learning_rate": 4.12622299719074e-06, "loss": 0.5974, "mean_token_accuracy": 0.8233723506331444, "num_tokens": 10243617.0, "step": 8520 }, { "entropy": 1.9711205691099167, "epoch": 0.02644225366739024, "grad_norm": 11.086894035339355, "learning_rate": 4.131066550421389e-06, "loss": 0.67, "mean_token_accuracy": 0.7918197363615036, "num_tokens": 10254929.0, "step": 8530 }, { "entropy": 1.8315666139125824, "epoch": 0.026473252792439932, "grad_norm": 12.072243690490723, "learning_rate": 4.13591010365204e-06, "loss": 0.5831, "mean_token_accuracy": 0.8164573073387146, "num_tokens": 10268348.0, "step": 8540 }, { "entropy": 1.8700304999947548, "epoch": 0.02650425191748963, "grad_norm": 7.212819576263428, "learning_rate": 4.1407536568826895e-06, "loss": 0.5612, "mean_token_accuracy": 0.8133175373077393, "num_tokens": 10281420.0, "step": 8550 }, { "entropy": 1.8715410739183427, "epoch": 0.026535251042539326, "grad_norm": 15.102907180786133, "learning_rate": 4.145597210113339e-06, "loss": 0.5518, "mean_token_accuracy": 0.8137237802147865, "num_tokens": 10293312.0, "step": 8560 }, { "entropy": 1.8536837711930274, "epoch": 0.02656625016758902, "grad_norm": 18.23057746887207, "learning_rate": 4.15044076334399e-06, "loss": 0.5679, "mean_token_accuracy": 0.819478040933609, "num_tokens": 10305555.0, "step": 8570 }, { "entropy": 1.8068823255598545, "epoch": 0.026597249292638715, "grad_norm": 6.016721725463867, "learning_rate": 4.155284316574639e-06, "loss": 0.5376, "mean_token_accuracy": 0.8214111477136612, "num_tokens": 10318500.0, "step": 8580 }, { "entropy": 1.927972738444805, "epoch": 0.026628248417688412, "grad_norm": 13.958089828491211, "learning_rate": 4.160127869805289e-06, "loss": 0.6243, "mean_token_accuracy": 0.8016948789358139, "num_tokens": 10330081.0, "step": 8590 }, { "entropy": 1.8768570333719254, "epoch": 0.026659247542738105, "grad_norm": 11.411067962646484, "learning_rate": 4.16497142303594e-06, "loss": 0.5993, "mean_token_accuracy": 0.8204882755875588, "num_tokens": 10341630.0, "step": 8600 }, { "entropy": 1.9243871226906777, "epoch": 0.0266902466677878, "grad_norm": 12.71778678894043, "learning_rate": 4.16981497626659e-06, "loss": 0.6498, "mean_token_accuracy": 0.8025202408432961, "num_tokens": 10353521.0, "step": 8610 }, { "entropy": 1.8957347482442857, "epoch": 0.026721245792837498, "grad_norm": 13.246805191040039, "learning_rate": 4.1746585294972395e-06, "loss": 0.5698, "mean_token_accuracy": 0.8230790719389915, "num_tokens": 10365655.0, "step": 8620 }, { "entropy": 1.9469288036227226, "epoch": 0.02675224491788719, "grad_norm": 13.388124465942383, "learning_rate": 4.179502082727889e-06, "loss": 0.6473, "mean_token_accuracy": 0.8096310868859291, "num_tokens": 10377607.0, "step": 8630 }, { "entropy": 1.8346548154950142, "epoch": 0.026783244042936888, "grad_norm": 13.726068496704102, "learning_rate": 4.184345635958539e-06, "loss": 0.5094, "mean_token_accuracy": 0.8302373275160789, "num_tokens": 10390062.0, "step": 8640 }, { "entropy": 1.892690037190914, "epoch": 0.026814243167986584, "grad_norm": 15.530143737792969, "learning_rate": 4.189189189189189e-06, "loss": 0.6238, "mean_token_accuracy": 0.8069882616400719, "num_tokens": 10402361.0, "step": 8650 }, { "entropy": 1.8609767884016037, "epoch": 0.026845242293036278, "grad_norm": 6.528801918029785, "learning_rate": 4.19403274241984e-06, "loss": 0.5531, "mean_token_accuracy": 0.8153262332081794, "num_tokens": 10415461.0, "step": 8660 }, { "entropy": 1.869553703069687, "epoch": 0.026876241418085974, "grad_norm": 12.446064949035645, "learning_rate": 4.19887629565049e-06, "loss": 0.5657, "mean_token_accuracy": 0.8269910797476768, "num_tokens": 10427235.0, "step": 8670 }, { "entropy": 1.8075580313801765, "epoch": 0.02690724054313567, "grad_norm": 5.994472980499268, "learning_rate": 4.20371984888114e-06, "loss": 0.5459, "mean_token_accuracy": 0.8210108175873756, "num_tokens": 10440500.0, "step": 8680 }, { "entropy": 1.857536444067955, "epoch": 0.026938239668185364, "grad_norm": 7.034928321838379, "learning_rate": 4.2085634021117895e-06, "loss": 0.6801, "mean_token_accuracy": 0.8013882115483284, "num_tokens": 10452545.0, "step": 8690 }, { "entropy": 1.8659397497773171, "epoch": 0.02696923879323506, "grad_norm": 12.33353042602539, "learning_rate": 4.2134069553424394e-06, "loss": 0.5794, "mean_token_accuracy": 0.8184954881668091, "num_tokens": 10464085.0, "step": 8700 }, { "entropy": 1.9467614516615868, "epoch": 0.027000237918284757, "grad_norm": 14.808907508850098, "learning_rate": 4.218250508573089e-06, "loss": 0.6854, "mean_token_accuracy": 0.8033217743039132, "num_tokens": 10475471.0, "step": 8710 }, { "entropy": 1.9029730170965196, "epoch": 0.02703123704333445, "grad_norm": 6.386728763580322, "learning_rate": 4.22309406180374e-06, "loss": 0.5904, "mean_token_accuracy": 0.8169241547584534, "num_tokens": 10487494.0, "step": 8720 }, { "entropy": 1.8430425137281419, "epoch": 0.027062236168384147, "grad_norm": 5.942121982574463, "learning_rate": 4.22793761503439e-06, "loss": 0.5563, "mean_token_accuracy": 0.8265012905001641, "num_tokens": 10499362.0, "step": 8730 }, { "entropy": 1.8841292724013328, "epoch": 0.027093235293433843, "grad_norm": 5.7246198654174805, "learning_rate": 4.232781168265039e-06, "loss": 0.6059, "mean_token_accuracy": 0.8051643744111061, "num_tokens": 10510802.0, "step": 8740 }, { "entropy": 1.8404579356312751, "epoch": 0.02712423441848354, "grad_norm": 14.219062805175781, "learning_rate": 4.23762472149569e-06, "loss": 0.5386, "mean_token_accuracy": 0.8213804766535759, "num_tokens": 10523317.0, "step": 8750 }, { "entropy": 1.8721755370497704, "epoch": 0.027155233543533233, "grad_norm": 16.284940719604492, "learning_rate": 4.24246827472634e-06, "loss": 0.6095, "mean_token_accuracy": 0.807612107694149, "num_tokens": 10534703.0, "step": 8760 }, { "entropy": 1.8348284885287285, "epoch": 0.02718623266858293, "grad_norm": 11.95847225189209, "learning_rate": 4.2473118279569895e-06, "loss": 0.5816, "mean_token_accuracy": 0.8146825641393661, "num_tokens": 10547080.0, "step": 8770 }, { "entropy": 1.8284624338150024, "epoch": 0.027217231793632626, "grad_norm": 14.658815383911133, "learning_rate": 4.25215538118764e-06, "loss": 0.6493, "mean_token_accuracy": 0.8004456847906113, "num_tokens": 10559321.0, "step": 8780 }, { "entropy": 1.7974061518907547, "epoch": 0.02724823091868232, "grad_norm": 8.202540397644043, "learning_rate": 4.256998934418289e-06, "loss": 0.6258, "mean_token_accuracy": 0.8043118000030518, "num_tokens": 10572053.0, "step": 8790 }, { "entropy": 1.8207244381308556, "epoch": 0.027279230043732016, "grad_norm": 14.289605140686035, "learning_rate": 4.261842487648939e-06, "loss": 0.5533, "mean_token_accuracy": 0.8130275592207908, "num_tokens": 10584178.0, "step": 8800 }, { "entropy": 1.7965356424450873, "epoch": 0.027310229168781713, "grad_norm": 5.546839714050293, "learning_rate": 4.26668604087959e-06, "loss": 0.5518, "mean_token_accuracy": 0.8202337950468064, "num_tokens": 10597231.0, "step": 8810 }, { "entropy": 1.849162083864212, "epoch": 0.027341228293831406, "grad_norm": 13.331665992736816, "learning_rate": 4.27152959411024e-06, "loss": 0.5852, "mean_token_accuracy": 0.8217923492193222, "num_tokens": 10609555.0, "step": 8820 }, { "entropy": 1.8186723545193673, "epoch": 0.027372227418881102, "grad_norm": 13.591072082519531, "learning_rate": 4.27637314734089e-06, "loss": 0.6013, "mean_token_accuracy": 0.8041837394237519, "num_tokens": 10621885.0, "step": 8830 }, { "entropy": 1.8741968870162964, "epoch": 0.0274032265439308, "grad_norm": 12.443244934082031, "learning_rate": 4.2812167005715395e-06, "loss": 0.598, "mean_token_accuracy": 0.807262459397316, "num_tokens": 10633736.0, "step": 8840 }, { "entropy": 1.9037314355373383, "epoch": 0.027434225668980492, "grad_norm": 13.946099281311035, "learning_rate": 4.286060253802189e-06, "loss": 0.6851, "mean_token_accuracy": 0.808636249601841, "num_tokens": 10645193.0, "step": 8850 }, { "entropy": 1.8897378742694855, "epoch": 0.02746522479403019, "grad_norm": 13.437854766845703, "learning_rate": 4.290903807032839e-06, "loss": 0.6297, "mean_token_accuracy": 0.8217552006244659, "num_tokens": 10656076.0, "step": 8860 }, { "entropy": 1.9088585555553437, "epoch": 0.027496223919079885, "grad_norm": 15.542739868164062, "learning_rate": 4.29574736026349e-06, "loss": 0.6387, "mean_token_accuracy": 0.8007620498538017, "num_tokens": 10666538.0, "step": 8870 }, { "entropy": 1.8244119063019753, "epoch": 0.02752722304412958, "grad_norm": 13.687409400939941, "learning_rate": 4.30059091349414e-06, "loss": 0.5834, "mean_token_accuracy": 0.8106234610080719, "num_tokens": 10679216.0, "step": 8880 }, { "entropy": 1.8295306876301765, "epoch": 0.027558222169179275, "grad_norm": 13.513102531433105, "learning_rate": 4.30543446672479e-06, "loss": 0.5764, "mean_token_accuracy": 0.8163102254271507, "num_tokens": 10690908.0, "step": 8890 }, { "entropy": 1.8037668392062187, "epoch": 0.02758922129422897, "grad_norm": 10.39285659790039, "learning_rate": 4.31027801995544e-06, "loss": 0.5876, "mean_token_accuracy": 0.8148684576153755, "num_tokens": 10702625.0, "step": 8900 }, { "entropy": 1.826628988981247, "epoch": 0.027620220419278665, "grad_norm": 10.972448348999023, "learning_rate": 4.3151215731860895e-06, "loss": 0.6049, "mean_token_accuracy": 0.8112099289894104, "num_tokens": 10715241.0, "step": 8910 }, { "entropy": 1.9132762208580971, "epoch": 0.02765121954432836, "grad_norm": 11.588542938232422, "learning_rate": 4.319965126416739e-06, "loss": 0.6643, "mean_token_accuracy": 0.800639072060585, "num_tokens": 10726279.0, "step": 8920 }, { "entropy": 1.8298836424946785, "epoch": 0.027682218669378058, "grad_norm": 11.887832641601562, "learning_rate": 4.32480867964739e-06, "loss": 0.5828, "mean_token_accuracy": 0.81552804261446, "num_tokens": 10738806.0, "step": 8930 }, { "entropy": 1.8580557018518449, "epoch": 0.02771321779442775, "grad_norm": 10.222613334655762, "learning_rate": 4.32965223287804e-06, "loss": 0.6352, "mean_token_accuracy": 0.8103110626339912, "num_tokens": 10750764.0, "step": 8940 }, { "entropy": 1.7699718743562698, "epoch": 0.027744216919477448, "grad_norm": 5.6745147705078125, "learning_rate": 4.334495786108689e-06, "loss": 0.5092, "mean_token_accuracy": 0.8255440399050713, "num_tokens": 10764128.0, "step": 8950 }, { "entropy": 1.8009968280792237, "epoch": 0.027775216044527144, "grad_norm": 11.968152046203613, "learning_rate": 4.33933933933934e-06, "loss": 0.5227, "mean_token_accuracy": 0.8253274574875832, "num_tokens": 10776273.0, "step": 8960 }, { "entropy": 1.8581290900707246, "epoch": 0.027806215169576837, "grad_norm": 11.328075408935547, "learning_rate": 4.34418289256999e-06, "loss": 0.6036, "mean_token_accuracy": 0.7987002015113831, "num_tokens": 10787895.0, "step": 8970 }, { "entropy": 1.835921722650528, "epoch": 0.027837214294626534, "grad_norm": 15.295876502990723, "learning_rate": 4.3490264458006396e-06, "loss": 0.5987, "mean_token_accuracy": 0.8129387706518173, "num_tokens": 10800249.0, "step": 8980 }, { "entropy": 1.8953902840614318, "epoch": 0.02786821341967623, "grad_norm": 11.311132431030273, "learning_rate": 4.35386999903129e-06, "loss": 0.6602, "mean_token_accuracy": 0.7996082842350006, "num_tokens": 10811728.0, "step": 8990 }, { "entropy": 1.869747567176819, "epoch": 0.027899212544725924, "grad_norm": 13.892035484313965, "learning_rate": 4.358713552261939e-06, "loss": 0.6344, "mean_token_accuracy": 0.8065097257494926, "num_tokens": 10822949.0, "step": 9000 }, { "entropy": 1.8128714829683303, "epoch": 0.02793021166977562, "grad_norm": 10.744576454162598, "learning_rate": 4.363557105492589e-06, "loss": 0.5757, "mean_token_accuracy": 0.8185910269618034, "num_tokens": 10835661.0, "step": 9010 }, { "entropy": 1.8096760362386703, "epoch": 0.027961210794825317, "grad_norm": 5.07841682434082, "learning_rate": 4.36840065872324e-06, "loss": 0.5479, "mean_token_accuracy": 0.8252274811267852, "num_tokens": 10848267.0, "step": 9020 }, { "entropy": 1.9240417778491974, "epoch": 0.02799220991987501, "grad_norm": 13.373735427856445, "learning_rate": 4.37324421195389e-06, "loss": 0.6678, "mean_token_accuracy": 0.7926280677318573, "num_tokens": 10859705.0, "step": 9030 }, { "entropy": 1.9146719083189965, "epoch": 0.028023209044924707, "grad_norm": 11.327962875366211, "learning_rate": 4.37808776518454e-06, "loss": 0.6178, "mean_token_accuracy": 0.826309834420681, "num_tokens": 10871134.0, "step": 9040 }, { "entropy": 1.8291181206703186, "epoch": 0.028054208169974403, "grad_norm": 14.080740928649902, "learning_rate": 4.38293131841519e-06, "loss": 0.5395, "mean_token_accuracy": 0.8222734659910202, "num_tokens": 10883664.0, "step": 9050 }, { "entropy": 1.931309811770916, "epoch": 0.028085207295024096, "grad_norm": 14.17933177947998, "learning_rate": 4.3877748716458395e-06, "loss": 0.671, "mean_token_accuracy": 0.7961855351924896, "num_tokens": 10894837.0, "step": 9060 }, { "entropy": 1.9364981949329376, "epoch": 0.028116206420073793, "grad_norm": 16.13866424560547, "learning_rate": 4.392618424876489e-06, "loss": 0.6832, "mean_token_accuracy": 0.8080704003572464, "num_tokens": 10905463.0, "step": 9070 }, { "entropy": 1.8846597149968147, "epoch": 0.02814720554512349, "grad_norm": 12.987224578857422, "learning_rate": 4.39746197810714e-06, "loss": 0.6404, "mean_token_accuracy": 0.8108007118105889, "num_tokens": 10917119.0, "step": 9080 }, { "entropy": 1.9063777923583984, "epoch": 0.028178204670173183, "grad_norm": 14.922365188598633, "learning_rate": 4.40230553133779e-06, "loss": 0.6725, "mean_token_accuracy": 0.8062027513980865, "num_tokens": 10927669.0, "step": 9090 }, { "entropy": 1.8114924758672715, "epoch": 0.02820920379522288, "grad_norm": 10.971781730651855, "learning_rate": 4.40714908456844e-06, "loss": 0.5537, "mean_token_accuracy": 0.8184123501181603, "num_tokens": 10939764.0, "step": 9100 }, { "entropy": 1.9051185458898545, "epoch": 0.028240202920272576, "grad_norm": 5.854284763336182, "learning_rate": 4.41199263779909e-06, "loss": 0.6057, "mean_token_accuracy": 0.8078436881303788, "num_tokens": 10951263.0, "step": 9110 }, { "entropy": 2.0060752868652343, "epoch": 0.028271202045322272, "grad_norm": 13.479512214660645, "learning_rate": 4.41683619102974e-06, "loss": 0.7195, "mean_token_accuracy": 0.7970252841711044, "num_tokens": 10962301.0, "step": 9120 }, { "entropy": 1.9399765685200692, "epoch": 0.028302201170371966, "grad_norm": 13.337047576904297, "learning_rate": 4.4216797442603895e-06, "loss": 0.6101, "mean_token_accuracy": 0.8143787398934365, "num_tokens": 10973469.0, "step": 9130 }, { "entropy": 1.8391277641057968, "epoch": 0.028333200295421662, "grad_norm": 6.049315452575684, "learning_rate": 4.42652329749104e-06, "loss": 0.5842, "mean_token_accuracy": 0.8160114631056785, "num_tokens": 10986193.0, "step": 9140 }, { "entropy": 1.8236407771706582, "epoch": 0.02836419942047136, "grad_norm": 12.77992057800293, "learning_rate": 4.43136685072169e-06, "loss": 0.6036, "mean_token_accuracy": 0.8185855254530907, "num_tokens": 10998743.0, "step": 9150 }, { "entropy": 1.9133103117346764, "epoch": 0.028395198545521052, "grad_norm": 11.950321197509766, "learning_rate": 4.436210403952339e-06, "loss": 0.6247, "mean_token_accuracy": 0.8190466240048409, "num_tokens": 11010167.0, "step": 9160 }, { "entropy": 1.8094365164637565, "epoch": 0.02842619767057075, "grad_norm": 15.926004409790039, "learning_rate": 4.44105395718299e-06, "loss": 0.4952, "mean_token_accuracy": 0.8278137043118476, "num_tokens": 11023270.0, "step": 9170 }, { "entropy": 1.9534984081983566, "epoch": 0.028457196795620445, "grad_norm": 13.150014877319336, "learning_rate": 4.44589751041364e-06, "loss": 0.6237, "mean_token_accuracy": 0.8099380254745483, "num_tokens": 11034084.0, "step": 9180 }, { "entropy": 1.9308686077594757, "epoch": 0.028488195920670138, "grad_norm": 13.831839561462402, "learning_rate": 4.45074106364429e-06, "loss": 0.6109, "mean_token_accuracy": 0.8114223212003708, "num_tokens": 11045463.0, "step": 9190 }, { "entropy": 1.922127565741539, "epoch": 0.028519195045719835, "grad_norm": 14.833643913269043, "learning_rate": 4.45558461687494e-06, "loss": 0.6848, "mean_token_accuracy": 0.8016584351658821, "num_tokens": 11057209.0, "step": 9200 }, { "entropy": 1.9555729806423188, "epoch": 0.02855019417076953, "grad_norm": 12.715826988220215, "learning_rate": 4.4604281701055894e-06, "loss": 0.6802, "mean_token_accuracy": 0.805061075091362, "num_tokens": 11068291.0, "step": 9210 }, { "entropy": 1.942775259912014, "epoch": 0.028581193295819225, "grad_norm": 7.836392879486084, "learning_rate": 4.465271723336239e-06, "loss": 0.6386, "mean_token_accuracy": 0.8060253381729126, "num_tokens": 11079075.0, "step": 9220 }, { "entropy": 1.9078367695212364, "epoch": 0.02861219242086892, "grad_norm": 13.057887077331543, "learning_rate": 4.47011527656689e-06, "loss": 0.6303, "mean_token_accuracy": 0.806829422712326, "num_tokens": 11090899.0, "step": 9230 }, { "entropy": 1.9537988051772117, "epoch": 0.028643191545918618, "grad_norm": 13.034477233886719, "learning_rate": 4.47495882979754e-06, "loss": 0.643, "mean_token_accuracy": 0.8042517855763436, "num_tokens": 11102296.0, "step": 9240 }, { "entropy": 1.8851880788803101, "epoch": 0.02867419067096831, "grad_norm": 15.656882286071777, "learning_rate": 4.47980238302819e-06, "loss": 0.5952, "mean_token_accuracy": 0.8147235870361328, "num_tokens": 11113748.0, "step": 9250 }, { "entropy": 1.770183938741684, "epoch": 0.028705189796018007, "grad_norm": 12.287094116210938, "learning_rate": 4.48464593625884e-06, "loss": 0.5044, "mean_token_accuracy": 0.8376102089881897, "num_tokens": 11126535.0, "step": 9260 }, { "entropy": 1.8187786787748337, "epoch": 0.028736188921067704, "grad_norm": 12.566884994506836, "learning_rate": 4.48948948948949e-06, "loss": 0.5549, "mean_token_accuracy": 0.8142774716019631, "num_tokens": 11139226.0, "step": 9270 }, { "entropy": 1.9643137067556382, "epoch": 0.028767188046117397, "grad_norm": 13.199538230895996, "learning_rate": 4.4943330427201395e-06, "loss": 0.7005, "mean_token_accuracy": 0.7958864018321037, "num_tokens": 11150412.0, "step": 9280 }, { "entropy": 1.7707602977752686, "epoch": 0.028798187171167094, "grad_norm": 11.749283790588379, "learning_rate": 4.49917659595079e-06, "loss": 0.4821, "mean_token_accuracy": 0.826721802353859, "num_tokens": 11164245.0, "step": 9290 }, { "entropy": 1.8696741789579392, "epoch": 0.02882918629621679, "grad_norm": 13.251580238342285, "learning_rate": 4.50402014918144e-06, "loss": 0.5609, "mean_token_accuracy": 0.819263182580471, "num_tokens": 11176738.0, "step": 9300 }, { "entropy": 1.9785585284233094, "epoch": 0.028860185421266484, "grad_norm": 14.014039039611816, "learning_rate": 4.50886370241209e-06, "loss": 0.629, "mean_token_accuracy": 0.8204264283180237, "num_tokens": 11187416.0, "step": 9310 }, { "entropy": 1.9258405417203903, "epoch": 0.02889118454631618, "grad_norm": 12.051921844482422, "learning_rate": 4.51370725564274e-06, "loss": 0.6911, "mean_token_accuracy": 0.8011237904429436, "num_tokens": 11198731.0, "step": 9320 }, { "entropy": 1.7640791162848473, "epoch": 0.028922183671365877, "grad_norm": 13.392481803894043, "learning_rate": 4.51855080887339e-06, "loss": 0.5373, "mean_token_accuracy": 0.8237902507185936, "num_tokens": 11212726.0, "step": 9330 }, { "entropy": 1.9153805747628212, "epoch": 0.02895318279641557, "grad_norm": 12.259576797485352, "learning_rate": 4.52339436210404e-06, "loss": 0.5935, "mean_token_accuracy": 0.8111542999744416, "num_tokens": 11225651.0, "step": 9340 }, { "entropy": 2.023898732662201, "epoch": 0.028984181921465266, "grad_norm": 15.207432746887207, "learning_rate": 4.52823791533469e-06, "loss": 0.7204, "mean_token_accuracy": 0.7969193458557129, "num_tokens": 11236751.0, "step": 9350 }, { "entropy": 1.9569285050034524, "epoch": 0.029015181046514963, "grad_norm": 15.066774368286133, "learning_rate": 4.53308146856534e-06, "loss": 0.6648, "mean_token_accuracy": 0.8044527977705002, "num_tokens": 11248128.0, "step": 9360 }, { "entropy": 1.8909387812018394, "epoch": 0.029046180171564656, "grad_norm": 12.793632507324219, "learning_rate": 4.537925021795989e-06, "loss": 0.6201, "mean_token_accuracy": 0.8120290517807007, "num_tokens": 11260048.0, "step": 9370 }, { "entropy": 1.9061389192938805, "epoch": 0.029077179296614353, "grad_norm": 10.562239646911621, "learning_rate": 4.54276857502664e-06, "loss": 0.6658, "mean_token_accuracy": 0.7987107247114181, "num_tokens": 11271960.0, "step": 9380 }, { "entropy": 1.9409636542201043, "epoch": 0.02910817842166405, "grad_norm": 12.370716094970703, "learning_rate": 4.54761212825729e-06, "loss": 0.641, "mean_token_accuracy": 0.8062451392412185, "num_tokens": 11283082.0, "step": 9390 }, { "entropy": 1.9235450312495233, "epoch": 0.029139177546713742, "grad_norm": 11.286612510681152, "learning_rate": 4.55245568148794e-06, "loss": 0.6299, "mean_token_accuracy": 0.8160214677453042, "num_tokens": 11294418.0, "step": 9400 }, { "entropy": 1.9037536427378654, "epoch": 0.02917017667176344, "grad_norm": 12.946687698364258, "learning_rate": 4.5572992347185905e-06, "loss": 0.6663, "mean_token_accuracy": 0.7993274956941605, "num_tokens": 11306264.0, "step": 9410 }, { "entropy": 1.9348094776272773, "epoch": 0.029201175796813136, "grad_norm": 11.020223617553711, "learning_rate": 4.5621427879492395e-06, "loss": 0.647, "mean_token_accuracy": 0.8057587504386902, "num_tokens": 11316953.0, "step": 9420 }, { "entropy": 1.8925198674201966, "epoch": 0.02923217492186283, "grad_norm": 12.39678955078125, "learning_rate": 4.5669863411798894e-06, "loss": 0.6654, "mean_token_accuracy": 0.8044493556022644, "num_tokens": 11328541.0, "step": 9430 }, { "entropy": 1.871733333170414, "epoch": 0.029263174046912525, "grad_norm": 4.86548376083374, "learning_rate": 4.57182989441054e-06, "loss": 0.6326, "mean_token_accuracy": 0.806520962715149, "num_tokens": 11341221.0, "step": 9440 }, { "entropy": 1.9219319432973863, "epoch": 0.029294173171962222, "grad_norm": 13.70630168914795, "learning_rate": 4.57667344764119e-06, "loss": 0.6243, "mean_token_accuracy": 0.8117819979786873, "num_tokens": 11352400.0, "step": 9450 }, { "entropy": 1.9954150676727296, "epoch": 0.029325172297011915, "grad_norm": 16.177236557006836, "learning_rate": 4.58151700087184e-06, "loss": 0.6861, "mean_token_accuracy": 0.7988776013255119, "num_tokens": 11363461.0, "step": 9460 }, { "entropy": 1.960495764017105, "epoch": 0.02935617142206161, "grad_norm": 13.248772621154785, "learning_rate": 4.58636055410249e-06, "loss": 0.6945, "mean_token_accuracy": 0.799818865954876, "num_tokens": 11375111.0, "step": 9470 }, { "entropy": 1.9463176384568215, "epoch": 0.02938717054711131, "grad_norm": 13.98116683959961, "learning_rate": 4.59120410733314e-06, "loss": 0.6175, "mean_token_accuracy": 0.814936289191246, "num_tokens": 11386133.0, "step": 9480 }, { "entropy": 1.8891835004091262, "epoch": 0.029418169672161005, "grad_norm": 3.9220001697540283, "learning_rate": 4.59604766056379e-06, "loss": 0.58, "mean_token_accuracy": 0.8183272242546081, "num_tokens": 11399042.0, "step": 9490 }, { "entropy": 1.9922261223196984, "epoch": 0.029449168797210698, "grad_norm": 10.860396385192871, "learning_rate": 4.60089121379444e-06, "loss": 0.6858, "mean_token_accuracy": 0.8004097312688827, "num_tokens": 11410000.0, "step": 9500 }, { "entropy": 1.9224519044160844, "epoch": 0.029480167922260395, "grad_norm": 11.63853645324707, "learning_rate": 4.60573476702509e-06, "loss": 0.5717, "mean_token_accuracy": 0.816666342318058, "num_tokens": 11422038.0, "step": 9510 }, { "entropy": 1.8657364815473556, "epoch": 0.02951116704731009, "grad_norm": 13.743812561035156, "learning_rate": 4.61057832025574e-06, "loss": 0.5471, "mean_token_accuracy": 0.8122471138834954, "num_tokens": 11435434.0, "step": 9520 }, { "entropy": 1.925044772028923, "epoch": 0.029542166172359784, "grad_norm": 7.6943230628967285, "learning_rate": 4.61542187348639e-06, "loss": 0.6255, "mean_token_accuracy": 0.8038001671433449, "num_tokens": 11446785.0, "step": 9530 }, { "entropy": 1.9045713931322097, "epoch": 0.02957316529740948, "grad_norm": 14.207799911499023, "learning_rate": 4.62026542671704e-06, "loss": 0.6089, "mean_token_accuracy": 0.8201724767684937, "num_tokens": 11458736.0, "step": 9540 }, { "entropy": 1.9084357440471649, "epoch": 0.029604164422459178, "grad_norm": 13.012181282043457, "learning_rate": 4.62510897994769e-06, "loss": 0.6914, "mean_token_accuracy": 0.8042386144399643, "num_tokens": 11470670.0, "step": 9550 }, { "entropy": 1.8864588722586633, "epoch": 0.02963516354750887, "grad_norm": 11.62660026550293, "learning_rate": 4.6299525331783405e-06, "loss": 0.6257, "mean_token_accuracy": 0.8103240177035331, "num_tokens": 11482128.0, "step": 9560 }, { "entropy": 1.8707260951399802, "epoch": 0.029666162672558567, "grad_norm": 12.582703590393066, "learning_rate": 4.63479608640899e-06, "loss": 0.5899, "mean_token_accuracy": 0.8164230138063431, "num_tokens": 11494873.0, "step": 9570 }, { "entropy": 1.927544179558754, "epoch": 0.029697161797608264, "grad_norm": 14.461875915527344, "learning_rate": 4.63963963963964e-06, "loss": 0.624, "mean_token_accuracy": 0.8096611618995666, "num_tokens": 11506364.0, "step": 9580 }, { "entropy": 1.8048232197761536, "epoch": 0.029728160922657957, "grad_norm": 7.559422016143799, "learning_rate": 4.64448319287029e-06, "loss": 0.5429, "mean_token_accuracy": 0.8254976123571396, "num_tokens": 11519319.0, "step": 9590 }, { "entropy": 1.868215946853161, "epoch": 0.029759160047707654, "grad_norm": 7.954585075378418, "learning_rate": 4.64932674610094e-06, "loss": 0.5627, "mean_token_accuracy": 0.8161717414855957, "num_tokens": 11531172.0, "step": 9600 }, { "entropy": 1.9226632490754128, "epoch": 0.02979015917275735, "grad_norm": 12.952104568481445, "learning_rate": 4.65417029933159e-06, "loss": 0.6432, "mean_token_accuracy": 0.7985331505537033, "num_tokens": 11542291.0, "step": 9610 }, { "entropy": 1.763420394062996, "epoch": 0.029821158297807043, "grad_norm": 4.180752277374268, "learning_rate": 4.659013852562241e-06, "loss": 0.5277, "mean_token_accuracy": 0.8286846920847892, "num_tokens": 11555662.0, "step": 9620 }, { "entropy": 1.9089765697717667, "epoch": 0.02985215742285674, "grad_norm": 5.77734899520874, "learning_rate": 4.66385740579289e-06, "loss": 0.6166, "mean_token_accuracy": 0.8161043807864189, "num_tokens": 11566884.0, "step": 9630 }, { "entropy": 1.7613512337207795, "epoch": 0.029883156547906436, "grad_norm": 11.983185768127441, "learning_rate": 4.6687009590235395e-06, "loss": 0.535, "mean_token_accuracy": 0.8203961864113808, "num_tokens": 11580772.0, "step": 9640 }, { "entropy": 1.8508950725197792, "epoch": 0.02991415567295613, "grad_norm": 4.04287576675415, "learning_rate": 4.67354451225419e-06, "loss": 0.5947, "mean_token_accuracy": 0.8185210764408112, "num_tokens": 11593156.0, "step": 9650 }, { "entropy": 1.9007043689489365, "epoch": 0.029945154798005826, "grad_norm": 12.041136741638184, "learning_rate": 4.67838806548484e-06, "loss": 0.687, "mean_token_accuracy": 0.7983652919530868, "num_tokens": 11604534.0, "step": 9660 }, { "entropy": 1.8662898167967796, "epoch": 0.029976153923055523, "grad_norm": 5.588154315948486, "learning_rate": 4.68323161871549e-06, "loss": 0.5698, "mean_token_accuracy": 0.8151004850864411, "num_tokens": 11616960.0, "step": 9670 }, { "entropy": 1.8811151057481765, "epoch": 0.030007153048105216, "grad_norm": 15.325428009033203, "learning_rate": 4.68807517194614e-06, "loss": 0.6616, "mean_token_accuracy": 0.7986567124724389, "num_tokens": 11628059.0, "step": 9680 }, { "entropy": 1.8047630965709687, "epoch": 0.030038152173154913, "grad_norm": 5.080140590667725, "learning_rate": 4.69291872517679e-06, "loss": 0.5694, "mean_token_accuracy": 0.8145760789513588, "num_tokens": 11640534.0, "step": 9690 }, { "entropy": 1.9280565515160561, "epoch": 0.03006915129820461, "grad_norm": 6.16276216506958, "learning_rate": 4.69776227840744e-06, "loss": 0.6692, "mean_token_accuracy": 0.8054024249315261, "num_tokens": 11651985.0, "step": 9700 }, { "entropy": 1.9135492697358132, "epoch": 0.030100150423254302, "grad_norm": 13.727761268615723, "learning_rate": 4.70260583163809e-06, "loss": 0.5566, "mean_token_accuracy": 0.8213163688778877, "num_tokens": 11664292.0, "step": 9710 }, { "entropy": 1.8022937417030334, "epoch": 0.030131149548304, "grad_norm": 11.88475513458252, "learning_rate": 4.70744938486874e-06, "loss": 0.5153, "mean_token_accuracy": 0.8253460243344307, "num_tokens": 11677213.0, "step": 9720 }, { "entropy": 1.9173430383205414, "epoch": 0.030162148673353695, "grad_norm": 10.792257308959961, "learning_rate": 4.71229293809939e-06, "loss": 0.5897, "mean_token_accuracy": 0.8223668470978737, "num_tokens": 11689083.0, "step": 9730 }, { "entropy": 1.8793220937252044, "epoch": 0.03019314779840339, "grad_norm": 13.49764347076416, "learning_rate": 4.71713649133004e-06, "loss": 0.6808, "mean_token_accuracy": 0.7959660813212395, "num_tokens": 11701156.0, "step": 9740 }, { "entropy": 1.7766539767384528, "epoch": 0.030224146923453085, "grad_norm": 12.855195045471191, "learning_rate": 4.72198004456069e-06, "loss": 0.506, "mean_token_accuracy": 0.8286942020058632, "num_tokens": 11715335.0, "step": 9750 }, { "entropy": 1.914339354634285, "epoch": 0.030255146048502782, "grad_norm": 12.800863265991211, "learning_rate": 4.72682359779134e-06, "loss": 0.6046, "mean_token_accuracy": 0.8148805812001229, "num_tokens": 11727922.0, "step": 9760 }, { "entropy": 1.910454373061657, "epoch": 0.030286145173552475, "grad_norm": 13.581644058227539, "learning_rate": 4.7316671510219906e-06, "loss": 0.5975, "mean_token_accuracy": 0.8178346559405327, "num_tokens": 11739422.0, "step": 9770 }, { "entropy": 1.8079843878746034, "epoch": 0.03031714429860217, "grad_norm": 14.549524307250977, "learning_rate": 4.7365107042526405e-06, "loss": 0.543, "mean_token_accuracy": 0.8197317391633987, "num_tokens": 11752317.0, "step": 9780 }, { "entropy": 1.9487209469079971, "epoch": 0.030348143423651868, "grad_norm": 12.98496150970459, "learning_rate": 4.74135425748329e-06, "loss": 0.6664, "mean_token_accuracy": 0.8045255482196808, "num_tokens": 11763631.0, "step": 9790 }, { "entropy": 1.9213872998952866, "epoch": 0.03037914254870156, "grad_norm": 14.27520751953125, "learning_rate": 4.74619781071394e-06, "loss": 0.6433, "mean_token_accuracy": 0.8067422285676003, "num_tokens": 11775491.0, "step": 9800 }, { "entropy": 1.8178927034139634, "epoch": 0.030410141673751258, "grad_norm": 14.970010757446289, "learning_rate": 4.75104136394459e-06, "loss": 0.5629, "mean_token_accuracy": 0.8228003144264221, "num_tokens": 11788717.0, "step": 9810 }, { "entropy": 1.9698729425668717, "epoch": 0.030441140798800954, "grad_norm": 12.28023624420166, "learning_rate": 4.75588491717524e-06, "loss": 0.6749, "mean_token_accuracy": 0.8033818736672401, "num_tokens": 11799616.0, "step": 9820 }, { "entropy": 1.8569468915462495, "epoch": 0.03047213992385065, "grad_norm": 12.524866104125977, "learning_rate": 4.760728470405891e-06, "loss": 0.5486, "mean_token_accuracy": 0.8196278423070907, "num_tokens": 11812567.0, "step": 9830 }, { "entropy": 1.9362822517752647, "epoch": 0.030503139048900344, "grad_norm": 12.16551685333252, "learning_rate": 4.76557202363654e-06, "loss": 0.6641, "mean_token_accuracy": 0.802090086042881, "num_tokens": 11824072.0, "step": 9840 }, { "entropy": 1.9034365549683572, "epoch": 0.03053413817395004, "grad_norm": 11.310152053833008, "learning_rate": 4.77041557686719e-06, "loss": 0.6016, "mean_token_accuracy": 0.8146296158432961, "num_tokens": 11835828.0, "step": 9850 }, { "entropy": 1.949262234568596, "epoch": 0.030565137298999737, "grad_norm": 13.651056289672852, "learning_rate": 4.77525913009784e-06, "loss": 0.6148, "mean_token_accuracy": 0.8138030260801316, "num_tokens": 11847123.0, "step": 9860 }, { "entropy": 1.8827594295144081, "epoch": 0.03059613642404943, "grad_norm": 11.080418586730957, "learning_rate": 4.78010268332849e-06, "loss": 0.597, "mean_token_accuracy": 0.8088963508605957, "num_tokens": 11859028.0, "step": 9870 }, { "entropy": 1.8874688804149629, "epoch": 0.030627135549099127, "grad_norm": 12.232693672180176, "learning_rate": 4.78494623655914e-06, "loss": 0.6113, "mean_token_accuracy": 0.814998921751976, "num_tokens": 11869876.0, "step": 9880 }, { "entropy": 1.8576917335391046, "epoch": 0.030658134674148824, "grad_norm": 11.815441131591797, "learning_rate": 4.78978978978979e-06, "loss": 0.6147, "mean_token_accuracy": 0.8156881853938103, "num_tokens": 11881144.0, "step": 9890 }, { "entropy": 1.8440957143902779, "epoch": 0.030689133799198517, "grad_norm": 12.98918628692627, "learning_rate": 4.79463334302044e-06, "loss": 0.6312, "mean_token_accuracy": 0.8034987449645996, "num_tokens": 11893147.0, "step": 9900 }, { "entropy": 1.88355852663517, "epoch": 0.030720132924248213, "grad_norm": 6.641689300537109, "learning_rate": 4.79947689625109e-06, "loss": 0.6617, "mean_token_accuracy": 0.7953455910086632, "num_tokens": 11905450.0, "step": 9910 }, { "entropy": 1.9107140555977822, "epoch": 0.03075113204929791, "grad_norm": 5.667116641998291, "learning_rate": 4.8043204494817405e-06, "loss": 0.6608, "mean_token_accuracy": 0.8010135576128959, "num_tokens": 11916859.0, "step": 9920 }, { "entropy": 1.9044810444116593, "epoch": 0.030782131174347603, "grad_norm": 13.490184783935547, "learning_rate": 4.80916400271239e-06, "loss": 0.6331, "mean_token_accuracy": 0.8088266268372536, "num_tokens": 11927564.0, "step": 9930 }, { "entropy": 1.899574799835682, "epoch": 0.0308131302993973, "grad_norm": 13.318133354187012, "learning_rate": 4.81400755594304e-06, "loss": 0.6828, "mean_token_accuracy": 0.8059917777776718, "num_tokens": 11938823.0, "step": 9940 }, { "entropy": 1.8431586802005768, "epoch": 0.030844129424446996, "grad_norm": 11.968719482421875, "learning_rate": 4.81885110917369e-06, "loss": 0.5588, "mean_token_accuracy": 0.8303678393363952, "num_tokens": 11950208.0, "step": 9950 }, { "entropy": 1.8151010930538178, "epoch": 0.03087512854949669, "grad_norm": 13.34570026397705, "learning_rate": 4.82369466240434e-06, "loss": 0.5367, "mean_token_accuracy": 0.8233583375811577, "num_tokens": 11963211.0, "step": 9960 }, { "entropy": 1.9202764973044395, "epoch": 0.030906127674546386, "grad_norm": 12.33815860748291, "learning_rate": 4.82853821563499e-06, "loss": 0.6453, "mean_token_accuracy": 0.7984903916716576, "num_tokens": 11974413.0, "step": 9970 }, { "entropy": 1.7811052426695824, "epoch": 0.030937126799596083, "grad_norm": 13.612791061401367, "learning_rate": 4.833381768865641e-06, "loss": 0.5295, "mean_token_accuracy": 0.8154791623353959, "num_tokens": 11986759.0, "step": 9980 }, { "entropy": 1.8961547821760179, "epoch": 0.030968125924645776, "grad_norm": 13.582847595214844, "learning_rate": 4.8382253220962906e-06, "loss": 0.6424, "mean_token_accuracy": 0.8010322540998459, "num_tokens": 11997882.0, "step": 9990 }, { "entropy": 1.7699449375271796, "epoch": 0.030999125049695472, "grad_norm": 12.8268461227417, "learning_rate": 4.8430688753269404e-06, "loss": 0.5349, "mean_token_accuracy": 0.8104767709970474, "num_tokens": 12011562.0, "step": 10000 }, { "entropy": 1.7704377323389053, "epoch": 0.03103012417474517, "grad_norm": 7.003698348999023, "learning_rate": 4.84791242855759e-06, "loss": 0.4874, "mean_token_accuracy": 0.817914342880249, "num_tokens": 12024642.0, "step": 10010 }, { "entropy": 1.8417405053973197, "epoch": 0.031061123299794862, "grad_norm": 12.85013198852539, "learning_rate": 4.85275598178824e-06, "loss": 0.6207, "mean_token_accuracy": 0.8156400442123413, "num_tokens": 12035688.0, "step": 10020 }, { "entropy": 1.871736840903759, "epoch": 0.03109212242484456, "grad_norm": 10.676023483276367, "learning_rate": 4.85759953501889e-06, "loss": 0.6359, "mean_token_accuracy": 0.8091729044914245, "num_tokens": 12046810.0, "step": 10030 }, { "entropy": 1.8386752873659133, "epoch": 0.031123121549894255, "grad_norm": 5.60599422454834, "learning_rate": 4.862443088249541e-06, "loss": 0.5317, "mean_token_accuracy": 0.8189519822597504, "num_tokens": 12059567.0, "step": 10040 }, { "entropy": 1.9133656844496727, "epoch": 0.03115412067494395, "grad_norm": 13.941995620727539, "learning_rate": 4.86728664148019e-06, "loss": 0.6437, "mean_token_accuracy": 0.8070616811513901, "num_tokens": 12070777.0, "step": 10050 }, { "entropy": 1.859761357307434, "epoch": 0.031185119799993645, "grad_norm": 11.398521423339844, "learning_rate": 4.87213019471084e-06, "loss": 0.557, "mean_token_accuracy": 0.8148921579122543, "num_tokens": 12082916.0, "step": 10060 }, { "entropy": 1.8685668006539344, "epoch": 0.03121611892504334, "grad_norm": 11.567840576171875, "learning_rate": 4.8769737479414905e-06, "loss": 0.6081, "mean_token_accuracy": 0.812491662800312, "num_tokens": 12094481.0, "step": 10070 }, { "entropy": 1.7921167001128198, "epoch": 0.031247118050093035, "grad_norm": 11.81413745880127, "learning_rate": 4.88181730117214e-06, "loss": 0.4818, "mean_token_accuracy": 0.8256913438439369, "num_tokens": 12107579.0, "step": 10080 }, { "entropy": 1.9026924163103103, "epoch": 0.031278117175142735, "grad_norm": 5.831927299499512, "learning_rate": 4.88666085440279e-06, "loss": 0.5999, "mean_token_accuracy": 0.8175748229026795, "num_tokens": 12119504.0, "step": 10090 }, { "entropy": 1.8218435019254684, "epoch": 0.031309116300192424, "grad_norm": 6.674549579620361, "learning_rate": 4.89150440763344e-06, "loss": 0.5129, "mean_token_accuracy": 0.8210835456848145, "num_tokens": 12132997.0, "step": 10100 }, { "entropy": 1.7946079865098, "epoch": 0.03134011542524212, "grad_norm": 6.814689636230469, "learning_rate": 4.89634796086409e-06, "loss": 0.5653, "mean_token_accuracy": 0.8189253076910973, "num_tokens": 12145732.0, "step": 10110 }, { "entropy": 1.8335962176322937, "epoch": 0.03137111455029182, "grad_norm": 12.774432182312012, "learning_rate": 4.90119151409474e-06, "loss": 0.5843, "mean_token_accuracy": 0.8219740748405456, "num_tokens": 12157527.0, "step": 10120 }, { "entropy": 1.9156937181949616, "epoch": 0.031402113675341514, "grad_norm": 13.017987251281738, "learning_rate": 4.906035067325391e-06, "loss": 0.6375, "mean_token_accuracy": 0.8114208281040192, "num_tokens": 12168529.0, "step": 10130 }, { "entropy": 1.929344841837883, "epoch": 0.03143311280039121, "grad_norm": 12.184325218200684, "learning_rate": 4.9108786205560405e-06, "loss": 0.6351, "mean_token_accuracy": 0.816659900546074, "num_tokens": 12179791.0, "step": 10140 }, { "entropy": 1.8231835559010505, "epoch": 0.03146411192544091, "grad_norm": 13.120546340942383, "learning_rate": 4.91572217378669e-06, "loss": 0.5967, "mean_token_accuracy": 0.8101472899317741, "num_tokens": 12192369.0, "step": 10150 }, { "entropy": 1.7879482999444007, "epoch": 0.0314951110504906, "grad_norm": 14.349749565124512, "learning_rate": 4.92056572701734e-06, "loss": 0.5482, "mean_token_accuracy": 0.8103778302669525, "num_tokens": 12205341.0, "step": 10160 }, { "entropy": 1.9244187206029892, "epoch": 0.031526110175540294, "grad_norm": 11.694113731384277, "learning_rate": 4.92540928024799e-06, "loss": 0.643, "mean_token_accuracy": 0.8135433197021484, "num_tokens": 12217222.0, "step": 10170 }, { "entropy": 1.9573982939124108, "epoch": 0.03155710930058999, "grad_norm": 12.177897453308105, "learning_rate": 4.93025283347864e-06, "loss": 0.6126, "mean_token_accuracy": 0.809203888475895, "num_tokens": 12228264.0, "step": 10180 }, { "entropy": 1.9127047300338744, "epoch": 0.03158810842563969, "grad_norm": 13.4846830368042, "learning_rate": 4.935096386709291e-06, "loss": 0.6306, "mean_token_accuracy": 0.8137389361858368, "num_tokens": 12240050.0, "step": 10190 }, { "entropy": 1.910162153840065, "epoch": 0.03161910755068938, "grad_norm": 14.54655647277832, "learning_rate": 4.939939939939941e-06, "loss": 0.6165, "mean_token_accuracy": 0.8089611247181893, "num_tokens": 12251372.0, "step": 10200 }, { "entropy": 1.9011714354157447, "epoch": 0.03165010667573908, "grad_norm": 12.256708145141602, "learning_rate": 4.9447834931705905e-06, "loss": 0.5892, "mean_token_accuracy": 0.8112087488174439, "num_tokens": 12263039.0, "step": 10210 }, { "entropy": 1.8994778007268907, "epoch": 0.03168110580078877, "grad_norm": 14.00627613067627, "learning_rate": 4.9496270464012404e-06, "loss": 0.732, "mean_token_accuracy": 0.7959692940115929, "num_tokens": 12275633.0, "step": 10220 }, { "entropy": 1.7719287261366845, "epoch": 0.031712104925838466, "grad_norm": 10.754083633422852, "learning_rate": 4.95447059963189e-06, "loss": 0.5132, "mean_token_accuracy": 0.8201853647828102, "num_tokens": 12288123.0, "step": 10230 }, { "entropy": 1.9072040289640426, "epoch": 0.03174310405088816, "grad_norm": 10.166359901428223, "learning_rate": 4.95931415286254e-06, "loss": 0.6276, "mean_token_accuracy": 0.8175885871052742, "num_tokens": 12299187.0, "step": 10240 }, { "entropy": 1.897184392809868, "epoch": 0.03177410317593786, "grad_norm": 10.87751293182373, "learning_rate": 4.964157706093191e-06, "loss": 0.6334, "mean_token_accuracy": 0.8149495676159859, "num_tokens": 12309767.0, "step": 10250 }, { "entropy": 1.8759140372276306, "epoch": 0.031805102300987556, "grad_norm": 10.203163146972656, "learning_rate": 4.96900125932384e-06, "loss": 0.634, "mean_token_accuracy": 0.8065581321716309, "num_tokens": 12320842.0, "step": 10260 }, { "entropy": 1.8192242681980133, "epoch": 0.03183610142603725, "grad_norm": 13.203461647033691, "learning_rate": 4.97384481255449e-06, "loss": 0.5818, "mean_token_accuracy": 0.8157040163874626, "num_tokens": 12332641.0, "step": 10270 }, { "entropy": 1.9448095709085464, "epoch": 0.03186710055108694, "grad_norm": 16.432714462280273, "learning_rate": 4.978688365785141e-06, "loss": 0.7231, "mean_token_accuracy": 0.7967922583222389, "num_tokens": 12343283.0, "step": 10280 }, { "entropy": 1.8554027765989303, "epoch": 0.03189809967613664, "grad_norm": 10.973164558410645, "learning_rate": 4.9835319190157905e-06, "loss": 0.6028, "mean_token_accuracy": 0.8122664794325829, "num_tokens": 12355277.0, "step": 10290 }, { "entropy": 1.8099492847919465, "epoch": 0.031929098801186336, "grad_norm": 13.487873077392578, "learning_rate": 4.98837547224644e-06, "loss": 0.5666, "mean_token_accuracy": 0.8163850530982018, "num_tokens": 12367374.0, "step": 10300 }, { "entropy": 1.907564203441143, "epoch": 0.03196009792623603, "grad_norm": 14.73357105255127, "learning_rate": 4.99321902547709e-06, "loss": 0.6864, "mean_token_accuracy": 0.804863877594471, "num_tokens": 12379185.0, "step": 10310 }, { "entropy": 1.9018947511911393, "epoch": 0.03199109705128573, "grad_norm": 12.761754035949707, "learning_rate": 4.99806257870774e-06, "loss": 0.6285, "mean_token_accuracy": 0.7974288746714592, "num_tokens": 12390639.0, "step": 10320 }, { "entropy": 1.8945544630289077, "epoch": 0.032022096176335425, "grad_norm": 12.64550495147705, "learning_rate": 5.002906131938391e-06, "loss": 0.696, "mean_token_accuracy": 0.7960980877280235, "num_tokens": 12401445.0, "step": 10330 }, { "entropy": 1.8769317850470544, "epoch": 0.032053095301385115, "grad_norm": 10.872472763061523, "learning_rate": 5.00774968516904e-06, "loss": 0.6736, "mean_token_accuracy": 0.8056196987628936, "num_tokens": 12412842.0, "step": 10340 }, { "entropy": 1.874910195171833, "epoch": 0.03208409442643481, "grad_norm": 11.449685096740723, "learning_rate": 5.01259323839969e-06, "loss": 0.6081, "mean_token_accuracy": 0.8147278919816017, "num_tokens": 12424094.0, "step": 10350 }, { "entropy": 1.8590925335884094, "epoch": 0.03211509355148451, "grad_norm": 13.95566177368164, "learning_rate": 5.0174367916303405e-06, "loss": 0.5992, "mean_token_accuracy": 0.8060554280877114, "num_tokens": 12436102.0, "step": 10360 }, { "entropy": 1.9232334434986114, "epoch": 0.032146092676534205, "grad_norm": 13.31452751159668, "learning_rate": 5.02228034486099e-06, "loss": 0.6143, "mean_token_accuracy": 0.819976469874382, "num_tokens": 12447424.0, "step": 10370 }, { "entropy": 1.860893575847149, "epoch": 0.0321770918015839, "grad_norm": 11.070972442626953, "learning_rate": 5.02712389809164e-06, "loss": 0.6446, "mean_token_accuracy": 0.8143087536096573, "num_tokens": 12459620.0, "step": 10380 }, { "entropy": 1.8405973672866822, "epoch": 0.0322080909266336, "grad_norm": 11.35168743133545, "learning_rate": 5.03196745132229e-06, "loss": 0.5767, "mean_token_accuracy": 0.8185856312513351, "num_tokens": 12471581.0, "step": 10390 }, { "entropy": 1.8092712104320525, "epoch": 0.032239090051683295, "grad_norm": 12.700512886047363, "learning_rate": 5.036811004552941e-06, "loss": 0.5496, "mean_token_accuracy": 0.8279015332460403, "num_tokens": 12483574.0, "step": 10400 }, { "entropy": 1.772294245660305, "epoch": 0.032270089176732984, "grad_norm": 11.940170288085938, "learning_rate": 5.041654557783591e-06, "loss": 0.5448, "mean_token_accuracy": 0.8129552945494651, "num_tokens": 12496082.0, "step": 10410 }, { "entropy": 1.7716655775904655, "epoch": 0.03230108830178268, "grad_norm": 5.408534049987793, "learning_rate": 5.046498111014241e-06, "loss": 0.5785, "mean_token_accuracy": 0.8172632664442062, "num_tokens": 12508722.0, "step": 10420 }, { "entropy": 1.7848970398306847, "epoch": 0.03233208742683238, "grad_norm": 10.916991233825684, "learning_rate": 5.0513416642448905e-06, "loss": 0.5381, "mean_token_accuracy": 0.822851151227951, "num_tokens": 12521638.0, "step": 10430 }, { "entropy": 1.88555389046669, "epoch": 0.032363086551882074, "grad_norm": 16.887577056884766, "learning_rate": 5.0561852174755396e-06, "loss": 0.6788, "mean_token_accuracy": 0.7913921490311623, "num_tokens": 12533306.0, "step": 10440 }, { "entropy": 1.8448584645986557, "epoch": 0.03239408567693177, "grad_norm": 5.770571708679199, "learning_rate": 5.06102877070619e-06, "loss": 0.545, "mean_token_accuracy": 0.8273654997348785, "num_tokens": 12544900.0, "step": 10450 }, { "entropy": 1.8120036974549294, "epoch": 0.03242508480198147, "grad_norm": 12.20251178741455, "learning_rate": 5.06587232393684e-06, "loss": 0.5249, "mean_token_accuracy": 0.8255019530653953, "num_tokens": 12557961.0, "step": 10460 }, { "entropy": 1.8894508898258209, "epoch": 0.03245608392703116, "grad_norm": 12.401219367980957, "learning_rate": 5.07071587716749e-06, "loss": 0.6334, "mean_token_accuracy": 0.8066725865006447, "num_tokens": 12569409.0, "step": 10470 }, { "entropy": 1.8568513855338096, "epoch": 0.03248708305208085, "grad_norm": 15.682320594787598, "learning_rate": 5.07555943039814e-06, "loss": 0.5906, "mean_token_accuracy": 0.8160646855831146, "num_tokens": 12581146.0, "step": 10480 }, { "entropy": 1.806802648305893, "epoch": 0.03251808217713055, "grad_norm": 13.31312370300293, "learning_rate": 5.080402983628791e-06, "loss": 0.5478, "mean_token_accuracy": 0.8170752301812172, "num_tokens": 12593979.0, "step": 10490 }, { "entropy": 1.8914892196655273, "epoch": 0.03254908130218025, "grad_norm": 13.41655445098877, "learning_rate": 5.0852465368594406e-06, "loss": 0.6523, "mean_token_accuracy": 0.8074697732925415, "num_tokens": 12604872.0, "step": 10500 }, { "entropy": 1.8397974416613578, "epoch": 0.03258008042722994, "grad_norm": 13.245908737182617, "learning_rate": 5.0900900900900905e-06, "loss": 0.6411, "mean_token_accuracy": 0.8051371321082115, "num_tokens": 12616464.0, "step": 10510 }, { "entropy": 1.7762023329734802, "epoch": 0.03261107955227964, "grad_norm": 12.972115516662598, "learning_rate": 5.094933643320741e-06, "loss": 0.5699, "mean_token_accuracy": 0.8134799718856811, "num_tokens": 12628431.0, "step": 10520 }, { "entropy": 1.7372042164206505, "epoch": 0.03264207867732933, "grad_norm": 14.975220680236816, "learning_rate": 5.099777196551391e-06, "loss": 0.5565, "mean_token_accuracy": 0.8230763703584671, "num_tokens": 12641420.0, "step": 10530 }, { "entropy": 1.8806333974003793, "epoch": 0.032673077802379026, "grad_norm": 10.917582511901855, "learning_rate": 5.104620749782041e-06, "loss": 0.6079, "mean_token_accuracy": 0.8124522119760513, "num_tokens": 12652755.0, "step": 10540 }, { "entropy": 1.8468591958284377, "epoch": 0.03270407692742872, "grad_norm": 15.210384368896484, "learning_rate": 5.10946430301269e-06, "loss": 0.6178, "mean_token_accuracy": 0.8065985247492791, "num_tokens": 12664093.0, "step": 10550 }, { "entropy": 1.7937588766217232, "epoch": 0.03273507605247842, "grad_norm": 11.96823787689209, "learning_rate": 5.11430785624334e-06, "loss": 0.5948, "mean_token_accuracy": 0.8129746183753014, "num_tokens": 12676810.0, "step": 10560 }, { "entropy": 1.8590254932641983, "epoch": 0.032766075177528116, "grad_norm": 13.634770393371582, "learning_rate": 5.119151409473991e-06, "loss": 0.622, "mean_token_accuracy": 0.812176737189293, "num_tokens": 12688390.0, "step": 10570 }, { "entropy": 1.8288069173693657, "epoch": 0.03279707430257781, "grad_norm": 19.997926712036133, "learning_rate": 5.1239949627046405e-06, "loss": 0.6365, "mean_token_accuracy": 0.8077499896287919, "num_tokens": 12700682.0, "step": 10580 }, { "entropy": 1.9130954205989839, "epoch": 0.0328280734276275, "grad_norm": 13.037125587463379, "learning_rate": 5.12883851593529e-06, "loss": 0.7036, "mean_token_accuracy": 0.8007219597697258, "num_tokens": 12711160.0, "step": 10590 }, { "entropy": 1.908418272435665, "epoch": 0.0328590725526772, "grad_norm": 9.9063720703125, "learning_rate": 5.13368206916594e-06, "loss": 0.6536, "mean_token_accuracy": 0.8085066750645638, "num_tokens": 12722404.0, "step": 10600 }, { "entropy": 1.88576088398695, "epoch": 0.032890071677726895, "grad_norm": 15.735233306884766, "learning_rate": 5.138525622396591e-06, "loss": 0.6415, "mean_token_accuracy": 0.8067146822810173, "num_tokens": 12735219.0, "step": 10610 }, { "entropy": 1.8483037024736404, "epoch": 0.03292107080277659, "grad_norm": 16.17683982849121, "learning_rate": 5.143369175627241e-06, "loss": 0.5999, "mean_token_accuracy": 0.809206846356392, "num_tokens": 12747804.0, "step": 10620 }, { "entropy": 1.7985512629151343, "epoch": 0.03295206992782629, "grad_norm": 5.797553062438965, "learning_rate": 5.148212728857891e-06, "loss": 0.5436, "mean_token_accuracy": 0.8201219871640205, "num_tokens": 12760529.0, "step": 10630 }, { "entropy": 1.829072842001915, "epoch": 0.032983069052875985, "grad_norm": 11.334760665893555, "learning_rate": 5.153056282088541e-06, "loss": 0.5998, "mean_token_accuracy": 0.8162604227662087, "num_tokens": 12772598.0, "step": 10640 }, { "entropy": 1.8073836967349053, "epoch": 0.033014068177925675, "grad_norm": 11.55426025390625, "learning_rate": 5.15789983531919e-06, "loss": 0.5323, "mean_token_accuracy": 0.8164139956235885, "num_tokens": 12784907.0, "step": 10650 }, { "entropy": 1.8018400803208352, "epoch": 0.03304506730297537, "grad_norm": 11.91412353515625, "learning_rate": 5.16274338854984e-06, "loss": 0.5519, "mean_token_accuracy": 0.8153271213173866, "num_tokens": 12797883.0, "step": 10660 }, { "entropy": 1.7438832193613052, "epoch": 0.03307606642802507, "grad_norm": 13.903759956359863, "learning_rate": 5.16758694178049e-06, "loss": 0.5102, "mean_token_accuracy": 0.8333475485444068, "num_tokens": 12810429.0, "step": 10670 }, { "entropy": 1.8345250859856606, "epoch": 0.033107065553074765, "grad_norm": 12.650176048278809, "learning_rate": 5.17243049501114e-06, "loss": 0.5793, "mean_token_accuracy": 0.8180657878518105, "num_tokens": 12822129.0, "step": 10680 }, { "entropy": 1.8904076486825943, "epoch": 0.03313806467812446, "grad_norm": 10.247991561889648, "learning_rate": 5.17727404824179e-06, "loss": 0.6336, "mean_token_accuracy": 0.8089143499732018, "num_tokens": 12833756.0, "step": 10690 }, { "entropy": 1.8453685097396373, "epoch": 0.03316906380317416, "grad_norm": 6.841571807861328, "learning_rate": 5.182117601472441e-06, "loss": 0.5723, "mean_token_accuracy": 0.8071049571037292, "num_tokens": 12846112.0, "step": 10700 }, { "entropy": 1.8263072147965431, "epoch": 0.03320006292822385, "grad_norm": 10.83479118347168, "learning_rate": 5.186961154703091e-06, "loss": 0.5561, "mean_token_accuracy": 0.8183331623673439, "num_tokens": 12858793.0, "step": 10710 }, { "entropy": 1.9237968116998672, "epoch": 0.033231062053273544, "grad_norm": 14.300333023071289, "learning_rate": 5.1918047079337406e-06, "loss": 0.6285, "mean_token_accuracy": 0.8162373065948486, "num_tokens": 12869520.0, "step": 10720 }, { "entropy": 1.804058338701725, "epoch": 0.03326206117832324, "grad_norm": 12.697318077087402, "learning_rate": 5.196648261164391e-06, "loss": 0.5877, "mean_token_accuracy": 0.8053817078471184, "num_tokens": 12883158.0, "step": 10730 }, { "entropy": 1.858067548274994, "epoch": 0.03329306030337294, "grad_norm": 12.211063385009766, "learning_rate": 5.201491814395041e-06, "loss": 0.5606, "mean_token_accuracy": 0.8255058348178863, "num_tokens": 12894190.0, "step": 10740 }, { "entropy": 1.8079882711172104, "epoch": 0.033324059428422634, "grad_norm": 6.42866325378418, "learning_rate": 5.206335367625691e-06, "loss": 0.5676, "mean_token_accuracy": 0.8178721934556961, "num_tokens": 12907324.0, "step": 10750 }, { "entropy": 1.851043240725994, "epoch": 0.03335505855347233, "grad_norm": 13.750490188598633, "learning_rate": 5.21117892085634e-06, "loss": 0.6343, "mean_token_accuracy": 0.8060193166136742, "num_tokens": 12918683.0, "step": 10760 }, { "entropy": 1.8697371006011962, "epoch": 0.03338605767852203, "grad_norm": 6.069192409515381, "learning_rate": 5.21602247408699e-06, "loss": 0.6548, "mean_token_accuracy": 0.8067969933152199, "num_tokens": 12930503.0, "step": 10770 }, { "entropy": 1.833288662135601, "epoch": 0.03341705680357172, "grad_norm": 11.35355281829834, "learning_rate": 5.220866027317641e-06, "loss": 0.575, "mean_token_accuracy": 0.8082702040672303, "num_tokens": 12943156.0, "step": 10780 }, { "entropy": 1.895593549311161, "epoch": 0.03344805592862141, "grad_norm": 6.248627662658691, "learning_rate": 5.225709580548291e-06, "loss": 0.629, "mean_token_accuracy": 0.8071183800697327, "num_tokens": 12955988.0, "step": 10790 }, { "entropy": 1.8191024243831635, "epoch": 0.03347905505367111, "grad_norm": 4.646948337554932, "learning_rate": 5.2305531337789405e-06, "loss": 0.6002, "mean_token_accuracy": 0.81056018024683, "num_tokens": 12969403.0, "step": 10800 }, { "entropy": 1.8694421991705894, "epoch": 0.033510054178720806, "grad_norm": 6.7921977043151855, "learning_rate": 5.23539668700959e-06, "loss": 0.5993, "mean_token_accuracy": 0.8087976023554801, "num_tokens": 12981414.0, "step": 10810 }, { "entropy": 1.8624958485364913, "epoch": 0.0335410533037705, "grad_norm": 14.213687896728516, "learning_rate": 5.240240240240241e-06, "loss": 0.6583, "mean_token_accuracy": 0.8021715626120567, "num_tokens": 12992353.0, "step": 10820 }, { "entropy": 1.7845357745885848, "epoch": 0.0335720524288202, "grad_norm": 5.307274341583252, "learning_rate": 5.245083793470891e-06, "loss": 0.5681, "mean_token_accuracy": 0.8313144221901894, "num_tokens": 13004955.0, "step": 10830 }, { "entropy": 1.8418287009000778, "epoch": 0.03360305155386989, "grad_norm": 8.079380989074707, "learning_rate": 5.249927346701541e-06, "loss": 0.5771, "mean_token_accuracy": 0.8063233241438865, "num_tokens": 13017444.0, "step": 10840 }, { "entropy": 1.8194942593574523, "epoch": 0.033634050678919586, "grad_norm": 10.57718563079834, "learning_rate": 5.254770899932191e-06, "loss": 0.5911, "mean_token_accuracy": 0.8096415162086487, "num_tokens": 13029772.0, "step": 10850 }, { "entropy": 1.910832443833351, "epoch": 0.03366504980396928, "grad_norm": 11.854331970214844, "learning_rate": 5.25961445316284e-06, "loss": 0.6181, "mean_token_accuracy": 0.8215459808707237, "num_tokens": 13041096.0, "step": 10860 }, { "entropy": 1.8814880549907684, "epoch": 0.03369604892901898, "grad_norm": 12.540098190307617, "learning_rate": 5.2644580063934905e-06, "loss": 0.6347, "mean_token_accuracy": 0.805778457224369, "num_tokens": 13052804.0, "step": 10870 }, { "entropy": 1.8503670692443848, "epoch": 0.033727048054068676, "grad_norm": 5.048069477081299, "learning_rate": 5.26930155962414e-06, "loss": 0.5956, "mean_token_accuracy": 0.8164945662021637, "num_tokens": 13064995.0, "step": 10880 }, { "entropy": 1.871761327981949, "epoch": 0.03375804717911837, "grad_norm": 17.457483291625977, "learning_rate": 5.27414511285479e-06, "loss": 0.5965, "mean_token_accuracy": 0.8171331033110618, "num_tokens": 13076407.0, "step": 10890 }, { "entropy": 1.8655700251460074, "epoch": 0.03378904630416806, "grad_norm": 5.408127307891846, "learning_rate": 5.27898866608544e-06, "loss": 0.6076, "mean_token_accuracy": 0.8107570946216583, "num_tokens": 13088254.0, "step": 10900 }, { "entropy": 1.9251561522483827, "epoch": 0.03382004542921776, "grad_norm": 11.156283378601074, "learning_rate": 5.283832219316091e-06, "loss": 0.708, "mean_token_accuracy": 0.7975843846797943, "num_tokens": 13099187.0, "step": 10910 }, { "entropy": 1.9327981039881705, "epoch": 0.033851044554267455, "grad_norm": 15.034012794494629, "learning_rate": 5.288675772546741e-06, "loss": 0.6567, "mean_token_accuracy": 0.8078183338046074, "num_tokens": 13110633.0, "step": 10920 }, { "entropy": 1.8207261368632317, "epoch": 0.03388204367931715, "grad_norm": 13.088176727294922, "learning_rate": 5.293519325777391e-06, "loss": 0.5434, "mean_token_accuracy": 0.8240138441324234, "num_tokens": 13122812.0, "step": 10930 }, { "entropy": 1.914224511384964, "epoch": 0.03391304280436685, "grad_norm": 12.61460018157959, "learning_rate": 5.298362879008041e-06, "loss": 0.6495, "mean_token_accuracy": 0.8125144988298416, "num_tokens": 13133966.0, "step": 10940 }, { "entropy": 1.8054762348532676, "epoch": 0.033944041929416545, "grad_norm": 12.354558944702148, "learning_rate": 5.303206432238691e-06, "loss": 0.5174, "mean_token_accuracy": 0.8258527040481567, "num_tokens": 13146315.0, "step": 10950 }, { "entropy": 1.7843435242772103, "epoch": 0.033975041054466235, "grad_norm": 11.364967346191406, "learning_rate": 5.308049985469341e-06, "loss": 0.5668, "mean_token_accuracy": 0.8167797103524208, "num_tokens": 13158286.0, "step": 10960 }, { "entropy": 1.8914837822318078, "epoch": 0.03400604017951593, "grad_norm": 12.683460235595703, "learning_rate": 5.31289353869999e-06, "loss": 0.6436, "mean_token_accuracy": 0.8010558471083641, "num_tokens": 13170443.0, "step": 10970 }, { "entropy": 1.8247480258345603, "epoch": 0.03403703930456563, "grad_norm": 5.69301176071167, "learning_rate": 5.31773709193064e-06, "loss": 0.5587, "mean_token_accuracy": 0.8227360025048256, "num_tokens": 13181845.0, "step": 10980 }, { "entropy": 1.8641392514109612, "epoch": 0.034068038429615324, "grad_norm": 12.605916023254395, "learning_rate": 5.322580645161291e-06, "loss": 0.6679, "mean_token_accuracy": 0.80641338378191, "num_tokens": 13193899.0, "step": 10990 }, { "entropy": 1.8444667972624302, "epoch": 0.03409903755466502, "grad_norm": 5.727068901062012, "learning_rate": 5.327424198391941e-06, "loss": 0.5699, "mean_token_accuracy": 0.8202215030789375, "num_tokens": 13206043.0, "step": 11000 }, { "entropy": 1.773200060427189, "epoch": 0.03413003667971472, "grad_norm": 13.433945655822754, "learning_rate": 5.332267751622591e-06, "loss": 0.5159, "mean_token_accuracy": 0.827538113296032, "num_tokens": 13218691.0, "step": 11010 }, { "entropy": 1.9272786289453507, "epoch": 0.03416103580476441, "grad_norm": 11.8448486328125, "learning_rate": 5.3371113048532405e-06, "loss": 0.6449, "mean_token_accuracy": 0.8075118377804756, "num_tokens": 13229756.0, "step": 11020 }, { "entropy": 1.8923597291111947, "epoch": 0.034192034929814104, "grad_norm": 5.769096374511719, "learning_rate": 5.341954858083891e-06, "loss": 0.5911, "mean_token_accuracy": 0.8067701622843743, "num_tokens": 13241705.0, "step": 11030 }, { "entropy": 1.8403789684176446, "epoch": 0.0342230340548638, "grad_norm": 12.353031158447266, "learning_rate": 5.346798411314541e-06, "loss": 0.5409, "mean_token_accuracy": 0.8218411967158318, "num_tokens": 13254778.0, "step": 11040 }, { "entropy": 1.8932681947946548, "epoch": 0.0342540331799135, "grad_norm": 5.458434581756592, "learning_rate": 5.351641964545191e-06, "loss": 0.5972, "mean_token_accuracy": 0.8183223947882652, "num_tokens": 13266049.0, "step": 11050 }, { "entropy": 1.86180839240551, "epoch": 0.034285032304963194, "grad_norm": 8.72637939453125, "learning_rate": 5.356485517775841e-06, "loss": 0.5903, "mean_token_accuracy": 0.8129496097564697, "num_tokens": 13277888.0, "step": 11060 }, { "entropy": 1.9068016976118087, "epoch": 0.03431603143001289, "grad_norm": 12.052847862243652, "learning_rate": 5.36132907100649e-06, "loss": 0.6476, "mean_token_accuracy": 0.8061621204018593, "num_tokens": 13289432.0, "step": 11070 }, { "entropy": 1.8573195546865464, "epoch": 0.03434703055506259, "grad_norm": 10.996872901916504, "learning_rate": 5.366172624237141e-06, "loss": 0.5506, "mean_token_accuracy": 0.818072022497654, "num_tokens": 13301147.0, "step": 11080 }, { "entropy": 1.8704863846302033, "epoch": 0.034378029680112276, "grad_norm": 13.163795471191406, "learning_rate": 5.3710161774677905e-06, "loss": 0.5488, "mean_token_accuracy": 0.816085159778595, "num_tokens": 13312829.0, "step": 11090 }, { "entropy": 1.8130246475338936, "epoch": 0.03440902880516197, "grad_norm": 14.593989372253418, "learning_rate": 5.37585973069844e-06, "loss": 0.585, "mean_token_accuracy": 0.8232684478163719, "num_tokens": 13325336.0, "step": 11100 }, { "entropy": 1.835708625614643, "epoch": 0.03444002793021167, "grad_norm": 12.702786445617676, "learning_rate": 5.380703283929091e-06, "loss": 0.5843, "mean_token_accuracy": 0.8147074475884437, "num_tokens": 13337824.0, "step": 11110 }, { "entropy": 1.8307800009846686, "epoch": 0.034471027055261366, "grad_norm": 11.551653861999512, "learning_rate": 5.385546837159741e-06, "loss": 0.594, "mean_token_accuracy": 0.811168585717678, "num_tokens": 13349534.0, "step": 11120 }, { "entropy": 1.8517664805054665, "epoch": 0.03450202618031106, "grad_norm": 2.7256433963775635, "learning_rate": 5.390390390390391e-06, "loss": 0.602, "mean_token_accuracy": 0.8006679996848106, "num_tokens": 13363408.0, "step": 11130 }, { "entropy": 1.8399043202400207, "epoch": 0.03453302530536076, "grad_norm": 7.206852912902832, "learning_rate": 5.395233943621041e-06, "loss": 0.6157, "mean_token_accuracy": 0.8117770627140999, "num_tokens": 13376213.0, "step": 11140 }, { "entropy": 1.9139886111021043, "epoch": 0.03456402443041045, "grad_norm": 12.817607879638672, "learning_rate": 5.4000774968516915e-06, "loss": 0.6346, "mean_token_accuracy": 0.8084447503089904, "num_tokens": 13387413.0, "step": 11150 }, { "entropy": 1.8213655844330787, "epoch": 0.034595023555460146, "grad_norm": 11.663803100585938, "learning_rate": 5.404921050082341e-06, "loss": 0.6088, "mean_token_accuracy": 0.8164772510528564, "num_tokens": 13399017.0, "step": 11160 }, { "entropy": 1.8936782956123352, "epoch": 0.03462602268050984, "grad_norm": 10.823102951049805, "learning_rate": 5.409764603312991e-06, "loss": 0.6693, "mean_token_accuracy": 0.810118442773819, "num_tokens": 13410180.0, "step": 11170 }, { "entropy": 1.8289520889520645, "epoch": 0.03465702180555954, "grad_norm": 12.268133163452148, "learning_rate": 5.41460815654364e-06, "loss": 0.5937, "mean_token_accuracy": 0.812929916381836, "num_tokens": 13422880.0, "step": 11180 }, { "entropy": 1.7611708968877793, "epoch": 0.034688020930609235, "grad_norm": 5.681921482086182, "learning_rate": 5.41945170977429e-06, "loss": 0.5208, "mean_token_accuracy": 0.8295134902000427, "num_tokens": 13436525.0, "step": 11190 }, { "entropy": 1.894238282740116, "epoch": 0.03471902005565893, "grad_norm": 9.887272834777832, "learning_rate": 5.424295263004941e-06, "loss": 0.6165, "mean_token_accuracy": 0.8203635275363922, "num_tokens": 13447769.0, "step": 11200 }, { "entropy": 1.8931616693735123, "epoch": 0.03475001918070862, "grad_norm": 10.567079544067383, "learning_rate": 5.429138816235591e-06, "loss": 0.6151, "mean_token_accuracy": 0.813125379383564, "num_tokens": 13459142.0, "step": 11210 }, { "entropy": 1.8201383396983146, "epoch": 0.03478101830575832, "grad_norm": 6.484344482421875, "learning_rate": 5.433982369466241e-06, "loss": 0.5731, "mean_token_accuracy": 0.8103004142642021, "num_tokens": 13471313.0, "step": 11220 }, { "entropy": 1.8642778664827346, "epoch": 0.034812017430808015, "grad_norm": 13.053911209106445, "learning_rate": 5.4388259226968906e-06, "loss": 0.5937, "mean_token_accuracy": 0.8092061296105385, "num_tokens": 13483091.0, "step": 11230 }, { "entropy": 1.8669605866074561, "epoch": 0.03484301655585771, "grad_norm": 11.9703369140625, "learning_rate": 5.443669475927541e-06, "loss": 0.6023, "mean_token_accuracy": 0.8229537546634674, "num_tokens": 13494177.0, "step": 11240 }, { "entropy": 1.85545664280653, "epoch": 0.03487401568090741, "grad_norm": 6.333286762237549, "learning_rate": 5.448513029158191e-06, "loss": 0.6025, "mean_token_accuracy": 0.810517068207264, "num_tokens": 13507247.0, "step": 11250 }, { "entropy": 1.7715984418988229, "epoch": 0.034905014805957105, "grad_norm": 10.145940780639648, "learning_rate": 5.453356582388841e-06, "loss": 0.5342, "mean_token_accuracy": 0.825646486878395, "num_tokens": 13520781.0, "step": 11260 }, { "entropy": 1.9231941044330596, "epoch": 0.034936013931006794, "grad_norm": 12.940445899963379, "learning_rate": 5.458200135619491e-06, "loss": 0.6732, "mean_token_accuracy": 0.799634762108326, "num_tokens": 13531942.0, "step": 11270 }, { "entropy": 1.8580936834216117, "epoch": 0.03496701305605649, "grad_norm": 13.450532913208008, "learning_rate": 5.46304368885014e-06, "loss": 0.6076, "mean_token_accuracy": 0.8047621414065361, "num_tokens": 13543831.0, "step": 11280 }, { "entropy": 1.8055393621325493, "epoch": 0.03499801218110619, "grad_norm": 12.781180381774902, "learning_rate": 5.467887242080791e-06, "loss": 0.5527, "mean_token_accuracy": 0.8207426607608795, "num_tokens": 13555967.0, "step": 11290 }, { "entropy": 1.804980905354023, "epoch": 0.035029011306155884, "grad_norm": 12.926373481750488, "learning_rate": 5.472730795311441e-06, "loss": 0.5702, "mean_token_accuracy": 0.8110515549778938, "num_tokens": 13569161.0, "step": 11300 }, { "entropy": 1.9503383368253708, "epoch": 0.03506001043120558, "grad_norm": 12.764983177185059, "learning_rate": 5.4775743485420905e-06, "loss": 0.6721, "mean_token_accuracy": 0.7947013214230537, "num_tokens": 13580275.0, "step": 11310 }, { "entropy": 1.88107870221138, "epoch": 0.03509100955625528, "grad_norm": 14.808971405029297, "learning_rate": 5.482417901772741e-06, "loss": 0.6075, "mean_token_accuracy": 0.8077093288302422, "num_tokens": 13592745.0, "step": 11320 }, { "entropy": 1.8953585937619208, "epoch": 0.03512200868130497, "grad_norm": 12.039188385009766, "learning_rate": 5.487261455003391e-06, "loss": 0.6186, "mean_token_accuracy": 0.8103330656886101, "num_tokens": 13604350.0, "step": 11330 }, { "entropy": 1.8628807738423347, "epoch": 0.035153007806354664, "grad_norm": 11.431326866149902, "learning_rate": 5.492105008234041e-06, "loss": 0.5886, "mean_token_accuracy": 0.8245170086622238, "num_tokens": 13616428.0, "step": 11340 }, { "entropy": 1.9492103517055512, "epoch": 0.03518400693140436, "grad_norm": 13.414464950561523, "learning_rate": 5.496948561464691e-06, "loss": 0.6753, "mean_token_accuracy": 0.7961807489395142, "num_tokens": 13627865.0, "step": 11350 }, { "entropy": 1.9296086087822915, "epoch": 0.03521500605645406, "grad_norm": 12.078662872314453, "learning_rate": 5.501792114695342e-06, "loss": 0.7047, "mean_token_accuracy": 0.7942573204636574, "num_tokens": 13639469.0, "step": 11360 }, { "entropy": 1.9266355335712433, "epoch": 0.03524600518150375, "grad_norm": 10.940107345581055, "learning_rate": 5.5066356679259915e-06, "loss": 0.6479, "mean_token_accuracy": 0.8066198214888572, "num_tokens": 13650098.0, "step": 11370 }, { "entropy": 1.7104891866445542, "epoch": 0.03527700430655345, "grad_norm": 13.101654052734375, "learning_rate": 5.511479221156641e-06, "loss": 0.5101, "mean_token_accuracy": 0.8279589235782623, "num_tokens": 13664250.0, "step": 11380 }, { "entropy": 1.8383988574147225, "epoch": 0.03530800343160314, "grad_norm": 12.127103805541992, "learning_rate": 5.51632277438729e-06, "loss": 0.6044, "mean_token_accuracy": 0.8270195707678795, "num_tokens": 13675591.0, "step": 11390 }, { "entropy": 1.8982412829995154, "epoch": 0.035339002556652836, "grad_norm": 13.012894630432129, "learning_rate": 5.52116632761794e-06, "loss": 0.6433, "mean_token_accuracy": 0.8037685632705689, "num_tokens": 13687364.0, "step": 11400 }, { "entropy": 1.85855975151062, "epoch": 0.03537000168170253, "grad_norm": 5.7059454917907715, "learning_rate": 5.526009880848591e-06, "loss": 0.5678, "mean_token_accuracy": 0.8210862413048744, "num_tokens": 13699442.0, "step": 11410 }, { "entropy": 1.8508766368031502, "epoch": 0.03540100080675223, "grad_norm": 12.565545082092285, "learning_rate": 5.530853434079241e-06, "loss": 0.6132, "mean_token_accuracy": 0.8048885554075241, "num_tokens": 13711862.0, "step": 11420 }, { "entropy": 1.8528811484575272, "epoch": 0.035431999931801926, "grad_norm": 10.851259231567383, "learning_rate": 5.535696987309891e-06, "loss": 0.6248, "mean_token_accuracy": 0.8083219036459923, "num_tokens": 13723049.0, "step": 11430 }, { "entropy": 1.7467531949281692, "epoch": 0.03546299905685162, "grad_norm": 12.76328182220459, "learning_rate": 5.540540540540541e-06, "loss": 0.4742, "mean_token_accuracy": 0.8341274619102478, "num_tokens": 13736433.0, "step": 11440 }, { "entropy": 1.9269695818424224, "epoch": 0.03549399818190132, "grad_norm": 13.179561614990234, "learning_rate": 5.545384093771191e-06, "loss": 0.6138, "mean_token_accuracy": 0.8090446889400482, "num_tokens": 13748433.0, "step": 11450 }, { "entropy": 1.871768619120121, "epoch": 0.03552499730695101, "grad_norm": 11.79694652557373, "learning_rate": 5.550227647001841e-06, "loss": 0.6131, "mean_token_accuracy": 0.8084805279970169, "num_tokens": 13760554.0, "step": 11460 }, { "entropy": 1.899412962794304, "epoch": 0.035555996432000705, "grad_norm": 10.14828872680664, "learning_rate": 5.555071200232491e-06, "loss": 0.6452, "mean_token_accuracy": 0.8113683596253395, "num_tokens": 13771849.0, "step": 11470 }, { "entropy": 1.8810555890202523, "epoch": 0.0355869955570504, "grad_norm": 13.629297256469727, "learning_rate": 5.559914753463141e-06, "loss": 0.6262, "mean_token_accuracy": 0.8057652726769448, "num_tokens": 13783349.0, "step": 11480 }, { "entropy": 1.8832147046923637, "epoch": 0.0356179946821001, "grad_norm": 11.237863540649414, "learning_rate": 5.56475830669379e-06, "loss": 0.6097, "mean_token_accuracy": 0.80062695145607, "num_tokens": 13794988.0, "step": 11490 }, { "entropy": 1.734711329638958, "epoch": 0.035648993807149795, "grad_norm": 15.71545696258545, "learning_rate": 5.569601859924441e-06, "loss": 0.459, "mean_token_accuracy": 0.8244102329015732, "num_tokens": 13808196.0, "step": 11500 }, { "entropy": 1.8736824676394463, "epoch": 0.03567999293219949, "grad_norm": 10.090490341186523, "learning_rate": 5.574445413155091e-06, "loss": 0.5995, "mean_token_accuracy": 0.8152802541851998, "num_tokens": 13820833.0, "step": 11510 }, { "entropy": 1.7825218871235848, "epoch": 0.03571099205724918, "grad_norm": 10.388526916503906, "learning_rate": 5.579288966385741e-06, "loss": 0.4678, "mean_token_accuracy": 0.8239463314414024, "num_tokens": 13834118.0, "step": 11520 }, { "entropy": 1.9106709718704225, "epoch": 0.03574199118229888, "grad_norm": 5.574368000030518, "learning_rate": 5.584132519616391e-06, "loss": 0.6794, "mean_token_accuracy": 0.8046201914548874, "num_tokens": 13845774.0, "step": 11530 }, { "entropy": 1.9362187922000884, "epoch": 0.035772990307348575, "grad_norm": 11.167245864868164, "learning_rate": 5.588976072847041e-06, "loss": 0.6901, "mean_token_accuracy": 0.7971135929226876, "num_tokens": 13856205.0, "step": 11540 }, { "entropy": 1.792908415198326, "epoch": 0.03580398943239827, "grad_norm": 5.944072723388672, "learning_rate": 5.593819626077691e-06, "loss": 0.5636, "mean_token_accuracy": 0.8229343697428704, "num_tokens": 13868827.0, "step": 11550 }, { "entropy": 1.7896666795015335, "epoch": 0.03583498855744797, "grad_norm": 5.675596237182617, "learning_rate": 5.598663179308341e-06, "loss": 0.5246, "mean_token_accuracy": 0.8251826211810112, "num_tokens": 13881592.0, "step": 11560 }, { "entropy": 1.9190635159611702, "epoch": 0.035865987682497665, "grad_norm": 14.54418659210205, "learning_rate": 5.603506732538992e-06, "loss": 0.6535, "mean_token_accuracy": 0.8147982686758042, "num_tokens": 13892831.0, "step": 11570 }, { "entropy": 1.9398990795016289, "epoch": 0.035896986807547354, "grad_norm": 5.379489898681641, "learning_rate": 5.608350285769642e-06, "loss": 0.6346, "mean_token_accuracy": 0.807078929245472, "num_tokens": 13903824.0, "step": 11580 }, { "entropy": 1.8159923285245896, "epoch": 0.03592798593259705, "grad_norm": 12.649768829345703, "learning_rate": 5.6131938390002915e-06, "loss": 0.5691, "mean_token_accuracy": 0.8225948542356492, "num_tokens": 13916211.0, "step": 11590 }, { "entropy": 1.8722815930843353, "epoch": 0.03595898505764675, "grad_norm": 11.376723289489746, "learning_rate": 5.6180373922309405e-06, "loss": 0.5969, "mean_token_accuracy": 0.8152380838990212, "num_tokens": 13927975.0, "step": 11600 }, { "entropy": 1.8421933129429817, "epoch": 0.035989984182696444, "grad_norm": 11.949515342712402, "learning_rate": 5.62288094546159e-06, "loss": 0.6167, "mean_token_accuracy": 0.8134393319487572, "num_tokens": 13939474.0, "step": 11610 }, { "entropy": 1.888566829264164, "epoch": 0.03602098330774614, "grad_norm": 17.51249885559082, "learning_rate": 5.627724498692241e-06, "loss": 0.624, "mean_token_accuracy": 0.817312179505825, "num_tokens": 13951148.0, "step": 11620 }, { "entropy": 1.8805680245161056, "epoch": 0.03605198243279584, "grad_norm": 11.506142616271973, "learning_rate": 5.632568051922891e-06, "loss": 0.6413, "mean_token_accuracy": 0.8139393076300621, "num_tokens": 13962535.0, "step": 11630 }, { "entropy": 1.939769622683525, "epoch": 0.03608298155784553, "grad_norm": 11.291607856750488, "learning_rate": 5.637411605153541e-06, "loss": 0.6347, "mean_token_accuracy": 0.81319679915905, "num_tokens": 13973297.0, "step": 11640 }, { "entropy": 1.871855989843607, "epoch": 0.03611398068289522, "grad_norm": 13.56961727142334, "learning_rate": 5.642255158384191e-06, "loss": 0.5749, "mean_token_accuracy": 0.8270270243287087, "num_tokens": 13985337.0, "step": 11650 }, { "entropy": 1.807421001791954, "epoch": 0.03614497980794492, "grad_norm": 4.713283538818359, "learning_rate": 5.6470987116148415e-06, "loss": 0.4808, "mean_token_accuracy": 0.8386715143918991, "num_tokens": 13998425.0, "step": 11660 }, { "entropy": 1.784872618317604, "epoch": 0.03617597893299462, "grad_norm": 11.134720802307129, "learning_rate": 5.651942264845491e-06, "loss": 0.5639, "mean_token_accuracy": 0.8182505384087563, "num_tokens": 14011957.0, "step": 11670 }, { "entropy": 1.842606595158577, "epoch": 0.03620697805804431, "grad_norm": 5.860245227813721, "learning_rate": 5.656785818076141e-06, "loss": 0.5969, "mean_token_accuracy": 0.8185199961066246, "num_tokens": 14023989.0, "step": 11680 }, { "entropy": 1.7569019049406052, "epoch": 0.03623797718309401, "grad_norm": 11.391021728515625, "learning_rate": 5.661629371306791e-06, "loss": 0.4851, "mean_token_accuracy": 0.8376433789730072, "num_tokens": 14037138.0, "step": 11690 }, { "entropy": 1.8167656242847443, "epoch": 0.0362689763081437, "grad_norm": 14.077984809875488, "learning_rate": 5.66647292453744e-06, "loss": 0.5651, "mean_token_accuracy": 0.8293104261159897, "num_tokens": 14049119.0, "step": 11700 }, { "entropy": 1.8543547958135604, "epoch": 0.036299975433193396, "grad_norm": 9.835662841796875, "learning_rate": 5.671316477768091e-06, "loss": 0.598, "mean_token_accuracy": 0.8196697190403939, "num_tokens": 14061251.0, "step": 11710 }, { "entropy": 1.8527500122785567, "epoch": 0.03633097455824309, "grad_norm": 13.09237289428711, "learning_rate": 5.676160030998741e-06, "loss": 0.5879, "mean_token_accuracy": 0.8196158647537232, "num_tokens": 14073877.0, "step": 11720 }, { "entropy": 1.870137917995453, "epoch": 0.03636197368329279, "grad_norm": 12.378225326538086, "learning_rate": 5.681003584229391e-06, "loss": 0.6102, "mean_token_accuracy": 0.81648840457201, "num_tokens": 14086089.0, "step": 11730 }, { "entropy": 1.8904090464115142, "epoch": 0.036392972808342486, "grad_norm": 10.42041301727295, "learning_rate": 5.6858471374600414e-06, "loss": 0.5896, "mean_token_accuracy": 0.8135670512914658, "num_tokens": 14098409.0, "step": 11740 }, { "entropy": 1.9056991636753082, "epoch": 0.03642397193339218, "grad_norm": 5.637151718139648, "learning_rate": 5.690690690690691e-06, "loss": 0.6525, "mean_token_accuracy": 0.8007242888212204, "num_tokens": 14110078.0, "step": 11750 }, { "entropy": 1.7788226932287217, "epoch": 0.03645497105844187, "grad_norm": 6.211390018463135, "learning_rate": 5.695534243921341e-06, "loss": 0.5271, "mean_token_accuracy": 0.8264580845832825, "num_tokens": 14122941.0, "step": 11760 }, { "entropy": 1.8122435554862022, "epoch": 0.03648597018349157, "grad_norm": 10.39504623413086, "learning_rate": 5.700377797151991e-06, "loss": 0.5682, "mean_token_accuracy": 0.8231259554624557, "num_tokens": 14135164.0, "step": 11770 }, { "entropy": 1.9115518778562546, "epoch": 0.036516969308541265, "grad_norm": 11.225345611572266, "learning_rate": 5.705221350382642e-06, "loss": 0.6966, "mean_token_accuracy": 0.8027231857180596, "num_tokens": 14145993.0, "step": 11780 }, { "entropy": 1.833278726041317, "epoch": 0.03654796843359096, "grad_norm": 9.642135620117188, "learning_rate": 5.710064903613292e-06, "loss": 0.6039, "mean_token_accuracy": 0.8039411261677742, "num_tokens": 14159775.0, "step": 11790 }, { "entropy": 1.7943380519747734, "epoch": 0.03657896755864066, "grad_norm": 11.112443923950195, "learning_rate": 5.714908456843942e-06, "loss": 0.5057, "mean_token_accuracy": 0.8308914095163346, "num_tokens": 14172584.0, "step": 11800 }, { "entropy": 1.7970113635063172, "epoch": 0.036609966683690355, "grad_norm": 12.6159029006958, "learning_rate": 5.719752010074591e-06, "loss": 0.5638, "mean_token_accuracy": 0.8075689300894737, "num_tokens": 14184681.0, "step": 11810 }, { "entropy": 1.7890537694096564, "epoch": 0.03664096580874005, "grad_norm": 4.384820461273193, "learning_rate": 5.7245955633052405e-06, "loss": 0.6038, "mean_token_accuracy": 0.8110948413610458, "num_tokens": 14198022.0, "step": 11820 }, { "entropy": 1.814190225303173, "epoch": 0.03667196493378974, "grad_norm": 4.793267726898193, "learning_rate": 5.729439116535891e-06, "loss": 0.5878, "mean_token_accuracy": 0.8212701037526131, "num_tokens": 14209842.0, "step": 11830 }, { "entropy": 1.7815122097730636, "epoch": 0.03670296405883944, "grad_norm": 10.278016090393066, "learning_rate": 5.734282669766541e-06, "loss": 0.5281, "mean_token_accuracy": 0.8179376661777497, "num_tokens": 14222224.0, "step": 11840 }, { "entropy": 1.8016631290316583, "epoch": 0.036733963183889135, "grad_norm": 11.094133377075195, "learning_rate": 5.739126222997191e-06, "loss": 0.5766, "mean_token_accuracy": 0.8138196378946304, "num_tokens": 14234855.0, "step": 11850 }, { "entropy": 1.8083413437008857, "epoch": 0.03676496230893883, "grad_norm": 6.810488700866699, "learning_rate": 5.743969776227841e-06, "loss": 0.6051, "mean_token_accuracy": 0.8194642826914788, "num_tokens": 14246248.0, "step": 11860 }, { "entropy": 1.754711863398552, "epoch": 0.03679596143398853, "grad_norm": 5.887208938598633, "learning_rate": 5.748813329458492e-06, "loss": 0.6317, "mean_token_accuracy": 0.8068835064768791, "num_tokens": 14258816.0, "step": 11870 }, { "entropy": 1.7588348999619483, "epoch": 0.036826960559038224, "grad_norm": 12.953771591186523, "learning_rate": 5.7536568826891415e-06, "loss": 0.5202, "mean_token_accuracy": 0.8235802292823792, "num_tokens": 14271525.0, "step": 11880 }, { "entropy": 1.8742596834897995, "epoch": 0.036857959684087914, "grad_norm": 11.466156959533691, "learning_rate": 5.758500435919791e-06, "loss": 0.6855, "mean_token_accuracy": 0.8022027894854545, "num_tokens": 14282330.0, "step": 11890 }, { "entropy": 1.7715501874685287, "epoch": 0.03688895880913761, "grad_norm": 10.178691864013672, "learning_rate": 5.763343989150441e-06, "loss": 0.5384, "mean_token_accuracy": 0.8171212136745453, "num_tokens": 14294446.0, "step": 11900 }, { "entropy": 1.8660178661346436, "epoch": 0.03691995793418731, "grad_norm": 11.348404884338379, "learning_rate": 5.76818754238109e-06, "loss": 0.6431, "mean_token_accuracy": 0.8038701593875885, "num_tokens": 14305947.0, "step": 11910 }, { "entropy": 1.7659137800335885, "epoch": 0.036950957059237004, "grad_norm": 11.47120475769043, "learning_rate": 5.773031095611741e-06, "loss": 0.5643, "mean_token_accuracy": 0.8206716164946556, "num_tokens": 14318249.0, "step": 11920 }, { "entropy": 1.747839505970478, "epoch": 0.0369819561842867, "grad_norm": 5.5586018562316895, "learning_rate": 5.777874648842391e-06, "loss": 0.559, "mean_token_accuracy": 0.8257301524281502, "num_tokens": 14330993.0, "step": 11930 }, { "entropy": 1.8217350393533707, "epoch": 0.0370129553093364, "grad_norm": 10.629807472229004, "learning_rate": 5.782718202073041e-06, "loss": 0.6345, "mean_token_accuracy": 0.8065755069255829, "num_tokens": 14343198.0, "step": 11940 }, { "entropy": 1.8037606567144393, "epoch": 0.03704395443438609, "grad_norm": 8.647960662841797, "learning_rate": 5.7875617553036915e-06, "loss": 0.598, "mean_token_accuracy": 0.8171042606234551, "num_tokens": 14355609.0, "step": 11950 }, { "entropy": 1.8690994665026666, "epoch": 0.03707495355943578, "grad_norm": 13.143199920654297, "learning_rate": 5.792405308534341e-06, "loss": 0.6191, "mean_token_accuracy": 0.8137925118207932, "num_tokens": 14367235.0, "step": 11960 }, { "entropy": 1.7714310929179191, "epoch": 0.03710595268448548, "grad_norm": 11.381326675415039, "learning_rate": 5.797248861764991e-06, "loss": 0.5207, "mean_token_accuracy": 0.8231312274932862, "num_tokens": 14379885.0, "step": 11970 }, { "entropy": 1.8847584262490273, "epoch": 0.037136951809535176, "grad_norm": 12.403789520263672, "learning_rate": 5.802092414995641e-06, "loss": 0.6177, "mean_token_accuracy": 0.8122402995824813, "num_tokens": 14390884.0, "step": 11980 }, { "entropy": 1.7906603574752809, "epoch": 0.03716795093458487, "grad_norm": 12.341434478759766, "learning_rate": 5.806935968226292e-06, "loss": 0.6091, "mean_token_accuracy": 0.8056104972958564, "num_tokens": 14402659.0, "step": 11990 }, { "entropy": 1.8461511224508285, "epoch": 0.03719895005963457, "grad_norm": 11.438586235046387, "learning_rate": 5.811779521456942e-06, "loss": 0.6092, "mean_token_accuracy": 0.8133256167173386, "num_tokens": 14413793.0, "step": 12000 }, { "entropy": 1.797146451473236, "epoch": 0.03722994918468426, "grad_norm": 12.915786743164062, "learning_rate": 5.816623074687592e-06, "loss": 0.5399, "mean_token_accuracy": 0.826125793159008, "num_tokens": 14426225.0, "step": 12010 }, { "entropy": 1.7861742541193961, "epoch": 0.037260948309733956, "grad_norm": 11.406455993652344, "learning_rate": 5.821466627918241e-06, "loss": 0.5892, "mean_token_accuracy": 0.8146935313940048, "num_tokens": 14438143.0, "step": 12020 }, { "entropy": 1.8457993239164352, "epoch": 0.03729194743478365, "grad_norm": 10.185896873474121, "learning_rate": 5.826310181148891e-06, "loss": 0.6127, "mean_token_accuracy": 0.8021126002073288, "num_tokens": 14451306.0, "step": 12030 }, { "entropy": 1.8906755059957505, "epoch": 0.03732294655983335, "grad_norm": 12.862022399902344, "learning_rate": 5.831153734379541e-06, "loss": 0.722, "mean_token_accuracy": 0.7925963416695595, "num_tokens": 14463297.0, "step": 12040 }, { "entropy": 1.760866042971611, "epoch": 0.037353945684883046, "grad_norm": 12.60151195526123, "learning_rate": 5.835997287610191e-06, "loss": 0.5435, "mean_token_accuracy": 0.8173602819442749, "num_tokens": 14476740.0, "step": 12050 }, { "entropy": 1.8725519806146622, "epoch": 0.03738494480993274, "grad_norm": 13.23414134979248, "learning_rate": 5.840840840840841e-06, "loss": 0.6585, "mean_token_accuracy": 0.804969422519207, "num_tokens": 14488274.0, "step": 12060 }, { "entropy": 1.821217942237854, "epoch": 0.03741594393498243, "grad_norm": 12.871102333068848, "learning_rate": 5.845684394071491e-06, "loss": 0.5877, "mean_token_accuracy": 0.8180323630571366, "num_tokens": 14499989.0, "step": 12070 }, { "entropy": 1.8127268552780151, "epoch": 0.03744694306003213, "grad_norm": 12.796869277954102, "learning_rate": 5.850527947302142e-06, "loss": 0.5265, "mean_token_accuracy": 0.8186282083392143, "num_tokens": 14512825.0, "step": 12080 }, { "entropy": 1.8107111915946006, "epoch": 0.037477942185081825, "grad_norm": 5.6175360679626465, "learning_rate": 5.855371500532792e-06, "loss": 0.5838, "mean_token_accuracy": 0.809958079457283, "num_tokens": 14525109.0, "step": 12090 }, { "entropy": 1.8637285217642785, "epoch": 0.03750894131013152, "grad_norm": 12.704855918884277, "learning_rate": 5.8602150537634415e-06, "loss": 0.6368, "mean_token_accuracy": 0.8088982105255127, "num_tokens": 14536754.0, "step": 12100 }, { "entropy": 1.7325578138232232, "epoch": 0.03753994043518122, "grad_norm": 6.377108573913574, "learning_rate": 5.865058606994092e-06, "loss": 0.5025, "mean_token_accuracy": 0.8252245962619782, "num_tokens": 14551185.0, "step": 12110 }, { "entropy": 1.802834041416645, "epoch": 0.037570939560230915, "grad_norm": 12.303117752075195, "learning_rate": 5.86990216022474e-06, "loss": 0.5427, "mean_token_accuracy": 0.8231405153870582, "num_tokens": 14564613.0, "step": 12120 }, { "entropy": 1.8981367230415345, "epoch": 0.037601938685280605, "grad_norm": 12.030719757080078, "learning_rate": 5.874745713455391e-06, "loss": 0.6299, "mean_token_accuracy": 0.8138510972261429, "num_tokens": 14575459.0, "step": 12130 }, { "entropy": 1.8835298061370849, "epoch": 0.0376329378103303, "grad_norm": 11.876147270202637, "learning_rate": 5.879589266686041e-06, "loss": 0.624, "mean_token_accuracy": 0.8129880890250206, "num_tokens": 14587190.0, "step": 12140 }, { "entropy": 1.8323535963892936, "epoch": 0.03766393693538, "grad_norm": 13.480051040649414, "learning_rate": 5.884432819916691e-06, "loss": 0.5953, "mean_token_accuracy": 0.819337697327137, "num_tokens": 14598092.0, "step": 12150 }, { "entropy": 1.874643650650978, "epoch": 0.037694936060429694, "grad_norm": 10.773601531982422, "learning_rate": 5.889276373147342e-06, "loss": 0.6391, "mean_token_accuracy": 0.8008844360709191, "num_tokens": 14609253.0, "step": 12160 }, { "entropy": 1.864101167023182, "epoch": 0.03772593518547939, "grad_norm": 5.442148208618164, "learning_rate": 5.8941199263779915e-06, "loss": 0.6138, "mean_token_accuracy": 0.8117442667484284, "num_tokens": 14620881.0, "step": 12170 }, { "entropy": 1.9181242629885673, "epoch": 0.03775693431052909, "grad_norm": 10.096672058105469, "learning_rate": 5.898963479608641e-06, "loss": 0.6121, "mean_token_accuracy": 0.8077910885214805, "num_tokens": 14632799.0, "step": 12180 }, { "entropy": 1.870473875105381, "epoch": 0.037787933435578784, "grad_norm": 12.328435897827148, "learning_rate": 5.903807032839291e-06, "loss": 0.5875, "mean_token_accuracy": 0.8157344311475754, "num_tokens": 14643832.0, "step": 12190 }, { "entropy": 1.8657541304826737, "epoch": 0.037818932560628474, "grad_norm": 14.332444190979004, "learning_rate": 5.908650586069942e-06, "loss": 0.62, "mean_token_accuracy": 0.8118143856525422, "num_tokens": 14655521.0, "step": 12200 }, { "entropy": 1.8177946463227272, "epoch": 0.03784993168567817, "grad_norm": 11.780436515808105, "learning_rate": 5.913494139300592e-06, "loss": 0.6046, "mean_token_accuracy": 0.8122022807598114, "num_tokens": 14668492.0, "step": 12210 }, { "entropy": 1.8311690405011176, "epoch": 0.03788093081072787, "grad_norm": 10.684754371643066, "learning_rate": 5.918337692531242e-06, "loss": 0.5527, "mean_token_accuracy": 0.8185856059193611, "num_tokens": 14681336.0, "step": 12220 }, { "entropy": 1.8790747031569481, "epoch": 0.037911929935777564, "grad_norm": 12.979804039001465, "learning_rate": 5.923181245761891e-06, "loss": 0.6315, "mean_token_accuracy": 0.8123493686318397, "num_tokens": 14693888.0, "step": 12230 }, { "entropy": 1.8931312650442123, "epoch": 0.03794292906082726, "grad_norm": 12.229876518249512, "learning_rate": 5.928024798992541e-06, "loss": 0.625, "mean_token_accuracy": 0.8202015697956085, "num_tokens": 14704937.0, "step": 12240 }, { "entropy": 1.8407990396022798, "epoch": 0.03797392818587696, "grad_norm": 11.653843879699707, "learning_rate": 5.9328683522231914e-06, "loss": 0.5902, "mean_token_accuracy": 0.8176357612013817, "num_tokens": 14717674.0, "step": 12250 }, { "entropy": 1.8561912134289742, "epoch": 0.038004927310926646, "grad_norm": 13.023904800415039, "learning_rate": 5.937711905453841e-06, "loss": 0.5849, "mean_token_accuracy": 0.8190959095954895, "num_tokens": 14729749.0, "step": 12260 }, { "entropy": 1.9160822182893753, "epoch": 0.03803592643597634, "grad_norm": 12.721014976501465, "learning_rate": 5.942555458684491e-06, "loss": 0.6519, "mean_token_accuracy": 0.8113419517874718, "num_tokens": 14741605.0, "step": 12270 }, { "entropy": 1.8194750413298606, "epoch": 0.03806692556102604, "grad_norm": 11.955613136291504, "learning_rate": 5.947399011915141e-06, "loss": 0.5518, "mean_token_accuracy": 0.8214196056127548, "num_tokens": 14754303.0, "step": 12280 }, { "entropy": 1.8162646040320396, "epoch": 0.038097924686075736, "grad_norm": 5.1032328605651855, "learning_rate": 5.952242565145792e-06, "loss": 0.5327, "mean_token_accuracy": 0.818191859126091, "num_tokens": 14766657.0, "step": 12290 }, { "entropy": 1.846658205986023, "epoch": 0.03812892381112543, "grad_norm": 5.808051109313965, "learning_rate": 5.957086118376442e-06, "loss": 0.63, "mean_token_accuracy": 0.8142410039901733, "num_tokens": 14778196.0, "step": 12300 }, { "entropy": 1.7994149655103684, "epoch": 0.03815992293617513, "grad_norm": 12.859893798828125, "learning_rate": 5.961929671607092e-06, "loss": 0.5125, "mean_token_accuracy": 0.8262584999203682, "num_tokens": 14790293.0, "step": 12310 }, { "entropy": 1.951202955842018, "epoch": 0.03819092206122482, "grad_norm": 9.63222599029541, "learning_rate": 5.966773224837742e-06, "loss": 0.6364, "mean_token_accuracy": 0.8155479997396469, "num_tokens": 14801257.0, "step": 12320 }, { "entropy": 1.8327611729502677, "epoch": 0.038221921186274516, "grad_norm": 12.394364356994629, "learning_rate": 5.9716167780683905e-06, "loss": 0.5406, "mean_token_accuracy": 0.8244035348296166, "num_tokens": 14813680.0, "step": 12330 }, { "entropy": 1.8879570379853248, "epoch": 0.03825292031132421, "grad_norm": 9.891463279724121, "learning_rate": 5.976460331299041e-06, "loss": 0.5948, "mean_token_accuracy": 0.815042594075203, "num_tokens": 14825260.0, "step": 12340 }, { "entropy": 1.8341692447662354, "epoch": 0.03828391943637391, "grad_norm": 14.30816650390625, "learning_rate": 5.981303884529691e-06, "loss": 0.5992, "mean_token_accuracy": 0.8117454037070274, "num_tokens": 14837969.0, "step": 12350 }, { "entropy": 1.8351563602685927, "epoch": 0.038314918561423605, "grad_norm": 12.036502838134766, "learning_rate": 5.986147437760341e-06, "loss": 0.5528, "mean_token_accuracy": 0.820985272526741, "num_tokens": 14850504.0, "step": 12360 }, { "entropy": 1.8652123495936395, "epoch": 0.0383459176864733, "grad_norm": 5.860367298126221, "learning_rate": 5.990990990990992e-06, "loss": 0.5289, "mean_token_accuracy": 0.8303888604044914, "num_tokens": 14862430.0, "step": 12370 }, { "entropy": 1.8906607389450074, "epoch": 0.03837691681152299, "grad_norm": 11.164518356323242, "learning_rate": 5.995834544221642e-06, "loss": 0.5727, "mean_token_accuracy": 0.8233511716127395, "num_tokens": 14874608.0, "step": 12380 }, { "entropy": 1.9229727298021317, "epoch": 0.03840791593657269, "grad_norm": 10.484548568725586, "learning_rate": 6.0006780974522915e-06, "loss": 0.617, "mean_token_accuracy": 0.8203805893659591, "num_tokens": 14885555.0, "step": 12390 }, { "entropy": 1.858140294253826, "epoch": 0.038438915061622385, "grad_norm": 3.475583791732788, "learning_rate": 6.005521650682941e-06, "loss": 0.5348, "mean_token_accuracy": 0.8278280302882195, "num_tokens": 14899067.0, "step": 12400 }, { "entropy": 1.9508850559592248, "epoch": 0.03846991418667208, "grad_norm": 14.349974632263184, "learning_rate": 6.010365203913592e-06, "loss": 0.6307, "mean_token_accuracy": 0.8059279710054398, "num_tokens": 14910681.0, "step": 12410 }, { "entropy": 1.9063424944877625, "epoch": 0.03850091331172178, "grad_norm": 12.377765655517578, "learning_rate": 6.015208757144242e-06, "loss": 0.5669, "mean_token_accuracy": 0.8251767575740814, "num_tokens": 14922446.0, "step": 12420 }, { "entropy": 1.8672158405184747, "epoch": 0.038531912436771475, "grad_norm": 14.53426456451416, "learning_rate": 6.020052310374892e-06, "loss": 0.5257, "mean_token_accuracy": 0.8267401665449142, "num_tokens": 14934912.0, "step": 12430 }, { "entropy": 1.946044033765793, "epoch": 0.038562911561821164, "grad_norm": 12.349529266357422, "learning_rate": 6.024895863605541e-06, "loss": 0.7026, "mean_token_accuracy": 0.7958286583423615, "num_tokens": 14946270.0, "step": 12440 }, { "entropy": 1.9352963030338288, "epoch": 0.03859391068687086, "grad_norm": 15.071447372436523, "learning_rate": 6.029739416836191e-06, "loss": 0.6828, "mean_token_accuracy": 0.8007722824811936, "num_tokens": 14957285.0, "step": 12450 }, { "entropy": 1.9621846169233321, "epoch": 0.03862490981192056, "grad_norm": 11.935574531555176, "learning_rate": 6.0345829700668415e-06, "loss": 0.7034, "mean_token_accuracy": 0.7971277639269829, "num_tokens": 14968080.0, "step": 12460 }, { "entropy": 1.8536121487617492, "epoch": 0.038655908936970254, "grad_norm": 12.425955772399902, "learning_rate": 6.0394265232974914e-06, "loss": 0.571, "mean_token_accuracy": 0.8072732269763947, "num_tokens": 14980128.0, "step": 12470 }, { "entropy": 1.853446225821972, "epoch": 0.03868690806201995, "grad_norm": 5.7373366355896, "learning_rate": 6.044270076528141e-06, "loss": 0.6053, "mean_token_accuracy": 0.8060764700174332, "num_tokens": 14991962.0, "step": 12480 }, { "entropy": 1.792759819328785, "epoch": 0.03871790718706965, "grad_norm": 11.446489334106445, "learning_rate": 6.049113629758791e-06, "loss": 0.5256, "mean_token_accuracy": 0.8205834746360778, "num_tokens": 15004797.0, "step": 12490 }, { "entropy": 1.8071394935250282, "epoch": 0.038748906312119344, "grad_norm": 10.317464828491211, "learning_rate": 6.053957182989442e-06, "loss": 0.5875, "mean_token_accuracy": 0.817318132519722, "num_tokens": 15017391.0, "step": 12500 }, { "entropy": 1.8499668538570404, "epoch": 0.038779905437169034, "grad_norm": 11.95541763305664, "learning_rate": 6.058800736220092e-06, "loss": 0.6207, "mean_token_accuracy": 0.8172942087054252, "num_tokens": 15028762.0, "step": 12510 }, { "entropy": 1.8761356472969055, "epoch": 0.03881090456221873, "grad_norm": 11.275162696838379, "learning_rate": 6.063644289450742e-06, "loss": 0.615, "mean_token_accuracy": 0.8089762479066849, "num_tokens": 15040328.0, "step": 12520 }, { "entropy": 1.8349191531538964, "epoch": 0.03884190368726843, "grad_norm": 12.247875213623047, "learning_rate": 6.0684878426813924e-06, "loss": 0.628, "mean_token_accuracy": 0.8030536040663719, "num_tokens": 15052829.0, "step": 12530 }, { "entropy": 1.8827327758073806, "epoch": 0.03887290281231812, "grad_norm": 13.34996509552002, "learning_rate": 6.073331395912041e-06, "loss": 0.5946, "mean_token_accuracy": 0.8162320896983146, "num_tokens": 15064110.0, "step": 12540 }, { "entropy": 1.825656296312809, "epoch": 0.03890390193736782, "grad_norm": 9.6633939743042, "learning_rate": 6.078174949142691e-06, "loss": 0.5161, "mean_token_accuracy": 0.8281683087348938, "num_tokens": 15076335.0, "step": 12550 }, { "entropy": 1.8427977129817008, "epoch": 0.038934901062417517, "grad_norm": 7.553597927093506, "learning_rate": 6.083018502373341e-06, "loss": 0.6407, "mean_token_accuracy": 0.8068282768130303, "num_tokens": 15088344.0, "step": 12560 }, { "entropy": 1.7809947788715363, "epoch": 0.038965900187467206, "grad_norm": 5.229321002960205, "learning_rate": 6.087862055603991e-06, "loss": 0.5344, "mean_token_accuracy": 0.8179076954722404, "num_tokens": 15101575.0, "step": 12570 }, { "entropy": 1.9495843350887299, "epoch": 0.0389968993125169, "grad_norm": 10.395573616027832, "learning_rate": 6.092705608834642e-06, "loss": 0.6253, "mean_token_accuracy": 0.8120292127132416, "num_tokens": 15112551.0, "step": 12580 }, { "entropy": 1.8639352589845657, "epoch": 0.0390278984375666, "grad_norm": 14.738471984863281, "learning_rate": 6.097549162065292e-06, "loss": 0.6255, "mean_token_accuracy": 0.8092112436890602, "num_tokens": 15124473.0, "step": 12590 }, { "entropy": 1.8332767322659493, "epoch": 0.039058897562616296, "grad_norm": 6.293068885803223, "learning_rate": 6.102392715295942e-06, "loss": 0.6048, "mean_token_accuracy": 0.8051409214735031, "num_tokens": 15136268.0, "step": 12600 }, { "entropy": 1.8370594762265682, "epoch": 0.03908989668766599, "grad_norm": 11.538333892822266, "learning_rate": 6.1072362685265915e-06, "loss": 0.5955, "mean_token_accuracy": 0.8151108309626579, "num_tokens": 15147270.0, "step": 12610 }, { "entropy": 1.8287363216280936, "epoch": 0.03912089581271569, "grad_norm": 12.497879981994629, "learning_rate": 6.112079821757242e-06, "loss": 0.5737, "mean_token_accuracy": 0.8114735826849937, "num_tokens": 15159305.0, "step": 12620 }, { "entropy": 1.841526921093464, "epoch": 0.03915189493776538, "grad_norm": 15.22536563873291, "learning_rate": 6.116923374987892e-06, "loss": 0.6205, "mean_token_accuracy": 0.8125358670949936, "num_tokens": 15170956.0, "step": 12630 }, { "entropy": 1.8685049675405025, "epoch": 0.039182894062815075, "grad_norm": 10.67180347442627, "learning_rate": 6.121766928218542e-06, "loss": 0.6239, "mean_token_accuracy": 0.8018307209014892, "num_tokens": 15183129.0, "step": 12640 }, { "entropy": 1.8348088413476944, "epoch": 0.03921389318786477, "grad_norm": 6.374260902404785, "learning_rate": 6.126610481449191e-06, "loss": 0.5658, "mean_token_accuracy": 0.8142863139510155, "num_tokens": 15195411.0, "step": 12650 }, { "entropy": 1.9104970484972, "epoch": 0.03924489231291447, "grad_norm": 11.84320068359375, "learning_rate": 6.131454034679841e-06, "loss": 0.7097, "mean_token_accuracy": 0.7972751423716545, "num_tokens": 15206633.0, "step": 12660 }, { "entropy": 1.8191149190068245, "epoch": 0.039275891437964165, "grad_norm": 14.073987007141113, "learning_rate": 6.136297587910492e-06, "loss": 0.6266, "mean_token_accuracy": 0.8092659756541252, "num_tokens": 15218732.0, "step": 12670 }, { "entropy": 1.839244581758976, "epoch": 0.03930689056301386, "grad_norm": 9.864472389221191, "learning_rate": 6.1411411411411415e-06, "loss": 0.5842, "mean_token_accuracy": 0.8092280417680741, "num_tokens": 15231394.0, "step": 12680 }, { "entropy": 1.8042413115501403, "epoch": 0.03933788968806355, "grad_norm": 12.334712982177734, "learning_rate": 6.145984694371791e-06, "loss": 0.6022, "mean_token_accuracy": 0.8171849220991134, "num_tokens": 15242791.0, "step": 12690 }, { "entropy": 1.8627389699220658, "epoch": 0.03936888881311325, "grad_norm": 13.441896438598633, "learning_rate": 6.150828247602441e-06, "loss": 0.5941, "mean_token_accuracy": 0.8153049975633622, "num_tokens": 15254161.0, "step": 12700 }, { "entropy": 1.8331183463335037, "epoch": 0.039399887938162945, "grad_norm": 12.164008140563965, "learning_rate": 6.155671800833092e-06, "loss": 0.5919, "mean_token_accuracy": 0.8250399813055992, "num_tokens": 15265756.0, "step": 12710 }, { "entropy": 1.8537740007042884, "epoch": 0.03943088706321264, "grad_norm": 11.0955810546875, "learning_rate": 6.160515354063742e-06, "loss": 0.5697, "mean_token_accuracy": 0.8286507219076157, "num_tokens": 15277513.0, "step": 12720 }, { "entropy": 1.8841774478554725, "epoch": 0.03946188618826234, "grad_norm": 6.4962663650512695, "learning_rate": 6.165358907294392e-06, "loss": 0.6102, "mean_token_accuracy": 0.8118945837020874, "num_tokens": 15288556.0, "step": 12730 }, { "entropy": 1.8026639834046363, "epoch": 0.039492885313312034, "grad_norm": 12.76574420928955, "learning_rate": 6.1702024605250425e-06, "loss": 0.5569, "mean_token_accuracy": 0.8168751522898674, "num_tokens": 15301444.0, "step": 12740 }, { "entropy": 1.8253360256552695, "epoch": 0.039523884438361724, "grad_norm": 5.973870754241943, "learning_rate": 6.175046013755691e-06, "loss": 0.5451, "mean_token_accuracy": 0.8278811991214752, "num_tokens": 15313764.0, "step": 12750 }, { "entropy": 1.7969791740179062, "epoch": 0.03955488356341142, "grad_norm": 14.835697174072266, "learning_rate": 6.1798895669863415e-06, "loss": 0.5442, "mean_token_accuracy": 0.8214199602603912, "num_tokens": 15326494.0, "step": 12760 }, { "entropy": 1.8358357205986977, "epoch": 0.03958588268846112, "grad_norm": 6.574784278869629, "learning_rate": 6.184733120216991e-06, "loss": 0.6013, "mean_token_accuracy": 0.8155452191829682, "num_tokens": 15338454.0, "step": 12770 }, { "entropy": 1.7818957820534707, "epoch": 0.039616881813510814, "grad_norm": 5.757773399353027, "learning_rate": 6.189576673447641e-06, "loss": 0.5374, "mean_token_accuracy": 0.8222944095730782, "num_tokens": 15351199.0, "step": 12780 }, { "entropy": 1.7962487503886222, "epoch": 0.03964788093856051, "grad_norm": 11.691226959228516, "learning_rate": 6.194420226678292e-06, "loss": 0.5549, "mean_token_accuracy": 0.8140451073646545, "num_tokens": 15363661.0, "step": 12790 }, { "entropy": 1.869248776137829, "epoch": 0.03967888006361021, "grad_norm": 13.061457633972168, "learning_rate": 6.199263779908942e-06, "loss": 0.6036, "mean_token_accuracy": 0.8111810341477395, "num_tokens": 15375191.0, "step": 12800 }, { "entropy": 1.9534028589725494, "epoch": 0.0397098791886599, "grad_norm": 12.596964836120605, "learning_rate": 6.204107333139592e-06, "loss": 0.7028, "mean_token_accuracy": 0.7962926715612412, "num_tokens": 15386223.0, "step": 12810 }, { "entropy": 1.8273673072457313, "epoch": 0.03974087831370959, "grad_norm": 6.692715644836426, "learning_rate": 6.208950886370242e-06, "loss": 0.6008, "mean_token_accuracy": 0.8225413948297501, "num_tokens": 15398818.0, "step": 12820 }, { "entropy": 1.8166001297533512, "epoch": 0.03977187743875929, "grad_norm": 13.053438186645508, "learning_rate": 6.213794439600892e-06, "loss": 0.5571, "mean_token_accuracy": 0.825442411005497, "num_tokens": 15411042.0, "step": 12830 }, { "entropy": 1.8038362354040145, "epoch": 0.03980287656380899, "grad_norm": 14.547704696655273, "learning_rate": 6.218637992831542e-06, "loss": 0.5879, "mean_token_accuracy": 0.8112800106406212, "num_tokens": 15423517.0, "step": 12840 }, { "entropy": 1.8522241950035094, "epoch": 0.03983387568885868, "grad_norm": 13.58086109161377, "learning_rate": 6.223481546062192e-06, "loss": 0.6283, "mean_token_accuracy": 0.8121964901685714, "num_tokens": 15434996.0, "step": 12850 }, { "entropy": 1.8666557878255845, "epoch": 0.03986487481390838, "grad_norm": 12.256643295288086, "learning_rate": 6.228325099292841e-06, "loss": 0.6245, "mean_token_accuracy": 0.7993756279349327, "num_tokens": 15446746.0, "step": 12860 }, { "entropy": 1.8094900250434875, "epoch": 0.039895873938958076, "grad_norm": 12.334588050842285, "learning_rate": 6.233168652523491e-06, "loss": 0.5687, "mean_token_accuracy": 0.8260209634900093, "num_tokens": 15459054.0, "step": 12870 }, { "entropy": 1.8770345002412796, "epoch": 0.039926873064007766, "grad_norm": 12.718692779541016, "learning_rate": 6.238012205754142e-06, "loss": 0.6855, "mean_token_accuracy": 0.7952335923910141, "num_tokens": 15469948.0, "step": 12880 }, { "entropy": 1.8727562785148621, "epoch": 0.03995787218905746, "grad_norm": 9.794669151306152, "learning_rate": 6.242855758984792e-06, "loss": 0.6304, "mean_token_accuracy": 0.816286937892437, "num_tokens": 15482541.0, "step": 12890 }, { "entropy": 1.8690064042806624, "epoch": 0.03998887131410716, "grad_norm": 11.124152183532715, "learning_rate": 6.2476993122154415e-06, "loss": 0.589, "mean_token_accuracy": 0.8171432822942734, "num_tokens": 15494339.0, "step": 12900 }, { "entropy": 1.815145094692707, "epoch": 0.040019870439156856, "grad_norm": 11.725676536560059, "learning_rate": 6.252542865446091e-06, "loss": 0.5313, "mean_token_accuracy": 0.822598272562027, "num_tokens": 15506838.0, "step": 12910 }, { "entropy": 1.7810951352119446, "epoch": 0.04005086956420655, "grad_norm": 6.2188520431518555, "learning_rate": 6.257386418676742e-06, "loss": 0.5694, "mean_token_accuracy": 0.8137658298015594, "num_tokens": 15520324.0, "step": 12920 }, { "entropy": 1.8406692005693912, "epoch": 0.04008186868925625, "grad_norm": 10.676637649536133, "learning_rate": 6.262229971907392e-06, "loss": 0.5757, "mean_token_accuracy": 0.8103611707687378, "num_tokens": 15533133.0, "step": 12930 }, { "entropy": 1.9034065037965775, "epoch": 0.04011286781430594, "grad_norm": 12.021625518798828, "learning_rate": 6.267073525138042e-06, "loss": 0.6135, "mean_token_accuracy": 0.8219456240534783, "num_tokens": 15543784.0, "step": 12940 }, { "entropy": 1.859899564087391, "epoch": 0.040143866939355635, "grad_norm": 5.559539794921875, "learning_rate": 6.271917078368693e-06, "loss": 0.6117, "mean_token_accuracy": 0.8060070484876632, "num_tokens": 15555512.0, "step": 12950 }, { "entropy": 1.8671554252505302, "epoch": 0.04017486606440533, "grad_norm": 4.944991111755371, "learning_rate": 6.276760631599341e-06, "loss": 0.6074, "mean_token_accuracy": 0.8140358075499534, "num_tokens": 15567319.0, "step": 12960 }, { "entropy": 1.8478776171803475, "epoch": 0.04020586518945503, "grad_norm": 6.253074645996094, "learning_rate": 6.2816041848299916e-06, "loss": 0.6098, "mean_token_accuracy": 0.8053146034479142, "num_tokens": 15579682.0, "step": 12970 }, { "entropy": 1.9027994275093079, "epoch": 0.040236864314504725, "grad_norm": 11.593799591064453, "learning_rate": 6.2864477380606414e-06, "loss": 0.6247, "mean_token_accuracy": 0.8165087446570396, "num_tokens": 15590423.0, "step": 12980 }, { "entropy": 1.8880388498306275, "epoch": 0.04026786343955442, "grad_norm": 12.472247123718262, "learning_rate": 6.291291291291291e-06, "loss": 0.6243, "mean_token_accuracy": 0.8147116348147392, "num_tokens": 15601491.0, "step": 12990 }, { "entropy": 1.8754856497049333, "epoch": 0.04029886256460411, "grad_norm": 10.099672317504883, "learning_rate": 6.296134844521942e-06, "loss": 0.615, "mean_token_accuracy": 0.8218295946717262, "num_tokens": 15611664.0, "step": 13000 }, { "entropy": 1.8529302537441255, "epoch": 0.04032986168965381, "grad_norm": 11.330911636352539, "learning_rate": 6.300978397752592e-06, "loss": 0.6227, "mean_token_accuracy": 0.8136036321520805, "num_tokens": 15623729.0, "step": 13010 }, { "entropy": 1.8294827699661256, "epoch": 0.040360860814703504, "grad_norm": 12.999713897705078, "learning_rate": 6.305821950983242e-06, "loss": 0.6562, "mean_token_accuracy": 0.8110322475433349, "num_tokens": 15634983.0, "step": 13020 }, { "entropy": 1.8562739863991737, "epoch": 0.0403918599397532, "grad_norm": 9.199260711669922, "learning_rate": 6.310665504213892e-06, "loss": 0.5929, "mean_token_accuracy": 0.8223294109106064, "num_tokens": 15646086.0, "step": 13030 }, { "entropy": 1.8410490170121192, "epoch": 0.0404228590648029, "grad_norm": 12.867775917053223, "learning_rate": 6.3155090574445424e-06, "loss": 0.6028, "mean_token_accuracy": 0.8097691163420677, "num_tokens": 15658322.0, "step": 13040 }, { "entropy": 1.770823860168457, "epoch": 0.040453858189852594, "grad_norm": 5.460647106170654, "learning_rate": 6.320352610675192e-06, "loss": 0.4783, "mean_token_accuracy": 0.8246093928813935, "num_tokens": 15671766.0, "step": 13050 }, { "entropy": 1.856894339621067, "epoch": 0.040484857314902284, "grad_norm": 12.062054634094238, "learning_rate": 6.325196163905842e-06, "loss": 0.5404, "mean_token_accuracy": 0.8271098077297211, "num_tokens": 15683812.0, "step": 13060 }, { "entropy": 1.8751189470291139, "epoch": 0.04051585643995198, "grad_norm": 6.459671974182129, "learning_rate": 6.330039717136491e-06, "loss": 0.6067, "mean_token_accuracy": 0.8083498194813729, "num_tokens": 15695486.0, "step": 13070 }, { "entropy": 1.8417337238788605, "epoch": 0.04054685556500168, "grad_norm": 6.831528663635254, "learning_rate": 6.334883270367141e-06, "loss": 0.6252, "mean_token_accuracy": 0.8124615296721458, "num_tokens": 15707915.0, "step": 13080 }, { "entropy": 1.854557254910469, "epoch": 0.040577854690051374, "grad_norm": 10.786755561828613, "learning_rate": 6.339726823597792e-06, "loss": 0.5652, "mean_token_accuracy": 0.8192732855677605, "num_tokens": 15720535.0, "step": 13090 }, { "entropy": 1.9224106088280677, "epoch": 0.04060885381510107, "grad_norm": 12.00683879852295, "learning_rate": 6.344570376828442e-06, "loss": 0.698, "mean_token_accuracy": 0.7983192473649978, "num_tokens": 15732872.0, "step": 13100 }, { "entropy": 1.8551457852125168, "epoch": 0.04063985294015077, "grad_norm": 7.283522605895996, "learning_rate": 6.349413930059092e-06, "loss": 0.6393, "mean_token_accuracy": 0.8065184399485588, "num_tokens": 15745391.0, "step": 13110 }, { "entropy": 1.8762536928057671, "epoch": 0.04067085206520046, "grad_norm": 5.411208629608154, "learning_rate": 6.3542574832897415e-06, "loss": 0.5594, "mean_token_accuracy": 0.8252108618617058, "num_tokens": 15756896.0, "step": 13120 }, { "entropy": 1.9300520285964011, "epoch": 0.04070185119025015, "grad_norm": 11.322684288024902, "learning_rate": 6.359101036520392e-06, "loss": 0.6418, "mean_token_accuracy": 0.8126945987343788, "num_tokens": 15768386.0, "step": 13130 }, { "entropy": 1.7394558861851692, "epoch": 0.04073285031529985, "grad_norm": 3.4418604373931885, "learning_rate": 6.363944589751042e-06, "loss": 0.5333, "mean_token_accuracy": 0.8297542706131935, "num_tokens": 15782338.0, "step": 13140 }, { "entropy": 1.886605440080166, "epoch": 0.040763849440349546, "grad_norm": 10.883418083190918, "learning_rate": 6.368788142981692e-06, "loss": 0.5851, "mean_token_accuracy": 0.8118449002504349, "num_tokens": 15794126.0, "step": 13150 }, { "entropy": 1.833412842452526, "epoch": 0.04079484856539924, "grad_norm": 5.032846450805664, "learning_rate": 6.373631696212343e-06, "loss": 0.5319, "mean_token_accuracy": 0.8236513167619706, "num_tokens": 15806365.0, "step": 13160 }, { "entropy": 1.8451514735817909, "epoch": 0.04082584769044894, "grad_norm": 12.876490592956543, "learning_rate": 6.378475249442991e-06, "loss": 0.6205, "mean_token_accuracy": 0.8131675496697426, "num_tokens": 15817789.0, "step": 13170 }, { "entropy": 1.9239716470241546, "epoch": 0.04085684681549863, "grad_norm": 10.991585731506348, "learning_rate": 6.383318802673642e-06, "loss": 0.6288, "mean_token_accuracy": 0.8010996967554093, "num_tokens": 15828472.0, "step": 13180 }, { "entropy": 1.861264744400978, "epoch": 0.040887845940548326, "grad_norm": 12.282683372497559, "learning_rate": 6.3881623559042915e-06, "loss": 0.5954, "mean_token_accuracy": 0.8243414610624313, "num_tokens": 15840805.0, "step": 13190 }, { "entropy": 1.7732505962252616, "epoch": 0.04091884506559802, "grad_norm": 10.902586936950684, "learning_rate": 6.3930059091349414e-06, "loss": 0.5532, "mean_token_accuracy": 0.8143455445766449, "num_tokens": 15854669.0, "step": 13200 }, { "entropy": 1.8966855704784393, "epoch": 0.04094984419064772, "grad_norm": 15.284767150878906, "learning_rate": 6.397849462365592e-06, "loss": 0.6365, "mean_token_accuracy": 0.8180679067969322, "num_tokens": 15866135.0, "step": 13210 }, { "entropy": 1.8465805247426033, "epoch": 0.040980843315697416, "grad_norm": 10.624847412109375, "learning_rate": 6.402693015596242e-06, "loss": 0.5707, "mean_token_accuracy": 0.8213504150509834, "num_tokens": 15878224.0, "step": 13220 }, { "entropy": 1.9168522462248803, "epoch": 0.04101184244074711, "grad_norm": 10.14426326751709, "learning_rate": 6.407536568826892e-06, "loss": 0.6252, "mean_token_accuracy": 0.8033525243401527, "num_tokens": 15891104.0, "step": 13230 }, { "entropy": 1.902791763842106, "epoch": 0.04104284156579681, "grad_norm": 12.496599197387695, "learning_rate": 6.412380122057542e-06, "loss": 0.5903, "mean_token_accuracy": 0.809710368514061, "num_tokens": 15903392.0, "step": 13240 }, { "entropy": 1.834274485707283, "epoch": 0.0410738406908465, "grad_norm": 10.14233684539795, "learning_rate": 6.4172236752881925e-06, "loss": 0.5469, "mean_token_accuracy": 0.8176365941762924, "num_tokens": 15916354.0, "step": 13250 }, { "entropy": 1.7371825829148293, "epoch": 0.041104839815896195, "grad_norm": 4.078171730041504, "learning_rate": 6.4220672285188424e-06, "loss": 0.4872, "mean_token_accuracy": 0.8276883006095886, "num_tokens": 15929845.0, "step": 13260 }, { "entropy": 1.906320759654045, "epoch": 0.04113583894094589, "grad_norm": 13.177295684814453, "learning_rate": 6.426910781749492e-06, "loss": 0.6577, "mean_token_accuracy": 0.803479115664959, "num_tokens": 15941475.0, "step": 13270 }, { "entropy": 1.86391938328743, "epoch": 0.04116683806599559, "grad_norm": 12.480244636535645, "learning_rate": 6.431754334980141e-06, "loss": 0.6242, "mean_token_accuracy": 0.8072491884231567, "num_tokens": 15953798.0, "step": 13280 }, { "entropy": 1.8636515244841576, "epoch": 0.041197837191045285, "grad_norm": 11.828653335571289, "learning_rate": 6.436597888210791e-06, "loss": 0.6048, "mean_token_accuracy": 0.8113080978393554, "num_tokens": 15965803.0, "step": 13290 }, { "entropy": 1.8319952994585038, "epoch": 0.04122883631609498, "grad_norm": 10.738685607910156, "learning_rate": 6.441441441441442e-06, "loss": 0.6163, "mean_token_accuracy": 0.8101550653576851, "num_tokens": 15978557.0, "step": 13300 }, { "entropy": 1.8741496190428735, "epoch": 0.04125983544114467, "grad_norm": 11.56949234008789, "learning_rate": 6.446284994672092e-06, "loss": 0.5879, "mean_token_accuracy": 0.811277537047863, "num_tokens": 15990542.0, "step": 13310 }, { "entropy": 1.9263570591807366, "epoch": 0.04129083456619437, "grad_norm": 5.565122127532959, "learning_rate": 6.451128547902742e-06, "loss": 0.6913, "mean_token_accuracy": 0.7948109433054924, "num_tokens": 16001976.0, "step": 13320 }, { "entropy": 1.9291939318180085, "epoch": 0.041321833691244064, "grad_norm": 11.844070434570312, "learning_rate": 6.455972101133392e-06, "loss": 0.6205, "mean_token_accuracy": 0.8096133157610893, "num_tokens": 16012910.0, "step": 13330 }, { "entropy": 1.8042861357331277, "epoch": 0.04135283281629376, "grad_norm": 10.28139877319336, "learning_rate": 6.460815654364042e-06, "loss": 0.5329, "mean_token_accuracy": 0.825696873664856, "num_tokens": 16025765.0, "step": 13340 }, { "entropy": 1.9436326138675213, "epoch": 0.04138383194134346, "grad_norm": 12.31657886505127, "learning_rate": 6.465659207594692e-06, "loss": 0.6457, "mean_token_accuracy": 0.7972278758883476, "num_tokens": 16037425.0, "step": 13350 }, { "entropy": 1.8341301783919335, "epoch": 0.041414831066393154, "grad_norm": 11.382768630981445, "learning_rate": 6.470502760825342e-06, "loss": 0.5102, "mean_token_accuracy": 0.8359068840742111, "num_tokens": 16050108.0, "step": 13360 }, { "entropy": 1.9616862878203392, "epoch": 0.041445830191442844, "grad_norm": 11.222855567932129, "learning_rate": 6.475346314055993e-06, "loss": 0.6498, "mean_token_accuracy": 0.8021759241819382, "num_tokens": 16061548.0, "step": 13370 }, { "entropy": 1.9346465274691582, "epoch": 0.04147682931649254, "grad_norm": 10.253416061401367, "learning_rate": 6.480189867286641e-06, "loss": 0.6582, "mean_token_accuracy": 0.8080699786543846, "num_tokens": 16072805.0, "step": 13380 }, { "entropy": 1.8889733031392097, "epoch": 0.04150782844154224, "grad_norm": 10.119233131408691, "learning_rate": 6.485033420517292e-06, "loss": 0.6144, "mean_token_accuracy": 0.8134203433990479, "num_tokens": 16084811.0, "step": 13390 }, { "entropy": 1.935821218788624, "epoch": 0.041538827566591933, "grad_norm": 11.066147804260254, "learning_rate": 6.489876973747942e-06, "loss": 0.6034, "mean_token_accuracy": 0.8229247480630875, "num_tokens": 16095975.0, "step": 13400 }, { "entropy": 1.8281036272644997, "epoch": 0.04156982669164163, "grad_norm": 12.909806251525879, "learning_rate": 6.4947205269785915e-06, "loss": 0.5791, "mean_token_accuracy": 0.8127323508262634, "num_tokens": 16108978.0, "step": 13410 }, { "entropy": 1.915538875758648, "epoch": 0.04160082581669133, "grad_norm": 11.429049491882324, "learning_rate": 6.499564080209242e-06, "loss": 0.6331, "mean_token_accuracy": 0.8143791824579238, "num_tokens": 16121160.0, "step": 13420 }, { "entropy": 1.8657421082258225, "epoch": 0.041631824941741016, "grad_norm": 5.312261581420898, "learning_rate": 6.504407633439892e-06, "loss": 0.5418, "mean_token_accuracy": 0.8279088050127029, "num_tokens": 16133755.0, "step": 13430 }, { "entropy": 1.9448801666498183, "epoch": 0.04166282406679071, "grad_norm": 11.335988998413086, "learning_rate": 6.509251186670542e-06, "loss": 0.6206, "mean_token_accuracy": 0.8046508118510246, "num_tokens": 16145359.0, "step": 13440 }, { "entropy": 1.837200105190277, "epoch": 0.04169382319184041, "grad_norm": 8.97873592376709, "learning_rate": 6.514094739901192e-06, "loss": 0.577, "mean_token_accuracy": 0.823949719965458, "num_tokens": 16157112.0, "step": 13450 }, { "entropy": 1.8169792860746383, "epoch": 0.041724822316890106, "grad_norm": 13.009424209594727, "learning_rate": 6.518938293131843e-06, "loss": 0.5284, "mean_token_accuracy": 0.8228383541107178, "num_tokens": 16169936.0, "step": 13460 }, { "entropy": 1.7715341180562973, "epoch": 0.0417558214419398, "grad_norm": 4.483689308166504, "learning_rate": 6.5237818463624925e-06, "loss": 0.5045, "mean_token_accuracy": 0.8237165853381156, "num_tokens": 16184134.0, "step": 13470 }, { "entropy": 1.9055643543601035, "epoch": 0.0417868205669895, "grad_norm": 11.176976203918457, "learning_rate": 6.528625399593142e-06, "loss": 0.616, "mean_token_accuracy": 0.807232391834259, "num_tokens": 16195929.0, "step": 13480 }, { "entropy": 1.7436404943466186, "epoch": 0.04181781969203919, "grad_norm": 11.869129180908203, "learning_rate": 6.5334689528237915e-06, "loss": 0.4807, "mean_token_accuracy": 0.8363062143325806, "num_tokens": 16210306.0, "step": 13490 }, { "entropy": 1.8251071318984031, "epoch": 0.041848818817088886, "grad_norm": 6.912018775939941, "learning_rate": 6.538312506054441e-06, "loss": 0.5753, "mean_token_accuracy": 0.8134068369865417, "num_tokens": 16222376.0, "step": 13500 }, { "entropy": 1.8436161801218987, "epoch": 0.04187981794213858, "grad_norm": 10.873340606689453, "learning_rate": 6.543156059285092e-06, "loss": 0.5726, "mean_token_accuracy": 0.8138954237103462, "num_tokens": 16235581.0, "step": 13510 }, { "entropy": 1.9201088815927505, "epoch": 0.04191081706718828, "grad_norm": 12.203166007995605, "learning_rate": 6.547999612515742e-06, "loss": 0.6399, "mean_token_accuracy": 0.809160690009594, "num_tokens": 16246598.0, "step": 13520 }, { "entropy": 1.9009541541337966, "epoch": 0.041941816192237975, "grad_norm": 12.666475296020508, "learning_rate": 6.552843165746392e-06, "loss": 0.6352, "mean_token_accuracy": 0.7980454340577126, "num_tokens": 16258072.0, "step": 13530 }, { "entropy": 1.8785468250513078, "epoch": 0.04197281531728767, "grad_norm": 10.72907829284668, "learning_rate": 6.557686718977042e-06, "loss": 0.6576, "mean_token_accuracy": 0.8070585578680038, "num_tokens": 16269003.0, "step": 13540 }, { "entropy": 1.7919475421309472, "epoch": 0.04200381444233736, "grad_norm": 12.568443298339844, "learning_rate": 6.5625302722076924e-06, "loss": 0.5273, "mean_token_accuracy": 0.8193079605698586, "num_tokens": 16281900.0, "step": 13550 }, { "entropy": 1.865149575471878, "epoch": 0.04203481356738706, "grad_norm": 11.324207305908203, "learning_rate": 6.567373825438342e-06, "loss": 0.6203, "mean_token_accuracy": 0.810130500793457, "num_tokens": 16293834.0, "step": 13560 }, { "entropy": 1.9343768432736397, "epoch": 0.042065812692436755, "grad_norm": 13.249279975891113, "learning_rate": 6.572217378668992e-06, "loss": 0.6251, "mean_token_accuracy": 0.8070745259523392, "num_tokens": 16305965.0, "step": 13570 }, { "entropy": 1.916192325949669, "epoch": 0.04209681181748645, "grad_norm": 12.691089630126953, "learning_rate": 6.577060931899643e-06, "loss": 0.5989, "mean_token_accuracy": 0.8105158194899559, "num_tokens": 16317096.0, "step": 13580 }, { "entropy": 1.888793683052063, "epoch": 0.04212781094253615, "grad_norm": 9.996535301208496, "learning_rate": 6.581904485130292e-06, "loss": 0.6723, "mean_token_accuracy": 0.8087509065866471, "num_tokens": 16329359.0, "step": 13590 }, { "entropy": 1.8468269050121306, "epoch": 0.042158810067585845, "grad_norm": 10.06303596496582, "learning_rate": 6.586748038360942e-06, "loss": 0.5719, "mean_token_accuracy": 0.8199773550033569, "num_tokens": 16341773.0, "step": 13600 }, { "entropy": 1.8991917297244072, "epoch": 0.04218980919263554, "grad_norm": 3.874143362045288, "learning_rate": 6.591591591591592e-06, "loss": 0.6202, "mean_token_accuracy": 0.8165913313627243, "num_tokens": 16353377.0, "step": 13610 }, { "entropy": 1.9188966274261474, "epoch": 0.04222080831768523, "grad_norm": 11.466548919677734, "learning_rate": 6.596435144822242e-06, "loss": 0.6185, "mean_token_accuracy": 0.8085856437683105, "num_tokens": 16365134.0, "step": 13620 }, { "entropy": 1.9046609073877334, "epoch": 0.04225180744273493, "grad_norm": 10.0979585647583, "learning_rate": 6.601278698052892e-06, "loss": 0.5963, "mean_token_accuracy": 0.8179086208343506, "num_tokens": 16376299.0, "step": 13630 }, { "entropy": 1.828147941827774, "epoch": 0.042282806567784624, "grad_norm": 10.194396018981934, "learning_rate": 6.606122251283542e-06, "loss": 0.5521, "mean_token_accuracy": 0.8191608220338822, "num_tokens": 16388768.0, "step": 13640 }, { "entropy": 1.8713184520602226, "epoch": 0.04231380569283432, "grad_norm": 12.538405418395996, "learning_rate": 6.610965804514192e-06, "loss": 0.6416, "mean_token_accuracy": 0.8056328803300857, "num_tokens": 16400797.0, "step": 13650 }, { "entropy": 1.8713348254561424, "epoch": 0.04234480481788402, "grad_norm": 12.18179988861084, "learning_rate": 6.615809357744842e-06, "loss": 0.5399, "mean_token_accuracy": 0.8290207415819169, "num_tokens": 16412316.0, "step": 13660 }, { "entropy": 1.7908138126134872, "epoch": 0.042375803942933714, "grad_norm": 11.162075996398926, "learning_rate": 6.620652910975493e-06, "loss": 0.548, "mean_token_accuracy": 0.8253503978252411, "num_tokens": 16426142.0, "step": 13670 }, { "entropy": 1.8941476494073868, "epoch": 0.042406803067983403, "grad_norm": 15.437644958496094, "learning_rate": 6.625496464206143e-06, "loss": 0.6019, "mean_token_accuracy": 0.8184331357479095, "num_tokens": 16438321.0, "step": 13680 }, { "entropy": 1.9231675088405609, "epoch": 0.0424378021930331, "grad_norm": 12.137004852294922, "learning_rate": 6.6303400174367925e-06, "loss": 0.6361, "mean_token_accuracy": 0.814939396083355, "num_tokens": 16449946.0, "step": 13690 }, { "entropy": 1.8700689136981965, "epoch": 0.0424688013180828, "grad_norm": 10.420080184936523, "learning_rate": 6.6351835706674416e-06, "loss": 0.5667, "mean_token_accuracy": 0.8213317602872848, "num_tokens": 16461823.0, "step": 13700 }, { "entropy": 1.9616285428404807, "epoch": 0.04249980044313249, "grad_norm": 11.999855995178223, "learning_rate": 6.6400271238980914e-06, "loss": 0.6274, "mean_token_accuracy": 0.8095705136656761, "num_tokens": 16473326.0, "step": 13710 }, { "entropy": 1.8622044518589973, "epoch": 0.04253079956818219, "grad_norm": 6.192050457000732, "learning_rate": 6.644870677128742e-06, "loss": 0.5515, "mean_token_accuracy": 0.8142581716179847, "num_tokens": 16486260.0, "step": 13720 }, { "entropy": 1.9137049853801726, "epoch": 0.042561798693231886, "grad_norm": 10.177970886230469, "learning_rate": 6.649714230359392e-06, "loss": 0.6307, "mean_token_accuracy": 0.8191399827599526, "num_tokens": 16497734.0, "step": 13730 }, { "entropy": 1.8745167449116706, "epoch": 0.042592797818281576, "grad_norm": 10.241938591003418, "learning_rate": 6.654557783590042e-06, "loss": 0.5971, "mean_token_accuracy": 0.8165997639298439, "num_tokens": 16509288.0, "step": 13740 }, { "entropy": 1.93837161809206, "epoch": 0.04262379694333127, "grad_norm": 12.580158233642578, "learning_rate": 6.659401336820692e-06, "loss": 0.6656, "mean_token_accuracy": 0.7997888430953026, "num_tokens": 16520584.0, "step": 13750 }, { "entropy": 1.9007338181138038, "epoch": 0.04265479606838097, "grad_norm": 10.395405769348145, "learning_rate": 6.6642448900513426e-06, "loss": 0.6244, "mean_token_accuracy": 0.8126766815781593, "num_tokens": 16532328.0, "step": 13760 }, { "entropy": 1.9072567522525787, "epoch": 0.042685795193430666, "grad_norm": 12.813956260681152, "learning_rate": 6.6690884432819924e-06, "loss": 0.6346, "mean_token_accuracy": 0.81207554936409, "num_tokens": 16543530.0, "step": 13770 }, { "entropy": 1.9019085496664048, "epoch": 0.04271679431848036, "grad_norm": 12.529980659484863, "learning_rate": 6.673931996512642e-06, "loss": 0.66, "mean_token_accuracy": 0.8073429599404335, "num_tokens": 16554610.0, "step": 13780 }, { "entropy": 1.915639691054821, "epoch": 0.04274779344353006, "grad_norm": 6.579958438873291, "learning_rate": 6.678775549743293e-06, "loss": 0.6188, "mean_token_accuracy": 0.8055771961808205, "num_tokens": 16566534.0, "step": 13790 }, { "entropy": 1.897673524916172, "epoch": 0.04277879256857975, "grad_norm": 13.530618667602539, "learning_rate": 6.683619102973943e-06, "loss": 0.5869, "mean_token_accuracy": 0.821336168050766, "num_tokens": 16577725.0, "step": 13800 }, { "entropy": 1.8759017661213875, "epoch": 0.042809791693629445, "grad_norm": 10.226716995239258, "learning_rate": 6.688462656204592e-06, "loss": 0.5829, "mean_token_accuracy": 0.823052391409874, "num_tokens": 16589628.0, "step": 13810 }, { "entropy": 1.8439259082078934, "epoch": 0.04284079081867914, "grad_norm": 10.295611381530762, "learning_rate": 6.693306209435242e-06, "loss": 0.581, "mean_token_accuracy": 0.8230698376893997, "num_tokens": 16601483.0, "step": 13820 }, { "entropy": 1.9123721539974212, "epoch": 0.04287178994372884, "grad_norm": 4.241229057312012, "learning_rate": 6.698149762665892e-06, "loss": 0.5893, "mean_token_accuracy": 0.8144331857562065, "num_tokens": 16613580.0, "step": 13830 }, { "entropy": 1.8960613742470742, "epoch": 0.042902789068778535, "grad_norm": 6.314639091491699, "learning_rate": 6.7029933158965425e-06, "loss": 0.591, "mean_token_accuracy": 0.8104770466685295, "num_tokens": 16626132.0, "step": 13840 }, { "entropy": 1.9086899921298026, "epoch": 0.04293378819382823, "grad_norm": 5.750073432922363, "learning_rate": 6.707836869127192e-06, "loss": 0.6769, "mean_token_accuracy": 0.8074093982577324, "num_tokens": 16637715.0, "step": 13850 }, { "entropy": 1.9291119009256363, "epoch": 0.04296478731887792, "grad_norm": 10.997711181640625, "learning_rate": 6.712680422357842e-06, "loss": 0.64, "mean_token_accuracy": 0.8164422243833542, "num_tokens": 16648790.0, "step": 13860 }, { "entropy": 1.7718395471572876, "epoch": 0.04299578644392762, "grad_norm": 5.029774188995361, "learning_rate": 6.717523975588492e-06, "loss": 0.5216, "mean_token_accuracy": 0.824312150478363, "num_tokens": 16661991.0, "step": 13870 }, { "entropy": 1.897067406773567, "epoch": 0.043026785568977315, "grad_norm": 13.85363483428955, "learning_rate": 6.722367528819143e-06, "loss": 0.6711, "mean_token_accuracy": 0.8090870007872581, "num_tokens": 16672565.0, "step": 13880 }, { "entropy": 1.8265616819262505, "epoch": 0.04305778469402701, "grad_norm": 13.524333000183105, "learning_rate": 6.727211082049793e-06, "loss": 0.6015, "mean_token_accuracy": 0.8150227144360542, "num_tokens": 16685456.0, "step": 13890 }, { "entropy": 1.8813450008630752, "epoch": 0.04308878381907671, "grad_norm": 12.762773513793945, "learning_rate": 6.732054635280443e-06, "loss": 0.5886, "mean_token_accuracy": 0.8163325443863869, "num_tokens": 16697034.0, "step": 13900 }, { "entropy": 1.8345113858580588, "epoch": 0.043119782944126404, "grad_norm": 12.19513988494873, "learning_rate": 6.736898188511092e-06, "loss": 0.5777, "mean_token_accuracy": 0.8157405987381935, "num_tokens": 16708871.0, "step": 13910 }, { "entropy": 1.8650273010134697, "epoch": 0.043150782069176094, "grad_norm": 12.387640953063965, "learning_rate": 6.7417417417417415e-06, "loss": 0.6953, "mean_token_accuracy": 0.8010308906435967, "num_tokens": 16720631.0, "step": 13920 }, { "entropy": 1.8238304048776626, "epoch": 0.04318178119422579, "grad_norm": 10.031027793884277, "learning_rate": 6.746585294972392e-06, "loss": 0.5536, "mean_token_accuracy": 0.8242588087916374, "num_tokens": 16733234.0, "step": 13930 }, { "entropy": 1.8230559036135674, "epoch": 0.04321278031927549, "grad_norm": 12.307608604431152, "learning_rate": 6.751428848203042e-06, "loss": 0.5682, "mean_token_accuracy": 0.8173372864723205, "num_tokens": 16745599.0, "step": 13940 }, { "entropy": 1.9341980874538423, "epoch": 0.043243779444325184, "grad_norm": 12.861936569213867, "learning_rate": 6.756272401433692e-06, "loss": 0.6583, "mean_token_accuracy": 0.8072870954871177, "num_tokens": 16756209.0, "step": 13950 }, { "entropy": 1.8573587149381638, "epoch": 0.04327477856937488, "grad_norm": 11.555964469909668, "learning_rate": 6.761115954664342e-06, "loss": 0.5883, "mean_token_accuracy": 0.8267058104276657, "num_tokens": 16767433.0, "step": 13960 }, { "entropy": 1.8955969214439392, "epoch": 0.04330577769442458, "grad_norm": 11.410296440124512, "learning_rate": 6.765959507894993e-06, "loss": 0.5956, "mean_token_accuracy": 0.8207617849111557, "num_tokens": 16778097.0, "step": 13970 }, { "entropy": 1.876904509961605, "epoch": 0.043336776819474274, "grad_norm": 10.337623596191406, "learning_rate": 6.7708030611256425e-06, "loss": 0.636, "mean_token_accuracy": 0.8040330380201339, "num_tokens": 16789800.0, "step": 13980 }, { "entropy": 1.9401722326874733, "epoch": 0.04336777594452396, "grad_norm": 11.589249610900879, "learning_rate": 6.7756466143562924e-06, "loss": 0.6594, "mean_token_accuracy": 0.8035330668091774, "num_tokens": 16801605.0, "step": 13990 }, { "entropy": 1.890137755870819, "epoch": 0.04339877506957366, "grad_norm": 10.558548927307129, "learning_rate": 6.780490167586943e-06, "loss": 0.6248, "mean_token_accuracy": 0.8102419808506965, "num_tokens": 16813740.0, "step": 14000 }, { "entropy": 1.888036273419857, "epoch": 0.043429774194623356, "grad_norm": 6.69431734085083, "learning_rate": 6.785333720817593e-06, "loss": 0.6036, "mean_token_accuracy": 0.81003537774086, "num_tokens": 16826251.0, "step": 14010 }, { "entropy": 1.949825246632099, "epoch": 0.04346077331967305, "grad_norm": 11.190848350524902, "learning_rate": 6.790177274048242e-06, "loss": 0.6626, "mean_token_accuracy": 0.8077096566557884, "num_tokens": 16837543.0, "step": 14020 }, { "entropy": 1.8271329566836356, "epoch": 0.04349177244472275, "grad_norm": 5.520564556121826, "learning_rate": 6.795020827278892e-06, "loss": 0.5847, "mean_token_accuracy": 0.8192524507641792, "num_tokens": 16850237.0, "step": 14030 }, { "entropy": 1.8742077186703683, "epoch": 0.043522771569772446, "grad_norm": 12.861546516418457, "learning_rate": 6.799864380509542e-06, "loss": 0.5947, "mean_token_accuracy": 0.8112540423870087, "num_tokens": 16861883.0, "step": 14040 }, { "entropy": 1.813956792652607, "epoch": 0.043553770694822136, "grad_norm": 13.800911903381348, "learning_rate": 6.8047079337401926e-06, "loss": 0.5804, "mean_token_accuracy": 0.8147607937455177, "num_tokens": 16874756.0, "step": 14050 }, { "entropy": 1.9326902404427528, "epoch": 0.04358476981987183, "grad_norm": 11.105195045471191, "learning_rate": 6.8095514869708425e-06, "loss": 0.6847, "mean_token_accuracy": 0.793427674472332, "num_tokens": 16885758.0, "step": 14060 }, { "entropy": 1.9657369270920753, "epoch": 0.04361576894492153, "grad_norm": 11.325508117675781, "learning_rate": 6.814395040201492e-06, "loss": 0.6517, "mean_token_accuracy": 0.7974928095936775, "num_tokens": 16896989.0, "step": 14070 }, { "entropy": 1.7869946122169496, "epoch": 0.043646768069971226, "grad_norm": 10.723480224609375, "learning_rate": 6.819238593432142e-06, "loss": 0.5484, "mean_token_accuracy": 0.8241583168506622, "num_tokens": 16910317.0, "step": 14080 }, { "entropy": 1.886980764567852, "epoch": 0.04367776719502092, "grad_norm": 10.611824989318848, "learning_rate": 6.824082146662793e-06, "loss": 0.5972, "mean_token_accuracy": 0.821302755177021, "num_tokens": 16921546.0, "step": 14090 }, { "entropy": 1.795514563471079, "epoch": 0.04370876632007062, "grad_norm": 3.9574859142303467, "learning_rate": 6.828925699893443e-06, "loss": 0.5667, "mean_token_accuracy": 0.8096185430884362, "num_tokens": 16934810.0, "step": 14100 }, { "entropy": 1.8915703102946282, "epoch": 0.04373976544512031, "grad_norm": 12.432307243347168, "learning_rate": 6.833769253124093e-06, "loss": 0.6694, "mean_token_accuracy": 0.7994834899902343, "num_tokens": 16946379.0, "step": 14110 }, { "entropy": 1.872760045528412, "epoch": 0.043770764570170005, "grad_norm": 12.616002082824707, "learning_rate": 6.838612806354742e-06, "loss": 0.6253, "mean_token_accuracy": 0.8020053207874298, "num_tokens": 16958537.0, "step": 14120 }, { "entropy": 1.8190848156809807, "epoch": 0.0438017636952197, "grad_norm": 6.749497890472412, "learning_rate": 6.843456359585392e-06, "loss": 0.566, "mean_token_accuracy": 0.8224000081419944, "num_tokens": 16971281.0, "step": 14130 }, { "entropy": 1.7950861573219299, "epoch": 0.0438327628202694, "grad_norm": 5.5427021980285645, "learning_rate": 6.848299912816042e-06, "loss": 0.523, "mean_token_accuracy": 0.821921581029892, "num_tokens": 16983378.0, "step": 14140 }, { "entropy": 1.7081642150878906, "epoch": 0.043863761945319095, "grad_norm": 14.56338882446289, "learning_rate": 6.853143466046692e-06, "loss": 0.4977, "mean_token_accuracy": 0.8312652423977852, "num_tokens": 16996461.0, "step": 14150 }, { "entropy": 1.7555221557617187, "epoch": 0.04389476107036879, "grad_norm": 5.769707679748535, "learning_rate": 6.857987019277342e-06, "loss": 0.5165, "mean_token_accuracy": 0.81901466101408, "num_tokens": 17009223.0, "step": 14160 }, { "entropy": 1.8228839874267577, "epoch": 0.04392576019541848, "grad_norm": 14.009598731994629, "learning_rate": 6.862830572507992e-06, "loss": 0.5565, "mean_token_accuracy": 0.8186228111386299, "num_tokens": 17021369.0, "step": 14170 }, { "entropy": 1.8004897370934487, "epoch": 0.04395675932046818, "grad_norm": 13.954216957092285, "learning_rate": 6.867674125738643e-06, "loss": 0.6168, "mean_token_accuracy": 0.8179115906357766, "num_tokens": 17033773.0, "step": 14180 }, { "entropy": 1.8727715358138084, "epoch": 0.043987758445517874, "grad_norm": 5.836693286895752, "learning_rate": 6.872517678969293e-06, "loss": 0.63, "mean_token_accuracy": 0.8152304857969284, "num_tokens": 17045247.0, "step": 14190 }, { "entropy": 1.9000848352909088, "epoch": 0.04401875757056757, "grad_norm": 13.797229766845703, "learning_rate": 6.8773612321999425e-06, "loss": 0.6389, "mean_token_accuracy": 0.8070854544639587, "num_tokens": 17056322.0, "step": 14200 }, { "entropy": 1.8419507443904877, "epoch": 0.04404975669561727, "grad_norm": 10.07559585571289, "learning_rate": 6.882204785430593e-06, "loss": 0.607, "mean_token_accuracy": 0.8120106473565102, "num_tokens": 17068663.0, "step": 14210 }, { "entropy": 1.8738445043563843, "epoch": 0.044080755820666964, "grad_norm": 12.083559036254883, "learning_rate": 6.887048338661243e-06, "loss": 0.597, "mean_token_accuracy": 0.8096369743347168, "num_tokens": 17080468.0, "step": 14220 }, { "entropy": 1.8636516377329826, "epoch": 0.044111754945716654, "grad_norm": 10.079278945922852, "learning_rate": 6.891891891891892e-06, "loss": 0.6145, "mean_token_accuracy": 0.8174061790108681, "num_tokens": 17091833.0, "step": 14230 }, { "entropy": 1.8882641837000846, "epoch": 0.04414275407076635, "grad_norm": 13.434144973754883, "learning_rate": 6.896735445122542e-06, "loss": 0.6377, "mean_token_accuracy": 0.8117697656154632, "num_tokens": 17102951.0, "step": 14240 }, { "entropy": 1.9334753528237343, "epoch": 0.04417375319581605, "grad_norm": 9.094237327575684, "learning_rate": 6.901578998353192e-06, "loss": 0.6594, "mean_token_accuracy": 0.8049651876091957, "num_tokens": 17114582.0, "step": 14250 }, { "entropy": 1.8502332031726838, "epoch": 0.044204752320865744, "grad_norm": 9.20546817779541, "learning_rate": 6.906422551583843e-06, "loss": 0.6099, "mean_token_accuracy": 0.8162536874413491, "num_tokens": 17127285.0, "step": 14260 }, { "entropy": 1.8943143799901008, "epoch": 0.04423575144591544, "grad_norm": 10.137446403503418, "learning_rate": 6.9112661048144926e-06, "loss": 0.6203, "mean_token_accuracy": 0.8087219893932343, "num_tokens": 17139089.0, "step": 14270 }, { "entropy": 1.887183803319931, "epoch": 0.04426675057096514, "grad_norm": 10.977912902832031, "learning_rate": 6.9161096580451424e-06, "loss": 0.6091, "mean_token_accuracy": 0.8069622635841369, "num_tokens": 17151186.0, "step": 14280 }, { "entropy": 1.8306689888238907, "epoch": 0.04429774969601483, "grad_norm": 11.67708969116211, "learning_rate": 6.920953211275792e-06, "loss": 0.567, "mean_token_accuracy": 0.8185241803526878, "num_tokens": 17162472.0, "step": 14290 }, { "entropy": 1.9653134107589723, "epoch": 0.04432874882106452, "grad_norm": 12.62784481048584, "learning_rate": 6.925796764506443e-06, "loss": 0.7129, "mean_token_accuracy": 0.7954209297895432, "num_tokens": 17173491.0, "step": 14300 }, { "entropy": 1.8972666263580322, "epoch": 0.04435974794611422, "grad_norm": 10.280572891235352, "learning_rate": 6.930640317737093e-06, "loss": 0.6188, "mean_token_accuracy": 0.8118947297334671, "num_tokens": 17184219.0, "step": 14310 }, { "entropy": 1.8529438108205796, "epoch": 0.044390747071163916, "grad_norm": 13.168403625488281, "learning_rate": 6.935483870967743e-06, "loss": 0.5719, "mean_token_accuracy": 0.8159810289740562, "num_tokens": 17195740.0, "step": 14320 }, { "entropy": 1.8751512482762336, "epoch": 0.04442174619621361, "grad_norm": 5.939009666442871, "learning_rate": 6.940327424198392e-06, "loss": 0.6072, "mean_token_accuracy": 0.8138178676366806, "num_tokens": 17207488.0, "step": 14330 }, { "entropy": 1.9205424144864083, "epoch": 0.04445274532126331, "grad_norm": 9.95887565612793, "learning_rate": 6.945170977429042e-06, "loss": 0.6255, "mean_token_accuracy": 0.8147617742419243, "num_tokens": 17218552.0, "step": 14340 }, { "entropy": 1.9514837980270385, "epoch": 0.044483744446313006, "grad_norm": 11.541547775268555, "learning_rate": 6.9500145306596925e-06, "loss": 0.7025, "mean_token_accuracy": 0.7957576259970665, "num_tokens": 17229630.0, "step": 14350 }, { "entropy": 1.9413245290517807, "epoch": 0.044514743571362696, "grad_norm": 11.1705322265625, "learning_rate": 6.954858083890342e-06, "loss": 0.6673, "mean_token_accuracy": 0.8048760443925858, "num_tokens": 17240633.0, "step": 14360 }, { "entropy": 1.8450009673833847, "epoch": 0.04454574269641239, "grad_norm": 6.041028022766113, "learning_rate": 6.959701637120992e-06, "loss": 0.6093, "mean_token_accuracy": 0.8054246112704277, "num_tokens": 17253698.0, "step": 14370 }, { "entropy": 1.833503720164299, "epoch": 0.04457674182146209, "grad_norm": 4.289499759674072, "learning_rate": 6.964545190351643e-06, "loss": 0.5688, "mean_token_accuracy": 0.8216146498918533, "num_tokens": 17266350.0, "step": 14380 }, { "entropy": 1.7655683636665345, "epoch": 0.044607740946511786, "grad_norm": 12.362275123596191, "learning_rate": 6.969388743582293e-06, "loss": 0.5197, "mean_token_accuracy": 0.8274104654788971, "num_tokens": 17279397.0, "step": 14390 }, { "entropy": 1.8648739516735078, "epoch": 0.04463874007156148, "grad_norm": 11.831520080566406, "learning_rate": 6.974232296812943e-06, "loss": 0.5705, "mean_token_accuracy": 0.8219448134303093, "num_tokens": 17291500.0, "step": 14400 }, { "entropy": 1.8236894220113755, "epoch": 0.04466973919661118, "grad_norm": 10.356793403625488, "learning_rate": 6.979075850043593e-06, "loss": 0.5211, "mean_token_accuracy": 0.8268671408295631, "num_tokens": 17304068.0, "step": 14410 }, { "entropy": 1.8797018930315972, "epoch": 0.04470073832166087, "grad_norm": 9.828136444091797, "learning_rate": 6.983919403274243e-06, "loss": 0.5973, "mean_token_accuracy": 0.8069930672645569, "num_tokens": 17316443.0, "step": 14420 }, { "entropy": 1.9420302003622054, "epoch": 0.044731737446710565, "grad_norm": 12.722853660583496, "learning_rate": 6.988762956504893e-06, "loss": 0.6601, "mean_token_accuracy": 0.8111462906002999, "num_tokens": 17326906.0, "step": 14430 }, { "entropy": 1.8793850436806678, "epoch": 0.04476273657176026, "grad_norm": 13.556890487670898, "learning_rate": 6.993606509735542e-06, "loss": 0.5825, "mean_token_accuracy": 0.8150814548134804, "num_tokens": 17338259.0, "step": 14440 }, { "entropy": 1.8735877990722656, "epoch": 0.04479373569680996, "grad_norm": 10.880188941955566, "learning_rate": 6.998450062966192e-06, "loss": 0.5381, "mean_token_accuracy": 0.8156931459903717, "num_tokens": 17351172.0, "step": 14450 }, { "entropy": 1.8445069909095764, "epoch": 0.044824734821859655, "grad_norm": 11.748570442199707, "learning_rate": 7.003293616196842e-06, "loss": 0.5565, "mean_token_accuracy": 0.8234645172953605, "num_tokens": 17363083.0, "step": 14460 }, { "entropy": 1.8664287984371186, "epoch": 0.04485573394690935, "grad_norm": 11.027554512023926, "learning_rate": 7.008137169427493e-06, "loss": 0.6284, "mean_token_accuracy": 0.8028878584504128, "num_tokens": 17374802.0, "step": 14470 }, { "entropy": 1.8942121878266334, "epoch": 0.04488673307195904, "grad_norm": 12.925647735595703, "learning_rate": 7.012980722658143e-06, "loss": 0.6221, "mean_token_accuracy": 0.8129858180880547, "num_tokens": 17386778.0, "step": 14480 }, { "entropy": 1.8534621268510818, "epoch": 0.04491773219700874, "grad_norm": 11.046483039855957, "learning_rate": 7.0178242758887926e-06, "loss": 0.5628, "mean_token_accuracy": 0.8115181505680085, "num_tokens": 17399462.0, "step": 14490 }, { "entropy": 1.8930703341960906, "epoch": 0.044948731322058434, "grad_norm": 14.628288269042969, "learning_rate": 7.0226678291194424e-06, "loss": 0.6421, "mean_token_accuracy": 0.8067412465810776, "num_tokens": 17411740.0, "step": 14500 }, { "entropy": 1.9142782092094421, "epoch": 0.04497973044710813, "grad_norm": 13.652180671691895, "learning_rate": 7.027511382350093e-06, "loss": 0.6096, "mean_token_accuracy": 0.8202868893742561, "num_tokens": 17423288.0, "step": 14510 }, { "entropy": 1.9144923388957977, "epoch": 0.04501072957215783, "grad_norm": 9.822308540344238, "learning_rate": 7.032354935580743e-06, "loss": 0.5705, "mean_token_accuracy": 0.8314526736736297, "num_tokens": 17434455.0, "step": 14520 }, { "entropy": 1.926996847987175, "epoch": 0.045041728697207524, "grad_norm": 12.479947090148926, "learning_rate": 7.037198488811393e-06, "loss": 0.6008, "mean_token_accuracy": 0.8161266520619392, "num_tokens": 17445979.0, "step": 14530 }, { "entropy": 1.9374195352196693, "epoch": 0.045072727822257214, "grad_norm": 15.353309631347656, "learning_rate": 7.042042042042042e-06, "loss": 0.6577, "mean_token_accuracy": 0.8026217222213745, "num_tokens": 17457698.0, "step": 14540 }, { "entropy": 1.960123062133789, "epoch": 0.04510372694730691, "grad_norm": 13.066014289855957, "learning_rate": 7.046885595272692e-06, "loss": 0.664, "mean_token_accuracy": 0.8060840681195259, "num_tokens": 17468922.0, "step": 14550 }, { "entropy": 1.9222340703010559, "epoch": 0.04513472607235661, "grad_norm": 11.0694580078125, "learning_rate": 7.051729148503343e-06, "loss": 0.6043, "mean_token_accuracy": 0.8135023102164268, "num_tokens": 17480520.0, "step": 14560 }, { "entropy": 1.8666946336627006, "epoch": 0.0451657251974063, "grad_norm": 7.747961044311523, "learning_rate": 7.0565727017339925e-06, "loss": 0.5063, "mean_token_accuracy": 0.837678550183773, "num_tokens": 17493323.0, "step": 14570 }, { "entropy": 1.8420935034751893, "epoch": 0.045196724322456, "grad_norm": 10.135159492492676, "learning_rate": 7.061416254964642e-06, "loss": 0.502, "mean_token_accuracy": 0.8299189880490303, "num_tokens": 17506110.0, "step": 14580 }, { "entropy": 1.9041017875075341, "epoch": 0.0452277234475057, "grad_norm": 9.9268159866333, "learning_rate": 7.066259808195293e-06, "loss": 0.6489, "mean_token_accuracy": 0.8125099197030068, "num_tokens": 17517261.0, "step": 14590 }, { "entropy": 1.8802707374095917, "epoch": 0.045258722572555386, "grad_norm": 9.548727989196777, "learning_rate": 7.071103361425943e-06, "loss": 0.6023, "mean_token_accuracy": 0.8187322363257408, "num_tokens": 17529058.0, "step": 14600 }, { "entropy": 1.9222388476133347, "epoch": 0.04528972169760508, "grad_norm": 13.59824275970459, "learning_rate": 7.075946914656593e-06, "loss": 0.6298, "mean_token_accuracy": 0.8107534229755402, "num_tokens": 17540313.0, "step": 14610 }, { "entropy": 1.8212895065546035, "epoch": 0.04532072082265478, "grad_norm": 5.862724304199219, "learning_rate": 7.080790467887243e-06, "loss": 0.5766, "mean_token_accuracy": 0.8119758501648903, "num_tokens": 17553154.0, "step": 14620 }, { "entropy": 1.919022636115551, "epoch": 0.045351719947704476, "grad_norm": 11.380285263061523, "learning_rate": 7.0856340211178935e-06, "loss": 0.6269, "mean_token_accuracy": 0.804240868985653, "num_tokens": 17564581.0, "step": 14630 }, { "entropy": 1.8831742450594902, "epoch": 0.04538271907275417, "grad_norm": 12.382061004638672, "learning_rate": 7.090477574348543e-06, "loss": 0.5588, "mean_token_accuracy": 0.8190920054912567, "num_tokens": 17576403.0, "step": 14640 }, { "entropy": 1.8691495105624198, "epoch": 0.04541371819780387, "grad_norm": 5.398478984832764, "learning_rate": 7.095321127579192e-06, "loss": 0.5668, "mean_token_accuracy": 0.8149119704961777, "num_tokens": 17587726.0, "step": 14650 }, { "entropy": 1.7776469945907594, "epoch": 0.045444717322853566, "grad_norm": 11.435405731201172, "learning_rate": 7.100164680809842e-06, "loss": 0.5784, "mean_token_accuracy": 0.8274336785078049, "num_tokens": 17600178.0, "step": 14660 }, { "entropy": 1.8829714879393578, "epoch": 0.045475716447903256, "grad_norm": 6.995675563812256, "learning_rate": 7.105008234040492e-06, "loss": 0.6475, "mean_token_accuracy": 0.8128258779644966, "num_tokens": 17611382.0, "step": 14670 }, { "entropy": 1.906617347896099, "epoch": 0.04550671557295295, "grad_norm": 6.328146457672119, "learning_rate": 7.109851787271143e-06, "loss": 0.6191, "mean_token_accuracy": 0.8149865731596947, "num_tokens": 17623005.0, "step": 14680 }, { "entropy": 1.8847622662782668, "epoch": 0.04553771469800265, "grad_norm": 12.374866485595703, "learning_rate": 7.114695340501793e-06, "loss": 0.5861, "mean_token_accuracy": 0.8213684529066085, "num_tokens": 17634543.0, "step": 14690 }, { "entropy": 1.8927973687648774, "epoch": 0.045568713823052345, "grad_norm": 11.521233558654785, "learning_rate": 7.119538893732443e-06, "loss": 0.5843, "mean_token_accuracy": 0.8219080328941345, "num_tokens": 17645810.0, "step": 14700 }, { "entropy": 1.9088530361652374, "epoch": 0.04559971294810204, "grad_norm": 11.580865859985352, "learning_rate": 7.1243824469630925e-06, "loss": 0.6404, "mean_token_accuracy": 0.8121732175350189, "num_tokens": 17657230.0, "step": 14710 }, { "entropy": 1.906784760951996, "epoch": 0.04563071207315174, "grad_norm": 10.853860855102539, "learning_rate": 7.129226000193743e-06, "loss": 0.6028, "mean_token_accuracy": 0.8171251997351646, "num_tokens": 17669157.0, "step": 14720 }, { "entropy": 1.8272288024425507, "epoch": 0.04566171119820143, "grad_norm": 11.097689628601074, "learning_rate": 7.134069553424393e-06, "loss": 0.5436, "mean_token_accuracy": 0.8214239537715912, "num_tokens": 17682522.0, "step": 14730 }, { "entropy": 1.9602118730545044, "epoch": 0.045692710323251125, "grad_norm": 11.482585906982422, "learning_rate": 7.138913106655043e-06, "loss": 0.6694, "mean_token_accuracy": 0.8125876560807228, "num_tokens": 17693830.0, "step": 14740 }, { "entropy": 1.8845744907855988, "epoch": 0.04572370944830082, "grad_norm": 11.898690223693848, "learning_rate": 7.143756659885692e-06, "loss": 0.621, "mean_token_accuracy": 0.8045976728200912, "num_tokens": 17706045.0, "step": 14750 }, { "entropy": 1.8579593807458878, "epoch": 0.04575470857335052, "grad_norm": 5.022470474243164, "learning_rate": 7.148600213116342e-06, "loss": 0.6451, "mean_token_accuracy": 0.804808932542801, "num_tokens": 17718119.0, "step": 14760 }, { "entropy": 1.9066347777843475, "epoch": 0.045785707698400215, "grad_norm": 10.700624465942383, "learning_rate": 7.153443766346993e-06, "loss": 0.61, "mean_token_accuracy": 0.8180598929524422, "num_tokens": 17729029.0, "step": 14770 }, { "entropy": 1.8993641972541808, "epoch": 0.04581670682344991, "grad_norm": 10.812292098999023, "learning_rate": 7.1582873195776426e-06, "loss": 0.6628, "mean_token_accuracy": 0.8115692853927612, "num_tokens": 17740072.0, "step": 14780 }, { "entropy": 1.8603774085640907, "epoch": 0.0458477059484996, "grad_norm": 13.853246688842773, "learning_rate": 7.1631308728082925e-06, "loss": 0.609, "mean_token_accuracy": 0.8194773256778717, "num_tokens": 17751523.0, "step": 14790 }, { "entropy": 1.7647764384746552, "epoch": 0.0458787050735493, "grad_norm": 13.947328567504883, "learning_rate": 7.167974426038943e-06, "loss": 0.5377, "mean_token_accuracy": 0.8233131125569344, "num_tokens": 17763817.0, "step": 14800 }, { "entropy": 1.750529208779335, "epoch": 0.045909704198598994, "grad_norm": 5.576157569885254, "learning_rate": 7.172817979269593e-06, "loss": 0.5442, "mean_token_accuracy": 0.8132794231176377, "num_tokens": 17777124.0, "step": 14810 }, { "entropy": 1.851732324063778, "epoch": 0.04594070332364869, "grad_norm": 11.042110443115234, "learning_rate": 7.177661532500243e-06, "loss": 0.6467, "mean_token_accuracy": 0.8058951094746589, "num_tokens": 17788612.0, "step": 14820 }, { "entropy": 1.7839807882905006, "epoch": 0.04597170244869839, "grad_norm": 10.22685718536377, "learning_rate": 7.182505085730893e-06, "loss": 0.5832, "mean_token_accuracy": 0.8168496385216713, "num_tokens": 17801156.0, "step": 14830 }, { "entropy": 1.8429742008447647, "epoch": 0.046002701573748084, "grad_norm": 5.943838596343994, "learning_rate": 7.1873486389615436e-06, "loss": 0.5539, "mean_token_accuracy": 0.8247615218162536, "num_tokens": 17813245.0, "step": 14840 }, { "entropy": 1.8560289978981017, "epoch": 0.04603370069879777, "grad_norm": 11.962748527526855, "learning_rate": 7.1921921921921935e-06, "loss": 0.651, "mean_token_accuracy": 0.8067975297570229, "num_tokens": 17824749.0, "step": 14850 }, { "entropy": 1.7992481097579003, "epoch": 0.04606469982384747, "grad_norm": 13.64804744720459, "learning_rate": 7.1970357454228425e-06, "loss": 0.5454, "mean_token_accuracy": 0.8238918125629425, "num_tokens": 17838276.0, "step": 14860 }, { "entropy": 1.925189484655857, "epoch": 0.04609569894889717, "grad_norm": 10.546503067016602, "learning_rate": 7.201879298653492e-06, "loss": 0.6402, "mean_token_accuracy": 0.8055410549044609, "num_tokens": 17849457.0, "step": 14870 }, { "entropy": 1.939789319038391, "epoch": 0.04612669807394686, "grad_norm": 13.753273963928223, "learning_rate": 7.206722851884142e-06, "loss": 0.6455, "mean_token_accuracy": 0.807138554751873, "num_tokens": 17860102.0, "step": 14880 }, { "entropy": 1.8846666172146798, "epoch": 0.04615769719899656, "grad_norm": 11.280816078186035, "learning_rate": 7.211566405114793e-06, "loss": 0.6067, "mean_token_accuracy": 0.8096478402614593, "num_tokens": 17871606.0, "step": 14890 }, { "entropy": 1.918617771565914, "epoch": 0.046188696324046256, "grad_norm": 10.672183990478516, "learning_rate": 7.216409958345443e-06, "loss": 0.6779, "mean_token_accuracy": 0.8008549973368645, "num_tokens": 17882968.0, "step": 14900 }, { "entropy": 1.9250801861286164, "epoch": 0.046219695449095946, "grad_norm": 10.703913688659668, "learning_rate": 7.221253511576093e-06, "loss": 0.6541, "mean_token_accuracy": 0.8061710268259048, "num_tokens": 17894617.0, "step": 14910 }, { "entropy": 1.8236558228731155, "epoch": 0.04625069457414564, "grad_norm": 11.690652847290039, "learning_rate": 7.226097064806743e-06, "loss": 0.5484, "mean_token_accuracy": 0.8214540228247642, "num_tokens": 17907854.0, "step": 14920 }, { "entropy": 1.8554262310266494, "epoch": 0.04628169369919534, "grad_norm": 10.404338836669922, "learning_rate": 7.230940618037393e-06, "loss": 0.5821, "mean_token_accuracy": 0.8083539769053459, "num_tokens": 17920382.0, "step": 14930 }, { "entropy": 1.855211953818798, "epoch": 0.046312692824245036, "grad_norm": 10.875576972961426, "learning_rate": 7.235784171268043e-06, "loss": 0.5979, "mean_token_accuracy": 0.8178864941000938, "num_tokens": 17932205.0, "step": 14940 }, { "entropy": 1.8454056218266488, "epoch": 0.04634369194929473, "grad_norm": 11.040106773376465, "learning_rate": 7.240627724498693e-06, "loss": 0.58, "mean_token_accuracy": 0.8090640112757683, "num_tokens": 17943421.0, "step": 14950 }, { "entropy": 1.901095400750637, "epoch": 0.04637469107434443, "grad_norm": 10.230569839477539, "learning_rate": 7.245471277729342e-06, "loss": 0.5863, "mean_token_accuracy": 0.8283701628446579, "num_tokens": 17954715.0, "step": 14960 }, { "entropy": 1.8909978330135346, "epoch": 0.04640569019939412, "grad_norm": 12.003896713256836, "learning_rate": 7.250314830959992e-06, "loss": 0.6415, "mean_token_accuracy": 0.809289188683033, "num_tokens": 17966078.0, "step": 14970 }, { "entropy": 1.893742746114731, "epoch": 0.046436689324443815, "grad_norm": 12.008511543273926, "learning_rate": 7.255158384190643e-06, "loss": 0.5831, "mean_token_accuracy": 0.8165367186069489, "num_tokens": 17977838.0, "step": 14980 }, { "entropy": 1.9413186386227608, "epoch": 0.04646768844949351, "grad_norm": 11.98163890838623, "learning_rate": 7.260001937421293e-06, "loss": 0.632, "mean_token_accuracy": 0.8125703752040863, "num_tokens": 17989242.0, "step": 14990 }, { "entropy": 1.8333601012825966, "epoch": 0.04649868757454321, "grad_norm": 5.240977764129639, "learning_rate": 7.2648454906519426e-06, "loss": 0.537, "mean_token_accuracy": 0.8350656241178512, "num_tokens": 18001491.0, "step": 15000 }, { "entropy": 1.8348119348287582, "epoch": 0.046529686699592905, "grad_norm": 5.844879150390625, "learning_rate": 7.269689043882593e-06, "loss": 0.602, "mean_token_accuracy": 0.8153130680322647, "num_tokens": 18014233.0, "step": 15010 }, { "entropy": 1.8076252147555352, "epoch": 0.0465606858246426, "grad_norm": 5.277684688568115, "learning_rate": 7.274532597113243e-06, "loss": 0.5404, "mean_token_accuracy": 0.824401643872261, "num_tokens": 18027418.0, "step": 15020 }, { "entropy": 1.9218341365456582, "epoch": 0.0465916849496923, "grad_norm": 10.615427017211914, "learning_rate": 7.279376150343893e-06, "loss": 0.608, "mean_token_accuracy": 0.8177351862192154, "num_tokens": 18038854.0, "step": 15030 }, { "entropy": 1.9281207531690598, "epoch": 0.04662268407474199, "grad_norm": 9.128558158874512, "learning_rate": 7.284219703574543e-06, "loss": 0.598, "mean_token_accuracy": 0.8243363618850708, "num_tokens": 18049861.0, "step": 15040 }, { "entropy": 1.8921822875738143, "epoch": 0.046653683199791685, "grad_norm": 13.774589538574219, "learning_rate": 7.289063256805194e-06, "loss": 0.6362, "mean_token_accuracy": 0.8057242497801781, "num_tokens": 18061596.0, "step": 15050 }, { "entropy": 1.832139255106449, "epoch": 0.04668468232484138, "grad_norm": 10.49471378326416, "learning_rate": 7.2939068100358436e-06, "loss": 0.5717, "mean_token_accuracy": 0.816807533800602, "num_tokens": 18073371.0, "step": 15060 }, { "entropy": 1.8594244837760925, "epoch": 0.04671568144989108, "grad_norm": 11.264952659606934, "learning_rate": 7.298750363266493e-06, "loss": 0.6349, "mean_token_accuracy": 0.8005358681082726, "num_tokens": 18086010.0, "step": 15070 }, { "entropy": 1.8758624821901322, "epoch": 0.046746680574940774, "grad_norm": 13.645959854125977, "learning_rate": 7.3035939164971425e-06, "loss": 0.5787, "mean_token_accuracy": 0.8165410995483399, "num_tokens": 18097503.0, "step": 15080 }, { "entropy": 1.8632541045546531, "epoch": 0.04677767969999047, "grad_norm": 12.606849670410156, "learning_rate": 7.308437469727792e-06, "loss": 0.5916, "mean_token_accuracy": 0.8230202242732048, "num_tokens": 18109025.0, "step": 15090 }, { "entropy": 1.8468170419335366, "epoch": 0.04680867882504016, "grad_norm": 5.763012409210205, "learning_rate": 7.313281022958443e-06, "loss": 0.5543, "mean_token_accuracy": 0.82147196829319, "num_tokens": 18120920.0, "step": 15100 }, { "entropy": 1.8019221499562263, "epoch": 0.04683967795008986, "grad_norm": 12.194572448730469, "learning_rate": 7.318124576189093e-06, "loss": 0.534, "mean_token_accuracy": 0.8267902106046676, "num_tokens": 18134004.0, "step": 15110 }, { "entropy": 1.9301525950431824, "epoch": 0.046870677075139554, "grad_norm": 11.044305801391602, "learning_rate": 7.322968129419743e-06, "loss": 0.7477, "mean_token_accuracy": 0.798159584403038, "num_tokens": 18145789.0, "step": 15120 }, { "entropy": 1.9049590498209, "epoch": 0.04690167620018925, "grad_norm": 12.671975135803223, "learning_rate": 7.327811682650393e-06, "loss": 0.6079, "mean_token_accuracy": 0.8189090758562088, "num_tokens": 18157088.0, "step": 15130 }, { "entropy": 1.8682396605610847, "epoch": 0.04693267532523895, "grad_norm": 9.93550968170166, "learning_rate": 7.3326552358810435e-06, "loss": 0.5791, "mean_token_accuracy": 0.8179777503013611, "num_tokens": 18169072.0, "step": 15140 }, { "entropy": 1.8468847021460533, "epoch": 0.046963674450288644, "grad_norm": 9.431316375732422, "learning_rate": 7.337498789111693e-06, "loss": 0.5507, "mean_token_accuracy": 0.8228641420602798, "num_tokens": 18181019.0, "step": 15150 }, { "entropy": 1.775972270965576, "epoch": 0.04699467357533833, "grad_norm": 10.788688659667969, "learning_rate": 7.342342342342343e-06, "loss": 0.4845, "mean_token_accuracy": 0.8265146508812904, "num_tokens": 18194155.0, "step": 15160 }, { "entropy": 1.8457147806882859, "epoch": 0.04702567270038803, "grad_norm": 9.854268074035645, "learning_rate": 7.347185895572992e-06, "loss": 0.5475, "mean_token_accuracy": 0.8127473801374435, "num_tokens": 18206750.0, "step": 15170 }, { "entropy": 1.7577089250087738, "epoch": 0.047056671825437726, "grad_norm": 5.139494895935059, "learning_rate": 7.352029448803642e-06, "loss": 0.4973, "mean_token_accuracy": 0.8430294960737228, "num_tokens": 18219202.0, "step": 15180 }, { "entropy": 1.8935526996850967, "epoch": 0.04708767095048742, "grad_norm": 5.4716057777404785, "learning_rate": 7.356873002034293e-06, "loss": 0.5882, "mean_token_accuracy": 0.8127918854355812, "num_tokens": 18230861.0, "step": 15190 }, { "entropy": 1.8892264723777772, "epoch": 0.04711867007553712, "grad_norm": 12.728769302368164, "learning_rate": 7.361716555264943e-06, "loss": 0.5607, "mean_token_accuracy": 0.8149558499455452, "num_tokens": 18242915.0, "step": 15200 }, { "entropy": 1.8911462113261224, "epoch": 0.047149669200586816, "grad_norm": 9.735945701599121, "learning_rate": 7.366560108495593e-06, "loss": 0.6442, "mean_token_accuracy": 0.8063351511955261, "num_tokens": 18253977.0, "step": 15210 }, { "entropy": 1.9077469795942306, "epoch": 0.047180668325636506, "grad_norm": 13.403579711914062, "learning_rate": 7.371403661726243e-06, "loss": 0.6695, "mean_token_accuracy": 0.8085703745484352, "num_tokens": 18264673.0, "step": 15220 }, { "entropy": 1.86230625808239, "epoch": 0.0472116674506862, "grad_norm": 5.826930999755859, "learning_rate": 7.376247214956893e-06, "loss": 0.5957, "mean_token_accuracy": 0.8189520880579948, "num_tokens": 18276799.0, "step": 15230 }, { "entropy": 1.9372797414660454, "epoch": 0.0472426665757359, "grad_norm": 12.060833930969238, "learning_rate": 7.381090768187543e-06, "loss": 0.5974, "mean_token_accuracy": 0.8150049105286599, "num_tokens": 18288796.0, "step": 15240 }, { "entropy": 1.788297127187252, "epoch": 0.047273665700785596, "grad_norm": 5.706685543060303, "learning_rate": 7.385934321418193e-06, "loss": 0.5107, "mean_token_accuracy": 0.8224897965788841, "num_tokens": 18301726.0, "step": 15250 }, { "entropy": 1.9241210684180259, "epoch": 0.04730466482583529, "grad_norm": 11.742621421813965, "learning_rate": 7.390777874648844e-06, "loss": 0.6246, "mean_token_accuracy": 0.810030820965767, "num_tokens": 18312680.0, "step": 15260 }, { "entropy": 1.9467894092202187, "epoch": 0.04733566395088499, "grad_norm": 9.387283325195312, "learning_rate": 7.395621427879494e-06, "loss": 0.6334, "mean_token_accuracy": 0.8097245275974274, "num_tokens": 18324139.0, "step": 15270 }, { "entropy": 1.9100206598639489, "epoch": 0.04736666307593468, "grad_norm": 10.441351890563965, "learning_rate": 7.400464981110143e-06, "loss": 0.6332, "mean_token_accuracy": 0.7996808186173439, "num_tokens": 18336228.0, "step": 15280 }, { "entropy": 1.96844242811203, "epoch": 0.047397662200984375, "grad_norm": 11.180095672607422, "learning_rate": 7.405308534340793e-06, "loss": 0.675, "mean_token_accuracy": 0.8107746347784996, "num_tokens": 18346950.0, "step": 15290 }, { "entropy": 1.9150108516216278, "epoch": 0.04742866132603407, "grad_norm": 9.653234481811523, "learning_rate": 7.4101520875714425e-06, "loss": 0.6054, "mean_token_accuracy": 0.8183826908469201, "num_tokens": 18358537.0, "step": 15300 }, { "entropy": 1.8632504418492317, "epoch": 0.04745966045108377, "grad_norm": 13.043754577636719, "learning_rate": 7.414995640802093e-06, "loss": 0.5891, "mean_token_accuracy": 0.8238464057445526, "num_tokens": 18371462.0, "step": 15310 }, { "entropy": 1.8496581763029099, "epoch": 0.047490659576133465, "grad_norm": 11.938183784484863, "learning_rate": 7.419839194032743e-06, "loss": 0.5633, "mean_token_accuracy": 0.827662679553032, "num_tokens": 18384026.0, "step": 15320 }, { "entropy": 1.8263291805982589, "epoch": 0.04752165870118316, "grad_norm": 10.78771686553955, "learning_rate": 7.424682747263393e-06, "loss": 0.5709, "mean_token_accuracy": 0.815859617292881, "num_tokens": 18396414.0, "step": 15330 }, { "entropy": 1.8266072757542133, "epoch": 0.04755265782623285, "grad_norm": 11.682705879211426, "learning_rate": 7.429526300494043e-06, "loss": 0.5583, "mean_token_accuracy": 0.8147292017936707, "num_tokens": 18409403.0, "step": 15340 }, { "entropy": 1.8509322211146355, "epoch": 0.04758365695128255, "grad_norm": 4.418774127960205, "learning_rate": 7.434369853724694e-06, "loss": 0.5744, "mean_token_accuracy": 0.822237353026867, "num_tokens": 18421350.0, "step": 15350 }, { "entropy": 1.8988280490040779, "epoch": 0.047614656076332244, "grad_norm": 11.46986198425293, "learning_rate": 7.4392134069553435e-06, "loss": 0.5755, "mean_token_accuracy": 0.8086514964699745, "num_tokens": 18433565.0, "step": 15360 }, { "entropy": 1.883128097653389, "epoch": 0.04764565520138194, "grad_norm": 10.333133697509766, "learning_rate": 7.444056960185993e-06, "loss": 0.6242, "mean_token_accuracy": 0.8101354941725731, "num_tokens": 18445233.0, "step": 15370 }, { "entropy": 1.7941233783960342, "epoch": 0.04767665432643164, "grad_norm": 4.953238010406494, "learning_rate": 7.448900513416642e-06, "loss": 0.4966, "mean_token_accuracy": 0.8319120317697525, "num_tokens": 18457975.0, "step": 15380 }, { "entropy": 1.9092483133077622, "epoch": 0.047707653451481334, "grad_norm": 11.126029014587402, "learning_rate": 7.453744066647292e-06, "loss": 0.6611, "mean_token_accuracy": 0.8040424823760987, "num_tokens": 18469319.0, "step": 15390 }, { "entropy": 1.8364340022206307, "epoch": 0.04773865257653103, "grad_norm": 5.549501419067383, "learning_rate": 7.458587619877943e-06, "loss": 0.6241, "mean_token_accuracy": 0.8029653668403626, "num_tokens": 18482789.0, "step": 15400 }, { "entropy": 1.8426788106560708, "epoch": 0.04776965170158072, "grad_norm": 11.887903213500977, "learning_rate": 7.463431173108593e-06, "loss": 0.5929, "mean_token_accuracy": 0.8010544419288635, "num_tokens": 18495234.0, "step": 15410 }, { "entropy": 1.898276437819004, "epoch": 0.04780065082663042, "grad_norm": 11.306375503540039, "learning_rate": 7.468274726339243e-06, "loss": 0.5772, "mean_token_accuracy": 0.8193996638059616, "num_tokens": 18507024.0, "step": 15420 }, { "entropy": 1.9499488294124603, "epoch": 0.047831649951680114, "grad_norm": 13.077889442443848, "learning_rate": 7.4731182795698935e-06, "loss": 0.7473, "mean_token_accuracy": 0.7906885385513306, "num_tokens": 18518699.0, "step": 15430 }, { "entropy": 1.8603081166744233, "epoch": 0.04786264907672981, "grad_norm": 11.552556037902832, "learning_rate": 7.477961832800543e-06, "loss": 0.5398, "mean_token_accuracy": 0.8289436608552933, "num_tokens": 18530154.0, "step": 15440 }, { "entropy": 1.8968264564871788, "epoch": 0.04789364820177951, "grad_norm": 12.209922790527344, "learning_rate": 7.482805386031193e-06, "loss": 0.6208, "mean_token_accuracy": 0.8118745431303978, "num_tokens": 18541201.0, "step": 15450 }, { "entropy": 1.7692169472575188, "epoch": 0.0479246473268292, "grad_norm": 11.662141799926758, "learning_rate": 7.487648939261843e-06, "loss": 0.5171, "mean_token_accuracy": 0.8198059305548668, "num_tokens": 18554224.0, "step": 15460 }, { "entropy": 1.927010752260685, "epoch": 0.04795564645187889, "grad_norm": 10.837479591369629, "learning_rate": 7.492492492492494e-06, "loss": 0.5931, "mean_token_accuracy": 0.8046490982174873, "num_tokens": 18566184.0, "step": 15470 }, { "entropy": 1.854862241446972, "epoch": 0.04798664557692859, "grad_norm": 9.293188095092773, "learning_rate": 7.497336045723144e-06, "loss": 0.5992, "mean_token_accuracy": 0.8218624517321587, "num_tokens": 18578244.0, "step": 15480 }, { "entropy": 1.8932930827140808, "epoch": 0.048017644701978286, "grad_norm": 10.518685340881348, "learning_rate": 7.502179598953793e-06, "loss": 0.6514, "mean_token_accuracy": 0.8079543009400367, "num_tokens": 18589205.0, "step": 15490 }, { "entropy": 1.8627614587545396, "epoch": 0.04804864382702798, "grad_norm": 14.004125595092773, "learning_rate": 7.507023152184443e-06, "loss": 0.6486, "mean_token_accuracy": 0.8162886321544647, "num_tokens": 18600091.0, "step": 15500 }, { "entropy": 1.8322096571326256, "epoch": 0.04807964295207768, "grad_norm": 10.797554016113281, "learning_rate": 7.5118667054150926e-06, "loss": 0.5938, "mean_token_accuracy": 0.8135915264487267, "num_tokens": 18611780.0, "step": 15510 }, { "entropy": 1.7482316687703132, "epoch": 0.048110642077127376, "grad_norm": 11.08513069152832, "learning_rate": 7.516710258645743e-06, "loss": 0.581, "mean_token_accuracy": 0.8262606129050255, "num_tokens": 18624882.0, "step": 15520 }, { "entropy": 1.8912085920572281, "epoch": 0.048141641202177066, "grad_norm": 12.140768051147461, "learning_rate": 7.521553811876393e-06, "loss": 0.6816, "mean_token_accuracy": 0.8002603054046631, "num_tokens": 18636954.0, "step": 15530 }, { "entropy": 1.8268685653805732, "epoch": 0.04817264032722676, "grad_norm": 12.404664039611816, "learning_rate": 7.526397365107043e-06, "loss": 0.5911, "mean_token_accuracy": 0.8149344757199287, "num_tokens": 18649266.0, "step": 15540 }, { "entropy": 1.8767468333244324, "epoch": 0.04820363945227646, "grad_norm": 10.459026336669922, "learning_rate": 7.531240918337693e-06, "loss": 0.6271, "mean_token_accuracy": 0.8109247386455536, "num_tokens": 18660751.0, "step": 15550 }, { "entropy": 1.831296342611313, "epoch": 0.048234638577326155, "grad_norm": 10.499650955200195, "learning_rate": 7.536084471568344e-06, "loss": 0.5749, "mean_token_accuracy": 0.8176342695951462, "num_tokens": 18673033.0, "step": 15560 }, { "entropy": 1.8214371725916862, "epoch": 0.04826563770237585, "grad_norm": 14.110614776611328, "learning_rate": 7.5409280247989936e-06, "loss": 0.5775, "mean_token_accuracy": 0.8230008244514465, "num_tokens": 18685183.0, "step": 15570 }, { "entropy": 1.8123807251453399, "epoch": 0.04829663682742555, "grad_norm": 10.320479393005371, "learning_rate": 7.5457715780296435e-06, "loss": 0.5839, "mean_token_accuracy": 0.8221219345927239, "num_tokens": 18696924.0, "step": 15580 }, { "entropy": 1.7216524064540863, "epoch": 0.04832763595247524, "grad_norm": 9.836614608764648, "learning_rate": 7.5506151312602925e-06, "loss": 0.4975, "mean_token_accuracy": 0.8300070941448212, "num_tokens": 18710414.0, "step": 15590 }, { "entropy": 1.889795495569706, "epoch": 0.048358635077524935, "grad_norm": 11.627612113952637, "learning_rate": 7.555458684490942e-06, "loss": 0.6584, "mean_token_accuracy": 0.8059937000274658, "num_tokens": 18721801.0, "step": 15600 }, { "entropy": 1.8819514483213424, "epoch": 0.04838963420257463, "grad_norm": 6.095297813415527, "learning_rate": 7.560302237721593e-06, "loss": 0.6108, "mean_token_accuracy": 0.807888326048851, "num_tokens": 18733560.0, "step": 15610 }, { "entropy": 1.8655756518244744, "epoch": 0.04842063332762433, "grad_norm": 10.622197151184082, "learning_rate": 7.565145790952243e-06, "loss": 0.6113, "mean_token_accuracy": 0.8073986008763313, "num_tokens": 18746189.0, "step": 15620 }, { "entropy": 1.8603959396481513, "epoch": 0.048451632452674025, "grad_norm": 11.854666709899902, "learning_rate": 7.569989344182893e-06, "loss": 0.5933, "mean_token_accuracy": 0.817948243021965, "num_tokens": 18757924.0, "step": 15630 }, { "entropy": 1.9059595987200737, "epoch": 0.04848263157772372, "grad_norm": 9.098125457763672, "learning_rate": 7.574832897413544e-06, "loss": 0.6782, "mean_token_accuracy": 0.8045271784067154, "num_tokens": 18769446.0, "step": 15640 }, { "entropy": 1.7989111676812173, "epoch": 0.04851363070277341, "grad_norm": 11.551828384399414, "learning_rate": 7.5796764506441935e-06, "loss": 0.5537, "mean_token_accuracy": 0.8248863905668259, "num_tokens": 18782700.0, "step": 15650 }, { "entropy": 1.8210396483540534, "epoch": 0.04854462982782311, "grad_norm": 4.9011759757995605, "learning_rate": 7.584520003874843e-06, "loss": 0.5557, "mean_token_accuracy": 0.8225024446845055, "num_tokens": 18795390.0, "step": 15660 }, { "entropy": 1.811398984491825, "epoch": 0.048575628952872804, "grad_norm": 11.048978805541992, "learning_rate": 7.589363557105493e-06, "loss": 0.5542, "mean_token_accuracy": 0.8189741969108582, "num_tokens": 18807396.0, "step": 15670 }, { "entropy": 1.892940777540207, "epoch": 0.0486066280779225, "grad_norm": 12.325209617614746, "learning_rate": 7.594207110336144e-06, "loss": 0.6986, "mean_token_accuracy": 0.7981882244348526, "num_tokens": 18819378.0, "step": 15680 }, { "entropy": 1.886229231953621, "epoch": 0.0486376272029722, "grad_norm": 10.576592445373535, "learning_rate": 7.599050663566794e-06, "loss": 0.6074, "mean_token_accuracy": 0.8123501881957054, "num_tokens": 18830551.0, "step": 15690 }, { "entropy": 1.8603569209575652, "epoch": 0.048668626328021894, "grad_norm": 11.795734405517578, "learning_rate": 7.603894216797443e-06, "loss": 0.5157, "mean_token_accuracy": 0.8348536252975464, "num_tokens": 18842716.0, "step": 15700 }, { "entropy": 1.9385449796915055, "epoch": 0.04869962545307159, "grad_norm": 10.914395332336426, "learning_rate": 7.608737770028093e-06, "loss": 0.6356, "mean_token_accuracy": 0.8061470225453377, "num_tokens": 18853817.0, "step": 15710 }, { "entropy": 1.8827759057283402, "epoch": 0.04873062457812128, "grad_norm": 14.19170093536377, "learning_rate": 7.613581323258743e-06, "loss": 0.6335, "mean_token_accuracy": 0.8078219577670097, "num_tokens": 18865592.0, "step": 15720 }, { "entropy": 1.8566590577363968, "epoch": 0.04876162370317098, "grad_norm": 13.042546272277832, "learning_rate": 7.618424876489393e-06, "loss": 0.6099, "mean_token_accuracy": 0.8074685573577881, "num_tokens": 18878717.0, "step": 15730 }, { "entropy": 1.8778809905052185, "epoch": 0.04879262282822067, "grad_norm": 9.526302337646484, "learning_rate": 7.623268429720043e-06, "loss": 0.5393, "mean_token_accuracy": 0.8302160531282425, "num_tokens": 18890522.0, "step": 15740 }, { "entropy": 1.843679629266262, "epoch": 0.04882362195327037, "grad_norm": 11.040970802307129, "learning_rate": 7.628111982950693e-06, "loss": 0.6254, "mean_token_accuracy": 0.812717217206955, "num_tokens": 18902667.0, "step": 15750 }, { "entropy": 1.9100946336984634, "epoch": 0.04885462107832007, "grad_norm": 7.509261608123779, "learning_rate": 7.632955536181344e-06, "loss": 0.6673, "mean_token_accuracy": 0.8177873358130455, "num_tokens": 18913868.0, "step": 15760 }, { "entropy": 1.8826175585389138, "epoch": 0.04888562020336976, "grad_norm": 11.155854225158691, "learning_rate": 7.637799089411994e-06, "loss": 0.6481, "mean_token_accuracy": 0.8087216004729271, "num_tokens": 18925988.0, "step": 15770 }, { "entropy": 1.7796615183353424, "epoch": 0.04891661932841945, "grad_norm": 5.291562080383301, "learning_rate": 7.642642642642644e-06, "loss": 0.5153, "mean_token_accuracy": 0.8381139814853669, "num_tokens": 18938466.0, "step": 15780 }, { "entropy": 1.9144254103302956, "epoch": 0.04894761845346915, "grad_norm": 10.03088092803955, "learning_rate": 7.647486195873294e-06, "loss": 0.6672, "mean_token_accuracy": 0.8007161229848861, "num_tokens": 18949499.0, "step": 15790 }, { "entropy": 1.8293558046221734, "epoch": 0.048978617578518846, "grad_norm": 10.299362182617188, "learning_rate": 7.652329749103943e-06, "loss": 0.6104, "mean_token_accuracy": 0.8068207755684853, "num_tokens": 18961354.0, "step": 15800 }, { "entropy": 1.843856942653656, "epoch": 0.04900961670356854, "grad_norm": 5.878364562988281, "learning_rate": 7.657173302334593e-06, "loss": 0.5521, "mean_token_accuracy": 0.8132404252886772, "num_tokens": 18973459.0, "step": 15810 }, { "entropy": 1.8029664367437364, "epoch": 0.04904061582861824, "grad_norm": 13.001435279846191, "learning_rate": 7.662016855565243e-06, "loss": 0.5321, "mean_token_accuracy": 0.8184084549546242, "num_tokens": 18986940.0, "step": 15820 }, { "entropy": 1.836187256872654, "epoch": 0.049071614953667936, "grad_norm": 14.026951789855957, "learning_rate": 7.666860408795893e-06, "loss": 0.5792, "mean_token_accuracy": 0.8184718936681747, "num_tokens": 19000053.0, "step": 15830 }, { "entropy": 2.0127352684736253, "epoch": 0.049102614078717625, "grad_norm": 10.804215431213379, "learning_rate": 7.671703962026543e-06, "loss": 0.6748, "mean_token_accuracy": 0.8046320468187332, "num_tokens": 19010864.0, "step": 15840 }, { "entropy": 1.8914284870028495, "epoch": 0.04913361320376732, "grad_norm": 12.271016120910645, "learning_rate": 7.676547515257193e-06, "loss": 0.6046, "mean_token_accuracy": 0.8181375965476037, "num_tokens": 19022131.0, "step": 15850 }, { "entropy": 1.8896478191018105, "epoch": 0.04916461232881702, "grad_norm": 10.453010559082031, "learning_rate": 7.681391068487843e-06, "loss": 0.6347, "mean_token_accuracy": 0.8188618034124374, "num_tokens": 19033759.0, "step": 15860 }, { "entropy": 1.9267440363764763, "epoch": 0.049195611453866715, "grad_norm": 12.614462852478027, "learning_rate": 7.686234621718494e-06, "loss": 0.6087, "mean_token_accuracy": 0.819622540473938, "num_tokens": 19044630.0, "step": 15870 }, { "entropy": 1.8585731774568557, "epoch": 0.04922661057891641, "grad_norm": 11.191518783569336, "learning_rate": 7.691078174949144e-06, "loss": 0.6359, "mean_token_accuracy": 0.8025752380490303, "num_tokens": 19056706.0, "step": 15880 }, { "entropy": 1.8346818998456, "epoch": 0.04925760970396611, "grad_norm": 10.337366104125977, "learning_rate": 7.695921728179794e-06, "loss": 0.5781, "mean_token_accuracy": 0.8142318353056908, "num_tokens": 19069370.0, "step": 15890 }, { "entropy": 1.8798568680882455, "epoch": 0.0492886088290158, "grad_norm": 11.572778701782227, "learning_rate": 7.700765281410444e-06, "loss": 0.6304, "mean_token_accuracy": 0.8030693680047989, "num_tokens": 19081586.0, "step": 15900 }, { "entropy": 1.8789875194430352, "epoch": 0.049319607954065495, "grad_norm": 6.301096439361572, "learning_rate": 7.705608834641092e-06, "loss": 0.618, "mean_token_accuracy": 0.8181715860962868, "num_tokens": 19093093.0, "step": 15910 }, { "entropy": 1.7927953436970712, "epoch": 0.04935060707911519, "grad_norm": 6.900021076202393, "learning_rate": 7.710452387871744e-06, "loss": 0.5129, "mean_token_accuracy": 0.8337122991681098, "num_tokens": 19105589.0, "step": 15920 }, { "entropy": 1.8942086696624756, "epoch": 0.04938160620416489, "grad_norm": 9.592936515808105, "learning_rate": 7.715295941102394e-06, "loss": 0.6314, "mean_token_accuracy": 0.8135825991630554, "num_tokens": 19117515.0, "step": 15930 }, { "entropy": 1.8845902875065803, "epoch": 0.049412605329214584, "grad_norm": 5.827622890472412, "learning_rate": 7.720139494333044e-06, "loss": 0.5792, "mean_token_accuracy": 0.8127764299511909, "num_tokens": 19129539.0, "step": 15940 }, { "entropy": 1.8030100539326668, "epoch": 0.04944360445426428, "grad_norm": 10.93111515045166, "learning_rate": 7.724983047563693e-06, "loss": 0.5162, "mean_token_accuracy": 0.8306656986474991, "num_tokens": 19142450.0, "step": 15950 }, { "entropy": 1.8265103816986084, "epoch": 0.04947460357931397, "grad_norm": 4.990983486175537, "learning_rate": 7.729826600794343e-06, "loss": 0.5402, "mean_token_accuracy": 0.8204995647072792, "num_tokens": 19154745.0, "step": 15960 }, { "entropy": 1.8918519958853721, "epoch": 0.04950560270436367, "grad_norm": 9.628335952758789, "learning_rate": 7.734670154024993e-06, "loss": 0.6261, "mean_token_accuracy": 0.8113234505057335, "num_tokens": 19166521.0, "step": 15970 }, { "entropy": 1.886893954873085, "epoch": 0.049536601829413364, "grad_norm": 11.60335922241211, "learning_rate": 7.739513707255643e-06, "loss": 0.612, "mean_token_accuracy": 0.8159346550703048, "num_tokens": 19177833.0, "step": 15980 }, { "entropy": 1.831916256248951, "epoch": 0.04956760095446306, "grad_norm": 9.314860343933105, "learning_rate": 7.744357260486293e-06, "loss": 0.5776, "mean_token_accuracy": 0.816097392141819, "num_tokens": 19189742.0, "step": 15990 }, { "entropy": 1.9015469133853913, "epoch": 0.04959860007951276, "grad_norm": 11.75980281829834, "learning_rate": 7.749200813716945e-06, "loss": 0.6715, "mean_token_accuracy": 0.8024095699191094, "num_tokens": 19200772.0, "step": 16000 }, { "entropy": 1.8420792520046234, "epoch": 0.049629599204562454, "grad_norm": 10.332155227661133, "learning_rate": 7.754044366947593e-06, "loss": 0.5571, "mean_token_accuracy": 0.8230875134468079, "num_tokens": 19212754.0, "step": 16010 }, { "entropy": 1.9134253069758416, "epoch": 0.04966059832961214, "grad_norm": 12.374646186828613, "learning_rate": 7.758887920178243e-06, "loss": 0.6448, "mean_token_accuracy": 0.8036070108413697, "num_tokens": 19224725.0, "step": 16020 }, { "entropy": 1.8665786176919936, "epoch": 0.04969159745466184, "grad_norm": 4.8785624504089355, "learning_rate": 7.763731473408892e-06, "loss": 0.6109, "mean_token_accuracy": 0.8155334115028381, "num_tokens": 19236500.0, "step": 16030 }, { "entropy": 1.827447460591793, "epoch": 0.04972259657971154, "grad_norm": 10.244511604309082, "learning_rate": 7.768575026639544e-06, "loss": 0.5549, "mean_token_accuracy": 0.8148609265685082, "num_tokens": 19249091.0, "step": 16040 }, { "entropy": 1.875395241379738, "epoch": 0.04975359570476123, "grad_norm": 11.924630165100098, "learning_rate": 7.773418579870194e-06, "loss": 0.5739, "mean_token_accuracy": 0.8249801099300385, "num_tokens": 19260725.0, "step": 16050 }, { "entropy": 1.8245926171541214, "epoch": 0.04978459482981093, "grad_norm": 5.5314226150512695, "learning_rate": 7.778262133100844e-06, "loss": 0.5986, "mean_token_accuracy": 0.8085296213626861, "num_tokens": 19272815.0, "step": 16060 }, { "entropy": 1.879467526078224, "epoch": 0.049815593954860626, "grad_norm": 10.704133033752441, "learning_rate": 7.783105686331494e-06, "loss": 0.6219, "mean_token_accuracy": 0.8161226466298104, "num_tokens": 19284571.0, "step": 16070 }, { "entropy": 1.8190572828054428, "epoch": 0.04984659307991032, "grad_norm": 12.365442276000977, "learning_rate": 7.787949239562144e-06, "loss": 0.5841, "mean_token_accuracy": 0.8116471886634826, "num_tokens": 19297247.0, "step": 16080 }, { "entropy": 1.8453327685594558, "epoch": 0.04987759220496001, "grad_norm": 11.743905067443848, "learning_rate": 7.792792792792793e-06, "loss": 0.5898, "mean_token_accuracy": 0.8130872502923012, "num_tokens": 19309118.0, "step": 16090 }, { "entropy": 1.8390842065215112, "epoch": 0.04990859133000971, "grad_norm": 11.728593826293945, "learning_rate": 7.797636346023443e-06, "loss": 0.5926, "mean_token_accuracy": 0.8115570530295372, "num_tokens": 19321049.0, "step": 16100 }, { "entropy": 1.8872535049915313, "epoch": 0.049939590455059406, "grad_norm": 9.616110801696777, "learning_rate": 7.802479899254093e-06, "loss": 0.6174, "mean_token_accuracy": 0.8136581242084503, "num_tokens": 19332620.0, "step": 16110 }, { "entropy": 1.9285971507430077, "epoch": 0.0499705895801091, "grad_norm": 10.771924018859863, "learning_rate": 7.807323452484743e-06, "loss": 0.6794, "mean_token_accuracy": 0.7982939928770065, "num_tokens": 19344422.0, "step": 16120 }, { "entropy": 1.8309599250555038, "epoch": 0.0500015887051588, "grad_norm": 11.928840637207031, "learning_rate": 7.812167005715393e-06, "loss": 0.5435, "mean_token_accuracy": 0.8183940485119819, "num_tokens": 19356711.0, "step": 16130 }, { "entropy": 1.7304350897669791, "epoch": 0.050032587830208496, "grad_norm": 11.800362586975098, "learning_rate": 7.817010558946043e-06, "loss": 0.5798, "mean_token_accuracy": 0.8197820097208023, "num_tokens": 19370369.0, "step": 16140 }, { "entropy": 1.8792952716350555, "epoch": 0.050063586955258185, "grad_norm": 12.090376853942871, "learning_rate": 7.821854112176693e-06, "loss": 0.6114, "mean_token_accuracy": 0.8166804388165474, "num_tokens": 19381499.0, "step": 16150 }, { "entropy": 1.8935234278440476, "epoch": 0.05009458608030788, "grad_norm": 11.883420944213867, "learning_rate": 7.826697665407343e-06, "loss": 0.6121, "mean_token_accuracy": 0.8147160053253174, "num_tokens": 19392659.0, "step": 16160 }, { "entropy": 1.846675968170166, "epoch": 0.05012558520535758, "grad_norm": 11.219228744506836, "learning_rate": 7.831541218637994e-06, "loss": 0.6005, "mean_token_accuracy": 0.8102226793766022, "num_tokens": 19404388.0, "step": 16170 }, { "entropy": 1.7764321342110634, "epoch": 0.050156584330407275, "grad_norm": 10.584576606750488, "learning_rate": 7.836384771868644e-06, "loss": 0.5192, "mean_token_accuracy": 0.823691800236702, "num_tokens": 19417440.0, "step": 16180 }, { "entropy": 1.82427935898304, "epoch": 0.05018758345545697, "grad_norm": 5.237261772155762, "learning_rate": 7.841228325099294e-06, "loss": 0.5448, "mean_token_accuracy": 0.8294490948319435, "num_tokens": 19429964.0, "step": 16190 }, { "entropy": 1.830063909292221, "epoch": 0.05021858258050667, "grad_norm": 10.078315734863281, "learning_rate": 7.846071878329944e-06, "loss": 0.5906, "mean_token_accuracy": 0.816213846206665, "num_tokens": 19441944.0, "step": 16200 }, { "entropy": 1.8124560460448265, "epoch": 0.05024958170555636, "grad_norm": 7.47139835357666, "learning_rate": 7.850915431560594e-06, "loss": 0.6127, "mean_token_accuracy": 0.8125569224357605, "num_tokens": 19454488.0, "step": 16210 }, { "entropy": 1.8454759851098061, "epoch": 0.050280580830606054, "grad_norm": 11.923384666442871, "learning_rate": 7.855758984791244e-06, "loss": 0.5783, "mean_token_accuracy": 0.8168366953730584, "num_tokens": 19466582.0, "step": 16220 }, { "entropy": 1.8621361300349235, "epoch": 0.05031157995565575, "grad_norm": 11.48657512664795, "learning_rate": 7.860602538021894e-06, "loss": 0.6007, "mean_token_accuracy": 0.8186299324035644, "num_tokens": 19478249.0, "step": 16230 }, { "entropy": 1.8037197709083557, "epoch": 0.05034257908070545, "grad_norm": 5.228190898895264, "learning_rate": 7.865446091252543e-06, "loss": 0.5935, "mean_token_accuracy": 0.8192895010113717, "num_tokens": 19489977.0, "step": 16240 }, { "entropy": 1.9085147067904473, "epoch": 0.050373578205755144, "grad_norm": 12.668362617492676, "learning_rate": 7.870289644483193e-06, "loss": 0.6271, "mean_token_accuracy": 0.8066832914948463, "num_tokens": 19501682.0, "step": 16250 }, { "entropy": 1.8951284110546112, "epoch": 0.05040457733080484, "grad_norm": 5.59218168258667, "learning_rate": 7.875133197713843e-06, "loss": 0.5867, "mean_token_accuracy": 0.8141255229711533, "num_tokens": 19513281.0, "step": 16260 }, { "entropy": 1.8662694096565247, "epoch": 0.05043557645585453, "grad_norm": 11.334489822387695, "learning_rate": 7.879976750944493e-06, "loss": 0.5662, "mean_token_accuracy": 0.8265201568603515, "num_tokens": 19525550.0, "step": 16270 }, { "entropy": 1.838777382671833, "epoch": 0.05046657558090423, "grad_norm": 12.854705810546875, "learning_rate": 7.884820304175143e-06, "loss": 0.5784, "mean_token_accuracy": 0.8137482151389122, "num_tokens": 19537201.0, "step": 16280 }, { "entropy": 1.8109822481870652, "epoch": 0.050497574705953924, "grad_norm": 11.828731536865234, "learning_rate": 7.889663857405795e-06, "loss": 0.6238, "mean_token_accuracy": 0.8079608172178269, "num_tokens": 19550217.0, "step": 16290 }, { "entropy": 1.8977823466062547, "epoch": 0.05052857383100362, "grad_norm": 6.278931140899658, "learning_rate": 7.894507410636444e-06, "loss": 0.63, "mean_token_accuracy": 0.8086236611008644, "num_tokens": 19562455.0, "step": 16300 }, { "entropy": 1.885308313369751, "epoch": 0.05055957295605332, "grad_norm": 12.805869102478027, "learning_rate": 7.899350963867094e-06, "loss": 0.6222, "mean_token_accuracy": 0.8057803988456727, "num_tokens": 19573567.0, "step": 16310 }, { "entropy": 1.8054832592606544, "epoch": 0.050590572081103014, "grad_norm": 10.450759887695312, "learning_rate": 7.904194517097744e-06, "loss": 0.5596, "mean_token_accuracy": 0.8225147247314453, "num_tokens": 19585912.0, "step": 16320 }, { "entropy": 1.836893168091774, "epoch": 0.0506215712061527, "grad_norm": 8.395682334899902, "learning_rate": 7.909038070328392e-06, "loss": 0.604, "mean_token_accuracy": 0.8111759915947914, "num_tokens": 19597962.0, "step": 16330 }, { "entropy": 1.8264792799949645, "epoch": 0.0506525703312024, "grad_norm": 9.940167427062988, "learning_rate": 7.913881623559044e-06, "loss": 0.5945, "mean_token_accuracy": 0.8128548488020897, "num_tokens": 19610522.0, "step": 16340 }, { "entropy": 1.7342656940221786, "epoch": 0.050683569456252096, "grad_norm": 5.958490371704102, "learning_rate": 7.918725176789694e-06, "loss": 0.4647, "mean_token_accuracy": 0.8350143611431122, "num_tokens": 19623800.0, "step": 16350 }, { "entropy": 1.8424322217702866, "epoch": 0.05071456858130179, "grad_norm": 10.20142936706543, "learning_rate": 7.923568730020344e-06, "loss": 0.6181, "mean_token_accuracy": 0.8129007190465927, "num_tokens": 19635543.0, "step": 16360 }, { "entropy": 1.8671317547559738, "epoch": 0.05074556770635149, "grad_norm": 11.634441375732422, "learning_rate": 7.928412283250994e-06, "loss": 0.5988, "mean_token_accuracy": 0.8180768474936485, "num_tokens": 19646796.0, "step": 16370 }, { "entropy": 1.8461050242185593, "epoch": 0.050776566831401186, "grad_norm": 5.321835041046143, "learning_rate": 7.933255836481643e-06, "loss": 0.6055, "mean_token_accuracy": 0.8090061485767365, "num_tokens": 19658855.0, "step": 16380 }, { "entropy": 1.8895393520593644, "epoch": 0.050807565956450876, "grad_norm": 12.092207908630371, "learning_rate": 7.938099389712293e-06, "loss": 0.6138, "mean_token_accuracy": 0.8007647573947907, "num_tokens": 19670438.0, "step": 16390 }, { "entropy": 1.93761787712574, "epoch": 0.05083856508150057, "grad_norm": 13.244606971740723, "learning_rate": 7.942942942942943e-06, "loss": 0.6953, "mean_token_accuracy": 0.7985577821731568, "num_tokens": 19681443.0, "step": 16400 }, { "entropy": 1.8733505114912987, "epoch": 0.05086956420655027, "grad_norm": 10.528936386108398, "learning_rate": 7.947786496173593e-06, "loss": 0.5767, "mean_token_accuracy": 0.8295532435178756, "num_tokens": 19693237.0, "step": 16410 }, { "entropy": 1.8836349695920944, "epoch": 0.050900563331599966, "grad_norm": 9.953451156616211, "learning_rate": 7.952630049404245e-06, "loss": 0.5955, "mean_token_accuracy": 0.8276330158114433, "num_tokens": 19704482.0, "step": 16420 }, { "entropy": 1.8280857503414154, "epoch": 0.05093156245664966, "grad_norm": 11.806560516357422, "learning_rate": 7.957473602634893e-06, "loss": 0.5736, "mean_token_accuracy": 0.8197454378008843, "num_tokens": 19716072.0, "step": 16430 }, { "entropy": 1.868168619275093, "epoch": 0.05096256158169936, "grad_norm": 14.613922119140625, "learning_rate": 7.962317155865543e-06, "loss": 0.6294, "mean_token_accuracy": 0.8104723706841469, "num_tokens": 19727908.0, "step": 16440 }, { "entropy": 1.8337607711553574, "epoch": 0.050993560706749055, "grad_norm": 11.018813133239746, "learning_rate": 7.967160709096193e-06, "loss": 0.5722, "mean_token_accuracy": 0.815605454146862, "num_tokens": 19740124.0, "step": 16450 }, { "entropy": 1.8944316014647484, "epoch": 0.051024559831798745, "grad_norm": 12.068158149719238, "learning_rate": 7.972004262326844e-06, "loss": 0.611, "mean_token_accuracy": 0.8088446974754333, "num_tokens": 19752042.0, "step": 16460 }, { "entropy": 1.8141653031110763, "epoch": 0.05105555895684844, "grad_norm": 11.938932418823242, "learning_rate": 7.976847815557494e-06, "loss": 0.5529, "mean_token_accuracy": 0.8189818143844605, "num_tokens": 19764559.0, "step": 16470 }, { "entropy": 1.8091833949089051, "epoch": 0.05108655808189814, "grad_norm": 6.31152868270874, "learning_rate": 7.981691368788144e-06, "loss": 0.6014, "mean_token_accuracy": 0.8165456935763359, "num_tokens": 19776260.0, "step": 16480 }, { "entropy": 1.812686663866043, "epoch": 0.051117557206947835, "grad_norm": 5.221765995025635, "learning_rate": 7.986534922018794e-06, "loss": 0.5688, "mean_token_accuracy": 0.8260109111666679, "num_tokens": 19788934.0, "step": 16490 }, { "entropy": 1.8625101804733277, "epoch": 0.05114855633199753, "grad_norm": 11.6726713180542, "learning_rate": 7.991378475249444e-06, "loss": 0.5964, "mean_token_accuracy": 0.8176575809717178, "num_tokens": 19800717.0, "step": 16500 }, { "entropy": 1.8866008058190347, "epoch": 0.05117955545704723, "grad_norm": 13.77607250213623, "learning_rate": 7.996222028480094e-06, "loss": 0.6522, "mean_token_accuracy": 0.7992734283208847, "num_tokens": 19812273.0, "step": 16510 }, { "entropy": 1.9345821738243103, "epoch": 0.05121055458209692, "grad_norm": 10.338969230651855, "learning_rate": 8.001065581710744e-06, "loss": 0.684, "mean_token_accuracy": 0.7988832354545593, "num_tokens": 19822905.0, "step": 16520 }, { "entropy": 1.9000392645597457, "epoch": 0.051241553707146614, "grad_norm": 10.453362464904785, "learning_rate": 8.005909134941393e-06, "loss": 0.6257, "mean_token_accuracy": 0.8212269991636276, "num_tokens": 19833662.0, "step": 16530 }, { "entropy": 1.8661847695708276, "epoch": 0.05127255283219631, "grad_norm": 10.485684394836426, "learning_rate": 8.010752688172043e-06, "loss": 0.643, "mean_token_accuracy": 0.8103910699486733, "num_tokens": 19845416.0, "step": 16540 }, { "entropy": 1.8219714432954788, "epoch": 0.05130355195724601, "grad_norm": 12.01816463470459, "learning_rate": 8.015596241402693e-06, "loss": 0.6021, "mean_token_accuracy": 0.8123174890875816, "num_tokens": 19857098.0, "step": 16550 }, { "entropy": 1.8485499978065492, "epoch": 0.051334551082295704, "grad_norm": 9.451250076293945, "learning_rate": 8.020439794633343e-06, "loss": 0.6128, "mean_token_accuracy": 0.8077960431575775, "num_tokens": 19869230.0, "step": 16560 }, { "entropy": 1.8622397229075431, "epoch": 0.0513655502073454, "grad_norm": 10.47313404083252, "learning_rate": 8.025283347863993e-06, "loss": 0.6179, "mean_token_accuracy": 0.8121888309717178, "num_tokens": 19881415.0, "step": 16570 }, { "entropy": 1.763058114051819, "epoch": 0.05139654933239509, "grad_norm": 5.442761421203613, "learning_rate": 8.030126901094643e-06, "loss": 0.5156, "mean_token_accuracy": 0.8221294581890106, "num_tokens": 19894415.0, "step": 16580 }, { "entropy": 1.7793406203389168, "epoch": 0.05142754845744479, "grad_norm": 12.98482894897461, "learning_rate": 8.034970454325294e-06, "loss": 0.5569, "mean_token_accuracy": 0.8247652933001518, "num_tokens": 19907687.0, "step": 16590 }, { "entropy": 1.874410592019558, "epoch": 0.051458547582494484, "grad_norm": 4.950082778930664, "learning_rate": 8.039814007555944e-06, "loss": 0.6167, "mean_token_accuracy": 0.8031128868460655, "num_tokens": 19919926.0, "step": 16600 }, { "entropy": 1.8641373217105865, "epoch": 0.05148954670754418, "grad_norm": 11.791984558105469, "learning_rate": 8.044657560786594e-06, "loss": 0.6205, "mean_token_accuracy": 0.8170140698552132, "num_tokens": 19932297.0, "step": 16610 }, { "entropy": 1.8777590736746788, "epoch": 0.05152054583259388, "grad_norm": 14.65937614440918, "learning_rate": 8.049501114017244e-06, "loss": 0.6264, "mean_token_accuracy": 0.8093700706958771, "num_tokens": 19944296.0, "step": 16620 }, { "entropy": 1.83215441852808, "epoch": 0.05155154495764357, "grad_norm": 18.492868423461914, "learning_rate": 8.054344667247894e-06, "loss": 0.5367, "mean_token_accuracy": 0.83001299649477, "num_tokens": 19956294.0, "step": 16630 }, { "entropy": 1.8398912638425826, "epoch": 0.05158254408269326, "grad_norm": 12.522979736328125, "learning_rate": 8.059188220478544e-06, "loss": 0.615, "mean_token_accuracy": 0.8146472930908203, "num_tokens": 19968456.0, "step": 16640 }, { "entropy": 1.8897781684994697, "epoch": 0.05161354320774296, "grad_norm": 11.673059463500977, "learning_rate": 8.064031773709194e-06, "loss": 0.6458, "mean_token_accuracy": 0.8108509004116058, "num_tokens": 19980146.0, "step": 16650 }, { "entropy": 1.9111702471971512, "epoch": 0.051644542332792656, "grad_norm": 11.109082221984863, "learning_rate": 8.068875326939844e-06, "loss": 0.633, "mean_token_accuracy": 0.8093647316098214, "num_tokens": 19992383.0, "step": 16660 }, { "entropy": 1.9211183845996858, "epoch": 0.05167554145784235, "grad_norm": 11.176039695739746, "learning_rate": 8.073718880170494e-06, "loss": 0.6448, "mean_token_accuracy": 0.8067373007535934, "num_tokens": 20002938.0, "step": 16670 }, { "entropy": 1.8170073732733727, "epoch": 0.05170654058289205, "grad_norm": 11.94314193725586, "learning_rate": 8.078562433401143e-06, "loss": 0.5507, "mean_token_accuracy": 0.8301214516162873, "num_tokens": 20014929.0, "step": 16680 }, { "entropy": 1.8377844780683517, "epoch": 0.051737539707941746, "grad_norm": 9.638590812683105, "learning_rate": 8.083405986631793e-06, "loss": 0.5725, "mean_token_accuracy": 0.8208416193723679, "num_tokens": 20026901.0, "step": 16690 }, { "entropy": 1.866816473007202, "epoch": 0.051768538832991436, "grad_norm": 10.777813911437988, "learning_rate": 8.088249539862443e-06, "loss": 0.639, "mean_token_accuracy": 0.8086013346910477, "num_tokens": 20038260.0, "step": 16700 }, { "entropy": 1.8705125212669373, "epoch": 0.05179953795804113, "grad_norm": 12.59802532196045, "learning_rate": 8.093093093093095e-06, "loss": 0.5945, "mean_token_accuracy": 0.8204304948449135, "num_tokens": 20049362.0, "step": 16710 }, { "entropy": 1.81406359821558, "epoch": 0.05183053708309083, "grad_norm": 14.44949722290039, "learning_rate": 8.097936646323745e-06, "loss": 0.6025, "mean_token_accuracy": 0.8081631407141685, "num_tokens": 20062608.0, "step": 16720 }, { "entropy": 1.9008635878562927, "epoch": 0.051861536208140525, "grad_norm": 9.96707534790039, "learning_rate": 8.102780199554395e-06, "loss": 0.6399, "mean_token_accuracy": 0.8111796498298645, "num_tokens": 20074235.0, "step": 16730 }, { "entropy": 1.8784829795360565, "epoch": 0.05189253533319022, "grad_norm": 11.282867431640625, "learning_rate": 8.107623752785044e-06, "loss": 0.5778, "mean_token_accuracy": 0.8110943511128426, "num_tokens": 20086392.0, "step": 16740 }, { "entropy": 1.915185084939003, "epoch": 0.05192353445823992, "grad_norm": 10.151687622070312, "learning_rate": 8.112467306015693e-06, "loss": 0.6498, "mean_token_accuracy": 0.8103260189294815, "num_tokens": 20097631.0, "step": 16750 }, { "entropy": 1.791477920114994, "epoch": 0.05195453358328961, "grad_norm": 5.157210350036621, "learning_rate": 8.117310859246344e-06, "loss": 0.5729, "mean_token_accuracy": 0.8181603282690049, "num_tokens": 20111249.0, "step": 16760 }, { "entropy": 1.8779030591249466, "epoch": 0.051985532708339305, "grad_norm": 10.711991310119629, "learning_rate": 8.122154412476994e-06, "loss": 0.6099, "mean_token_accuracy": 0.8104298338294029, "num_tokens": 20122999.0, "step": 16770 }, { "entropy": 1.8585640251636506, "epoch": 0.052016531833389, "grad_norm": 5.962745666503906, "learning_rate": 8.126997965707644e-06, "loss": 0.5765, "mean_token_accuracy": 0.8122934922575951, "num_tokens": 20135004.0, "step": 16780 }, { "entropy": 1.8683840811252594, "epoch": 0.0520475309584387, "grad_norm": 10.104706764221191, "learning_rate": 8.131841518938294e-06, "loss": 0.5892, "mean_token_accuracy": 0.8203515037894249, "num_tokens": 20146845.0, "step": 16790 }, { "entropy": 1.8777433142066002, "epoch": 0.052078530083488395, "grad_norm": 11.71458625793457, "learning_rate": 8.136685072168944e-06, "loss": 0.6173, "mean_token_accuracy": 0.816298334300518, "num_tokens": 20157635.0, "step": 16800 }, { "entropy": 1.8254910349845885, "epoch": 0.05210952920853809, "grad_norm": 12.500772476196289, "learning_rate": 8.141528625399594e-06, "loss": 0.5744, "mean_token_accuracy": 0.8122817128896713, "num_tokens": 20169780.0, "step": 16810 }, { "entropy": 1.7841567173600197, "epoch": 0.05214052833358779, "grad_norm": 5.604972839355469, "learning_rate": 8.146372178630243e-06, "loss": 0.5449, "mean_token_accuracy": 0.8187017843127251, "num_tokens": 20182608.0, "step": 16820 }, { "entropy": 1.8085433050990105, "epoch": 0.05217152745863748, "grad_norm": 12.112675666809082, "learning_rate": 8.151215731860895e-06, "loss": 0.5584, "mean_token_accuracy": 0.819066570699215, "num_tokens": 20194982.0, "step": 16830 }, { "entropy": 1.8413061544299125, "epoch": 0.052202526583687174, "grad_norm": 12.422449111938477, "learning_rate": 8.156059285091545e-06, "loss": 0.5611, "mean_token_accuracy": 0.8272005066275596, "num_tokens": 20206354.0, "step": 16840 }, { "entropy": 1.8800123199820518, "epoch": 0.05223352570873687, "grad_norm": 5.7091779708862305, "learning_rate": 8.160902838322193e-06, "loss": 0.6096, "mean_token_accuracy": 0.8087597385048866, "num_tokens": 20218011.0, "step": 16850 }, { "entropy": 1.7969383016228675, "epoch": 0.05226452483378657, "grad_norm": 10.848820686340332, "learning_rate": 8.165746391552843e-06, "loss": 0.5235, "mean_token_accuracy": 0.8236014172434807, "num_tokens": 20230550.0, "step": 16860 }, { "entropy": 1.8589994862675667, "epoch": 0.052295523958836264, "grad_norm": 10.558844566345215, "learning_rate": 8.170589944783493e-06, "loss": 0.56, "mean_token_accuracy": 0.8191485345363617, "num_tokens": 20242485.0, "step": 16870 }, { "entropy": 1.8285365521907806, "epoch": 0.05232652308388596, "grad_norm": 10.265873908996582, "learning_rate": 8.175433498014144e-06, "loss": 0.5369, "mean_token_accuracy": 0.827988238632679, "num_tokens": 20254416.0, "step": 16880 }, { "entropy": 1.7561014324426651, "epoch": 0.05235752220893565, "grad_norm": 10.504303932189941, "learning_rate": 8.180277051244794e-06, "loss": 0.5194, "mean_token_accuracy": 0.8308162286877632, "num_tokens": 20267582.0, "step": 16890 }, { "entropy": 1.8213954389095306, "epoch": 0.05238852133398535, "grad_norm": 11.907408714294434, "learning_rate": 8.185120604475444e-06, "loss": 0.5975, "mean_token_accuracy": 0.8134548336267471, "num_tokens": 20279358.0, "step": 16900 }, { "entropy": 1.855673785507679, "epoch": 0.05241952045903504, "grad_norm": 12.381454467773438, "learning_rate": 8.189964157706094e-06, "loss": 0.6341, "mean_token_accuracy": 0.8102763295173645, "num_tokens": 20291624.0, "step": 16910 }, { "entropy": 1.8303613483905792, "epoch": 0.05245051958408474, "grad_norm": 11.09730052947998, "learning_rate": 8.194807710936744e-06, "loss": 0.5403, "mean_token_accuracy": 0.8215647727251053, "num_tokens": 20304029.0, "step": 16920 }, { "entropy": 1.7606044977903366, "epoch": 0.052481518709134437, "grad_norm": 14.970794677734375, "learning_rate": 8.199651264167394e-06, "loss": 0.5461, "mean_token_accuracy": 0.8292558521032334, "num_tokens": 20317399.0, "step": 16930 }, { "entropy": 1.9114146530628204, "epoch": 0.05251251783418413, "grad_norm": 11.057762145996094, "learning_rate": 8.204494817398044e-06, "loss": 0.6605, "mean_token_accuracy": 0.8093030959367752, "num_tokens": 20329008.0, "step": 16940 }, { "entropy": 1.9093146950006485, "epoch": 0.05254351695923382, "grad_norm": 10.629437446594238, "learning_rate": 8.209338370628694e-06, "loss": 0.6731, "mean_token_accuracy": 0.8056405037641525, "num_tokens": 20340231.0, "step": 16950 }, { "entropy": 1.8436574935913086, "epoch": 0.05257451608428352, "grad_norm": 11.923999786376953, "learning_rate": 8.214181923859344e-06, "loss": 0.5748, "mean_token_accuracy": 0.8199576199054718, "num_tokens": 20352072.0, "step": 16960 }, { "entropy": 1.8853593990206718, "epoch": 0.052605515209333216, "grad_norm": 10.41568660736084, "learning_rate": 8.219025477089993e-06, "loss": 0.6458, "mean_token_accuracy": 0.8017956003546715, "num_tokens": 20363499.0, "step": 16970 }, { "entropy": 1.918022060394287, "epoch": 0.05263651433438291, "grad_norm": 8.378069877624512, "learning_rate": 8.223869030320643e-06, "loss": 0.6415, "mean_token_accuracy": 0.8088062942028046, "num_tokens": 20375464.0, "step": 16980 }, { "entropy": 1.8862889647483825, "epoch": 0.05266751345943261, "grad_norm": 10.294898986816406, "learning_rate": 8.228712583551293e-06, "loss": 0.636, "mean_token_accuracy": 0.8037711903452873, "num_tokens": 20387540.0, "step": 16990 }, { "entropy": 1.9208069905638694, "epoch": 0.052698512584482306, "grad_norm": 10.81685733795166, "learning_rate": 8.233556136781943e-06, "loss": 0.7022, "mean_token_accuracy": 0.8014640405774116, "num_tokens": 20399798.0, "step": 17000 }, { "entropy": 1.8465980917215348, "epoch": 0.052729511709531995, "grad_norm": 12.846884727478027, "learning_rate": 8.238399690012595e-06, "loss": 0.592, "mean_token_accuracy": 0.820027408003807, "num_tokens": 20411783.0, "step": 17010 }, { "entropy": 1.8712544098496438, "epoch": 0.05276051083458169, "grad_norm": 12.44199275970459, "learning_rate": 8.243243243243245e-06, "loss": 0.6153, "mean_token_accuracy": 0.8144369840621948, "num_tokens": 20423720.0, "step": 17020 }, { "entropy": 1.861243399977684, "epoch": 0.05279150995963139, "grad_norm": 10.822843551635742, "learning_rate": 8.248086796473894e-06, "loss": 0.6249, "mean_token_accuracy": 0.8173502340912819, "num_tokens": 20435655.0, "step": 17030 }, { "entropy": 1.831656913459301, "epoch": 0.052822509084681085, "grad_norm": 10.160820960998535, "learning_rate": 8.252930349704544e-06, "loss": 0.5545, "mean_token_accuracy": 0.8063213124871254, "num_tokens": 20449025.0, "step": 17040 }, { "entropy": 1.78798815459013, "epoch": 0.05285350820973078, "grad_norm": 5.9856414794921875, "learning_rate": 8.257773902935194e-06, "loss": 0.5513, "mean_token_accuracy": 0.8148110911250115, "num_tokens": 20461447.0, "step": 17050 }, { "entropy": 1.86494023501873, "epoch": 0.05288450733478048, "grad_norm": 9.526678085327148, "learning_rate": 8.262617456165844e-06, "loss": 0.6113, "mean_token_accuracy": 0.815309202671051, "num_tokens": 20472983.0, "step": 17060 }, { "entropy": 1.8194462105631828, "epoch": 0.05291550645983017, "grad_norm": 5.433764934539795, "learning_rate": 8.267461009396494e-06, "loss": 0.5941, "mean_token_accuracy": 0.8140083074569702, "num_tokens": 20485617.0, "step": 17070 }, { "entropy": 1.876285555958748, "epoch": 0.052946505584879865, "grad_norm": 5.180232524871826, "learning_rate": 8.272304562627144e-06, "loss": 0.5879, "mean_token_accuracy": 0.8107918843626976, "num_tokens": 20496764.0, "step": 17080 }, { "entropy": 1.7992719858884811, "epoch": 0.05297750470992956, "grad_norm": 10.29720687866211, "learning_rate": 8.277148115857794e-06, "loss": 0.5607, "mean_token_accuracy": 0.8232490435242653, "num_tokens": 20509654.0, "step": 17090 }, { "entropy": 1.8305759117007256, "epoch": 0.05300850383497926, "grad_norm": 11.629636764526367, "learning_rate": 8.281991669088444e-06, "loss": 0.5491, "mean_token_accuracy": 0.8326339006423951, "num_tokens": 20521330.0, "step": 17100 }, { "entropy": 1.8921547949314117, "epoch": 0.053039502960028954, "grad_norm": 10.69015884399414, "learning_rate": 8.286835222319093e-06, "loss": 0.5905, "mean_token_accuracy": 0.8138958439230919, "num_tokens": 20532830.0, "step": 17110 }, { "entropy": 1.8374129354953765, "epoch": 0.05307050208507865, "grad_norm": 14.065573692321777, "learning_rate": 8.291678775549743e-06, "loss": 0.5772, "mean_token_accuracy": 0.8190833449363708, "num_tokens": 20545189.0, "step": 17120 }, { "entropy": 1.8749100968241692, "epoch": 0.05310150121012834, "grad_norm": 11.14418888092041, "learning_rate": 8.296522328780395e-06, "loss": 0.5848, "mean_token_accuracy": 0.8282869830727577, "num_tokens": 20557028.0, "step": 17130 }, { "entropy": 1.9455086424946786, "epoch": 0.05313250033517804, "grad_norm": 6.915708065032959, "learning_rate": 8.301365882011045e-06, "loss": 0.6958, "mean_token_accuracy": 0.7915912076830864, "num_tokens": 20567980.0, "step": 17140 }, { "entropy": 1.962218326330185, "epoch": 0.053163499460227734, "grad_norm": 12.134001731872559, "learning_rate": 8.306209435241695e-06, "loss": 0.6929, "mean_token_accuracy": 0.8000232353806496, "num_tokens": 20578839.0, "step": 17150 }, { "entropy": 1.845068359375, "epoch": 0.05319449858527743, "grad_norm": 11.644062995910645, "learning_rate": 8.311052988472345e-06, "loss": 0.6593, "mean_token_accuracy": 0.8096413478255272, "num_tokens": 20590558.0, "step": 17160 }, { "entropy": 1.8619411289691925, "epoch": 0.05322549771032713, "grad_norm": 10.61169719696045, "learning_rate": 8.315896541702993e-06, "loss": 0.5858, "mean_token_accuracy": 0.8225790098309517, "num_tokens": 20601969.0, "step": 17170 }, { "entropy": 1.8861952975392342, "epoch": 0.053256496835376824, "grad_norm": 5.271222114562988, "learning_rate": 8.320740094933644e-06, "loss": 0.6097, "mean_token_accuracy": 0.8090833589434624, "num_tokens": 20614378.0, "step": 17180 }, { "entropy": 1.7966760843992233, "epoch": 0.05328749596042652, "grad_norm": 10.908940315246582, "learning_rate": 8.325583648164294e-06, "loss": 0.5047, "mean_token_accuracy": 0.8201152965426445, "num_tokens": 20628222.0, "step": 17190 }, { "entropy": 1.8242161065340041, "epoch": 0.05331849508547621, "grad_norm": 5.578505992889404, "learning_rate": 8.330427201394944e-06, "loss": 0.5196, "mean_token_accuracy": 0.8250388026237487, "num_tokens": 20641356.0, "step": 17200 }, { "entropy": 1.9080461800098418, "epoch": 0.053349494210525907, "grad_norm": 10.300594329833984, "learning_rate": 8.335270754625594e-06, "loss": 0.6218, "mean_token_accuracy": 0.8083810389041901, "num_tokens": 20653007.0, "step": 17210 }, { "entropy": 1.9093635857105256, "epoch": 0.0533804933355756, "grad_norm": 11.184460639953613, "learning_rate": 8.340114307856244e-06, "loss": 0.6377, "mean_token_accuracy": 0.8104760378599167, "num_tokens": 20663792.0, "step": 17220 }, { "entropy": 1.9165063306689263, "epoch": 0.0534114924606253, "grad_norm": 10.442662239074707, "learning_rate": 8.344957861086894e-06, "loss": 0.5942, "mean_token_accuracy": 0.8143234044313431, "num_tokens": 20675785.0, "step": 17230 }, { "entropy": 1.9359146520495414, "epoch": 0.053442491585674996, "grad_norm": 8.756479263305664, "learning_rate": 8.349801414317544e-06, "loss": 0.6133, "mean_token_accuracy": 0.8071959316730499, "num_tokens": 20687331.0, "step": 17240 }, { "entropy": 1.945779299736023, "epoch": 0.05347349071072469, "grad_norm": 11.406476020812988, "learning_rate": 8.354644967548195e-06, "loss": 0.6008, "mean_token_accuracy": 0.8256149128079414, "num_tokens": 20697727.0, "step": 17250 }, { "entropy": 1.8896187216043472, "epoch": 0.05350448983577438, "grad_norm": 12.807340621948242, "learning_rate": 8.359488520778845e-06, "loss": 0.6007, "mean_token_accuracy": 0.8113436698913574, "num_tokens": 20709839.0, "step": 17260 }, { "entropy": 1.8658792898058891, "epoch": 0.05353548896082408, "grad_norm": 11.434633255004883, "learning_rate": 8.364332074009493e-06, "loss": 0.6164, "mean_token_accuracy": 0.806533083319664, "num_tokens": 20721529.0, "step": 17270 }, { "entropy": 1.8530214801430702, "epoch": 0.053566488085873776, "grad_norm": 10.531983375549316, "learning_rate": 8.369175627240143e-06, "loss": 0.5375, "mean_token_accuracy": 0.819164864718914, "num_tokens": 20734410.0, "step": 17280 }, { "entropy": 1.864107219874859, "epoch": 0.05359748721092347, "grad_norm": 11.01248550415039, "learning_rate": 8.374019180470793e-06, "loss": 0.6512, "mean_token_accuracy": 0.8096802055835723, "num_tokens": 20746182.0, "step": 17290 }, { "entropy": 1.8512002035975457, "epoch": 0.05362848633597317, "grad_norm": 10.909806251525879, "learning_rate": 8.378862733701445e-06, "loss": 0.6076, "mean_token_accuracy": 0.8117505341768265, "num_tokens": 20758266.0, "step": 17300 }, { "entropy": 1.9608330637216569, "epoch": 0.053659485461022866, "grad_norm": 9.702219009399414, "learning_rate": 8.383706286932095e-06, "loss": 0.6645, "mean_token_accuracy": 0.8040051057934761, "num_tokens": 20768751.0, "step": 17310 }, { "entropy": 1.9042405292391777, "epoch": 0.053690484586072555, "grad_norm": 10.729706764221191, "learning_rate": 8.388549840162744e-06, "loss": 0.6057, "mean_token_accuracy": 0.817810270190239, "num_tokens": 20780886.0, "step": 17320 }, { "entropy": 1.8193862006068229, "epoch": 0.05372148371112225, "grad_norm": 5.850473880767822, "learning_rate": 8.393393393393394e-06, "loss": 0.5504, "mean_token_accuracy": 0.8224448531866073, "num_tokens": 20793098.0, "step": 17330 }, { "entropy": 1.7956085950136185, "epoch": 0.05375248283617195, "grad_norm": 4.813007354736328, "learning_rate": 8.398236946624044e-06, "loss": 0.5571, "mean_token_accuracy": 0.818726347386837, "num_tokens": 20806408.0, "step": 17340 }, { "entropy": 1.8429106876254082, "epoch": 0.053783481961221645, "grad_norm": 4.898284435272217, "learning_rate": 8.403080499854694e-06, "loss": 0.5251, "mean_token_accuracy": 0.828820888698101, "num_tokens": 20819353.0, "step": 17350 }, { "entropy": 1.8711024463176726, "epoch": 0.05381448108627134, "grad_norm": 6.471118927001953, "learning_rate": 8.407924053085344e-06, "loss": 0.588, "mean_token_accuracy": 0.8236263871192933, "num_tokens": 20830636.0, "step": 17360 }, { "entropy": 1.8240604743361473, "epoch": 0.05384548021132104, "grad_norm": 10.193392753601074, "learning_rate": 8.412767606315994e-06, "loss": 0.5247, "mean_token_accuracy": 0.8229044646024704, "num_tokens": 20843265.0, "step": 17370 }, { "entropy": 1.9022013053297997, "epoch": 0.05387647933637073, "grad_norm": 11.367668151855469, "learning_rate": 8.417611159546644e-06, "loss": 0.6635, "mean_token_accuracy": 0.8005659952759743, "num_tokens": 20855224.0, "step": 17380 }, { "entropy": 1.874705444276333, "epoch": 0.053907478461420424, "grad_norm": 13.563992500305176, "learning_rate": 8.422454712777294e-06, "loss": 0.6151, "mean_token_accuracy": 0.8215208485722542, "num_tokens": 20866903.0, "step": 17390 }, { "entropy": 1.9510533064603806, "epoch": 0.05393847758647012, "grad_norm": 12.109170913696289, "learning_rate": 8.427298266007944e-06, "loss": 0.6787, "mean_token_accuracy": 0.793503324687481, "num_tokens": 20878367.0, "step": 17400 }, { "entropy": 1.877579514682293, "epoch": 0.05396947671151982, "grad_norm": 10.051459312438965, "learning_rate": 8.432141819238593e-06, "loss": 0.5547, "mean_token_accuracy": 0.8256360620260239, "num_tokens": 20890360.0, "step": 17410 }, { "entropy": 1.9146401450037955, "epoch": 0.054000475836569514, "grad_norm": 10.120962142944336, "learning_rate": 8.436985372469243e-06, "loss": 0.6321, "mean_token_accuracy": 0.8045231074094772, "num_tokens": 20902287.0, "step": 17420 }, { "entropy": 1.8735201194882394, "epoch": 0.05403147496161921, "grad_norm": 5.3620524406433105, "learning_rate": 8.441828925699895e-06, "loss": 0.6369, "mean_token_accuracy": 0.811458395421505, "num_tokens": 20913921.0, "step": 17430 }, { "entropy": 1.8790420606732368, "epoch": 0.0540624740866689, "grad_norm": 11.361139297485352, "learning_rate": 8.446672478930545e-06, "loss": 0.5784, "mean_token_accuracy": 0.8240441083908081, "num_tokens": 20925521.0, "step": 17440 }, { "entropy": 1.8804828137159348, "epoch": 0.0540934732117186, "grad_norm": 12.176139831542969, "learning_rate": 8.451516032161195e-06, "loss": 0.5856, "mean_token_accuracy": 0.8197601407766342, "num_tokens": 20937530.0, "step": 17450 }, { "entropy": 1.8536333724856378, "epoch": 0.054124472336768294, "grad_norm": 8.680792808532715, "learning_rate": 8.456359585391845e-06, "loss": 0.5507, "mean_token_accuracy": 0.8246132522821427, "num_tokens": 20949885.0, "step": 17460 }, { "entropy": 1.9130322173237801, "epoch": 0.05415547146181799, "grad_norm": 9.968182563781738, "learning_rate": 8.461203138622494e-06, "loss": 0.6107, "mean_token_accuracy": 0.8138394117355346, "num_tokens": 20961126.0, "step": 17470 }, { "entropy": 1.8640489429235458, "epoch": 0.05418647058686769, "grad_norm": 14.096504211425781, "learning_rate": 8.466046691853144e-06, "loss": 0.5921, "mean_token_accuracy": 0.8181562379002572, "num_tokens": 20973024.0, "step": 17480 }, { "entropy": 1.7738539189100266, "epoch": 0.05421746971191738, "grad_norm": 10.170340538024902, "learning_rate": 8.470890245083794e-06, "loss": 0.5353, "mean_token_accuracy": 0.8179555460810661, "num_tokens": 20985918.0, "step": 17490 }, { "entropy": 1.8802258223295212, "epoch": 0.05424846883696708, "grad_norm": 10.039809226989746, "learning_rate": 8.475733798314444e-06, "loss": 0.6347, "mean_token_accuracy": 0.8079760164022446, "num_tokens": 20998102.0, "step": 17500 }, { "entropy": 1.9156350553035737, "epoch": 0.05427946796201677, "grad_norm": 9.540949821472168, "learning_rate": 8.480577351545094e-06, "loss": 0.6298, "mean_token_accuracy": 0.8177491262555122, "num_tokens": 21009239.0, "step": 17510 }, { "entropy": 1.804891037940979, "epoch": 0.054310467087066466, "grad_norm": 9.833329200744629, "learning_rate": 8.485420904775744e-06, "loss": 0.6428, "mean_token_accuracy": 0.808480116724968, "num_tokens": 21021894.0, "step": 17520 }, { "entropy": 1.9138577803969383, "epoch": 0.05434146621211616, "grad_norm": 6.600327968597412, "learning_rate": 8.490264458006394e-06, "loss": 0.5808, "mean_token_accuracy": 0.8139492854475975, "num_tokens": 21033383.0, "step": 17530 }, { "entropy": 1.7570361971855164, "epoch": 0.05437246533716586, "grad_norm": 6.329018592834473, "learning_rate": 8.495108011237044e-06, "loss": 0.5523, "mean_token_accuracy": 0.818885837495327, "num_tokens": 21047301.0, "step": 17540 }, { "entropy": 1.9657986283302307, "epoch": 0.054403464462215556, "grad_norm": 10.000340461730957, "learning_rate": 8.499951564467695e-06, "loss": 0.7018, "mean_token_accuracy": 0.7982934132218361, "num_tokens": 21058156.0, "step": 17550 }, { "entropy": 1.881192384660244, "epoch": 0.05443446358726525, "grad_norm": 9.785136222839355, "learning_rate": 8.504795117698345e-06, "loss": 0.6109, "mean_token_accuracy": 0.8185616865754127, "num_tokens": 21070030.0, "step": 17560 }, { "entropy": 1.8533698439598083, "epoch": 0.05446546271231494, "grad_norm": 4.606391906738281, "learning_rate": 8.509638670928995e-06, "loss": 0.6352, "mean_token_accuracy": 0.8094547167420387, "num_tokens": 21082808.0, "step": 17570 }, { "entropy": 1.8658347025513649, "epoch": 0.05449646183736464, "grad_norm": 10.683303833007812, "learning_rate": 8.514482224159645e-06, "loss": 0.5723, "mean_token_accuracy": 0.8231870085000992, "num_tokens": 21095739.0, "step": 17580 }, { "entropy": 1.9149486675858498, "epoch": 0.054527460962414336, "grad_norm": 7.256167411804199, "learning_rate": 8.519325777390293e-06, "loss": 0.6271, "mean_token_accuracy": 0.8080606922507286, "num_tokens": 21107649.0, "step": 17590 }, { "entropy": 1.857093758881092, "epoch": 0.05455846008746403, "grad_norm": 10.606932640075684, "learning_rate": 8.524169330620945e-06, "loss": 0.575, "mean_token_accuracy": 0.8211929991841316, "num_tokens": 21120550.0, "step": 17600 }, { "entropy": 1.8073217749595643, "epoch": 0.05458945921251373, "grad_norm": 10.49376392364502, "learning_rate": 8.529012883851594e-06, "loss": 0.5124, "mean_token_accuracy": 0.8263013437390327, "num_tokens": 21134529.0, "step": 17610 }, { "entropy": 1.8306412398815155, "epoch": 0.054620458337563425, "grad_norm": 9.479334831237793, "learning_rate": 8.533856437082244e-06, "loss": 0.6191, "mean_token_accuracy": 0.8138529911637307, "num_tokens": 21148789.0, "step": 17620 }, { "entropy": 1.9288539364933968, "epoch": 0.054651457462613115, "grad_norm": 9.992156028747559, "learning_rate": 8.538699990312894e-06, "loss": 0.6597, "mean_token_accuracy": 0.8028325289487839, "num_tokens": 21160363.0, "step": 17630 }, { "entropy": 1.865708639472723, "epoch": 0.05468245658766281, "grad_norm": 5.635770797729492, "learning_rate": 8.543543543543544e-06, "loss": 0.5549, "mean_token_accuracy": 0.8194977372884751, "num_tokens": 21173133.0, "step": 17640 }, { "entropy": 1.8686399355530738, "epoch": 0.05471345571271251, "grad_norm": 14.837812423706055, "learning_rate": 8.548387096774194e-06, "loss": 0.5678, "mean_token_accuracy": 0.8228330850601197, "num_tokens": 21185944.0, "step": 17650 }, { "entropy": 1.936301600933075, "epoch": 0.054744454837762205, "grad_norm": 9.631260871887207, "learning_rate": 8.553230650004844e-06, "loss": 0.5762, "mean_token_accuracy": 0.8145893216133118, "num_tokens": 21197684.0, "step": 17660 }, { "entropy": 1.9080315500497818, "epoch": 0.0547754539628119, "grad_norm": 9.75123119354248, "learning_rate": 8.558074203235495e-06, "loss": 0.5961, "mean_token_accuracy": 0.8067243576049805, "num_tokens": 21209448.0, "step": 17670 }, { "entropy": 1.941755273938179, "epoch": 0.0548064530878616, "grad_norm": 7.447671413421631, "learning_rate": 8.562917756466145e-06, "loss": 0.6306, "mean_token_accuracy": 0.8096268206834794, "num_tokens": 21222011.0, "step": 17680 }, { "entropy": 1.9120370209217072, "epoch": 0.05483745221291129, "grad_norm": 5.153918266296387, "learning_rate": 8.567761309696794e-06, "loss": 0.5873, "mean_token_accuracy": 0.8094914838671684, "num_tokens": 21234368.0, "step": 17690 }, { "entropy": 1.8817522883415223, "epoch": 0.054868451337960984, "grad_norm": 10.505568504333496, "learning_rate": 8.572604862927443e-06, "loss": 0.5737, "mean_token_accuracy": 0.8192173585295677, "num_tokens": 21246765.0, "step": 17700 }, { "entropy": 1.8698602363467216, "epoch": 0.05489945046301068, "grad_norm": 10.905664443969727, "learning_rate": 8.577448416158093e-06, "loss": 0.5834, "mean_token_accuracy": 0.81576436907053, "num_tokens": 21258879.0, "step": 17710 }, { "entropy": 1.8876920908689498, "epoch": 0.05493044958806038, "grad_norm": 10.402029991149902, "learning_rate": 8.582291969388745e-06, "loss": 0.5738, "mean_token_accuracy": 0.8234436243772507, "num_tokens": 21270505.0, "step": 17720 }, { "entropy": 1.8734422281384469, "epoch": 0.054961448713110074, "grad_norm": 10.90580940246582, "learning_rate": 8.587135522619395e-06, "loss": 0.5968, "mean_token_accuracy": 0.8190149515867233, "num_tokens": 21282053.0, "step": 17730 }, { "entropy": 1.8915531307458877, "epoch": 0.05499244783815977, "grad_norm": 8.619651794433594, "learning_rate": 8.591979075850045e-06, "loss": 0.6141, "mean_token_accuracy": 0.8292980402708053, "num_tokens": 21293252.0, "step": 17740 }, { "entropy": 1.819742462038994, "epoch": 0.05502344696320946, "grad_norm": 9.859935760498047, "learning_rate": 8.596822629080695e-06, "loss": 0.5696, "mean_token_accuracy": 0.825517275929451, "num_tokens": 21305648.0, "step": 17750 }, { "entropy": 1.7411428809165954, "epoch": 0.05505444608825916, "grad_norm": 11.464165687561035, "learning_rate": 8.601666182311344e-06, "loss": 0.5192, "mean_token_accuracy": 0.8340383976697922, "num_tokens": 21320159.0, "step": 17760 }, { "entropy": 1.919995127618313, "epoch": 0.05508544521330885, "grad_norm": 4.660171985626221, "learning_rate": 8.606509735541994e-06, "loss": 0.5619, "mean_token_accuracy": 0.8194687977433205, "num_tokens": 21331854.0, "step": 17770 }, { "entropy": 1.826239649951458, "epoch": 0.05511644433835855, "grad_norm": 6.208508014678955, "learning_rate": 8.611353288772644e-06, "loss": 0.519, "mean_token_accuracy": 0.8199943840503693, "num_tokens": 21344553.0, "step": 17780 }, { "entropy": 1.964565047621727, "epoch": 0.05514744346340825, "grad_norm": 11.809846878051758, "learning_rate": 8.616196842003294e-06, "loss": 0.6973, "mean_token_accuracy": 0.8042270794510842, "num_tokens": 21355485.0, "step": 17790 }, { "entropy": 1.9378721952438354, "epoch": 0.05517844258845794, "grad_norm": 9.962228775024414, "learning_rate": 8.621040395233944e-06, "loss": 0.6338, "mean_token_accuracy": 0.8136247247457504, "num_tokens": 21367049.0, "step": 17800 }, { "entropy": 1.894200675189495, "epoch": 0.05520944171350763, "grad_norm": 4.6598663330078125, "learning_rate": 8.625883948464594e-06, "loss": 0.5756, "mean_token_accuracy": 0.8180066585540772, "num_tokens": 21378413.0, "step": 17810 }, { "entropy": 1.828821636736393, "epoch": 0.05524044083855733, "grad_norm": 5.244216442108154, "learning_rate": 8.630727501695244e-06, "loss": 0.5283, "mean_token_accuracy": 0.8234935000538826, "num_tokens": 21391978.0, "step": 17820 }, { "entropy": 1.8749590143561363, "epoch": 0.055271439963607026, "grad_norm": 11.188740730285645, "learning_rate": 8.635571054925894e-06, "loss": 0.5804, "mean_token_accuracy": 0.8215572342276574, "num_tokens": 21403442.0, "step": 17830 }, { "entropy": 1.792897927761078, "epoch": 0.05530243908865672, "grad_norm": 5.151137828826904, "learning_rate": 8.640414608156543e-06, "loss": 0.4537, "mean_token_accuracy": 0.8246389016509056, "num_tokens": 21416621.0, "step": 17840 }, { "entropy": 1.9171807587146759, "epoch": 0.05533343821370642, "grad_norm": 10.319511413574219, "learning_rate": 8.645258161387195e-06, "loss": 0.7074, "mean_token_accuracy": 0.8059808582067489, "num_tokens": 21428271.0, "step": 17850 }, { "entropy": 1.8502112567424773, "epoch": 0.055364437338756116, "grad_norm": 9.720250129699707, "learning_rate": 8.650101714617845e-06, "loss": 0.636, "mean_token_accuracy": 0.8085378974676132, "num_tokens": 21439709.0, "step": 17860 }, { "entropy": 1.8581691205501556, "epoch": 0.05539543646380581, "grad_norm": 10.996453285217285, "learning_rate": 8.654945267848495e-06, "loss": 0.6044, "mean_token_accuracy": 0.8172203198075294, "num_tokens": 21451408.0, "step": 17870 }, { "entropy": 1.8972811639308929, "epoch": 0.0554264355888555, "grad_norm": 10.46535587310791, "learning_rate": 8.659788821079145e-06, "loss": 0.6667, "mean_token_accuracy": 0.8105733722448349, "num_tokens": 21463410.0, "step": 17880 }, { "entropy": 1.801578015089035, "epoch": 0.0554574347139052, "grad_norm": 11.458210945129395, "learning_rate": 8.664632374309795e-06, "loss": 0.4808, "mean_token_accuracy": 0.8232032969594002, "num_tokens": 21476435.0, "step": 17890 }, { "entropy": 1.8537754505872726, "epoch": 0.055488433838954895, "grad_norm": 9.625106811523438, "learning_rate": 8.669475927540444e-06, "loss": 0.5613, "mean_token_accuracy": 0.8271110355854034, "num_tokens": 21488620.0, "step": 17900 }, { "entropy": 1.920037829875946, "epoch": 0.05551943296400459, "grad_norm": 11.167927742004395, "learning_rate": 8.674319480771094e-06, "loss": 0.633, "mean_token_accuracy": 0.8028773024678231, "num_tokens": 21500165.0, "step": 17910 }, { "entropy": 1.8313481405377388, "epoch": 0.05555043208905429, "grad_norm": 12.061219215393066, "learning_rate": 8.679163034001744e-06, "loss": 0.5336, "mean_token_accuracy": 0.8166040554642677, "num_tokens": 21513677.0, "step": 17920 }, { "entropy": 2.0305935621261595, "epoch": 0.055581431214103985, "grad_norm": 10.752704620361328, "learning_rate": 8.684006587232394e-06, "loss": 0.6779, "mean_token_accuracy": 0.8040038585662842, "num_tokens": 21524307.0, "step": 17930 }, { "entropy": 1.8893893539905549, "epoch": 0.055612430339153675, "grad_norm": 11.609795570373535, "learning_rate": 8.688850140463044e-06, "loss": 0.6344, "mean_token_accuracy": 0.8086724013090134, "num_tokens": 21536735.0, "step": 17940 }, { "entropy": 1.8628504529595376, "epoch": 0.05564342946420337, "grad_norm": 7.905481815338135, "learning_rate": 8.693693693693694e-06, "loss": 0.5808, "mean_token_accuracy": 0.8129374206066131, "num_tokens": 21549205.0, "step": 17950 }, { "entropy": 1.884210042655468, "epoch": 0.05567442858925307, "grad_norm": 12.496581077575684, "learning_rate": 8.698537246924344e-06, "loss": 0.5238, "mean_token_accuracy": 0.8244456693530082, "num_tokens": 21560770.0, "step": 17960 }, { "entropy": 1.8549162834882735, "epoch": 0.055705427714302765, "grad_norm": 10.498296737670898, "learning_rate": 8.703380800154995e-06, "loss": 0.5502, "mean_token_accuracy": 0.8179799810051918, "num_tokens": 21572851.0, "step": 17970 }, { "entropy": 1.9428973540663719, "epoch": 0.05573642683935246, "grad_norm": 9.824682235717773, "learning_rate": 8.708224353385645e-06, "loss": 0.6371, "mean_token_accuracy": 0.8067533582448959, "num_tokens": 21584287.0, "step": 17980 }, { "entropy": 1.89179328083992, "epoch": 0.05576742596440216, "grad_norm": 5.299261093139648, "learning_rate": 8.713067906616295e-06, "loss": 0.5831, "mean_token_accuracy": 0.8168211251497268, "num_tokens": 21596202.0, "step": 17990 }, { "entropy": 1.8720185294747353, "epoch": 0.05579842508945185, "grad_norm": 9.72613525390625, "learning_rate": 8.717911459846945e-06, "loss": 0.5672, "mean_token_accuracy": 0.8193200185894967, "num_tokens": 21608507.0, "step": 18000 }, { "entropy": 1.90470210313797, "epoch": 0.055829424214501544, "grad_norm": 11.743047714233398, "learning_rate": 8.722755013077593e-06, "loss": 0.6317, "mean_token_accuracy": 0.8138886854052544, "num_tokens": 21619927.0, "step": 18010 }, { "entropy": 1.9255423635244369, "epoch": 0.05586042333955124, "grad_norm": 10.3324613571167, "learning_rate": 8.727598566308245e-06, "loss": 0.6288, "mean_token_accuracy": 0.8161659583449363, "num_tokens": 21631633.0, "step": 18020 }, { "entropy": 1.7495290905237197, "epoch": 0.05589142246460094, "grad_norm": 11.315978050231934, "learning_rate": 8.732442119538895e-06, "loss": 0.5321, "mean_token_accuracy": 0.8315317690372467, "num_tokens": 21645071.0, "step": 18030 }, { "entropy": 1.8392060235142709, "epoch": 0.055922421589650634, "grad_norm": 10.65888786315918, "learning_rate": 8.737285672769545e-06, "loss": 0.5339, "mean_token_accuracy": 0.8336151942610741, "num_tokens": 21657281.0, "step": 18040 }, { "entropy": 1.9446683064103127, "epoch": 0.05595342071470033, "grad_norm": 13.20337963104248, "learning_rate": 8.742129226000194e-06, "loss": 0.682, "mean_token_accuracy": 0.7979879945516586, "num_tokens": 21668367.0, "step": 18050 }, { "entropy": 1.8128132298588753, "epoch": 0.05598441983975002, "grad_norm": 11.751765251159668, "learning_rate": 8.746972779230844e-06, "loss": 0.585, "mean_token_accuracy": 0.8146183237433433, "num_tokens": 21681270.0, "step": 18060 }, { "entropy": 1.8214144945144652, "epoch": 0.05601541896479972, "grad_norm": 10.238035202026367, "learning_rate": 8.751816332461494e-06, "loss": 0.585, "mean_token_accuracy": 0.8190832883119583, "num_tokens": 21694477.0, "step": 18070 }, { "entropy": 1.8992233827710152, "epoch": 0.05604641808984941, "grad_norm": 3.5823521614074707, "learning_rate": 8.756659885692144e-06, "loss": 0.6326, "mean_token_accuracy": 0.8172223970293999, "num_tokens": 21706135.0, "step": 18080 }, { "entropy": 1.8768006905913353, "epoch": 0.05607741721489911, "grad_norm": 12.048982620239258, "learning_rate": 8.761503438922796e-06, "loss": 0.6044, "mean_token_accuracy": 0.8237545311450958, "num_tokens": 21717528.0, "step": 18090 }, { "entropy": 1.8304587841033935, "epoch": 0.056108416339948806, "grad_norm": 10.88693904876709, "learning_rate": 8.766346992153446e-06, "loss": 0.5422, "mean_token_accuracy": 0.8200173109769822, "num_tokens": 21730335.0, "step": 18100 }, { "entropy": 1.7798964753746986, "epoch": 0.0561394154649985, "grad_norm": 13.059136390686035, "learning_rate": 8.771190545384094e-06, "loss": 0.508, "mean_token_accuracy": 0.8268529579043389, "num_tokens": 21743306.0, "step": 18110 }, { "entropy": 1.9081214651465417, "epoch": 0.05617041459004819, "grad_norm": 11.699897766113281, "learning_rate": 8.776034098614744e-06, "loss": 0.6163, "mean_token_accuracy": 0.8113586112856865, "num_tokens": 21754558.0, "step": 18120 }, { "entropy": 1.8716801404953003, "epoch": 0.05620141371509789, "grad_norm": 11.486919403076172, "learning_rate": 8.780877651845394e-06, "loss": 0.6531, "mean_token_accuracy": 0.7999576240777969, "num_tokens": 21766487.0, "step": 18130 }, { "entropy": 1.8077737405896186, "epoch": 0.056232412840147586, "grad_norm": 10.865653991699219, "learning_rate": 8.785721205076045e-06, "loss": 0.5829, "mean_token_accuracy": 0.8171229988336564, "num_tokens": 21779437.0, "step": 18140 }, { "entropy": 1.93416608273983, "epoch": 0.05626341196519728, "grad_norm": 9.039263725280762, "learning_rate": 8.790564758306695e-06, "loss": 0.6632, "mean_token_accuracy": 0.8090454161167144, "num_tokens": 21790560.0, "step": 18150 }, { "entropy": 1.9330880522727967, "epoch": 0.05629441109024698, "grad_norm": 9.971395492553711, "learning_rate": 8.795408311537345e-06, "loss": 0.5888, "mean_token_accuracy": 0.8192548274993896, "num_tokens": 21802186.0, "step": 18160 }, { "entropy": 1.8966943442821502, "epoch": 0.056325410215296676, "grad_norm": 5.171525955200195, "learning_rate": 8.800251864767995e-06, "loss": 0.609, "mean_token_accuracy": 0.8174291670322418, "num_tokens": 21813141.0, "step": 18170 }, { "entropy": 1.9147083714604378, "epoch": 0.056356409340346365, "grad_norm": 10.910831451416016, "learning_rate": 8.805095417998645e-06, "loss": 0.6791, "mean_token_accuracy": 0.802092096209526, "num_tokens": 21824549.0, "step": 18180 }, { "entropy": 1.845213919878006, "epoch": 0.05638740846539606, "grad_norm": 11.23419189453125, "learning_rate": 8.809938971229295e-06, "loss": 0.541, "mean_token_accuracy": 0.836587692797184, "num_tokens": 21837064.0, "step": 18190 }, { "entropy": 1.9281988382339477, "epoch": 0.05641840759044576, "grad_norm": 12.443825721740723, "learning_rate": 8.814782524459944e-06, "loss": 0.6652, "mean_token_accuracy": 0.8060685142874717, "num_tokens": 21849025.0, "step": 18200 }, { "entropy": 1.985886165499687, "epoch": 0.056449406715495455, "grad_norm": 11.284612655639648, "learning_rate": 8.819626077690594e-06, "loss": 0.6496, "mean_token_accuracy": 0.8076219454407692, "num_tokens": 21859566.0, "step": 18210 }, { "entropy": 1.9329922825098038, "epoch": 0.05648040584054515, "grad_norm": 8.869474411010742, "learning_rate": 8.824469630921244e-06, "loss": 0.6058, "mean_token_accuracy": 0.810475055873394, "num_tokens": 21871056.0, "step": 18220 }, { "entropy": 1.920028594136238, "epoch": 0.05651140496559485, "grad_norm": 11.169597625732422, "learning_rate": 8.829313184151894e-06, "loss": 0.636, "mean_token_accuracy": 0.8107326775789261, "num_tokens": 21881630.0, "step": 18230 }, { "entropy": 1.8644825682044028, "epoch": 0.056542404090644545, "grad_norm": 10.988966941833496, "learning_rate": 8.834156737382544e-06, "loss": 0.5143, "mean_token_accuracy": 0.8243206977844239, "num_tokens": 21893921.0, "step": 18240 }, { "entropy": 1.8154290586709976, "epoch": 0.056573403215694235, "grad_norm": 6.226930141448975, "learning_rate": 8.839000290613194e-06, "loss": 0.5837, "mean_token_accuracy": 0.8170310199260712, "num_tokens": 21906253.0, "step": 18250 }, { "entropy": 1.859774151444435, "epoch": 0.05660440234074393, "grad_norm": 11.1119384765625, "learning_rate": 8.843843843843844e-06, "loss": 0.6233, "mean_token_accuracy": 0.8082350715994835, "num_tokens": 21918932.0, "step": 18260 }, { "entropy": 1.9023708343505858, "epoch": 0.05663540146579363, "grad_norm": 10.345152854919434, "learning_rate": 8.848687397074495e-06, "loss": 0.5928, "mean_token_accuracy": 0.8202927365899086, "num_tokens": 21930745.0, "step": 18270 }, { "entropy": 1.8009515717625617, "epoch": 0.056666400590843324, "grad_norm": 4.616584777832031, "learning_rate": 8.853530950305145e-06, "loss": 0.5181, "mean_token_accuracy": 0.8291145578026772, "num_tokens": 21943477.0, "step": 18280 }, { "entropy": 1.910208411514759, "epoch": 0.05669739971589302, "grad_norm": 10.986778259277344, "learning_rate": 8.858374503535795e-06, "loss": 0.6053, "mean_token_accuracy": 0.8097553431987763, "num_tokens": 21955368.0, "step": 18290 }, { "entropy": 1.8674479261040688, "epoch": 0.05672839884094272, "grad_norm": 12.478263854980469, "learning_rate": 8.863218056766445e-06, "loss": 0.592, "mean_token_accuracy": 0.8068441540002823, "num_tokens": 21968150.0, "step": 18300 }, { "entropy": 1.9316591426730156, "epoch": 0.05675939796599241, "grad_norm": 13.325005531311035, "learning_rate": 8.868061609997095e-06, "loss": 0.5848, "mean_token_accuracy": 0.8228493258357048, "num_tokens": 21979588.0, "step": 18310 }, { "entropy": 1.8889947533607483, "epoch": 0.056790397091042104, "grad_norm": 10.683235168457031, "learning_rate": 8.872905163227745e-06, "loss": 0.6101, "mean_token_accuracy": 0.8133319228887558, "num_tokens": 21991387.0, "step": 18320 }, { "entropy": 1.83803388774395, "epoch": 0.0568213962160918, "grad_norm": 9.95289421081543, "learning_rate": 8.877748716458395e-06, "loss": 0.5467, "mean_token_accuracy": 0.8165292799472809, "num_tokens": 22004539.0, "step": 18330 }, { "entropy": 1.8539226263761521, "epoch": 0.0568523953411415, "grad_norm": 11.232304573059082, "learning_rate": 8.882592269689044e-06, "loss": 0.5972, "mean_token_accuracy": 0.8155701532959938, "num_tokens": 22017151.0, "step": 18340 }, { "entropy": 1.8634928598999978, "epoch": 0.056883394466191194, "grad_norm": 10.557002067565918, "learning_rate": 8.887435822919694e-06, "loss": 0.5348, "mean_token_accuracy": 0.8263462707400322, "num_tokens": 22029875.0, "step": 18350 }, { "entropy": 1.8347053781151772, "epoch": 0.05691439359124089, "grad_norm": 5.2376389503479, "learning_rate": 8.892279376150344e-06, "loss": 0.553, "mean_token_accuracy": 0.82252157330513, "num_tokens": 22042520.0, "step": 18360 }, { "entropy": 1.9364128440618515, "epoch": 0.05694539271629058, "grad_norm": 5.532094955444336, "learning_rate": 8.897122929380994e-06, "loss": 0.673, "mean_token_accuracy": 0.800466650724411, "num_tokens": 22054370.0, "step": 18370 }, { "entropy": 1.9340432003140449, "epoch": 0.056976391841340276, "grad_norm": 11.368268966674805, "learning_rate": 8.901966482611644e-06, "loss": 0.6135, "mean_token_accuracy": 0.8022024169564247, "num_tokens": 22066153.0, "step": 18380 }, { "entropy": 1.8881090357899666, "epoch": 0.05700739096638997, "grad_norm": 5.1713972091674805, "learning_rate": 8.906810035842296e-06, "loss": 0.5272, "mean_token_accuracy": 0.8244768619537354, "num_tokens": 22078654.0, "step": 18390 }, { "entropy": 1.919769637286663, "epoch": 0.05703839009143967, "grad_norm": 8.57338809967041, "learning_rate": 8.911653589072945e-06, "loss": 0.6129, "mean_token_accuracy": 0.8175665631890296, "num_tokens": 22091226.0, "step": 18400 }, { "entropy": 1.7635185047984123, "epoch": 0.057069389216489366, "grad_norm": 3.2729835510253906, "learning_rate": 8.916497142303595e-06, "loss": 0.514, "mean_token_accuracy": 0.8288957670331001, "num_tokens": 22105037.0, "step": 18410 }, { "entropy": 1.84332295358181, "epoch": 0.05710038834153906, "grad_norm": 11.345158576965332, "learning_rate": 8.921340695534245e-06, "loss": 0.5933, "mean_token_accuracy": 0.8165907859802246, "num_tokens": 22117072.0, "step": 18420 }, { "entropy": 1.8966827645897866, "epoch": 0.05713138746658875, "grad_norm": 12.612526893615723, "learning_rate": 8.926184248764893e-06, "loss": 0.6189, "mean_token_accuracy": 0.8158532097935677, "num_tokens": 22127952.0, "step": 18430 }, { "entropy": 1.8704052582383155, "epoch": 0.05716238659163845, "grad_norm": 5.032566070556641, "learning_rate": 8.931027801995545e-06, "loss": 0.5832, "mean_token_accuracy": 0.8137423068284988, "num_tokens": 22139709.0, "step": 18440 }, { "entropy": 1.820935921370983, "epoch": 0.057193385716688146, "grad_norm": 10.067622184753418, "learning_rate": 8.935871355226195e-06, "loss": 0.5071, "mean_token_accuracy": 0.8368907868862152, "num_tokens": 22152377.0, "step": 18450 }, { "entropy": 1.896417185664177, "epoch": 0.05722438484173784, "grad_norm": 12.381418228149414, "learning_rate": 8.940714908456845e-06, "loss": 0.6965, "mean_token_accuracy": 0.8038837254047394, "num_tokens": 22163577.0, "step": 18460 }, { "entropy": 1.8900171846151352, "epoch": 0.05725538396678754, "grad_norm": 11.906807899475098, "learning_rate": 8.945558461687495e-06, "loss": 0.6449, "mean_token_accuracy": 0.8201625868678093, "num_tokens": 22174708.0, "step": 18470 }, { "entropy": 1.9111595183610917, "epoch": 0.057286383091837235, "grad_norm": 11.19156265258789, "learning_rate": 8.950402014918145e-06, "loss": 0.6555, "mean_token_accuracy": 0.802647915482521, "num_tokens": 22186789.0, "step": 18480 }, { "entropy": 1.9203668639063836, "epoch": 0.057317382216886925, "grad_norm": 11.311783790588379, "learning_rate": 8.955245568148794e-06, "loss": 0.6169, "mean_token_accuracy": 0.8133427366614342, "num_tokens": 22198408.0, "step": 18490 }, { "entropy": 1.9032350957393647, "epoch": 0.05734838134193662, "grad_norm": 13.073493957519531, "learning_rate": 8.960089121379444e-06, "loss": 0.6344, "mean_token_accuracy": 0.80216343998909, "num_tokens": 22210305.0, "step": 18500 }, { "entropy": 1.8219037756323815, "epoch": 0.05737938046698632, "grad_norm": 9.571296691894531, "learning_rate": 8.964932674610096e-06, "loss": 0.5245, "mean_token_accuracy": 0.8286261260509491, "num_tokens": 22222972.0, "step": 18510 }, { "entropy": 1.8522532120347024, "epoch": 0.057410379592036015, "grad_norm": 5.730122089385986, "learning_rate": 8.969776227840746e-06, "loss": 0.5858, "mean_token_accuracy": 0.8103675618767738, "num_tokens": 22235073.0, "step": 18520 }, { "entropy": 1.851724323630333, "epoch": 0.05744137871708571, "grad_norm": 9.87784194946289, "learning_rate": 8.974619781071394e-06, "loss": 0.5931, "mean_token_accuracy": 0.8163824722170829, "num_tokens": 22247790.0, "step": 18530 }, { "entropy": 1.8631928369402886, "epoch": 0.05747237784213541, "grad_norm": 5.00623893737793, "learning_rate": 8.979463334302044e-06, "loss": 0.559, "mean_token_accuracy": 0.8131022065877914, "num_tokens": 22260240.0, "step": 18540 }, { "entropy": 1.910364383459091, "epoch": 0.0575033769671851, "grad_norm": 11.106468200683594, "learning_rate": 8.984306887532694e-06, "loss": 0.6496, "mean_token_accuracy": 0.8063996240496636, "num_tokens": 22272059.0, "step": 18550 }, { "entropy": 1.9144617825746537, "epoch": 0.057534376092234794, "grad_norm": 10.869990348815918, "learning_rate": 8.989150440763345e-06, "loss": 0.6416, "mean_token_accuracy": 0.8138428092002868, "num_tokens": 22283072.0, "step": 18560 }, { "entropy": 1.9242763727903367, "epoch": 0.05756537521728449, "grad_norm": 10.430070877075195, "learning_rate": 8.993993993993995e-06, "loss": 0.6072, "mean_token_accuracy": 0.8113959297537804, "num_tokens": 22294697.0, "step": 18570 }, { "entropy": 1.9588421791791917, "epoch": 0.05759637434233419, "grad_norm": 12.86335277557373, "learning_rate": 8.998837547224645e-06, "loss": 0.6676, "mean_token_accuracy": 0.80771906375885, "num_tokens": 22305842.0, "step": 18580 }, { "entropy": 1.8290845677256584, "epoch": 0.057627373467383884, "grad_norm": 11.009787559509277, "learning_rate": 9.003681100455295e-06, "loss": 0.5679, "mean_token_accuracy": 0.819172665476799, "num_tokens": 22319673.0, "step": 18590 }, { "entropy": 1.8799395859241486, "epoch": 0.05765837259243358, "grad_norm": 5.414979457855225, "learning_rate": 9.008524653685945e-06, "loss": 0.5858, "mean_token_accuracy": 0.8109808668494225, "num_tokens": 22332734.0, "step": 18600 }, { "entropy": 1.9298516079783439, "epoch": 0.05768937171748328, "grad_norm": 4.051270961761475, "learning_rate": 9.013368206916595e-06, "loss": 0.6212, "mean_token_accuracy": 0.8061932161450386, "num_tokens": 22344081.0, "step": 18610 }, { "entropy": 1.9745507806539535, "epoch": 0.05772037084253297, "grad_norm": 10.910243034362793, "learning_rate": 9.018211760147245e-06, "loss": 0.7182, "mean_token_accuracy": 0.7929084002971649, "num_tokens": 22355535.0, "step": 18620 }, { "entropy": 1.880644341558218, "epoch": 0.057751369967582664, "grad_norm": 6.737995624542236, "learning_rate": 9.023055313377894e-06, "loss": 0.5881, "mean_token_accuracy": 0.8097090050578117, "num_tokens": 22368050.0, "step": 18630 }, { "entropy": 1.9336914867162704, "epoch": 0.05778236909263236, "grad_norm": 9.826704978942871, "learning_rate": 9.027898866608544e-06, "loss": 0.6545, "mean_token_accuracy": 0.8081561967730522, "num_tokens": 22379637.0, "step": 18640 }, { "entropy": 1.9320360347628593, "epoch": 0.05781336821768206, "grad_norm": 5.833460330963135, "learning_rate": 9.032742419839194e-06, "loss": 0.5905, "mean_token_accuracy": 0.8115993320941925, "num_tokens": 22391313.0, "step": 18650 }, { "entropy": 1.952963298559189, "epoch": 0.05784436734273175, "grad_norm": 5.079139709472656, "learning_rate": 9.037585973069844e-06, "loss": 0.6632, "mean_token_accuracy": 0.8034544378519058, "num_tokens": 22402538.0, "step": 18660 }, { "entropy": 1.7565592139959336, "epoch": 0.05787536646778145, "grad_norm": 4.124721050262451, "learning_rate": 9.042429526300494e-06, "loss": 0.5241, "mean_token_accuracy": 0.8233913615345955, "num_tokens": 22416764.0, "step": 18670 }, { "entropy": 1.900513444840908, "epoch": 0.05790636559283114, "grad_norm": 9.634764671325684, "learning_rate": 9.047273079531144e-06, "loss": 0.573, "mean_token_accuracy": 0.8233014553785324, "num_tokens": 22428670.0, "step": 18680 }, { "entropy": 1.9495866417884826, "epoch": 0.057937364717880836, "grad_norm": Infinity, "learning_rate": 9.052116632761795e-06, "loss": 0.6543, "mean_token_accuracy": 0.8013565137982368, "num_tokens": 22439439.0, "step": 18690 }, { "entropy": 1.8713518410921097, "epoch": 0.05796836384293053, "grad_norm": 9.244678497314453, "learning_rate": 9.056960185992445e-06, "loss": 0.5676, "mean_token_accuracy": 0.8261402577161789, "num_tokens": 22450954.0, "step": 18700 }, { "entropy": 1.867144940048456, "epoch": 0.05799936296798023, "grad_norm": 5.15731143951416, "learning_rate": 9.061803739223095e-06, "loss": 0.5672, "mean_token_accuracy": 0.814095051586628, "num_tokens": 22463821.0, "step": 18710 }, { "entropy": 1.8838742166757583, "epoch": 0.058030362093029926, "grad_norm": 10.645661354064941, "learning_rate": 9.066647292453745e-06, "loss": 0.5763, "mean_token_accuracy": 0.82436051517725, "num_tokens": 22475600.0, "step": 18720 }, { "entropy": 1.8638369679450988, "epoch": 0.05806136121807962, "grad_norm": 14.198400497436523, "learning_rate": 9.071490845684395e-06, "loss": 0.6381, "mean_token_accuracy": 0.8040808498859405, "num_tokens": 22487874.0, "step": 18730 }, { "entropy": 1.8371347174048425, "epoch": 0.05809236034312931, "grad_norm": 5.825725555419922, "learning_rate": 9.076334398915045e-06, "loss": 0.6015, "mean_token_accuracy": 0.8231277003884315, "num_tokens": 22500865.0, "step": 18740 }, { "entropy": 1.9247653126716613, "epoch": 0.05812335946817901, "grad_norm": 12.100114822387695, "learning_rate": 9.081177952145695e-06, "loss": 0.6471, "mean_token_accuracy": 0.8075411379337311, "num_tokens": 22511426.0, "step": 18750 }, { "entropy": 1.8342102706432342, "epoch": 0.058154358593228705, "grad_norm": 9.929917335510254, "learning_rate": 9.086021505376345e-06, "loss": 0.6312, "mean_token_accuracy": 0.7977308183908463, "num_tokens": 22524288.0, "step": 18760 }, { "entropy": 1.7855632066726685, "epoch": 0.0581853577182784, "grad_norm": 5.584056854248047, "learning_rate": 9.090865058606995e-06, "loss": 0.5442, "mean_token_accuracy": 0.8210761070251464, "num_tokens": 22537475.0, "step": 18770 }, { "entropy": 1.8851278007030488, "epoch": 0.0582163568433281, "grad_norm": 10.225055694580078, "learning_rate": 9.095708611837644e-06, "loss": 0.6466, "mean_token_accuracy": 0.8099891990423203, "num_tokens": 22548279.0, "step": 18780 }, { "entropy": 1.841400173306465, "epoch": 0.058247355968377795, "grad_norm": 9.732316017150879, "learning_rate": 9.100552165068294e-06, "loss": 0.5658, "mean_token_accuracy": 0.820391620695591, "num_tokens": 22560249.0, "step": 18790 }, { "entropy": 1.7526367500424385, "epoch": 0.058278355093427485, "grad_norm": 10.728232383728027, "learning_rate": 9.105395718298944e-06, "loss": 0.4948, "mean_token_accuracy": 0.8198321342468262, "num_tokens": 22573834.0, "step": 18800 }, { "entropy": 1.8276120245456695, "epoch": 0.05830935421847718, "grad_norm": 11.885202407836914, "learning_rate": 9.110239271529596e-06, "loss": 0.5502, "mean_token_accuracy": 0.8176509469747544, "num_tokens": 22586688.0, "step": 18810 }, { "entropy": 1.8741453379392623, "epoch": 0.05834035334352688, "grad_norm": 12.088459968566895, "learning_rate": 9.115082824760246e-06, "loss": 0.6949, "mean_token_accuracy": 0.7987207651138306, "num_tokens": 22598358.0, "step": 18820 }, { "entropy": 1.8715686663985251, "epoch": 0.058371352468576575, "grad_norm": 10.86611270904541, "learning_rate": 9.119926377990896e-06, "loss": 0.6264, "mean_token_accuracy": 0.8120866730809212, "num_tokens": 22609875.0, "step": 18830 }, { "entropy": 1.844074723124504, "epoch": 0.05840235159362627, "grad_norm": 10.710127830505371, "learning_rate": 9.124769931221545e-06, "loss": 0.6134, "mean_token_accuracy": 0.8119207620620728, "num_tokens": 22621314.0, "step": 18840 }, { "entropy": 1.8183260425925254, "epoch": 0.05843335071867597, "grad_norm": 9.820539474487305, "learning_rate": 9.129613484452194e-06, "loss": 0.6007, "mean_token_accuracy": 0.809662489593029, "num_tokens": 22633297.0, "step": 18850 }, { "entropy": 1.8575424775481224, "epoch": 0.05846434984372566, "grad_norm": 9.236701011657715, "learning_rate": 9.134457037682845e-06, "loss": 0.6365, "mean_token_accuracy": 0.8071271240711212, "num_tokens": 22645167.0, "step": 18860 }, { "entropy": 1.8752565145492555, "epoch": 0.058495348968775354, "grad_norm": 5.298765659332275, "learning_rate": 9.139300590913495e-06, "loss": 0.5903, "mean_token_accuracy": 0.8160139411687851, "num_tokens": 22656745.0, "step": 18870 }, { "entropy": 1.8490721896290778, "epoch": 0.05852634809382505, "grad_norm": 10.962682723999023, "learning_rate": 9.144144144144145e-06, "loss": 0.6161, "mean_token_accuracy": 0.8067515045404434, "num_tokens": 22669776.0, "step": 18880 }, { "entropy": 1.882719586789608, "epoch": 0.05855734721887475, "grad_norm": 10.184673309326172, "learning_rate": 9.148987697374795e-06, "loss": 0.6346, "mean_token_accuracy": 0.8162581637501717, "num_tokens": 22681064.0, "step": 18890 }, { "entropy": 1.7815841138362885, "epoch": 0.058588346343924444, "grad_norm": 9.868931770324707, "learning_rate": 9.153831250605445e-06, "loss": 0.4916, "mean_token_accuracy": 0.8312078416347504, "num_tokens": 22694642.0, "step": 18900 }, { "entropy": 1.8093548193573952, "epoch": 0.05861934546897414, "grad_norm": 10.930723190307617, "learning_rate": 9.158674803836095e-06, "loss": 0.5375, "mean_token_accuracy": 0.8200191363692284, "num_tokens": 22707548.0, "step": 18910 }, { "entropy": 1.8189376056194306, "epoch": 0.05865034459402383, "grad_norm": 9.895159721374512, "learning_rate": 9.163518357066745e-06, "loss": 0.6132, "mean_token_accuracy": 0.8140119895339012, "num_tokens": 22720201.0, "step": 18920 }, { "entropy": 1.8482043415307998, "epoch": 0.05868134371907353, "grad_norm": 10.34995174407959, "learning_rate": 9.168361910297396e-06, "loss": 0.5611, "mean_token_accuracy": 0.8194187000393868, "num_tokens": 22732586.0, "step": 18930 }, { "entropy": 1.8547095090150834, "epoch": 0.05871234284412322, "grad_norm": 9.405129432678223, "learning_rate": 9.173205463528046e-06, "loss": 0.6397, "mean_token_accuracy": 0.8073139742016793, "num_tokens": 22743872.0, "step": 18940 }, { "entropy": 1.7496973037719727, "epoch": 0.05874334196917292, "grad_norm": 11.039433479309082, "learning_rate": 9.178049016758694e-06, "loss": 0.537, "mean_token_accuracy": 0.8219654500484467, "num_tokens": 22756915.0, "step": 18950 }, { "entropy": 1.8445199981331826, "epoch": 0.05877434109422262, "grad_norm": 10.18836498260498, "learning_rate": 9.182892569989344e-06, "loss": 0.5484, "mean_token_accuracy": 0.8294688493013382, "num_tokens": 22768399.0, "step": 18960 }, { "entropy": 1.8422922030091287, "epoch": 0.05880534021927231, "grad_norm": 10.27499008178711, "learning_rate": 9.187736123219994e-06, "loss": 0.6632, "mean_token_accuracy": 0.815483920276165, "num_tokens": 22780846.0, "step": 18970 }, { "entropy": 1.9067127719521522, "epoch": 0.05883633934432201, "grad_norm": 9.372925758361816, "learning_rate": 9.192579676450646e-06, "loss": 0.6038, "mean_token_accuracy": 0.8227426365017891, "num_tokens": 22792760.0, "step": 18980 }, { "entropy": 1.9074788480997085, "epoch": 0.0588673384693717, "grad_norm": 5.241424560546875, "learning_rate": 9.197423229681295e-06, "loss": 0.5884, "mean_token_accuracy": 0.8211500853300094, "num_tokens": 22804075.0, "step": 18990 }, { "entropy": 1.911788022518158, "epoch": 0.058898337594421396, "grad_norm": 12.606864929199219, "learning_rate": 9.202266782911945e-06, "loss": 0.6369, "mean_token_accuracy": 0.8175441965460777, "num_tokens": 22814931.0, "step": 19000 }, { "entropy": 1.902238203585148, "epoch": 0.05892933671947109, "grad_norm": 5.6409831047058105, "learning_rate": 9.207110336142595e-06, "loss": 0.6392, "mean_token_accuracy": 0.8043625161051751, "num_tokens": 22826140.0, "step": 19010 }, { "entropy": 1.8769164964556695, "epoch": 0.05896033584452079, "grad_norm": 5.731686592102051, "learning_rate": 9.211953889373245e-06, "loss": 0.6231, "mean_token_accuracy": 0.8021165639162063, "num_tokens": 22837728.0, "step": 19020 }, { "entropy": 1.7606040745973588, "epoch": 0.058991334969570486, "grad_norm": 12.572894096374512, "learning_rate": 9.216797442603895e-06, "loss": 0.5168, "mean_token_accuracy": 0.8280571162700653, "num_tokens": 22851334.0, "step": 19030 }, { "entropy": 1.8437473088502885, "epoch": 0.05902233409462018, "grad_norm": 10.523193359375, "learning_rate": 9.221640995834545e-06, "loss": 0.5031, "mean_token_accuracy": 0.8213176369667053, "num_tokens": 22864325.0, "step": 19040 }, { "entropy": 1.9376160144805907, "epoch": 0.05905333321966987, "grad_norm": 9.312682151794434, "learning_rate": 9.226484549065195e-06, "loss": 0.6229, "mean_token_accuracy": 0.8095207557082176, "num_tokens": 22876271.0, "step": 19050 }, { "entropy": 1.8676683530211449, "epoch": 0.05908433234471957, "grad_norm": 10.493414878845215, "learning_rate": 9.231328102295845e-06, "loss": 0.5505, "mean_token_accuracy": 0.8218972474336624, "num_tokens": 22888674.0, "step": 19060 }, { "entropy": 1.9024165108799935, "epoch": 0.059115331469769265, "grad_norm": 11.642742156982422, "learning_rate": 9.236171655526494e-06, "loss": 0.5729, "mean_token_accuracy": 0.8063879519701004, "num_tokens": 22901132.0, "step": 19070 }, { "entropy": 1.8537215188145637, "epoch": 0.05914633059481896, "grad_norm": 10.604762077331543, "learning_rate": 9.241015208757144e-06, "loss": 0.5936, "mean_token_accuracy": 0.8061328649520874, "num_tokens": 22913819.0, "step": 19080 }, { "entropy": 1.9175572365522384, "epoch": 0.05917732971986866, "grad_norm": 5.126205921173096, "learning_rate": 9.245858761987794e-06, "loss": 0.6629, "mean_token_accuracy": 0.8014189884066582, "num_tokens": 22924724.0, "step": 19090 }, { "entropy": 1.9114294454455376, "epoch": 0.059208328844918355, "grad_norm": 10.768912315368652, "learning_rate": 9.250702315218446e-06, "loss": 0.6447, "mean_token_accuracy": 0.8088298216462135, "num_tokens": 22936371.0, "step": 19100 }, { "entropy": 1.9175557851791383, "epoch": 0.059239327969968045, "grad_norm": 9.46970272064209, "learning_rate": 9.255545868449096e-06, "loss": 0.6537, "mean_token_accuracy": 0.810999846458435, "num_tokens": 22948269.0, "step": 19110 }, { "entropy": 1.80474793612957, "epoch": 0.05927032709501774, "grad_norm": 7.381761074066162, "learning_rate": 9.260389421679746e-06, "loss": 0.559, "mean_token_accuracy": 0.8227289646863938, "num_tokens": 22961310.0, "step": 19120 }, { "entropy": 1.868805430829525, "epoch": 0.05930132622006744, "grad_norm": 10.440200805664062, "learning_rate": 9.265232974910395e-06, "loss": 0.548, "mean_token_accuracy": 0.8252261653542519, "num_tokens": 22973534.0, "step": 19130 }, { "entropy": 1.9009343415498734, "epoch": 0.059332325345117135, "grad_norm": 10.150739669799805, "learning_rate": 9.270076528141045e-06, "loss": 0.6062, "mean_token_accuracy": 0.8130706340074539, "num_tokens": 22984921.0, "step": 19140 }, { "entropy": 1.9186480671167374, "epoch": 0.05936332447016683, "grad_norm": 9.625990867614746, "learning_rate": 9.274920081371695e-06, "loss": 0.6403, "mean_token_accuracy": 0.8057694777846336, "num_tokens": 22996985.0, "step": 19150 }, { "entropy": 1.8890679344534873, "epoch": 0.05939432359521653, "grad_norm": 10.16703987121582, "learning_rate": 9.279763634602345e-06, "loss": 0.6015, "mean_token_accuracy": 0.8147070035338402, "num_tokens": 23008337.0, "step": 19160 }, { "entropy": 1.9546454161405564, "epoch": 0.05942532272026622, "grad_norm": 13.017064094543457, "learning_rate": 9.284607187832995e-06, "loss": 0.7113, "mean_token_accuracy": 0.7958240196108818, "num_tokens": 23019347.0, "step": 19170 }, { "entropy": 1.8713775292038917, "epoch": 0.059456321845315914, "grad_norm": 12.783992767333984, "learning_rate": 9.289450741063645e-06, "loss": 0.5894, "mean_token_accuracy": 0.8084977343678474, "num_tokens": 23031934.0, "step": 19180 }, { "entropy": 1.9044471591711045, "epoch": 0.05948732097036561, "grad_norm": 11.211695671081543, "learning_rate": 9.294294294294295e-06, "loss": 0.5655, "mean_token_accuracy": 0.8160697892308235, "num_tokens": 23043989.0, "step": 19190 }, { "entropy": 1.8023822262883187, "epoch": 0.05951832009541531, "grad_norm": 10.612577438354492, "learning_rate": 9.299137847524945e-06, "loss": 0.6033, "mean_token_accuracy": 0.8159924641251564, "num_tokens": 23058046.0, "step": 19200 }, { "entropy": 1.9452301010489463, "epoch": 0.059549319220465004, "grad_norm": 11.767992973327637, "learning_rate": 9.303981400755595e-06, "loss": 0.68, "mean_token_accuracy": 0.803168785572052, "num_tokens": 23069178.0, "step": 19210 }, { "entropy": 1.918524721264839, "epoch": 0.0595803183455147, "grad_norm": 10.314653396606445, "learning_rate": 9.308824953986244e-06, "loss": 0.6834, "mean_token_accuracy": 0.8066515281796456, "num_tokens": 23080400.0, "step": 19220 }, { "entropy": 1.9129308730363845, "epoch": 0.05961131747056439, "grad_norm": 12.398180961608887, "learning_rate": 9.313668507216896e-06, "loss": 0.6329, "mean_token_accuracy": 0.8124301135540009, "num_tokens": 23091734.0, "step": 19230 }, { "entropy": 1.9237210959196092, "epoch": 0.05964231659561409, "grad_norm": 10.278820037841797, "learning_rate": 9.318512060447546e-06, "loss": 0.6262, "mean_token_accuracy": 0.8095857813954354, "num_tokens": 23103397.0, "step": 19240 }, { "entropy": 1.8663773894309998, "epoch": 0.05967331572066378, "grad_norm": 10.0914888381958, "learning_rate": 9.323355613678196e-06, "loss": 0.5701, "mean_token_accuracy": 0.821681647002697, "num_tokens": 23115814.0, "step": 19250 }, { "entropy": 1.7783602967858314, "epoch": 0.05970431484571348, "grad_norm": 3.7728965282440186, "learning_rate": 9.328199166908846e-06, "loss": 0.5334, "mean_token_accuracy": 0.8306898340582848, "num_tokens": 23128899.0, "step": 19260 }, { "entropy": 1.894435779750347, "epoch": 0.059735313970763176, "grad_norm": 11.809426307678223, "learning_rate": 9.333042720139494e-06, "loss": 0.6599, "mean_token_accuracy": 0.8009930282831192, "num_tokens": 23140688.0, "step": 19270 }, { "entropy": 1.8965399265289307, "epoch": 0.05976631309581287, "grad_norm": 10.043713569641113, "learning_rate": 9.337886273370145e-06, "loss": 0.6145, "mean_token_accuracy": 0.8076204761862755, "num_tokens": 23152768.0, "step": 19280 }, { "entropy": 1.9386232048273087, "epoch": 0.05979731222086257, "grad_norm": 9.522479057312012, "learning_rate": 9.342729826600795e-06, "loss": 0.6379, "mean_token_accuracy": 0.8103454813361168, "num_tokens": 23164363.0, "step": 19290 }, { "entropy": 1.9121969774365426, "epoch": 0.05982831134591226, "grad_norm": 5.025160789489746, "learning_rate": 9.347573379831445e-06, "loss": 0.5836, "mean_token_accuracy": 0.8114672183990479, "num_tokens": 23176305.0, "step": 19300 }, { "entropy": 1.8917111858725548, "epoch": 0.059859310470961956, "grad_norm": 4.935334205627441, "learning_rate": 9.352416933062095e-06, "loss": 0.599, "mean_token_accuracy": 0.8276031494140625, "num_tokens": 23187205.0, "step": 19310 }, { "entropy": 1.96262284219265, "epoch": 0.05989030959601165, "grad_norm": 9.109956741333008, "learning_rate": 9.357260486292745e-06, "loss": 0.646, "mean_token_accuracy": 0.8043781608343125, "num_tokens": 23198670.0, "step": 19320 }, { "entropy": 1.9255134493112565, "epoch": 0.05992130872106135, "grad_norm": 9.983922958374023, "learning_rate": 9.362104039523395e-06, "loss": 0.6009, "mean_token_accuracy": 0.8162341311573982, "num_tokens": 23210792.0, "step": 19330 }, { "entropy": 1.9093994736671447, "epoch": 0.059952307846111046, "grad_norm": 9.770051002502441, "learning_rate": 9.366947592754045e-06, "loss": 0.663, "mean_token_accuracy": 0.8095744162797928, "num_tokens": 23222187.0, "step": 19340 }, { "entropy": 1.8983681246638298, "epoch": 0.05998330697116074, "grad_norm": 13.514256477355957, "learning_rate": 9.371791145984696e-06, "loss": 0.6243, "mean_token_accuracy": 0.7971692577004432, "num_tokens": 23233365.0, "step": 19350 }, { "entropy": 1.9183629781007767, "epoch": 0.06001430609621043, "grad_norm": 10.821701049804688, "learning_rate": 9.376634699215346e-06, "loss": 0.6502, "mean_token_accuracy": 0.8030115276575088, "num_tokens": 23243672.0, "step": 19360 }, { "entropy": 1.807152123749256, "epoch": 0.06004530522126013, "grad_norm": 4.91693639755249, "learning_rate": 9.381478252445994e-06, "loss": 0.4923, "mean_token_accuracy": 0.8239471137523651, "num_tokens": 23256613.0, "step": 19370 }, { "entropy": 1.869692163169384, "epoch": 0.060076304346309825, "grad_norm": 8.64077377319336, "learning_rate": 9.386321805676644e-06, "loss": 0.6068, "mean_token_accuracy": 0.8187321960926056, "num_tokens": 23268668.0, "step": 19380 }, { "entropy": 1.857068532705307, "epoch": 0.06010730347135952, "grad_norm": 11.18220329284668, "learning_rate": 9.391165358907294e-06, "loss": 0.6335, "mean_token_accuracy": 0.8069566667079926, "num_tokens": 23281042.0, "step": 19390 }, { "entropy": 1.7603770941495895, "epoch": 0.06013830259640922, "grad_norm": 4.09375524520874, "learning_rate": 9.396008912137946e-06, "loss": 0.5366, "mean_token_accuracy": 0.8220126062631607, "num_tokens": 23295383.0, "step": 19400 }, { "entropy": 1.8189956784248351, "epoch": 0.060169301721458915, "grad_norm": 13.177933692932129, "learning_rate": 9.400852465368596e-06, "loss": 0.5506, "mean_token_accuracy": 0.8236996352672576, "num_tokens": 23307499.0, "step": 19410 }, { "entropy": 1.7352744668722153, "epoch": 0.060200300846508605, "grad_norm": 10.09691333770752, "learning_rate": 9.405696018599245e-06, "loss": 0.4693, "mean_token_accuracy": 0.8295902729034423, "num_tokens": 23320862.0, "step": 19420 }, { "entropy": 1.8696271255612373, "epoch": 0.0602312999715583, "grad_norm": 11.082074165344238, "learning_rate": 9.410539571829895e-06, "loss": 0.5574, "mean_token_accuracy": 0.8225132539868355, "num_tokens": 23332219.0, "step": 19430 }, { "entropy": 1.8461137875914573, "epoch": 0.060262299096608, "grad_norm": 10.549184799194336, "learning_rate": 9.415383125060545e-06, "loss": 0.6182, "mean_token_accuracy": 0.8168609410524368, "num_tokens": 23343965.0, "step": 19440 }, { "entropy": 1.920869068801403, "epoch": 0.060293298221657694, "grad_norm": 4.992405891418457, "learning_rate": 9.420226678291195e-06, "loss": 0.6075, "mean_token_accuracy": 0.8106775835156441, "num_tokens": 23355438.0, "step": 19450 }, { "entropy": 1.8544103041291238, "epoch": 0.06032429734670739, "grad_norm": 10.945703506469727, "learning_rate": 9.425070231521845e-06, "loss": 0.5264, "mean_token_accuracy": 0.8255405530333519, "num_tokens": 23368404.0, "step": 19460 }, { "entropy": 1.841069608926773, "epoch": 0.06035529647175709, "grad_norm": 11.322453498840332, "learning_rate": 9.429913784752495e-06, "loss": 0.5469, "mean_token_accuracy": 0.8234514787793159, "num_tokens": 23381143.0, "step": 19470 }, { "entropy": 1.8469559505581856, "epoch": 0.06038629559680678, "grad_norm": 3.433302164077759, "learning_rate": 9.434757337983145e-06, "loss": 0.5658, "mean_token_accuracy": 0.8232303768396377, "num_tokens": 23393701.0, "step": 19480 }, { "entropy": 1.965156337618828, "epoch": 0.060417294721856474, "grad_norm": 11.884709358215332, "learning_rate": 9.439600891213795e-06, "loss": 0.6732, "mean_token_accuracy": 0.8007081165909767, "num_tokens": 23404106.0, "step": 19490 }, { "entropy": 1.9104407191276551, "epoch": 0.06044829384690617, "grad_norm": 9.23556900024414, "learning_rate": 9.444444444444445e-06, "loss": 0.6494, "mean_token_accuracy": 0.8021333515644073, "num_tokens": 23415825.0, "step": 19500 }, { "entropy": 1.885290040075779, "epoch": 0.06047929297195587, "grad_norm": 10.439990043640137, "learning_rate": 9.449287997675094e-06, "loss": 0.5873, "mean_token_accuracy": 0.8280000373721123, "num_tokens": 23426924.0, "step": 19510 }, { "entropy": 1.8925482839345933, "epoch": 0.060510292097005564, "grad_norm": 10.890645027160645, "learning_rate": 9.454131550905746e-06, "loss": 0.5907, "mean_token_accuracy": 0.8103524506092071, "num_tokens": 23439002.0, "step": 19520 }, { "entropy": 1.9133965462446212, "epoch": 0.06054129122205526, "grad_norm": 11.458105087280273, "learning_rate": 9.458975104136396e-06, "loss": 0.6338, "mean_token_accuracy": 0.8142278388142585, "num_tokens": 23450330.0, "step": 19530 }, { "entropy": 1.9326462358236314, "epoch": 0.06057229034710495, "grad_norm": 9.607535362243652, "learning_rate": 9.463818657367046e-06, "loss": 0.686, "mean_token_accuracy": 0.7996918767690658, "num_tokens": 23461629.0, "step": 19540 }, { "entropy": 1.876935575157404, "epoch": 0.060603289472154646, "grad_norm": 3.0272324085235596, "learning_rate": 9.468662210597696e-06, "loss": 0.5273, "mean_token_accuracy": 0.8154638946056366, "num_tokens": 23474503.0, "step": 19550 }, { "entropy": 1.9294023901224135, "epoch": 0.06063428859720434, "grad_norm": 9.758378982543945, "learning_rate": 9.473505763828346e-06, "loss": 0.6173, "mean_token_accuracy": 0.8036470055580139, "num_tokens": 23486313.0, "step": 19560 }, { "entropy": 1.9118182629346847, "epoch": 0.06066528772225404, "grad_norm": 8.618282318115234, "learning_rate": 9.478349317058995e-06, "loss": 0.6425, "mean_token_accuracy": 0.8147132098674774, "num_tokens": 23497631.0, "step": 19570 }, { "entropy": 1.8255969345569611, "epoch": 0.060696286847303736, "grad_norm": 4.955978870391846, "learning_rate": 9.483192870289645e-06, "loss": 0.5395, "mean_token_accuracy": 0.8196621656417846, "num_tokens": 23511011.0, "step": 19580 }, { "entropy": 1.7804076254367829, "epoch": 0.06072728597235343, "grad_norm": 9.806567192077637, "learning_rate": 9.488036423520295e-06, "loss": 0.5345, "mean_token_accuracy": 0.8275730326771736, "num_tokens": 23523377.0, "step": 19590 }, { "entropy": 1.7767413407564163, "epoch": 0.06075828509740312, "grad_norm": 5.016587257385254, "learning_rate": 9.492879976750945e-06, "loss": 0.5298, "mean_token_accuracy": 0.8214924201369286, "num_tokens": 23537122.0, "step": 19600 }, { "entropy": 1.8715672463178634, "epoch": 0.06078928422245282, "grad_norm": 6.352642059326172, "learning_rate": 9.497723529981595e-06, "loss": 0.5998, "mean_token_accuracy": 0.8125947907567024, "num_tokens": 23548737.0, "step": 19610 }, { "entropy": 1.7911435902118682, "epoch": 0.060820283347502516, "grad_norm": 11.929505348205566, "learning_rate": 9.502567083212245e-06, "loss": 0.546, "mean_token_accuracy": 0.8234824985265732, "num_tokens": 23561524.0, "step": 19620 }, { "entropy": 1.8728584364056586, "epoch": 0.06085128247255221, "grad_norm": 9.866854667663574, "learning_rate": 9.507410636442895e-06, "loss": 0.5745, "mean_token_accuracy": 0.8147120550274849, "num_tokens": 23573298.0, "step": 19630 }, { "entropy": 1.7628555655479432, "epoch": 0.06088228159760191, "grad_norm": 4.673979759216309, "learning_rate": 9.512254189673545e-06, "loss": 0.551, "mean_token_accuracy": 0.823185084760189, "num_tokens": 23585490.0, "step": 19640 }, { "entropy": 1.9162281841039657, "epoch": 0.060913280722651605, "grad_norm": 9.883894920349121, "learning_rate": 9.517097742904196e-06, "loss": 0.6322, "mean_token_accuracy": 0.8094197094440461, "num_tokens": 23596459.0, "step": 19650 }, { "entropy": 1.7629586443305016, "epoch": 0.0609442798477013, "grad_norm": 9.179272651672363, "learning_rate": 9.521941296134846e-06, "loss": 0.5226, "mean_token_accuracy": 0.8276288509368896, "num_tokens": 23609459.0, "step": 19660 }, { "entropy": 1.8585124626755714, "epoch": 0.06097527897275099, "grad_norm": 10.178589820861816, "learning_rate": 9.526784849365496e-06, "loss": 0.6334, "mean_token_accuracy": 0.8119170770049096, "num_tokens": 23622078.0, "step": 19670 }, { "entropy": 1.8904533594846726, "epoch": 0.06100627809780069, "grad_norm": 9.675638198852539, "learning_rate": 9.531628402596146e-06, "loss": 0.6203, "mean_token_accuracy": 0.8164917901158333, "num_tokens": 23632839.0, "step": 19680 }, { "entropy": 1.8115496248006822, "epoch": 0.061037277222850385, "grad_norm": 12.059412956237793, "learning_rate": 9.536471955826794e-06, "loss": 0.5708, "mean_token_accuracy": 0.8198250159621239, "num_tokens": 23644698.0, "step": 19690 }, { "entropy": 1.8150453560054303, "epoch": 0.06106827634790008, "grad_norm": 10.134939193725586, "learning_rate": 9.541315509057446e-06, "loss": 0.6001, "mean_token_accuracy": 0.815633225440979, "num_tokens": 23657908.0, "step": 19700 }, { "entropy": 1.7574081301689148, "epoch": 0.06109927547294978, "grad_norm": 5.226987838745117, "learning_rate": 9.546159062288096e-06, "loss": 0.5158, "mean_token_accuracy": 0.8180340453982353, "num_tokens": 23671715.0, "step": 19710 }, { "entropy": 1.8007687643170356, "epoch": 0.061130274597999475, "grad_norm": 9.646768569946289, "learning_rate": 9.551002615518745e-06, "loss": 0.5652, "mean_token_accuracy": 0.8148997142910958, "num_tokens": 23684332.0, "step": 19720 }, { "entropy": 1.836924096941948, "epoch": 0.061161273723049164, "grad_norm": 9.189767837524414, "learning_rate": 9.555846168749395e-06, "loss": 0.568, "mean_token_accuracy": 0.817747424542904, "num_tokens": 23696452.0, "step": 19730 }, { "entropy": 1.8786251202225686, "epoch": 0.06119227284809886, "grad_norm": 12.79636287689209, "learning_rate": 9.560689721980045e-06, "loss": 0.6144, "mean_token_accuracy": 0.8185794189572334, "num_tokens": 23707815.0, "step": 19740 }, { "entropy": 1.8025352910161019, "epoch": 0.06122327197314856, "grad_norm": 11.08669376373291, "learning_rate": 9.565533275210695e-06, "loss": 0.5575, "mean_token_accuracy": 0.8157659009099006, "num_tokens": 23720644.0, "step": 19750 }, { "entropy": 1.9162618890404701, "epoch": 0.061254271098198254, "grad_norm": 10.929869651794434, "learning_rate": 9.570376828441345e-06, "loss": 0.6926, "mean_token_accuracy": 0.8083868056535721, "num_tokens": 23731943.0, "step": 19760 }, { "entropy": 1.8641342878341676, "epoch": 0.06128527022324795, "grad_norm": 9.528030395507812, "learning_rate": 9.575220381671997e-06, "loss": 0.5878, "mean_token_accuracy": 0.8118364483118057, "num_tokens": 23744425.0, "step": 19770 }, { "entropy": 1.8606672033667564, "epoch": 0.06131626934829765, "grad_norm": 12.20190715789795, "learning_rate": 9.580063934902646e-06, "loss": 0.6066, "mean_token_accuracy": 0.8095232203602791, "num_tokens": 23755827.0, "step": 19780 }, { "entropy": 1.9077710419893266, "epoch": 0.06134726847334734, "grad_norm": 9.677120208740234, "learning_rate": 9.584907488133295e-06, "loss": 0.6099, "mean_token_accuracy": 0.8066776558756829, "num_tokens": 23767808.0, "step": 19790 }, { "entropy": 1.8880801230669022, "epoch": 0.061378267598397034, "grad_norm": 10.036674499511719, "learning_rate": 9.589751041363944e-06, "loss": 0.5946, "mean_token_accuracy": 0.8205381706357002, "num_tokens": 23779742.0, "step": 19800 }, { "entropy": 1.9239039212465285, "epoch": 0.06140926672344673, "grad_norm": 10.33579158782959, "learning_rate": 9.594594594594594e-06, "loss": 0.6357, "mean_token_accuracy": 0.8081588789820671, "num_tokens": 23790560.0, "step": 19810 }, { "entropy": 1.8777620539069175, "epoch": 0.06144026584849643, "grad_norm": 9.710867881774902, "learning_rate": 9.599438147825246e-06, "loss": 0.6344, "mean_token_accuracy": 0.8112061858177185, "num_tokens": 23802654.0, "step": 19820 }, { "entropy": 1.8921529993414878, "epoch": 0.06147126497354612, "grad_norm": 10.47066879272461, "learning_rate": 9.604281701055896e-06, "loss": 0.6444, "mean_token_accuracy": 0.8064588844776154, "num_tokens": 23813660.0, "step": 19830 }, { "entropy": 1.8871847927570342, "epoch": 0.06150226409859582, "grad_norm": 10.314284324645996, "learning_rate": 9.609125254286546e-06, "loss": 0.5919, "mean_token_accuracy": 0.8220043256878853, "num_tokens": 23825033.0, "step": 19840 }, { "entropy": 1.8442516967654228, "epoch": 0.06153326322364551, "grad_norm": 2.9218690395355225, "learning_rate": 9.613968807517196e-06, "loss": 0.5303, "mean_token_accuracy": 0.8329377219080925, "num_tokens": 23837886.0, "step": 19850 }, { "entropy": 1.9648214638233186, "epoch": 0.061564262348695206, "grad_norm": 10.593524932861328, "learning_rate": 9.618812360747845e-06, "loss": 0.6819, "mean_token_accuracy": 0.8005811557173729, "num_tokens": 23848949.0, "step": 19860 }, { "entropy": 1.9761517822742463, "epoch": 0.0615952614737449, "grad_norm": 11.34875774383545, "learning_rate": 9.623655913978495e-06, "loss": 0.6411, "mean_token_accuracy": 0.8136752307415008, "num_tokens": 23860198.0, "step": 19870 }, { "entropy": 1.935739828646183, "epoch": 0.0616262605987946, "grad_norm": 10.864781379699707, "learning_rate": 9.628499467209145e-06, "loss": 0.6602, "mean_token_accuracy": 0.8089005783200264, "num_tokens": 23872637.0, "step": 19880 }, { "entropy": 1.8924452632665634, "epoch": 0.061657259723844296, "grad_norm": 10.361483573913574, "learning_rate": 9.633343020439797e-06, "loss": 0.5566, "mean_token_accuracy": 0.8306096389889717, "num_tokens": 23885123.0, "step": 19890 }, { "entropy": 1.9373007863759995, "epoch": 0.06168825884889399, "grad_norm": 9.866893768310547, "learning_rate": 9.638186573670445e-06, "loss": 0.5791, "mean_token_accuracy": 0.8127657011151314, "num_tokens": 23897519.0, "step": 19900 }, { "entropy": 1.9933653771877289, "epoch": 0.06171925797394368, "grad_norm": 10.006440162658691, "learning_rate": 9.643030126901095e-06, "loss": 0.6684, "mean_token_accuracy": 0.810961103439331, "num_tokens": 23908829.0, "step": 19910 }, { "entropy": 1.9257026076316834, "epoch": 0.06175025709899338, "grad_norm": 10.654083251953125, "learning_rate": 9.647873680131745e-06, "loss": 0.6223, "mean_token_accuracy": 0.8109946802258492, "num_tokens": 23920767.0, "step": 19920 }, { "entropy": 1.956542044878006, "epoch": 0.061781256224043075, "grad_norm": 11.350798606872559, "learning_rate": 9.652717233362395e-06, "loss": 0.6668, "mean_token_accuracy": 0.8026480153203011, "num_tokens": 23932164.0, "step": 19930 }, { "entropy": 2.0048277229070663, "epoch": 0.06181225534909277, "grad_norm": 11.242584228515625, "learning_rate": 9.657560786593046e-06, "loss": 0.6457, "mean_token_accuracy": 0.8089631497859955, "num_tokens": 23942822.0, "step": 19940 }, { "entropy": 1.9254214867949486, "epoch": 0.06184325447414247, "grad_norm": 10.975452423095703, "learning_rate": 9.662404339823696e-06, "loss": 0.6117, "mean_token_accuracy": 0.8136309564113617, "num_tokens": 23954872.0, "step": 19950 }, { "entropy": 1.8969217911362648, "epoch": 0.061874253599192165, "grad_norm": 9.684366226196289, "learning_rate": 9.667247893054346e-06, "loss": 0.6079, "mean_token_accuracy": 0.8263826936483383, "num_tokens": 23967198.0, "step": 19960 }, { "entropy": 1.9214407287538051, "epoch": 0.061905252724241855, "grad_norm": 10.67751693725586, "learning_rate": 9.672091446284996e-06, "loss": 0.582, "mean_token_accuracy": 0.8223884150385856, "num_tokens": 23979194.0, "step": 19970 }, { "entropy": 1.8657239809632302, "epoch": 0.06193625184929155, "grad_norm": 9.526717185974121, "learning_rate": 9.676934999515646e-06, "loss": 0.538, "mean_token_accuracy": 0.8274316728115082, "num_tokens": 23991363.0, "step": 19980 }, { "entropy": 1.9447234928607942, "epoch": 0.06196725097434125, "grad_norm": 11.365060806274414, "learning_rate": 9.681778552746296e-06, "loss": 0.6874, "mean_token_accuracy": 0.8004522323608398, "num_tokens": 24002188.0, "step": 19990 }, { "entropy": 1.8590040877461433, "epoch": 0.061998250099390945, "grad_norm": 6.042444705963135, "learning_rate": 9.686622105976946e-06, "loss": 0.5923, "mean_token_accuracy": 0.814041730761528, "num_tokens": 24015280.0, "step": 20000 }, { "entropy": 1.9083186939358712, "epoch": 0.06202924922444064, "grad_norm": 10.619732856750488, "learning_rate": 9.691465659207595e-06, "loss": 0.5909, "mean_token_accuracy": 0.8160168409347535, "num_tokens": 24027453.0, "step": 20010 }, { "entropy": 1.922300359606743, "epoch": 0.06206024834949034, "grad_norm": 12.882160186767578, "learning_rate": 9.696309212438245e-06, "loss": 0.649, "mean_token_accuracy": 0.7888608485460281, "num_tokens": 24038637.0, "step": 20020 }, { "entropy": 1.9450786724686622, "epoch": 0.062091247474540034, "grad_norm": 9.845813751220703, "learning_rate": 9.701152765668895e-06, "loss": 0.645, "mean_token_accuracy": 0.8010739460587502, "num_tokens": 24050162.0, "step": 20030 }, { "entropy": 1.9187857955694199, "epoch": 0.062122246599589724, "grad_norm": 9.616793632507324, "learning_rate": 9.705996318899545e-06, "loss": 0.6361, "mean_token_accuracy": 0.8148136526346207, "num_tokens": 24061516.0, "step": 20040 }, { "entropy": 1.90914705991745, "epoch": 0.06215324572463942, "grad_norm": 12.597700119018555, "learning_rate": 9.710839872130195e-06, "loss": 0.6398, "mean_token_accuracy": 0.8118062347173691, "num_tokens": 24072040.0, "step": 20050 }, { "entropy": 1.7790238752961158, "epoch": 0.06218424484968912, "grad_norm": 14.254045486450195, "learning_rate": 9.715683425360845e-06, "loss": 0.5597, "mean_token_accuracy": 0.824961057305336, "num_tokens": 24084516.0, "step": 20060 }, { "entropy": 1.8951515421271323, "epoch": 0.062215243974738814, "grad_norm": 10.398776054382324, "learning_rate": 9.720526978591496e-06, "loss": 0.6339, "mean_token_accuracy": 0.8136864483356476, "num_tokens": 24096174.0, "step": 20070 }, { "entropy": 1.8427141726016998, "epoch": 0.06224624309978851, "grad_norm": 9.789461135864258, "learning_rate": 9.725370531822146e-06, "loss": 0.5943, "mean_token_accuracy": 0.8145849913358688, "num_tokens": 24107922.0, "step": 20080 }, { "entropy": 1.9320272147655486, "epoch": 0.06227724222483821, "grad_norm": 11.617203712463379, "learning_rate": 9.730214085052796e-06, "loss": 0.668, "mean_token_accuracy": 0.802889634668827, "num_tokens": 24119219.0, "step": 20090 }, { "entropy": 1.8822171539068222, "epoch": 0.0623082413498879, "grad_norm": 10.93297004699707, "learning_rate": 9.735057638283446e-06, "loss": 0.5582, "mean_token_accuracy": 0.8173771098256111, "num_tokens": 24131388.0, "step": 20100 }, { "entropy": 1.9237861156463623, "epoch": 0.06233924047493759, "grad_norm": 10.885433197021484, "learning_rate": 9.739901191514094e-06, "loss": 0.6236, "mean_token_accuracy": 0.8068635702133179, "num_tokens": 24143423.0, "step": 20110 }, { "entropy": 1.8605427652597428, "epoch": 0.06237023959998729, "grad_norm": 10.937981605529785, "learning_rate": 9.744744744744746e-06, "loss": 0.5793, "mean_token_accuracy": 0.8124633759260178, "num_tokens": 24155500.0, "step": 20120 }, { "entropy": 1.921093289554119, "epoch": 0.06240123872503699, "grad_norm": 4.816215515136719, "learning_rate": 9.749588297975396e-06, "loss": 0.6207, "mean_token_accuracy": 0.8212477296590805, "num_tokens": 24167119.0, "step": 20130 }, { "entropy": 1.822756864130497, "epoch": 0.06243223785008668, "grad_norm": 10.295876502990723, "learning_rate": 9.754431851206046e-06, "loss": 0.5307, "mean_token_accuracy": 0.8297939330339432, "num_tokens": 24179336.0, "step": 20140 }, { "entropy": 1.8902019761502742, "epoch": 0.06246323697513638, "grad_norm": 9.802702903747559, "learning_rate": 9.759275404436695e-06, "loss": 0.592, "mean_token_accuracy": 0.8155349537730217, "num_tokens": 24191945.0, "step": 20150 }, { "entropy": 1.9736180931329728, "epoch": 0.06249423610018607, "grad_norm": 10.913537979125977, "learning_rate": 9.764118957667345e-06, "loss": 0.646, "mean_token_accuracy": 0.8018572524189949, "num_tokens": 24203220.0, "step": 20160 }, { "entropy": 1.8472725585103036, "epoch": 0.06252523522523577, "grad_norm": 11.09897518157959, "learning_rate": 9.768962510897995e-06, "loss": 0.6617, "mean_token_accuracy": 0.8151746213436126, "num_tokens": 24216924.0, "step": 20170 }, { "entropy": 1.8363325014710425, "epoch": 0.06255623435028547, "grad_norm": 9.55675220489502, "learning_rate": 9.773806064128645e-06, "loss": 0.5394, "mean_token_accuracy": 0.8332865908741951, "num_tokens": 24229749.0, "step": 20180 }, { "entropy": 1.8290324866771699, "epoch": 0.06258723347533515, "grad_norm": 12.252378463745117, "learning_rate": 9.778649617359297e-06, "loss": 0.5426, "mean_token_accuracy": 0.8365808725357056, "num_tokens": 24242819.0, "step": 20190 }, { "entropy": 1.9081189304590225, "epoch": 0.06261823260038485, "grad_norm": 10.117230415344238, "learning_rate": 9.783493170589947e-06, "loss": 0.6745, "mean_token_accuracy": 0.8055532008409501, "num_tokens": 24253826.0, "step": 20200 }, { "entropy": 1.8748064145445824, "epoch": 0.06264923172543455, "grad_norm": 9.556116104125977, "learning_rate": 9.788336723820595e-06, "loss": 0.6262, "mean_token_accuracy": 0.8184110090136528, "num_tokens": 24264932.0, "step": 20210 }, { "entropy": 1.8884572684764862, "epoch": 0.06268023085048424, "grad_norm": 4.309450149536133, "learning_rate": 9.793180277051245e-06, "loss": 0.6162, "mean_token_accuracy": 0.8134237229824066, "num_tokens": 24277309.0, "step": 20220 }, { "entropy": 1.946840487420559, "epoch": 0.06271122997553394, "grad_norm": 10.026110649108887, "learning_rate": 9.798023830281895e-06, "loss": 0.6271, "mean_token_accuracy": 0.8044777557253837, "num_tokens": 24288780.0, "step": 20230 }, { "entropy": 2.013564696907997, "epoch": 0.06274222910058364, "grad_norm": 9.077942848205566, "learning_rate": 9.802867383512546e-06, "loss": 0.6781, "mean_token_accuracy": 0.800484599173069, "num_tokens": 24299775.0, "step": 20240 }, { "entropy": 1.8675778850913047, "epoch": 0.06277322822563333, "grad_norm": 9.851303100585938, "learning_rate": 9.807710936743196e-06, "loss": 0.5568, "mean_token_accuracy": 0.8174749821424484, "num_tokens": 24312128.0, "step": 20250 }, { "entropy": 1.9423525124788283, "epoch": 0.06280422735068303, "grad_norm": 7.952591419219971, "learning_rate": 9.812554489973846e-06, "loss": 0.6073, "mean_token_accuracy": 0.8145397856831551, "num_tokens": 24323096.0, "step": 20260 }, { "entropy": 1.7513898000121118, "epoch": 0.06283522647573273, "grad_norm": 5.437192916870117, "learning_rate": 9.817398043204496e-06, "loss": 0.5386, "mean_token_accuracy": 0.8165362849831581, "num_tokens": 24337460.0, "step": 20270 }, { "entropy": 1.907211183011532, "epoch": 0.06286622560078242, "grad_norm": 5.400768756866455, "learning_rate": 9.822241596435146e-06, "loss": 0.6175, "mean_token_accuracy": 0.8163802400231361, "num_tokens": 24349174.0, "step": 20280 }, { "entropy": 1.93508680164814, "epoch": 0.06289722472583212, "grad_norm": 10.331210136413574, "learning_rate": 9.827085149665796e-06, "loss": 0.6279, "mean_token_accuracy": 0.8010757058858872, "num_tokens": 24361061.0, "step": 20290 }, { "entropy": 1.9249700546264648, "epoch": 0.06292822385088181, "grad_norm": 9.38033390045166, "learning_rate": 9.831928702896445e-06, "loss": 0.608, "mean_token_accuracy": 0.8222104609012604, "num_tokens": 24372277.0, "step": 20300 }, { "entropy": 1.8913897737860679, "epoch": 0.0629592229759315, "grad_norm": 4.759194374084473, "learning_rate": 9.836772256127097e-06, "loss": 0.6044, "mean_token_accuracy": 0.8175428286194801, "num_tokens": 24384034.0, "step": 20310 }, { "entropy": 1.8855891197919845, "epoch": 0.0629902221009812, "grad_norm": 4.686741828918457, "learning_rate": 9.841615809357745e-06, "loss": 0.6091, "mean_token_accuracy": 0.8151167094707489, "num_tokens": 24395926.0, "step": 20320 }, { "entropy": 1.9041914016008377, "epoch": 0.06302122122603089, "grad_norm": 4.964253902435303, "learning_rate": 9.846459362588395e-06, "loss": 0.6008, "mean_token_accuracy": 0.8038349270820617, "num_tokens": 24407924.0, "step": 20330 }, { "entropy": 1.8939720645546914, "epoch": 0.06305222035108059, "grad_norm": 10.10213851928711, "learning_rate": 9.851302915819045e-06, "loss": 0.6206, "mean_token_accuracy": 0.8080206617712975, "num_tokens": 24419254.0, "step": 20340 }, { "entropy": 1.8911166504025458, "epoch": 0.06308321947613028, "grad_norm": 4.227776050567627, "learning_rate": 9.856146469049695e-06, "loss": 0.6281, "mean_token_accuracy": 0.8054097011685372, "num_tokens": 24431388.0, "step": 20350 }, { "entropy": 1.8960236981511116, "epoch": 0.06311421860117998, "grad_norm": 12.084880828857422, "learning_rate": 9.860990022280346e-06, "loss": 0.601, "mean_token_accuracy": 0.8084520295262336, "num_tokens": 24443385.0, "step": 20360 }, { "entropy": 1.871785145998001, "epoch": 0.06314521772622968, "grad_norm": 10.692331314086914, "learning_rate": 9.865833575510996e-06, "loss": 0.5794, "mean_token_accuracy": 0.8134460240602494, "num_tokens": 24454988.0, "step": 20370 }, { "entropy": 1.9693672031164169, "epoch": 0.06317621685127937, "grad_norm": 10.991315841674805, "learning_rate": 9.870677128741646e-06, "loss": 0.7165, "mean_token_accuracy": 0.7969113975763321, "num_tokens": 24465991.0, "step": 20380 }, { "entropy": 1.9215772941708564, "epoch": 0.06320721597632907, "grad_norm": 11.294339179992676, "learning_rate": 9.875520681972296e-06, "loss": 0.6661, "mean_token_accuracy": 0.7995204910635948, "num_tokens": 24478784.0, "step": 20390 }, { "entropy": 1.750515715777874, "epoch": 0.06323821510137877, "grad_norm": 9.189525604248047, "learning_rate": 9.880364235202946e-06, "loss": 0.4688, "mean_token_accuracy": 0.8383323296904563, "num_tokens": 24492223.0, "step": 20400 }, { "entropy": 1.8770269468426704, "epoch": 0.06326921422642846, "grad_norm": 9.287210464477539, "learning_rate": 9.885207788433596e-06, "loss": 0.6492, "mean_token_accuracy": 0.8062692806124687, "num_tokens": 24503912.0, "step": 20410 }, { "entropy": 1.8530636951327324, "epoch": 0.06330021335147816, "grad_norm": 7.70045280456543, "learning_rate": 9.890051341664246e-06, "loss": 0.5643, "mean_token_accuracy": 0.8267319366335869, "num_tokens": 24515605.0, "step": 20420 }, { "entropy": 1.9133968889713286, "epoch": 0.06333121247652786, "grad_norm": 10.080760955810547, "learning_rate": 9.894894894894896e-06, "loss": 0.6142, "mean_token_accuracy": 0.8130926743149758, "num_tokens": 24527065.0, "step": 20430 }, { "entropy": 1.876038283109665, "epoch": 0.06336221160157754, "grad_norm": 9.016529083251953, "learning_rate": 9.899738448125546e-06, "loss": 0.6295, "mean_token_accuracy": 0.8140005797147751, "num_tokens": 24538739.0, "step": 20440 }, { "entropy": 1.8368092089891435, "epoch": 0.06339321072662724, "grad_norm": 9.778054237365723, "learning_rate": 9.904582001356195e-06, "loss": 0.6006, "mean_token_accuracy": 0.81819748878479, "num_tokens": 24551215.0, "step": 20450 }, { "entropy": 1.7856343001127244, "epoch": 0.06342420985167693, "grad_norm": 4.523138523101807, "learning_rate": 9.909425554586845e-06, "loss": 0.509, "mean_token_accuracy": 0.8145459353923797, "num_tokens": 24564545.0, "step": 20460 }, { "entropy": 1.8864993780851365, "epoch": 0.06345520897672663, "grad_norm": 9.66295051574707, "learning_rate": 9.914269107817495e-06, "loss": 0.6304, "mean_token_accuracy": 0.8107953786849975, "num_tokens": 24576201.0, "step": 20470 }, { "entropy": 1.8306699201464653, "epoch": 0.06348620810177633, "grad_norm": 8.954712867736816, "learning_rate": 9.919112661048145e-06, "loss": 0.5188, "mean_token_accuracy": 0.8272392004728317, "num_tokens": 24588462.0, "step": 20480 }, { "entropy": 1.8582230091094971, "epoch": 0.06351720722682602, "grad_norm": 5.900847911834717, "learning_rate": 9.923956214278797e-06, "loss": 0.6333, "mean_token_accuracy": 0.8117007941007615, "num_tokens": 24601048.0, "step": 20490 }, { "entropy": 1.9099309206008912, "epoch": 0.06354820635187572, "grad_norm": 10.398176193237305, "learning_rate": 9.928799767509447e-06, "loss": 0.6342, "mean_token_accuracy": 0.8157493248581886, "num_tokens": 24612082.0, "step": 20500 }, { "entropy": 1.8192737758159638, "epoch": 0.06357920547692542, "grad_norm": 4.901766300201416, "learning_rate": 9.933643320740096e-06, "loss": 0.5962, "mean_token_accuracy": 0.8077544063329697, "num_tokens": 24624244.0, "step": 20510 }, { "entropy": 1.9380394339561462, "epoch": 0.06361020460197511, "grad_norm": 9.72773551940918, "learning_rate": 9.938486873970746e-06, "loss": 0.6725, "mean_token_accuracy": 0.8050031334161758, "num_tokens": 24635227.0, "step": 20520 }, { "entropy": 1.8201100319623946, "epoch": 0.06364120372702481, "grad_norm": 5.236706256866455, "learning_rate": 9.943330427201394e-06, "loss": 0.5545, "mean_token_accuracy": 0.8271597489714623, "num_tokens": 24647349.0, "step": 20530 }, { "entropy": 1.955614548921585, "epoch": 0.0636722028520745, "grad_norm": 13.138460159301758, "learning_rate": 9.948173980432046e-06, "loss": 0.7057, "mean_token_accuracy": 0.796806488931179, "num_tokens": 24658536.0, "step": 20540 }, { "entropy": 1.8944236859679222, "epoch": 0.0637032019771242, "grad_norm": 7.109180927276611, "learning_rate": 9.953017533662696e-06, "loss": 0.6351, "mean_token_accuracy": 0.8019287914037705, "num_tokens": 24670364.0, "step": 20550 }, { "entropy": 1.8803847134113312, "epoch": 0.06373420110217388, "grad_norm": 9.590316772460938, "learning_rate": 9.957861086893346e-06, "loss": 0.6256, "mean_token_accuracy": 0.8110292464494705, "num_tokens": 24682351.0, "step": 20560 }, { "entropy": 1.9103557452559472, "epoch": 0.06376520022722358, "grad_norm": 10.402158737182617, "learning_rate": 9.962704640123996e-06, "loss": 0.6655, "mean_token_accuracy": 0.7996529176831245, "num_tokens": 24693410.0, "step": 20570 }, { "entropy": 1.8738579228520393, "epoch": 0.06379619935227328, "grad_norm": 8.206809997558594, "learning_rate": 9.967548193354646e-06, "loss": 0.5871, "mean_token_accuracy": 0.8173448964953423, "num_tokens": 24705775.0, "step": 20580 }, { "entropy": 1.8843411058187485, "epoch": 0.06382719847732297, "grad_norm": 11.01453971862793, "learning_rate": 9.972391746585295e-06, "loss": 0.6117, "mean_token_accuracy": 0.808614219725132, "num_tokens": 24717660.0, "step": 20590 }, { "entropy": 1.8635393604636192, "epoch": 0.06385819760237267, "grad_norm": 10.689123153686523, "learning_rate": 9.977235299815945e-06, "loss": 0.6426, "mean_token_accuracy": 0.8036311730742455, "num_tokens": 24729686.0, "step": 20600 }, { "entropy": 1.9377967566251755, "epoch": 0.06388919672742237, "grad_norm": 11.218433380126953, "learning_rate": 9.982078853046597e-06, "loss": 0.6821, "mean_token_accuracy": 0.8068268105387688, "num_tokens": 24739992.0, "step": 20610 }, { "entropy": 1.8930637896060944, "epoch": 0.06392019585247206, "grad_norm": 10.52428150177002, "learning_rate": 9.986922406277247e-06, "loss": 0.6418, "mean_token_accuracy": 0.802625036239624, "num_tokens": 24751815.0, "step": 20620 }, { "entropy": 1.9082047209143638, "epoch": 0.06395119497752176, "grad_norm": 9.473612785339355, "learning_rate": 9.991765959507895e-06, "loss": 0.612, "mean_token_accuracy": 0.8163775265216827, "num_tokens": 24763588.0, "step": 20630 }, { "entropy": 1.8783060640096665, "epoch": 0.06398219410257146, "grad_norm": 10.244823455810547, "learning_rate": 9.996609512738545e-06, "loss": 0.6235, "mean_token_accuracy": 0.8123615190386773, "num_tokens": 24776091.0, "step": 20640 }, { "entropy": 1.8689485356211661, "epoch": 0.06401319322762115, "grad_norm": 9.722251892089844, "learning_rate": 9.999273546183343e-06, "loss": 0.5782, "mean_token_accuracy": 0.8126691862940788, "num_tokens": 24787537.0, "step": 20650 }, { "entropy": 1.869950420409441, "epoch": 0.06404419235267085, "grad_norm": 9.074020385742188, "learning_rate": 9.99685317639837e-06, "loss": 0.5865, "mean_token_accuracy": 0.8139035999774933, "num_tokens": 24799896.0, "step": 20660 }, { "entropy": 1.8778914123773576, "epoch": 0.06407519147772055, "grad_norm": 11.21394157409668, "learning_rate": 9.99443456334761e-06, "loss": 0.543, "mean_token_accuracy": 0.8123081102967262, "num_tokens": 24811525.0, "step": 20670 }, { "entropy": 1.9493975609540939, "epoch": 0.06410619060277023, "grad_norm": 13.748980522155762, "learning_rate": 9.992017704906994e-06, "loss": 0.647, "mean_token_accuracy": 0.8159363105893135, "num_tokens": 24822680.0, "step": 20680 }, { "entropy": 1.8906102269887923, "epoch": 0.06413718972781993, "grad_norm": 8.22630786895752, "learning_rate": 9.989602598956046e-06, "loss": 0.6267, "mean_token_accuracy": 0.8106621414422989, "num_tokens": 24834924.0, "step": 20690 }, { "entropy": 1.7157309882342815, "epoch": 0.06416818885286962, "grad_norm": 7.910831928253174, "learning_rate": 9.987189243377873e-06, "loss": 0.4686, "mean_token_accuracy": 0.8414781808853149, "num_tokens": 24849152.0, "step": 20700 }, { "entropy": 1.9214319348335267, "epoch": 0.06419918797791932, "grad_norm": 10.313517570495605, "learning_rate": 9.984777636059161e-06, "loss": 0.7036, "mean_token_accuracy": 0.7919689938426018, "num_tokens": 24860357.0, "step": 20710 }, { "entropy": 1.8385179117321968, "epoch": 0.06423018710296902, "grad_norm": 12.135076522827148, "learning_rate": 9.98236777489017e-06, "loss": 0.6188, "mean_token_accuracy": 0.8177133709192276, "num_tokens": 24872095.0, "step": 20720 }, { "entropy": 1.777934755384922, "epoch": 0.06426118622801871, "grad_norm": 4.797874450683594, "learning_rate": 9.979959657764716e-06, "loss": 0.4933, "mean_token_accuracy": 0.8323575839400291, "num_tokens": 24884741.0, "step": 20730 }, { "entropy": 1.8482744053006173, "epoch": 0.06429218535306841, "grad_norm": 9.592241287231445, "learning_rate": 9.977553282580177e-06, "loss": 0.5549, "mean_token_accuracy": 0.8222592979669571, "num_tokens": 24897187.0, "step": 20740 }, { "entropy": 1.8500724270939828, "epoch": 0.0643231844781181, "grad_norm": 12.197331428527832, "learning_rate": 9.975148647237474e-06, "loss": 0.6756, "mean_token_accuracy": 0.7995992138981819, "num_tokens": 24908381.0, "step": 20750 }, { "entropy": 1.75242570489645, "epoch": 0.0643541836031678, "grad_norm": 10.444792747497559, "learning_rate": 9.972745749641067e-06, "loss": 0.5345, "mean_token_accuracy": 0.8243147745728493, "num_tokens": 24920923.0, "step": 20760 }, { "entropy": 1.8526248589158059, "epoch": 0.0643851827282175, "grad_norm": 9.078262329101562, "learning_rate": 9.97034458769895e-06, "loss": 0.6349, "mean_token_accuracy": 0.8068570986390113, "num_tokens": 24932578.0, "step": 20770 }, { "entropy": 1.9244740456342697, "epoch": 0.0644161818532672, "grad_norm": 10.673161506652832, "learning_rate": 9.967945159322642e-06, "loss": 0.687, "mean_token_accuracy": 0.7971266448497772, "num_tokens": 24942881.0, "step": 20780 }, { "entropy": 1.8756011351943016, "epoch": 0.06444718097831689, "grad_norm": 9.983181953430176, "learning_rate": 9.965547462427177e-06, "loss": 0.6225, "mean_token_accuracy": 0.8142784401774407, "num_tokens": 24954564.0, "step": 20790 }, { "entropy": 1.8296712294220925, "epoch": 0.06447818010336659, "grad_norm": 10.166830062866211, "learning_rate": 9.963151494931094e-06, "loss": 0.6427, "mean_token_accuracy": 0.8073472276329994, "num_tokens": 24967386.0, "step": 20800 }, { "entropy": 1.8981720179319381, "epoch": 0.06450917922841627, "grad_norm": 11.244156837463379, "learning_rate": 9.960757254756438e-06, "loss": 0.669, "mean_token_accuracy": 0.8046011224389076, "num_tokens": 24978303.0, "step": 20810 }, { "entropy": 1.7593111276626587, "epoch": 0.06454017835346597, "grad_norm": 11.301054954528809, "learning_rate": 9.958364739828752e-06, "loss": 0.5735, "mean_token_accuracy": 0.8244746640324593, "num_tokens": 24991174.0, "step": 20820 }, { "entropy": 1.7012663453817367, "epoch": 0.06457117747851567, "grad_norm": 11.489786148071289, "learning_rate": 9.955973948077055e-06, "loss": 0.4713, "mean_token_accuracy": 0.8441385194659233, "num_tokens": 25003945.0, "step": 20830 }, { "entropy": 1.8124509736895562, "epoch": 0.06460217660356536, "grad_norm": 10.668350219726562, "learning_rate": 9.953584877433851e-06, "loss": 0.6364, "mean_token_accuracy": 0.8089659824967385, "num_tokens": 25016118.0, "step": 20840 }, { "entropy": 1.745204885303974, "epoch": 0.06463317572861506, "grad_norm": 10.901769638061523, "learning_rate": 9.951197525835119e-06, "loss": 0.5265, "mean_token_accuracy": 0.8224120557308197, "num_tokens": 25029471.0, "step": 20850 }, { "entropy": 1.835092043876648, "epoch": 0.06466417485366475, "grad_norm": 10.224356651306152, "learning_rate": 9.94881189122029e-06, "loss": 0.6312, "mean_token_accuracy": 0.8145354777574539, "num_tokens": 25041531.0, "step": 20860 }, { "entropy": 1.7869165703654288, "epoch": 0.06469517397871445, "grad_norm": 12.071456909179688, "learning_rate": 9.946427971532263e-06, "loss": 0.6134, "mean_token_accuracy": 0.8095423832535744, "num_tokens": 25054431.0, "step": 20870 }, { "entropy": 1.8208570063114167, "epoch": 0.06472617310376415, "grad_norm": 10.90512466430664, "learning_rate": 9.944045764717379e-06, "loss": 0.6059, "mean_token_accuracy": 0.813726843893528, "num_tokens": 25066679.0, "step": 20880 }, { "entropy": 1.7698472633957862, "epoch": 0.06475717222881384, "grad_norm": 4.871018409729004, "learning_rate": 9.941665268725422e-06, "loss": 0.5526, "mean_token_accuracy": 0.8185109004378319, "num_tokens": 25078924.0, "step": 20890 }, { "entropy": 1.7922344714403153, "epoch": 0.06478817135386354, "grad_norm": Infinity, "learning_rate": 9.939286481509611e-06, "loss": 0.5971, "mean_token_accuracy": 0.8192875310778618, "num_tokens": 25091264.0, "step": 20900 }, { "entropy": 1.873502266407013, "epoch": 0.06481917047891324, "grad_norm": 10.781414985656738, "learning_rate": 9.93690940102659e-06, "loss": 0.6287, "mean_token_accuracy": 0.8078038066625595, "num_tokens": 25102391.0, "step": 20910 }, { "entropy": 1.8319077044725418, "epoch": 0.06485016960396293, "grad_norm": 5.344398021697998, "learning_rate": 9.934534025236426e-06, "loss": 0.586, "mean_token_accuracy": 0.8086917147040367, "num_tokens": 25114388.0, "step": 20920 }, { "entropy": 1.8269810006022453, "epoch": 0.06488116872901262, "grad_norm": 10.4851655960083, "learning_rate": 9.93216035210259e-06, "loss": 0.6322, "mean_token_accuracy": 0.8202467620372772, "num_tokens": 25126354.0, "step": 20930 }, { "entropy": 1.842079259455204, "epoch": 0.06491216785406231, "grad_norm": 9.595065116882324, "learning_rate": 9.929788379591967e-06, "loss": 0.6143, "mean_token_accuracy": 0.81163260191679, "num_tokens": 25138345.0, "step": 20940 }, { "entropy": 1.8531109601259232, "epoch": 0.06494316697911201, "grad_norm": 10.80106258392334, "learning_rate": 9.92741810567483e-06, "loss": 0.5955, "mean_token_accuracy": 0.8235780015587807, "num_tokens": 25149452.0, "step": 20950 }, { "entropy": 1.7707980051636696, "epoch": 0.0649741661041617, "grad_norm": 9.250375747680664, "learning_rate": 9.925049528324852e-06, "loss": 0.5531, "mean_token_accuracy": 0.825781124830246, "num_tokens": 25162412.0, "step": 20960 }, { "entropy": 1.7839957982301713, "epoch": 0.0650051652292114, "grad_norm": 11.089311599731445, "learning_rate": 9.922682645519076e-06, "loss": 0.535, "mean_token_accuracy": 0.8361950904130936, "num_tokens": 25174698.0, "step": 20970 }, { "entropy": 1.8392813488841058, "epoch": 0.0650361643542611, "grad_norm": 9.709254264831543, "learning_rate": 9.920317455237932e-06, "loss": 0.6204, "mean_token_accuracy": 0.8103036060929298, "num_tokens": 25187089.0, "step": 20980 }, { "entropy": 1.89919556081295, "epoch": 0.0650671634793108, "grad_norm": 10.977981567382812, "learning_rate": 9.917953955465215e-06, "loss": 0.69, "mean_token_accuracy": 0.7884634464979172, "num_tokens": 25198974.0, "step": 20990 }, { "entropy": 1.8912572488188744, "epoch": 0.0650981626043605, "grad_norm": 11.527606964111328, "learning_rate": 9.915592144188078e-06, "loss": 0.6459, "mean_token_accuracy": 0.8056445106863975, "num_tokens": 25209971.0, "step": 21000 }, { "entropy": 1.8872391551733017, "epoch": 0.06512916172941019, "grad_norm": 10.813753128051758, "learning_rate": 9.913232019397025e-06, "loss": 0.7104, "mean_token_accuracy": 0.7989979892969131, "num_tokens": 25220791.0, "step": 21010 }, { "entropy": 1.8582836613059044, "epoch": 0.06516016085445989, "grad_norm": 9.749269485473633, "learning_rate": 9.910873579085914e-06, "loss": 0.6316, "mean_token_accuracy": 0.8038420900702477, "num_tokens": 25233097.0, "step": 21020 }, { "entropy": 1.9419476687908173, "epoch": 0.06519115997950958, "grad_norm": 10.637131690979004, "learning_rate": 9.908516821251943e-06, "loss": 0.6322, "mean_token_accuracy": 0.811190040409565, "num_tokens": 25244036.0, "step": 21030 }, { "entropy": 1.8555924728512765, "epoch": 0.06522215910455928, "grad_norm": 11.770044326782227, "learning_rate": 9.906161743895632e-06, "loss": 0.6444, "mean_token_accuracy": 0.8096898928284645, "num_tokens": 25255598.0, "step": 21040 }, { "entropy": 1.8110544815659524, "epoch": 0.06525315822960896, "grad_norm": 9.333264350891113, "learning_rate": 9.903808345020833e-06, "loss": 0.5622, "mean_token_accuracy": 0.8238018527626991, "num_tokens": 25268260.0, "step": 21050 }, { "entropy": 1.8922196328639984, "epoch": 0.06528415735465866, "grad_norm": 10.881926536560059, "learning_rate": 9.901456622634717e-06, "loss": 0.6427, "mean_token_accuracy": 0.8115344852209091, "num_tokens": 25279383.0, "step": 21060 }, { "entropy": 1.8002661630511283, "epoch": 0.06531515647970836, "grad_norm": 11.18062686920166, "learning_rate": 9.899106574747767e-06, "loss": 0.5344, "mean_token_accuracy": 0.8290015637874604, "num_tokens": 25292228.0, "step": 21070 }, { "entropy": 1.8496287435293197, "epoch": 0.06534615560475805, "grad_norm": 8.944875717163086, "learning_rate": 9.896758199373761e-06, "loss": 0.6023, "mean_token_accuracy": 0.8110730588436127, "num_tokens": 25304999.0, "step": 21080 }, { "entropy": 1.887920793890953, "epoch": 0.06537715472980775, "grad_norm": 5.810867786407471, "learning_rate": 9.894411494529786e-06, "loss": 0.5847, "mean_token_accuracy": 0.8183253973722457, "num_tokens": 25316919.0, "step": 21090 }, { "entropy": 1.7966985195875167, "epoch": 0.06540815385485745, "grad_norm": 10.770297050476074, "learning_rate": 9.892066458236207e-06, "loss": 0.5133, "mean_token_accuracy": 0.8241160228848458, "num_tokens": 25329663.0, "step": 21100 }, { "entropy": 1.936796745657921, "epoch": 0.06543915297990714, "grad_norm": 10.986133575439453, "learning_rate": 9.88972308851668e-06, "loss": 0.6237, "mean_token_accuracy": 0.7997463896870614, "num_tokens": 25341721.0, "step": 21110 }, { "entropy": 1.917391985654831, "epoch": 0.06547015210495684, "grad_norm": 11.787429809570312, "learning_rate": 9.887381383398138e-06, "loss": 0.5681, "mean_token_accuracy": 0.8246586143970489, "num_tokens": 25353957.0, "step": 21120 }, { "entropy": 1.8626626536250115, "epoch": 0.06550115123000654, "grad_norm": 10.935876846313477, "learning_rate": 9.885041340910771e-06, "loss": 0.5545, "mean_token_accuracy": 0.8301349982619286, "num_tokens": 25366548.0, "step": 21130 }, { "entropy": 1.9012475624680518, "epoch": 0.06553215035505623, "grad_norm": 11.089349746704102, "learning_rate": 9.882702959088042e-06, "loss": 0.6256, "mean_token_accuracy": 0.8115785628557205, "num_tokens": 25378165.0, "step": 21140 }, { "entropy": 1.9432383313775063, "epoch": 0.06556314948010593, "grad_norm": 6.057243824005127, "learning_rate": 9.880366235966667e-06, "loss": 0.6253, "mean_token_accuracy": 0.8162486657500267, "num_tokens": 25389601.0, "step": 21150 }, { "entropy": 1.8184719279408454, "epoch": 0.06559414860515562, "grad_norm": 9.748690605163574, "learning_rate": 9.878031169586607e-06, "loss": 0.5692, "mean_token_accuracy": 0.8212730750441551, "num_tokens": 25402464.0, "step": 21160 }, { "entropy": 1.8200145453214644, "epoch": 0.06562514773020532, "grad_norm": 10.591629981994629, "learning_rate": 9.875697757991068e-06, "loss": 0.5807, "mean_token_accuracy": 0.8190927669405937, "num_tokens": 25415261.0, "step": 21170 }, { "entropy": 1.911136743426323, "epoch": 0.065656146855255, "grad_norm": 9.886494636535645, "learning_rate": 9.87336599922648e-06, "loss": 0.6362, "mean_token_accuracy": 0.8133341312408447, "num_tokens": 25426211.0, "step": 21180 }, { "entropy": 1.859015080332756, "epoch": 0.0656871459803047, "grad_norm": 11.79246711730957, "learning_rate": 9.871035891342516e-06, "loss": 0.5876, "mean_token_accuracy": 0.8148143604397774, "num_tokens": 25438384.0, "step": 21190 }, { "entropy": 1.845600600540638, "epoch": 0.0657181451053544, "grad_norm": 11.987141609191895, "learning_rate": 9.868707432392061e-06, "loss": 0.5756, "mean_token_accuracy": 0.8216294556856155, "num_tokens": 25451164.0, "step": 21200 }, { "entropy": 1.970012903213501, "epoch": 0.0657491442304041, "grad_norm": 10.808934211730957, "learning_rate": 9.866380620431211e-06, "loss": 0.674, "mean_token_accuracy": 0.8027332067489624, "num_tokens": 25461708.0, "step": 21210 }, { "entropy": 1.8186683028936386, "epoch": 0.06578014335545379, "grad_norm": 5.0801897048950195, "learning_rate": 9.86405545351927e-06, "loss": 0.5119, "mean_token_accuracy": 0.8230162873864174, "num_tokens": 25475658.0, "step": 21220 }, { "entropy": 1.9227222502231598, "epoch": 0.06581114248050349, "grad_norm": 10.78120231628418, "learning_rate": 9.861731929718746e-06, "loss": 0.6277, "mean_token_accuracy": 0.816400445997715, "num_tokens": 25487133.0, "step": 21230 }, { "entropy": 1.921273510158062, "epoch": 0.06584214160555318, "grad_norm": 11.297560691833496, "learning_rate": 9.859410047095337e-06, "loss": 0.5938, "mean_token_accuracy": 0.8111256033182144, "num_tokens": 25498732.0, "step": 21240 }, { "entropy": 1.8860565900802613, "epoch": 0.06587314073060288, "grad_norm": 12.347557067871094, "learning_rate": 9.857089803717928e-06, "loss": 0.5763, "mean_token_accuracy": 0.814181549847126, "num_tokens": 25510149.0, "step": 21250 }, { "entropy": 1.8615178689360619, "epoch": 0.06590413985565258, "grad_norm": 13.131210327148438, "learning_rate": 9.854771197658584e-06, "loss": 0.6138, "mean_token_accuracy": 0.8099151000380516, "num_tokens": 25522335.0, "step": 21260 }, { "entropy": 1.9241080656647682, "epoch": 0.06593513898070227, "grad_norm": 4.803411960601807, "learning_rate": 9.852454226992548e-06, "loss": 0.5768, "mean_token_accuracy": 0.8204170763492584, "num_tokens": 25533747.0, "step": 21270 }, { "entropy": 1.889383627474308, "epoch": 0.06596613810575197, "grad_norm": 10.4091796875, "learning_rate": 9.850138889798216e-06, "loss": 0.6602, "mean_token_accuracy": 0.8120569944381714, "num_tokens": 25545456.0, "step": 21280 }, { "entropy": 1.9471757873892783, "epoch": 0.06599713723080167, "grad_norm": 9.847548484802246, "learning_rate": 9.847825184157157e-06, "loss": 0.6242, "mean_token_accuracy": 0.808134414255619, "num_tokens": 25556403.0, "step": 21290 }, { "entropy": 1.8040209040045738, "epoch": 0.06602813635585135, "grad_norm": 9.907560348510742, "learning_rate": 9.845513108154088e-06, "loss": 0.52, "mean_token_accuracy": 0.8381869062781334, "num_tokens": 25569237.0, "step": 21300 }, { "entropy": 1.8439020454883575, "epoch": 0.06605913548090105, "grad_norm": 10.755191802978516, "learning_rate": 9.843202659876867e-06, "loss": 0.5805, "mean_token_accuracy": 0.8121224045753479, "num_tokens": 25581127.0, "step": 21310 }, { "entropy": 1.9237766891717911, "epoch": 0.06609013460595074, "grad_norm": 9.386770248413086, "learning_rate": 9.8408938374165e-06, "loss": 0.6702, "mean_token_accuracy": 0.8007935658097267, "num_tokens": 25592005.0, "step": 21320 }, { "entropy": 1.9236200451850891, "epoch": 0.06612113373100044, "grad_norm": 11.110610961914062, "learning_rate": 9.838586638867122e-06, "loss": 0.6142, "mean_token_accuracy": 0.8141868889331818, "num_tokens": 25603901.0, "step": 21330 }, { "entropy": 1.8876248642802238, "epoch": 0.06615213285605014, "grad_norm": 11.1986665725708, "learning_rate": 9.836281062325994e-06, "loss": 0.5891, "mean_token_accuracy": 0.8205958276987075, "num_tokens": 25615461.0, "step": 21340 }, { "entropy": 1.8958319693803787, "epoch": 0.06618313198109983, "grad_norm": 10.968427658081055, "learning_rate": 9.833977105893499e-06, "loss": 0.6026, "mean_token_accuracy": 0.8049457132816314, "num_tokens": 25627052.0, "step": 21350 }, { "entropy": 1.7655820041894912, "epoch": 0.06621413110614953, "grad_norm": 11.009693145751953, "learning_rate": 9.831674767673128e-06, "loss": 0.4757, "mean_token_accuracy": 0.8243117600679397, "num_tokens": 25640796.0, "step": 21360 }, { "entropy": 1.9570647537708283, "epoch": 0.06624513023119923, "grad_norm": 4.816678524017334, "learning_rate": 9.829374045771485e-06, "loss": 0.6455, "mean_token_accuracy": 0.8167396351695061, "num_tokens": 25652130.0, "step": 21370 }, { "entropy": 1.8768548294901848, "epoch": 0.06627612935624892, "grad_norm": 9.173911094665527, "learning_rate": 9.827074938298272e-06, "loss": 0.563, "mean_token_accuracy": 0.8292611509561538, "num_tokens": 25664530.0, "step": 21380 }, { "entropy": 1.836962741613388, "epoch": 0.06630712848129862, "grad_norm": 11.056595802307129, "learning_rate": 9.824777443366282e-06, "loss": 0.5817, "mean_token_accuracy": 0.8175490334630012, "num_tokens": 25676479.0, "step": 21390 }, { "entropy": 1.862608587741852, "epoch": 0.06633812760634832, "grad_norm": 10.60038948059082, "learning_rate": 9.822481559091401e-06, "loss": 0.6626, "mean_token_accuracy": 0.8113431319594383, "num_tokens": 25687935.0, "step": 21400 }, { "entropy": 1.8528019905090332, "epoch": 0.06636912673139801, "grad_norm": 13.311474800109863, "learning_rate": 9.820187283592584e-06, "loss": 0.5789, "mean_token_accuracy": 0.8175876498222351, "num_tokens": 25700014.0, "step": 21410 }, { "entropy": 1.903902080655098, "epoch": 0.0664001258564477, "grad_norm": 9.098358154296875, "learning_rate": 9.817894614991875e-06, "loss": 0.6484, "mean_token_accuracy": 0.8050385147333146, "num_tokens": 25711005.0, "step": 21420 }, { "entropy": 1.8798560991883277, "epoch": 0.06643112498149739, "grad_norm": 10.936999320983887, "learning_rate": 9.815603551414374e-06, "loss": 0.5749, "mean_token_accuracy": 0.822943688929081, "num_tokens": 25722335.0, "step": 21430 }, { "entropy": 1.7517326176166534, "epoch": 0.06646212410654709, "grad_norm": 10.7169189453125, "learning_rate": 9.813314090988247e-06, "loss": 0.5203, "mean_token_accuracy": 0.8164523139595985, "num_tokens": 25736104.0, "step": 21440 }, { "entropy": 1.9384674742817878, "epoch": 0.06649312323159678, "grad_norm": 10.496194839477539, "learning_rate": 9.811026231844714e-06, "loss": 0.6222, "mean_token_accuracy": 0.8058070495724678, "num_tokens": 25747103.0, "step": 21450 }, { "entropy": 1.9147820815443992, "epoch": 0.06652412235664648, "grad_norm": 11.332496643066406, "learning_rate": 9.808739972118045e-06, "loss": 0.6591, "mean_token_accuracy": 0.7969366267323494, "num_tokens": 25759932.0, "step": 21460 }, { "entropy": 1.9171302869915963, "epoch": 0.06655512148169618, "grad_norm": 10.07258129119873, "learning_rate": 9.806455309945553e-06, "loss": 0.596, "mean_token_accuracy": 0.8198638454079628, "num_tokens": 25771193.0, "step": 21470 }, { "entropy": 1.663824899494648, "epoch": 0.06658612060674587, "grad_norm": 4.567380428314209, "learning_rate": 9.804172243467576e-06, "loss": 0.4525, "mean_token_accuracy": 0.8362728267908096, "num_tokens": 25785955.0, "step": 21480 }, { "entropy": 1.819168321788311, "epoch": 0.06661711973179557, "grad_norm": 6.046937465667725, "learning_rate": 9.8018907708275e-06, "loss": 0.6468, "mean_token_accuracy": 0.8072507992386818, "num_tokens": 25798694.0, "step": 21490 }, { "entropy": 1.8685800537467003, "epoch": 0.06664811885684527, "grad_norm": 9.6455659866333, "learning_rate": 9.799610890171714e-06, "loss": 0.5722, "mean_token_accuracy": 0.8169274970889091, "num_tokens": 25810540.0, "step": 21500 }, { "entropy": 1.894263032078743, "epoch": 0.06667911798189496, "grad_norm": 11.558295249938965, "learning_rate": 9.797332599649637e-06, "loss": 0.636, "mean_token_accuracy": 0.8053765878081321, "num_tokens": 25822467.0, "step": 21510 }, { "entropy": 1.9177772477269173, "epoch": 0.06671011710694466, "grad_norm": 10.580260276794434, "learning_rate": 9.795055897413697e-06, "loss": 0.684, "mean_token_accuracy": 0.800214584171772, "num_tokens": 25834010.0, "step": 21520 }, { "entropy": 1.831970725953579, "epoch": 0.06674111623199436, "grad_norm": 10.084224700927734, "learning_rate": 9.792780781619318e-06, "loss": 0.5683, "mean_token_accuracy": 0.821027934551239, "num_tokens": 25846769.0, "step": 21530 }, { "entropy": 1.9271451473236083, "epoch": 0.06677211535704405, "grad_norm": 11.508028030395508, "learning_rate": 9.790507250424926e-06, "loss": 0.643, "mean_token_accuracy": 0.7898375526070595, "num_tokens": 25857730.0, "step": 21540 }, { "entropy": 1.9292464286088944, "epoch": 0.06680311448209374, "grad_norm": 9.729171752929688, "learning_rate": 9.788235301991947e-06, "loss": 0.6296, "mean_token_accuracy": 0.8045027181506157, "num_tokens": 25869739.0, "step": 21550 }, { "entropy": 1.9347792953252791, "epoch": 0.06683411360714343, "grad_norm": 9.912041664123535, "learning_rate": 9.785964934484776e-06, "loss": 0.7101, "mean_token_accuracy": 0.7909634590148926, "num_tokens": 25880786.0, "step": 21560 }, { "entropy": 1.8601661220192909, "epoch": 0.06686511273219313, "grad_norm": 10.346519470214844, "learning_rate": 9.783696146070801e-06, "loss": 0.5925, "mean_token_accuracy": 0.8147589430212975, "num_tokens": 25892319.0, "step": 21570 }, { "entropy": 1.7954956993460656, "epoch": 0.06689611185724283, "grad_norm": 4.684342384338379, "learning_rate": 9.781428934920377e-06, "loss": 0.5335, "mean_token_accuracy": 0.8224958389997482, "num_tokens": 25905020.0, "step": 21580 }, { "entropy": 1.8782492965459823, "epoch": 0.06692711098229252, "grad_norm": 12.385150909423828, "learning_rate": 9.77916329920682e-06, "loss": 0.6179, "mean_token_accuracy": 0.8114234715700149, "num_tokens": 25916570.0, "step": 21590 }, { "entropy": 1.8336504146456718, "epoch": 0.06695811010734222, "grad_norm": 3.51712965965271, "learning_rate": 9.776899237106418e-06, "loss": 0.5324, "mean_token_accuracy": 0.8270602032542229, "num_tokens": 25929984.0, "step": 21600 }, { "entropy": 1.8505462616682054, "epoch": 0.06698910923239192, "grad_norm": 9.539434432983398, "learning_rate": 9.774636746798405e-06, "loss": 0.635, "mean_token_accuracy": 0.8051048040390014, "num_tokens": 25941877.0, "step": 21610 }, { "entropy": 1.7775013580918313, "epoch": 0.06702010835744161, "grad_norm": 11.277313232421875, "learning_rate": 9.77237582646497e-06, "loss": 0.5501, "mean_token_accuracy": 0.823443454504013, "num_tokens": 25955271.0, "step": 21620 }, { "entropy": 1.8188173368573188, "epoch": 0.06705110748249131, "grad_norm": 5.36696720123291, "learning_rate": 9.770116474291232e-06, "loss": 0.5572, "mean_token_accuracy": 0.8193252950906753, "num_tokens": 25967242.0, "step": 21630 }, { "entropy": 1.819683888554573, "epoch": 0.067082106607541, "grad_norm": 10.795523643493652, "learning_rate": 9.767858688465254e-06, "loss": 0.6162, "mean_token_accuracy": 0.8111403912305832, "num_tokens": 25979320.0, "step": 21640 }, { "entropy": 1.7906116724014283, "epoch": 0.0671131057325907, "grad_norm": 9.500042915344238, "learning_rate": 9.765602467178033e-06, "loss": 0.5362, "mean_token_accuracy": 0.822031632065773, "num_tokens": 25992253.0, "step": 21650 }, { "entropy": 1.8245372220873832, "epoch": 0.0671441048576404, "grad_norm": 8.064600944519043, "learning_rate": 9.763347808623481e-06, "loss": 0.5656, "mean_token_accuracy": 0.817147271335125, "num_tokens": 26003957.0, "step": 21660 }, { "entropy": 1.8516656443476678, "epoch": 0.06717510398269008, "grad_norm": 10.404606819152832, "learning_rate": 9.761094710998432e-06, "loss": 0.5836, "mean_token_accuracy": 0.8104123935103417, "num_tokens": 26015881.0, "step": 21670 }, { "entropy": 1.827203567326069, "epoch": 0.06720610310773978, "grad_norm": 13.826399803161621, "learning_rate": 9.75884317250263e-06, "loss": 0.5807, "mean_token_accuracy": 0.8180100679397583, "num_tokens": 26028572.0, "step": 21680 }, { "entropy": 1.7154892578721046, "epoch": 0.06723710223278948, "grad_norm": 4.944169044494629, "learning_rate": 9.756593191338725e-06, "loss": 0.3884, "mean_token_accuracy": 0.848066033422947, "num_tokens": 26043181.0, "step": 21690 }, { "entropy": 1.8305002465844153, "epoch": 0.06726810135783917, "grad_norm": 11.294840812683105, "learning_rate": 9.754344765712266e-06, "loss": 0.5811, "mean_token_accuracy": 0.8094679519534111, "num_tokens": 26055431.0, "step": 21700 }, { "entropy": 1.8822925835847855, "epoch": 0.06729910048288887, "grad_norm": 11.073433876037598, "learning_rate": 9.752097893831698e-06, "loss": 0.6668, "mean_token_accuracy": 0.8047523692250251, "num_tokens": 26067040.0, "step": 21710 }, { "entropy": 1.8371146380901338, "epoch": 0.06733009960793856, "grad_norm": 9.113365173339844, "learning_rate": 9.749852573908346e-06, "loss": 0.5507, "mean_token_accuracy": 0.8240691289305687, "num_tokens": 26080151.0, "step": 21720 }, { "entropy": 1.8499283462762832, "epoch": 0.06736109873298826, "grad_norm": 11.711502075195312, "learning_rate": 9.747608804156427e-06, "loss": 0.5783, "mean_token_accuracy": 0.8179016202688217, "num_tokens": 26092394.0, "step": 21730 }, { "entropy": 1.7742941290140153, "epoch": 0.06739209785803796, "grad_norm": 10.995613098144531, "learning_rate": 9.745366582793027e-06, "loss": 0.5917, "mean_token_accuracy": 0.8143564939498902, "num_tokens": 26105790.0, "step": 21740 }, { "entropy": 1.727923959493637, "epoch": 0.06742309698308765, "grad_norm": 3.2797532081604004, "learning_rate": 9.7431259080381e-06, "loss": 0.4503, "mean_token_accuracy": 0.8418777331709861, "num_tokens": 26119446.0, "step": 21750 }, { "entropy": 1.8147297531366349, "epoch": 0.06745409610813735, "grad_norm": 8.432580947875977, "learning_rate": 9.740886778114467e-06, "loss": 0.5166, "mean_token_accuracy": 0.8339284613728524, "num_tokens": 26131348.0, "step": 21760 }, { "entropy": 1.8820328325033189, "epoch": 0.06748509523318705, "grad_norm": 5.698887825012207, "learning_rate": 9.738649191247806e-06, "loss": 0.6172, "mean_token_accuracy": 0.809808611869812, "num_tokens": 26143041.0, "step": 21770 }, { "entropy": 1.8815367594361305, "epoch": 0.06751609435823674, "grad_norm": 8.804140090942383, "learning_rate": 9.736413145666649e-06, "loss": 0.594, "mean_token_accuracy": 0.820245711505413, "num_tokens": 26154978.0, "step": 21780 }, { "entropy": 1.9202604204416276, "epoch": 0.06754709348328644, "grad_norm": 9.845060348510742, "learning_rate": 9.734178639602368e-06, "loss": 0.6266, "mean_token_accuracy": 0.8070004045963287, "num_tokens": 26166500.0, "step": 21790 }, { "entropy": 1.8778617054224014, "epoch": 0.06757809260833612, "grad_norm": 9.954373359680176, "learning_rate": 9.731945671289185e-06, "loss": 0.6406, "mean_token_accuracy": 0.8081085130572319, "num_tokens": 26178330.0, "step": 21800 }, { "entropy": 1.9139564037322998, "epoch": 0.06760909173338582, "grad_norm": 10.78117847442627, "learning_rate": 9.72971423896414e-06, "loss": 0.6322, "mean_token_accuracy": 0.8143807515501976, "num_tokens": 26189448.0, "step": 21810 }, { "entropy": 1.8785874828696252, "epoch": 0.06764009085843552, "grad_norm": 5.0894389152526855, "learning_rate": 9.727484340867116e-06, "loss": 0.5966, "mean_token_accuracy": 0.8152860775589943, "num_tokens": 26201634.0, "step": 21820 }, { "entropy": 1.8347378268837928, "epoch": 0.06767108998348521, "grad_norm": 4.857226371765137, "learning_rate": 9.725255975240813e-06, "loss": 0.4972, "mean_token_accuracy": 0.8315959751605988, "num_tokens": 26213558.0, "step": 21830 }, { "entropy": 1.845048761367798, "epoch": 0.06770208910853491, "grad_norm": 11.329482078552246, "learning_rate": 9.723029140330748e-06, "loss": 0.5478, "mean_token_accuracy": 0.8252388536930084, "num_tokens": 26225961.0, "step": 21840 }, { "entropy": 1.8393208265304566, "epoch": 0.0677330882335846, "grad_norm": 10.937715530395508, "learning_rate": 9.72080383438525e-06, "loss": 0.6227, "mean_token_accuracy": 0.8155218675732613, "num_tokens": 26238570.0, "step": 21850 }, { "entropy": 1.835605874657631, "epoch": 0.0677640873586343, "grad_norm": 9.71265983581543, "learning_rate": 9.718580055655447e-06, "loss": 0.5487, "mean_token_accuracy": 0.8284683138132095, "num_tokens": 26249804.0, "step": 21860 }, { "entropy": 1.9305734172463418, "epoch": 0.067795086483684, "grad_norm": 9.470818519592285, "learning_rate": 9.716357802395276e-06, "loss": 0.6421, "mean_token_accuracy": 0.8088799849152565, "num_tokens": 26261058.0, "step": 21870 }, { "entropy": 1.8729921713471414, "epoch": 0.0678260856087337, "grad_norm": 11.346515655517578, "learning_rate": 9.714137072861461e-06, "loss": 0.6013, "mean_token_accuracy": 0.8200590804219245, "num_tokens": 26273172.0, "step": 21880 }, { "entropy": 1.8179200991988183, "epoch": 0.0678570847337834, "grad_norm": 9.96755313873291, "learning_rate": 9.711917865313517e-06, "loss": 0.5983, "mean_token_accuracy": 0.813332186639309, "num_tokens": 26286190.0, "step": 21890 }, { "entropy": 1.807481935620308, "epoch": 0.06788808385883309, "grad_norm": 9.412008285522461, "learning_rate": 9.709700178013736e-06, "loss": 0.5088, "mean_token_accuracy": 0.823253333568573, "num_tokens": 26299950.0, "step": 21900 }, { "entropy": 1.8429307714104652, "epoch": 0.06791908298388279, "grad_norm": 10.125129699707031, "learning_rate": 9.707484009227192e-06, "loss": 0.5312, "mean_token_accuracy": 0.8161771893501282, "num_tokens": 26312676.0, "step": 21910 }, { "entropy": 1.932323306798935, "epoch": 0.06795008210893247, "grad_norm": 10.303938865661621, "learning_rate": 9.705269357221728e-06, "loss": 0.6293, "mean_token_accuracy": 0.8122246876358986, "num_tokens": 26324476.0, "step": 21920 }, { "entropy": 1.8644244894385338, "epoch": 0.06798108123398217, "grad_norm": 9.893197059631348, "learning_rate": 9.703056220267948e-06, "loss": 0.5923, "mean_token_accuracy": 0.8110254645347595, "num_tokens": 26336986.0, "step": 21930 }, { "entropy": 1.9435497313737868, "epoch": 0.06801208035903186, "grad_norm": 10.371121406555176, "learning_rate": 9.700844596639224e-06, "loss": 0.6676, "mean_token_accuracy": 0.8060028344392777, "num_tokens": 26348032.0, "step": 21940 }, { "entropy": 1.818027876317501, "epoch": 0.06804307948408156, "grad_norm": 10.99317741394043, "learning_rate": 9.698634484611671e-06, "loss": 0.4973, "mean_token_accuracy": 0.8351524174213409, "num_tokens": 26360903.0, "step": 21950 }, { "entropy": 2.044380483031273, "epoch": 0.06807407860913126, "grad_norm": 11.019384384155273, "learning_rate": 9.696425882464162e-06, "loss": 0.7252, "mean_token_accuracy": 0.7958701580762864, "num_tokens": 26371825.0, "step": 21960 }, { "entropy": 1.9928947612643242, "epoch": 0.06810507773418095, "grad_norm": 9.34325122833252, "learning_rate": 9.694218788478302e-06, "loss": 0.6251, "mean_token_accuracy": 0.8070891216397286, "num_tokens": 26382919.0, "step": 21970 }, { "entropy": 1.8768664389848708, "epoch": 0.06813607685923065, "grad_norm": 9.70384407043457, "learning_rate": 9.692013200938443e-06, "loss": 0.5142, "mean_token_accuracy": 0.8396727994084359, "num_tokens": 26395459.0, "step": 21980 }, { "entropy": 1.8494936734437943, "epoch": 0.06816707598428035, "grad_norm": 11.369257926940918, "learning_rate": 9.689809118131661e-06, "loss": 0.5583, "mean_token_accuracy": 0.8196944579482078, "num_tokens": 26407374.0, "step": 21990 }, { "entropy": 1.8978412017226218, "epoch": 0.06819807510933004, "grad_norm": 11.591387748718262, "learning_rate": 9.68760653834776e-06, "loss": 0.6132, "mean_token_accuracy": 0.8112616434693336, "num_tokens": 26419627.0, "step": 22000 }, { "entropy": 1.8775578573346139, "epoch": 0.06822907423437974, "grad_norm": 5.471284866333008, "learning_rate": 9.685405459879265e-06, "loss": 0.5619, "mean_token_accuracy": 0.8241677179932594, "num_tokens": 26431836.0, "step": 22010 }, { "entropy": 1.9307914853096009, "epoch": 0.06826007335942944, "grad_norm": 9.451096534729004, "learning_rate": 9.683205881021414e-06, "loss": 0.6088, "mean_token_accuracy": 0.8177047148346901, "num_tokens": 26442888.0, "step": 22020 }, { "entropy": 1.915931698679924, "epoch": 0.06829107248447913, "grad_norm": 10.326228141784668, "learning_rate": 9.681007800072153e-06, "loss": 0.6463, "mean_token_accuracy": 0.8002750858664512, "num_tokens": 26454901.0, "step": 22030 }, { "entropy": 1.9367796674370765, "epoch": 0.06832207160952881, "grad_norm": 10.724263191223145, "learning_rate": 9.678811215332136e-06, "loss": 0.6072, "mean_token_accuracy": 0.8125025644898415, "num_tokens": 26466644.0, "step": 22040 }, { "entropy": 1.8923877745866775, "epoch": 0.06835307073457851, "grad_norm": 9.895198822021484, "learning_rate": 9.676616125104707e-06, "loss": 0.5792, "mean_token_accuracy": 0.8232117027044297, "num_tokens": 26478470.0, "step": 22050 }, { "entropy": 1.9004954114556312, "epoch": 0.06838406985962821, "grad_norm": 9.4879150390625, "learning_rate": 9.674422527695905e-06, "loss": 0.5998, "mean_token_accuracy": 0.8110221117734909, "num_tokens": 26490224.0, "step": 22060 }, { "entropy": 1.9001964166760446, "epoch": 0.0684150689846779, "grad_norm": 5.616433620452881, "learning_rate": 9.672230421414466e-06, "loss": 0.625, "mean_token_accuracy": 0.8154096305370331, "num_tokens": 26501366.0, "step": 22070 }, { "entropy": 1.9321230724453926, "epoch": 0.0684460681097276, "grad_norm": 9.737174987792969, "learning_rate": 9.670039804571791e-06, "loss": 0.6395, "mean_token_accuracy": 0.8030791565775871, "num_tokens": 26512519.0, "step": 22080 }, { "entropy": 1.9279805243015289, "epoch": 0.0684770672347773, "grad_norm": 9.636087417602539, "learning_rate": 9.667850675481966e-06, "loss": 0.5453, "mean_token_accuracy": 0.8268870130181313, "num_tokens": 26524253.0, "step": 22090 }, { "entropy": 1.8550891995429992, "epoch": 0.068508066359827, "grad_norm": 4.487429141998291, "learning_rate": 9.665663032461747e-06, "loss": 0.5258, "mean_token_accuracy": 0.8270553275942802, "num_tokens": 26536731.0, "step": 22100 }, { "entropy": 1.9424635365605354, "epoch": 0.06853906548487669, "grad_norm": 10.442728996276855, "learning_rate": 9.663476873830555e-06, "loss": 0.5829, "mean_token_accuracy": 0.8206922665238381, "num_tokens": 26547996.0, "step": 22110 }, { "entropy": 1.9067403554916382, "epoch": 0.06857006460992639, "grad_norm": 9.885056495666504, "learning_rate": 9.661292197910468e-06, "loss": 0.626, "mean_token_accuracy": 0.8120665445923805, "num_tokens": 26560444.0, "step": 22120 }, { "entropy": 1.9578855365514756, "epoch": 0.06860106373497608, "grad_norm": 8.84913158416748, "learning_rate": 9.659109003026222e-06, "loss": 0.6182, "mean_token_accuracy": 0.8090391054749488, "num_tokens": 26572313.0, "step": 22130 }, { "entropy": 2.025720348954201, "epoch": 0.06863206286002578, "grad_norm": 9.901925086975098, "learning_rate": 9.656927287505196e-06, "loss": 0.7, "mean_token_accuracy": 0.8067919075489044, "num_tokens": 26583097.0, "step": 22140 }, { "entropy": 2.0064867049455644, "epoch": 0.06866306198507548, "grad_norm": 10.104426383972168, "learning_rate": 9.654747049677416e-06, "loss": 0.6419, "mean_token_accuracy": 0.8101355582475662, "num_tokens": 26594726.0, "step": 22150 }, { "entropy": 1.9431262537837029, "epoch": 0.06869406111012517, "grad_norm": 10.839469909667969, "learning_rate": 9.652568287875552e-06, "loss": 0.5512, "mean_token_accuracy": 0.8269416391849518, "num_tokens": 26606536.0, "step": 22160 }, { "entropy": 2.012579309940338, "epoch": 0.06872506023517486, "grad_norm": 12.621667861938477, "learning_rate": 9.650391000434892e-06, "loss": 0.6522, "mean_token_accuracy": 0.8193601354956627, "num_tokens": 26617239.0, "step": 22170 }, { "entropy": 1.8841126412153244, "epoch": 0.06875605936022455, "grad_norm": 5.069724082946777, "learning_rate": 9.648215185693367e-06, "loss": 0.5399, "mean_token_accuracy": 0.8241742476820946, "num_tokens": 26629571.0, "step": 22180 }, { "entropy": 1.938437856733799, "epoch": 0.06878705848527425, "grad_norm": 10.184552192687988, "learning_rate": 9.646040841991519e-06, "loss": 0.6364, "mean_token_accuracy": 0.812780536711216, "num_tokens": 26640715.0, "step": 22190 }, { "entropy": 1.8110738307237626, "epoch": 0.06881805761032395, "grad_norm": 10.346161842346191, "learning_rate": 9.64386796767251e-06, "loss": 0.534, "mean_token_accuracy": 0.8277497068047523, "num_tokens": 26655037.0, "step": 22200 }, { "entropy": 1.9208206847310065, "epoch": 0.06884905673537364, "grad_norm": 5.2261762619018555, "learning_rate": 9.641696561082117e-06, "loss": 0.5982, "mean_token_accuracy": 0.8199071541428566, "num_tokens": 26667082.0, "step": 22210 }, { "entropy": 1.8494412750005722, "epoch": 0.06888005586042334, "grad_norm": 10.328657150268555, "learning_rate": 9.639526620568718e-06, "loss": 0.505, "mean_token_accuracy": 0.8268266186118126, "num_tokens": 26680702.0, "step": 22220 }, { "entropy": 2.001448130607605, "epoch": 0.06891105498547304, "grad_norm": 10.208023071289062, "learning_rate": 9.637358144483292e-06, "loss": 0.6457, "mean_token_accuracy": 0.8080255687236786, "num_tokens": 26692418.0, "step": 22230 }, { "entropy": 1.9539388507604598, "epoch": 0.06894205411052273, "grad_norm": 5.077010631561279, "learning_rate": 9.635191131179423e-06, "loss": 0.5923, "mean_token_accuracy": 0.8029374584555626, "num_tokens": 26705460.0, "step": 22240 }, { "entropy": 2.013536052405834, "epoch": 0.06897305323557243, "grad_norm": 8.475672721862793, "learning_rate": 9.633025579013265e-06, "loss": 0.6455, "mean_token_accuracy": 0.8068231120705605, "num_tokens": 26716216.0, "step": 22250 }, { "entropy": 1.9996852800250053, "epoch": 0.06900405236062213, "grad_norm": 11.207576751708984, "learning_rate": 9.630861486343582e-06, "loss": 0.6351, "mean_token_accuracy": 0.8085829988121986, "num_tokens": 26726768.0, "step": 22260 }, { "entropy": 1.8975500375032426, "epoch": 0.06903505148567182, "grad_norm": 9.219078063964844, "learning_rate": 9.628698851531698e-06, "loss": 0.5485, "mean_token_accuracy": 0.8229989439249039, "num_tokens": 26739895.0, "step": 22270 }, { "entropy": 1.9055659562349319, "epoch": 0.06906605061072152, "grad_norm": 9.515271186828613, "learning_rate": 9.62653767294152e-06, "loss": 0.5738, "mean_token_accuracy": 0.8302743345499038, "num_tokens": 26751846.0, "step": 22280 }, { "entropy": 1.928154693543911, "epoch": 0.0690970497357712, "grad_norm": 4.5626654624938965, "learning_rate": 9.624377948939526e-06, "loss": 0.578, "mean_token_accuracy": 0.818497559428215, "num_tokens": 26764244.0, "step": 22290 }, { "entropy": 1.9365676030516625, "epoch": 0.0691280488608209, "grad_norm": 10.669965744018555, "learning_rate": 9.622219677894753e-06, "loss": 0.5561, "mean_token_accuracy": 0.8216531693935394, "num_tokens": 26777119.0, "step": 22300 }, { "entropy": 1.9759273901581764, "epoch": 0.0691590479858706, "grad_norm": 9.490134239196777, "learning_rate": 9.620062858178804e-06, "loss": 0.5882, "mean_token_accuracy": 0.8160349875688553, "num_tokens": 26789166.0, "step": 22310 }, { "entropy": 1.930388794839382, "epoch": 0.06919004711092029, "grad_norm": 11.474687576293945, "learning_rate": 9.617907488165825e-06, "loss": 0.6161, "mean_token_accuracy": 0.8070300281047821, "num_tokens": 26800730.0, "step": 22320 }, { "entropy": 1.9698352724313737, "epoch": 0.06922104623596999, "grad_norm": 10.189593315124512, "learning_rate": 9.615753566232525e-06, "loss": 0.6158, "mean_token_accuracy": 0.8105901405215263, "num_tokens": 26812034.0, "step": 22330 }, { "entropy": 1.8571903064846993, "epoch": 0.06925204536101968, "grad_norm": 12.529641151428223, "learning_rate": 9.613601090758144e-06, "loss": 0.5555, "mean_token_accuracy": 0.8161864310503006, "num_tokens": 26825051.0, "step": 22340 }, { "entropy": 1.9772468566894532, "epoch": 0.06928304448606938, "grad_norm": 4.139663219451904, "learning_rate": 9.611450060124465e-06, "loss": 0.6634, "mean_token_accuracy": 0.8000595793128014, "num_tokens": 26836544.0, "step": 22350 }, { "entropy": 1.8732447177171707, "epoch": 0.06931404361111908, "grad_norm": 11.927010536193848, "learning_rate": 9.609300472715811e-06, "loss": 0.5409, "mean_token_accuracy": 0.8216081500053406, "num_tokens": 26849615.0, "step": 22360 }, { "entropy": 1.93919685035944, "epoch": 0.06934504273616877, "grad_norm": 10.041327476501465, "learning_rate": 9.607152326919017e-06, "loss": 0.6048, "mean_token_accuracy": 0.8051743671298027, "num_tokens": 26861482.0, "step": 22370 }, { "entropy": 1.970766744017601, "epoch": 0.06937604186121847, "grad_norm": 10.42013931274414, "learning_rate": 9.605005621123464e-06, "loss": 0.6534, "mean_token_accuracy": 0.8068610474467277, "num_tokens": 26873126.0, "step": 22380 }, { "entropy": 1.9374214485287666, "epoch": 0.06940704098626817, "grad_norm": 9.681081771850586, "learning_rate": 9.602860353721028e-06, "loss": 0.6223, "mean_token_accuracy": 0.8023192837834359, "num_tokens": 26885153.0, "step": 22390 }, { "entropy": 1.9398547366261483, "epoch": 0.06943804011131786, "grad_norm": 9.45241928100586, "learning_rate": 9.600716523106113e-06, "loss": 0.6011, "mean_token_accuracy": 0.82165118008852, "num_tokens": 26896187.0, "step": 22400 }, { "entropy": 1.9944514393806458, "epoch": 0.06946903923636755, "grad_norm": 9.26956844329834, "learning_rate": 9.598574127675626e-06, "loss": 0.6097, "mean_token_accuracy": 0.8134081959724426, "num_tokens": 26907111.0, "step": 22410 }, { "entropy": 1.9747971966862679, "epoch": 0.06950003836141724, "grad_norm": 10.627933502197266, "learning_rate": 9.59643316582898e-06, "loss": 0.6869, "mean_token_accuracy": 0.7921556413173676, "num_tokens": 26918762.0, "step": 22420 }, { "entropy": 1.975240734219551, "epoch": 0.06953103748646694, "grad_norm": 11.592079162597656, "learning_rate": 9.594293635968081e-06, "loss": 0.6539, "mean_token_accuracy": 0.800179885327816, "num_tokens": 26930609.0, "step": 22430 }, { "entropy": 1.9276101559400558, "epoch": 0.06956203661151664, "grad_norm": 10.43637466430664, "learning_rate": 9.59215553649733e-06, "loss": 0.6321, "mean_token_accuracy": 0.8026261925697327, "num_tokens": 26942087.0, "step": 22440 }, { "entropy": 1.9058784544467926, "epoch": 0.06959303573656633, "grad_norm": 10.7579984664917, "learning_rate": 9.590018865823617e-06, "loss": 0.5575, "mean_token_accuracy": 0.8225520506501198, "num_tokens": 26954256.0, "step": 22450 }, { "entropy": 1.928253923356533, "epoch": 0.06962403486161603, "grad_norm": 9.777754783630371, "learning_rate": 9.587883622356315e-06, "loss": 0.5975, "mean_token_accuracy": 0.8160154640674591, "num_tokens": 26966403.0, "step": 22460 }, { "entropy": 1.8594375014305116, "epoch": 0.06965503398666573, "grad_norm": 4.670237064361572, "learning_rate": 9.585749804507275e-06, "loss": 0.5684, "mean_token_accuracy": 0.8113547876477242, "num_tokens": 26979420.0, "step": 22470 }, { "entropy": 1.8346465826034546, "epoch": 0.06968603311171542, "grad_norm": 10.510963439941406, "learning_rate": 9.58361741069082e-06, "loss": 0.5274, "mean_token_accuracy": 0.8267633840441704, "num_tokens": 26991602.0, "step": 22480 }, { "entropy": 1.855713202059269, "epoch": 0.06971703223676512, "grad_norm": 10.206911087036133, "learning_rate": 9.581486439323741e-06, "loss": 0.6001, "mean_token_accuracy": 0.8163058936595917, "num_tokens": 27003931.0, "step": 22490 }, { "entropy": 1.9139684349298478, "epoch": 0.06974803136181482, "grad_norm": 10.377299308776855, "learning_rate": 9.579356888825293e-06, "loss": 0.622, "mean_token_accuracy": 0.8105499014258385, "num_tokens": 27016144.0, "step": 22500 }, { "entropy": 1.8956247389316558, "epoch": 0.06977903048686451, "grad_norm": 11.713562965393066, "learning_rate": 9.57722875761719e-06, "loss": 0.5887, "mean_token_accuracy": 0.8171533569693565, "num_tokens": 27028209.0, "step": 22510 }, { "entropy": 1.8996628910303115, "epoch": 0.06981002961191421, "grad_norm": 10.300981521606445, "learning_rate": 9.5751020441236e-06, "loss": 0.6273, "mean_token_accuracy": 0.816479966044426, "num_tokens": 27039827.0, "step": 22520 }, { "entropy": 1.8548725992441177, "epoch": 0.0698410287369639, "grad_norm": 9.6876220703125, "learning_rate": 9.572976746771132e-06, "loss": 0.529, "mean_token_accuracy": 0.8360526219010354, "num_tokens": 27051785.0, "step": 22530 }, { "entropy": 1.8720781534910202, "epoch": 0.06987202786201359, "grad_norm": 11.248559951782227, "learning_rate": 9.570852863988847e-06, "loss": 0.5231, "mean_token_accuracy": 0.8292858317494393, "num_tokens": 27063846.0, "step": 22540 }, { "entropy": 1.928021316230297, "epoch": 0.06990302698706329, "grad_norm": 12.513522148132324, "learning_rate": 9.568730394208245e-06, "loss": 0.6212, "mean_token_accuracy": 0.8208240970969201, "num_tokens": 27075131.0, "step": 22550 }, { "entropy": 1.9080385088920593, "epoch": 0.06993402611211298, "grad_norm": 13.877976417541504, "learning_rate": 9.566609335863253e-06, "loss": 0.5778, "mean_token_accuracy": 0.8138008996844291, "num_tokens": 27088488.0, "step": 22560 }, { "entropy": 1.8622148722410201, "epoch": 0.06996502523716268, "grad_norm": 9.615792274475098, "learning_rate": 9.564489687390232e-06, "loss": 0.5473, "mean_token_accuracy": 0.8177649453282356, "num_tokens": 27101651.0, "step": 22570 }, { "entropy": 1.9193992719054223, "epoch": 0.06999602436221238, "grad_norm": 10.330147743225098, "learning_rate": 9.562371447227966e-06, "loss": 0.6293, "mean_token_accuracy": 0.8139034882187843, "num_tokens": 27112881.0, "step": 22580 }, { "entropy": 1.8690322995185853, "epoch": 0.07002702348726207, "grad_norm": 9.160511016845703, "learning_rate": 9.560254613817653e-06, "loss": 0.5615, "mean_token_accuracy": 0.8230489745736123, "num_tokens": 27124763.0, "step": 22590 }, { "entropy": 1.8515427842736245, "epoch": 0.07005802261231177, "grad_norm": 9.302170753479004, "learning_rate": 9.558139185602919e-06, "loss": 0.6005, "mean_token_accuracy": 0.8178195133805275, "num_tokens": 27136756.0, "step": 22600 }, { "entropy": 1.9153072491288186, "epoch": 0.07008902173736146, "grad_norm": 4.208265781402588, "learning_rate": 9.556025161029786e-06, "loss": 0.5976, "mean_token_accuracy": 0.816322073340416, "num_tokens": 27148060.0, "step": 22610 }, { "entropy": 1.958570632338524, "epoch": 0.07012002086241116, "grad_norm": 11.623967170715332, "learning_rate": 9.553912538546687e-06, "loss": 0.663, "mean_token_accuracy": 0.8184066906571388, "num_tokens": 27158765.0, "step": 22620 }, { "entropy": 1.8749097064137459, "epoch": 0.07015101998746086, "grad_norm": 10.970122337341309, "learning_rate": 9.55180131660445e-06, "loss": 0.5946, "mean_token_accuracy": 0.81072598695755, "num_tokens": 27170806.0, "step": 22630 }, { "entropy": 1.831792925298214, "epoch": 0.07018201911251055, "grad_norm": 11.485851287841797, "learning_rate": 9.54969149365631e-06, "loss": 0.5266, "mean_token_accuracy": 0.8216899335384369, "num_tokens": 27184157.0, "step": 22640 }, { "entropy": 1.9119883999228477, "epoch": 0.07021301823756025, "grad_norm": 11.170741081237793, "learning_rate": 9.547583068157877e-06, "loss": 0.6163, "mean_token_accuracy": 0.8011090368032455, "num_tokens": 27196415.0, "step": 22650 }, { "entropy": 1.8825739532709123, "epoch": 0.07024401736260993, "grad_norm": 8.26751708984375, "learning_rate": 9.54547603856716e-06, "loss": 0.5673, "mean_token_accuracy": 0.8156488195061684, "num_tokens": 27209042.0, "step": 22660 }, { "entropy": 1.9585013464093208, "epoch": 0.07027501648765963, "grad_norm": 13.17714786529541, "learning_rate": 9.54337040334454e-06, "loss": 0.66, "mean_token_accuracy": 0.8024971842765808, "num_tokens": 27219980.0, "step": 22670 }, { "entropy": 1.9092792928218842, "epoch": 0.07030601561270933, "grad_norm": 8.935144424438477, "learning_rate": 9.54126616095278e-06, "loss": 0.6069, "mean_token_accuracy": 0.8157609686255455, "num_tokens": 27231294.0, "step": 22680 }, { "entropy": 1.8577600359916686, "epoch": 0.07033701473775902, "grad_norm": 11.60800838470459, "learning_rate": 9.539163309857014e-06, "loss": 0.559, "mean_token_accuracy": 0.8194443017244339, "num_tokens": 27243917.0, "step": 22690 }, { "entropy": 1.9067883223295212, "epoch": 0.07036801386280872, "grad_norm": 9.336089134216309, "learning_rate": 9.537061848524734e-06, "loss": 0.6081, "mean_token_accuracy": 0.8156686320900917, "num_tokens": 27255668.0, "step": 22700 }, { "entropy": 1.8172156438231468, "epoch": 0.07039901298785842, "grad_norm": 5.0443644523620605, "learning_rate": 9.534961775425811e-06, "loss": 0.504, "mean_token_accuracy": 0.8269337728619576, "num_tokens": 27269533.0, "step": 22710 }, { "entropy": 1.8444980800151825, "epoch": 0.07043001211290811, "grad_norm": 9.690264701843262, "learning_rate": 9.532863089032457e-06, "loss": 0.554, "mean_token_accuracy": 0.8164748609066009, "num_tokens": 27282775.0, "step": 22720 }, { "entropy": 2.0001364931464196, "epoch": 0.07046101123795781, "grad_norm": 11.46723747253418, "learning_rate": 9.530765787819247e-06, "loss": 0.6126, "mean_token_accuracy": 0.8077256262302399, "num_tokens": 27294314.0, "step": 22730 }, { "entropy": 2.0072741538286207, "epoch": 0.0704920103630075, "grad_norm": 9.978995323181152, "learning_rate": 9.528669870263103e-06, "loss": 0.6748, "mean_token_accuracy": 0.7956588104367256, "num_tokens": 27305909.0, "step": 22740 }, { "entropy": 2.031547136604786, "epoch": 0.0705230094880572, "grad_norm": 9.277776718139648, "learning_rate": 9.526575334843284e-06, "loss": 0.6595, "mean_token_accuracy": 0.8040603443980217, "num_tokens": 27317574.0, "step": 22750 }, { "entropy": 1.887120671570301, "epoch": 0.0705540086131069, "grad_norm": 10.967029571533203, "learning_rate": 9.524482180041396e-06, "loss": 0.5722, "mean_token_accuracy": 0.809663151204586, "num_tokens": 27330161.0, "step": 22760 }, { "entropy": 1.8155884474515915, "epoch": 0.0705850077381566, "grad_norm": 10.948394775390625, "learning_rate": 9.522390404341375e-06, "loss": 0.5795, "mean_token_accuracy": 0.8259642213582993, "num_tokens": 27343602.0, "step": 22770 }, { "entropy": 1.9294879660010338, "epoch": 0.07061600686320628, "grad_norm": 9.418436050415039, "learning_rate": 9.52030000622949e-06, "loss": 0.5636, "mean_token_accuracy": 0.8182932585477829, "num_tokens": 27355422.0, "step": 22780 }, { "entropy": 1.9679240599274634, "epoch": 0.07064700598825598, "grad_norm": 5.147764682769775, "learning_rate": 9.51821098419433e-06, "loss": 0.5615, "mean_token_accuracy": 0.8287713721394538, "num_tokens": 27367573.0, "step": 22790 }, { "entropy": 1.9291693687438964, "epoch": 0.07067800511330567, "grad_norm": 4.149558067321777, "learning_rate": 9.516123336726806e-06, "loss": 0.6414, "mean_token_accuracy": 0.8070420354604722, "num_tokens": 27379593.0, "step": 22800 }, { "entropy": 2.0455012649297712, "epoch": 0.07070900423835537, "grad_norm": 11.943114280700684, "learning_rate": 9.514037062320148e-06, "loss": 0.7181, "mean_token_accuracy": 0.7937376946210861, "num_tokens": 27390417.0, "step": 22810 }, { "entropy": 1.928147019445896, "epoch": 0.07074000336340507, "grad_norm": 10.236185073852539, "learning_rate": 9.511952159469895e-06, "loss": 0.5803, "mean_token_accuracy": 0.8198776602745056, "num_tokens": 27402550.0, "step": 22820 }, { "entropy": 2.0119119971990584, "epoch": 0.07077100248845476, "grad_norm": 8.378026962280273, "learning_rate": 9.509868626673897e-06, "loss": 0.6267, "mean_token_accuracy": 0.821306724846363, "num_tokens": 27413355.0, "step": 22830 }, { "entropy": 1.9993405610322952, "epoch": 0.07080200161350446, "grad_norm": 9.925045013427734, "learning_rate": 9.507786462432295e-06, "loss": 0.6015, "mean_token_accuracy": 0.8195290580391884, "num_tokens": 27424504.0, "step": 22840 }, { "entropy": 1.9374178424477577, "epoch": 0.07083300073855416, "grad_norm": 4.789980888366699, "learning_rate": 9.505705665247544e-06, "loss": 0.6441, "mean_token_accuracy": 0.8104717791080475, "num_tokens": 27436791.0, "step": 22850 }, { "entropy": 1.9383593767881393, "epoch": 0.07086399986360385, "grad_norm": 10.263440132141113, "learning_rate": 9.503626233624376e-06, "loss": 0.627, "mean_token_accuracy": 0.8090912804007531, "num_tokens": 27448858.0, "step": 22860 }, { "entropy": 2.0473968207836153, "epoch": 0.07089499898865355, "grad_norm": 12.426820755004883, "learning_rate": 9.501548166069823e-06, "loss": 0.7145, "mean_token_accuracy": 0.7896327748894691, "num_tokens": 27459568.0, "step": 22870 }, { "entropy": 1.9629662334918976, "epoch": 0.07092599811370325, "grad_norm": 9.21473503112793, "learning_rate": 9.499471461093198e-06, "loss": 0.6637, "mean_token_accuracy": 0.8080029338598251, "num_tokens": 27472413.0, "step": 22880 }, { "entropy": 1.9620086148381233, "epoch": 0.07095699723875294, "grad_norm": 11.438567161560059, "learning_rate": 9.497396117206091e-06, "loss": 0.6037, "mean_token_accuracy": 0.8183909133076668, "num_tokens": 27484336.0, "step": 22890 }, { "entropy": 1.9684356123209, "epoch": 0.07098799636380264, "grad_norm": 10.009490966796875, "learning_rate": 9.49532213292237e-06, "loss": 0.6147, "mean_token_accuracy": 0.8186342790722847, "num_tokens": 27495646.0, "step": 22900 }, { "entropy": 1.8952196419239045, "epoch": 0.07101899548885232, "grad_norm": 10.987327575683594, "learning_rate": 9.493249506758174e-06, "loss": 0.5656, "mean_token_accuracy": 0.8256588101387023, "num_tokens": 27507145.0, "step": 22910 }, { "entropy": 1.9499731242656708, "epoch": 0.07104999461390202, "grad_norm": 12.571517944335938, "learning_rate": 9.491178237231904e-06, "loss": 0.6393, "mean_token_accuracy": 0.8107420906424523, "num_tokens": 27518536.0, "step": 22920 }, { "entropy": 1.9733731657266618, "epoch": 0.07108099373895171, "grad_norm": 11.42594051361084, "learning_rate": 9.48910832286423e-06, "loss": 0.6315, "mean_token_accuracy": 0.8104237765073776, "num_tokens": 27529692.0, "step": 22930 }, { "entropy": 1.9351124539971352, "epoch": 0.07111199286400141, "grad_norm": 12.896414756774902, "learning_rate": 9.48703976217807e-06, "loss": 0.6321, "mean_token_accuracy": 0.8083524718880654, "num_tokens": 27541473.0, "step": 22940 }, { "entropy": 2.000078111886978, "epoch": 0.07114299198905111, "grad_norm": 10.5455904006958, "learning_rate": 9.484972553698609e-06, "loss": 0.7026, "mean_token_accuracy": 0.7948870256543159, "num_tokens": 27552542.0, "step": 22950 }, { "entropy": 1.87677208930254, "epoch": 0.0711739911141008, "grad_norm": 11.3195161819458, "learning_rate": 9.482906695953262e-06, "loss": 0.5929, "mean_token_accuracy": 0.819133634865284, "num_tokens": 27565263.0, "step": 22960 }, { "entropy": 1.8816159687936307, "epoch": 0.0712049902391505, "grad_norm": 5.181549072265625, "learning_rate": 9.480842187471707e-06, "loss": 0.5527, "mean_token_accuracy": 0.8206304356455802, "num_tokens": 27577618.0, "step": 22970 }, { "entropy": 1.8843361094594002, "epoch": 0.0712359893642002, "grad_norm": 9.86581039428711, "learning_rate": 9.478779026785849e-06, "loss": 0.5401, "mean_token_accuracy": 0.8279527604579926, "num_tokens": 27589842.0, "step": 22980 }, { "entropy": 1.9283671468496322, "epoch": 0.0712669884892499, "grad_norm": 8.319514274597168, "learning_rate": 9.476717212429832e-06, "loss": 0.6132, "mean_token_accuracy": 0.8075576022267341, "num_tokens": 27601806.0, "step": 22990 }, { "entropy": 1.9603848546743392, "epoch": 0.07129798761429959, "grad_norm": 10.907440185546875, "learning_rate": 9.474656742940032e-06, "loss": 0.6049, "mean_token_accuracy": 0.817129735648632, "num_tokens": 27613900.0, "step": 23000 }, { "entropy": 1.923836286365986, "epoch": 0.07132898673934929, "grad_norm": 10.593782424926758, "learning_rate": 9.472597616855047e-06, "loss": 0.6263, "mean_token_accuracy": 0.8129079177975654, "num_tokens": 27626469.0, "step": 23010 }, { "entropy": 1.9737991467118263, "epoch": 0.07135998586439898, "grad_norm": 9.002423286437988, "learning_rate": 9.470539832715709e-06, "loss": 0.6858, "mean_token_accuracy": 0.8034425944089889, "num_tokens": 27638941.0, "step": 23020 }, { "entropy": 2.0140700817108153, "epoch": 0.07139098498944867, "grad_norm": 9.269657135009766, "learning_rate": 9.468483389065051e-06, "loss": 0.7441, "mean_token_accuracy": 0.7920542553067207, "num_tokens": 27650333.0, "step": 23030 }, { "entropy": 1.9881491467356682, "epoch": 0.07142198411449836, "grad_norm": 9.53425407409668, "learning_rate": 9.466428284448339e-06, "loss": 0.605, "mean_token_accuracy": 0.815938925743103, "num_tokens": 27661706.0, "step": 23040 }, { "entropy": 1.9040154725313188, "epoch": 0.07145298323954806, "grad_norm": 11.15352725982666, "learning_rate": 9.464374517413028e-06, "loss": 0.5775, "mean_token_accuracy": 0.8235550940036773, "num_tokens": 27674006.0, "step": 23050 }, { "entropy": 1.8507097408175468, "epoch": 0.07148398236459776, "grad_norm": 4.413679599761963, "learning_rate": 9.462322086508796e-06, "loss": 0.613, "mean_token_accuracy": 0.822106608748436, "num_tokens": 27686803.0, "step": 23060 }, { "entropy": 1.8648405969142914, "epoch": 0.07151498148964745, "grad_norm": 8.988687515258789, "learning_rate": 9.460270990287506e-06, "loss": 0.623, "mean_token_accuracy": 0.8131396234035492, "num_tokens": 27699064.0, "step": 23070 }, { "entropy": 1.9689454659819603, "epoch": 0.07154598061469715, "grad_norm": 12.583850860595703, "learning_rate": 9.458221227303229e-06, "loss": 0.6461, "mean_token_accuracy": 0.8088790848851204, "num_tokens": 27710056.0, "step": 23080 }, { "entropy": 1.920731896162033, "epoch": 0.07157697973974685, "grad_norm": 7.8442254066467285, "learning_rate": 9.456172796112224e-06, "loss": 0.588, "mean_token_accuracy": 0.8172019869089127, "num_tokens": 27722078.0, "step": 23090 }, { "entropy": 1.8544889241456985, "epoch": 0.07160797886479654, "grad_norm": 4.247855186462402, "learning_rate": 9.454125695272939e-06, "loss": 0.5021, "mean_token_accuracy": 0.8311522156000137, "num_tokens": 27734016.0, "step": 23100 }, { "entropy": 1.905607244372368, "epoch": 0.07163897798984624, "grad_norm": 11.669919967651367, "learning_rate": 9.452079923346001e-06, "loss": 0.5869, "mean_token_accuracy": 0.8104119807481766, "num_tokens": 27745967.0, "step": 23110 }, { "entropy": 1.8860177457332612, "epoch": 0.07166997711489594, "grad_norm": 8.93431568145752, "learning_rate": 9.45003547889422e-06, "loss": 0.5465, "mean_token_accuracy": 0.8315723642706871, "num_tokens": 27758582.0, "step": 23120 }, { "entropy": 1.9184095725417136, "epoch": 0.07170097623994563, "grad_norm": 9.845928192138672, "learning_rate": 9.447992360482587e-06, "loss": 0.5926, "mean_token_accuracy": 0.8231678128242492, "num_tokens": 27770200.0, "step": 23130 }, { "entropy": 1.9659554988145829, "epoch": 0.07173197536499533, "grad_norm": 6.174232006072998, "learning_rate": 9.445950566678251e-06, "loss": 0.6704, "mean_token_accuracy": 0.8078634425997734, "num_tokens": 27781380.0, "step": 23140 }, { "entropy": 1.8729309558868408, "epoch": 0.07176297449004501, "grad_norm": 11.597721099853516, "learning_rate": 9.443910096050535e-06, "loss": 0.584, "mean_token_accuracy": 0.8244254574179649, "num_tokens": 27793522.0, "step": 23150 }, { "entropy": 1.9304191544651985, "epoch": 0.07179397361509471, "grad_norm": 4.9579596519470215, "learning_rate": 9.441870947170925e-06, "loss": 0.6446, "mean_token_accuracy": 0.812780550122261, "num_tokens": 27805347.0, "step": 23160 }, { "entropy": 1.9905481100082398, "epoch": 0.0718249727401444, "grad_norm": 8.615478515625, "learning_rate": 9.439833118613064e-06, "loss": 0.6513, "mean_token_accuracy": 0.8182663396000862, "num_tokens": 27816623.0, "step": 23170 }, { "entropy": 1.9626749917864799, "epoch": 0.0718559718651941, "grad_norm": 10.739019393920898, "learning_rate": 9.437796608952747e-06, "loss": 0.5874, "mean_token_accuracy": 0.8244615375995636, "num_tokens": 27828120.0, "step": 23180 }, { "entropy": 1.9498226195573807, "epoch": 0.0718869709902438, "grad_norm": 10.651679039001465, "learning_rate": 9.435761416767925e-06, "loss": 0.6655, "mean_token_accuracy": 0.8038348644971848, "num_tokens": 27839640.0, "step": 23190 }, { "entropy": 1.8966133877635003, "epoch": 0.0719179701152935, "grad_norm": 8.705821990966797, "learning_rate": 9.433727540638685e-06, "loss": 0.6362, "mean_token_accuracy": 0.8108318716287612, "num_tokens": 27851113.0, "step": 23200 }, { "entropy": 1.8680199176073073, "epoch": 0.07194896924034319, "grad_norm": 5.520404815673828, "learning_rate": 9.431694979147263e-06, "loss": 0.5776, "mean_token_accuracy": 0.8188697442412376, "num_tokens": 27862805.0, "step": 23210 }, { "entropy": 1.9000079199671744, "epoch": 0.07197996836539289, "grad_norm": 9.184073448181152, "learning_rate": 9.429663730878031e-06, "loss": 0.5792, "mean_token_accuracy": 0.8288420543074608, "num_tokens": 27875156.0, "step": 23220 }, { "entropy": 1.8455702617764473, "epoch": 0.07201096749044258, "grad_norm": 8.574012756347656, "learning_rate": 9.427633794417493e-06, "loss": 0.4801, "mean_token_accuracy": 0.8355481848120689, "num_tokens": 27888621.0, "step": 23230 }, { "entropy": 1.9175620287656785, "epoch": 0.07204196661549228, "grad_norm": 9.760550498962402, "learning_rate": 9.425605168354284e-06, "loss": 0.6302, "mean_token_accuracy": 0.8198355168104172, "num_tokens": 27899563.0, "step": 23240 }, { "entropy": 1.8596885740756988, "epoch": 0.07207296574054198, "grad_norm": 5.949073791503906, "learning_rate": 9.423577851279158e-06, "loss": 0.5272, "mean_token_accuracy": 0.8252129018306732, "num_tokens": 27913195.0, "step": 23250 }, { "entropy": 1.9171173721551895, "epoch": 0.07210396486559167, "grad_norm": 9.470051765441895, "learning_rate": 9.421551841784998e-06, "loss": 0.6161, "mean_token_accuracy": 0.8131398126482964, "num_tokens": 27925430.0, "step": 23260 }, { "entropy": 1.9633556708693505, "epoch": 0.07213496399064137, "grad_norm": 8.595085144042969, "learning_rate": 9.4195271384668e-06, "loss": 0.5934, "mean_token_accuracy": 0.8224080309271813, "num_tokens": 27936940.0, "step": 23270 }, { "entropy": 1.937751042842865, "epoch": 0.07216596311569105, "grad_norm": 5.916355609893799, "learning_rate": 9.417503739921671e-06, "loss": 0.6028, "mean_token_accuracy": 0.8086767017841339, "num_tokens": 27948799.0, "step": 23280 }, { "entropy": 1.9640158087015152, "epoch": 0.07219696224074075, "grad_norm": 10.401774406433105, "learning_rate": 9.415481644748828e-06, "loss": 0.6664, "mean_token_accuracy": 0.8089988082647324, "num_tokens": 27959298.0, "step": 23290 }, { "entropy": 1.8880821034312247, "epoch": 0.07222796136579045, "grad_norm": 10.773529052734375, "learning_rate": 9.413460851549596e-06, "loss": 0.6015, "mean_token_accuracy": 0.808536796271801, "num_tokens": 27971177.0, "step": 23300 }, { "entropy": 1.9368318811058998, "epoch": 0.07225896049084014, "grad_norm": 8.663378715515137, "learning_rate": 9.411441358927392e-06, "loss": 0.5692, "mean_token_accuracy": 0.8213057667016983, "num_tokens": 27983758.0, "step": 23310 }, { "entropy": 1.9660808324813843, "epoch": 0.07228995961588984, "grad_norm": 9.40843677520752, "learning_rate": 9.40942316548774e-06, "loss": 0.6292, "mean_token_accuracy": 0.8099093735218048, "num_tokens": 27994875.0, "step": 23320 }, { "entropy": 1.9370207205414771, "epoch": 0.07232095874093954, "grad_norm": 8.460911750793457, "learning_rate": 9.407406269838248e-06, "loss": 0.616, "mean_token_accuracy": 0.8174383148550988, "num_tokens": 28005892.0, "step": 23330 }, { "entropy": 1.9526633992791176, "epoch": 0.07235195786598923, "grad_norm": 9.294373512268066, "learning_rate": 9.405390670588613e-06, "loss": 0.6369, "mean_token_accuracy": 0.7995718091726303, "num_tokens": 28017932.0, "step": 23340 }, { "entropy": 1.9452801957726478, "epoch": 0.07238295699103893, "grad_norm": 9.387127876281738, "learning_rate": 9.403376366350623e-06, "loss": 0.6482, "mean_token_accuracy": 0.8082485362887383, "num_tokens": 28029626.0, "step": 23350 }, { "entropy": 1.8676959410309792, "epoch": 0.07241395611608863, "grad_norm": 9.851313591003418, "learning_rate": 9.401363355738139e-06, "loss": 0.5982, "mean_token_accuracy": 0.8089207410812378, "num_tokens": 28041602.0, "step": 23360 }, { "entropy": 1.8555365659296512, "epoch": 0.07244495524113832, "grad_norm": 9.005014419555664, "learning_rate": 9.399351637367101e-06, "loss": 0.5462, "mean_token_accuracy": 0.8123884066939354, "num_tokens": 28054923.0, "step": 23370 }, { "entropy": 1.8651632323861123, "epoch": 0.07247595436618802, "grad_norm": 4.899734020233154, "learning_rate": 9.397341209855522e-06, "loss": 0.5624, "mean_token_accuracy": 0.8213669985532761, "num_tokens": 28067132.0, "step": 23380 }, { "entropy": 1.9289941102266313, "epoch": 0.07250695349123772, "grad_norm": 5.1334228515625, "learning_rate": 9.395332071823485e-06, "loss": 0.6144, "mean_token_accuracy": 0.8181710079312324, "num_tokens": 28078505.0, "step": 23390 }, { "entropy": 1.8720559775829315, "epoch": 0.0725379526162874, "grad_norm": 10.077095031738281, "learning_rate": 9.393324221893131e-06, "loss": 0.5644, "mean_token_accuracy": 0.818347430229187, "num_tokens": 28090333.0, "step": 23400 }, { "entropy": 1.853566548228264, "epoch": 0.0725689517413371, "grad_norm": 4.725801467895508, "learning_rate": 9.391317658688664e-06, "loss": 0.5701, "mean_token_accuracy": 0.8122278556227684, "num_tokens": 28103819.0, "step": 23410 }, { "entropy": 1.9520633488893508, "epoch": 0.07259995086638679, "grad_norm": 9.427369117736816, "learning_rate": 9.389312380836351e-06, "loss": 0.6409, "mean_token_accuracy": 0.8200425282120705, "num_tokens": 28114738.0, "step": 23420 }, { "entropy": 1.8956316590309144, "epoch": 0.07263094999143649, "grad_norm": 7.797948360443115, "learning_rate": 9.3873083869645e-06, "loss": 0.5618, "mean_token_accuracy": 0.8178161531686783, "num_tokens": 28127067.0, "step": 23430 }, { "entropy": 1.9396205574274064, "epoch": 0.07266194911648619, "grad_norm": 10.093338012695312, "learning_rate": 9.38530567570348e-06, "loss": 0.6226, "mean_token_accuracy": 0.8116014674305916, "num_tokens": 28138551.0, "step": 23440 }, { "entropy": 2.0014042764902116, "epoch": 0.07269294824153588, "grad_norm": 8.427999496459961, "learning_rate": 9.383304245685689e-06, "loss": 0.6412, "mean_token_accuracy": 0.8133781552314758, "num_tokens": 28149878.0, "step": 23450 }, { "entropy": 1.9623918294906617, "epoch": 0.07272394736658558, "grad_norm": 8.854625701904297, "learning_rate": 9.381304095545583e-06, "loss": 0.6584, "mean_token_accuracy": 0.8096672505140304, "num_tokens": 28160988.0, "step": 23460 }, { "entropy": 1.9225846633315087, "epoch": 0.07275494649163528, "grad_norm": 12.70853042602539, "learning_rate": 9.379305223919642e-06, "loss": 0.6568, "mean_token_accuracy": 0.8003104001283645, "num_tokens": 28172035.0, "step": 23470 }, { "entropy": 1.8282870531082154, "epoch": 0.07278594561668497, "grad_norm": 8.768390655517578, "learning_rate": 9.377307629446383e-06, "loss": 0.5394, "mean_token_accuracy": 0.8221458449959755, "num_tokens": 28184093.0, "step": 23480 }, { "entropy": 1.8909123882651329, "epoch": 0.07281694474173467, "grad_norm": 20.700326919555664, "learning_rate": 9.375311310766353e-06, "loss": 0.5838, "mean_token_accuracy": 0.8204225182533265, "num_tokens": 28195961.0, "step": 23490 }, { "entropy": 1.8430864825844764, "epoch": 0.07284794386678436, "grad_norm": 5.11720609664917, "learning_rate": 9.373316266522123e-06, "loss": 0.4796, "mean_token_accuracy": 0.8319470316171647, "num_tokens": 28208526.0, "step": 23500 }, { "entropy": 1.8916525810956955, "epoch": 0.07287894299183406, "grad_norm": 11.949532508850098, "learning_rate": 9.371322495358281e-06, "loss": 0.6055, "mean_token_accuracy": 0.8044298276305198, "num_tokens": 28220641.0, "step": 23510 }, { "entropy": 1.979685339331627, "epoch": 0.07290994211688374, "grad_norm": 10.18620777130127, "learning_rate": 9.369329995921444e-06, "loss": 0.6482, "mean_token_accuracy": 0.8135093718767166, "num_tokens": 28231519.0, "step": 23520 }, { "entropy": 1.9537073016166686, "epoch": 0.07294094124193344, "grad_norm": 9.836045265197754, "learning_rate": 9.36733876686023e-06, "loss": 0.6507, "mean_token_accuracy": 0.8180084466934204, "num_tokens": 28242672.0, "step": 23530 }, { "entropy": 1.9465522095561028, "epoch": 0.07297194036698314, "grad_norm": 4.765397071838379, "learning_rate": 9.365348806825274e-06, "loss": 0.6548, "mean_token_accuracy": 0.8062368303537368, "num_tokens": 28254108.0, "step": 23540 }, { "entropy": 1.9817388027906417, "epoch": 0.07300293949203283, "grad_norm": 9.245866775512695, "learning_rate": 9.36336011446921e-06, "loss": 0.6389, "mean_token_accuracy": 0.8076906755566597, "num_tokens": 28265018.0, "step": 23550 }, { "entropy": 1.8746733576059342, "epoch": 0.07303393861708253, "grad_norm": 10.122679710388184, "learning_rate": 9.36137268844668e-06, "loss": 0.5419, "mean_token_accuracy": 0.8259296610951423, "num_tokens": 28277414.0, "step": 23560 }, { "entropy": 1.9215095147490502, "epoch": 0.07306493774213223, "grad_norm": 9.02968692779541, "learning_rate": 9.359386527414325e-06, "loss": 0.5584, "mean_token_accuracy": 0.8263942644000053, "num_tokens": 28289205.0, "step": 23570 }, { "entropy": 1.8805431053042412, "epoch": 0.07309593686718192, "grad_norm": 8.153913497924805, "learning_rate": 9.35740163003077e-06, "loss": 0.5737, "mean_token_accuracy": 0.8194609686732293, "num_tokens": 28300813.0, "step": 23580 }, { "entropy": 1.8591163650155067, "epoch": 0.07312693599223162, "grad_norm": 11.29405403137207, "learning_rate": 9.355417994956647e-06, "loss": 0.6421, "mean_token_accuracy": 0.7990970879793167, "num_tokens": 28313650.0, "step": 23590 }, { "entropy": 1.887549029290676, "epoch": 0.07315793511728132, "grad_norm": 4.2539472579956055, "learning_rate": 9.353435620854559e-06, "loss": 0.539, "mean_token_accuracy": 0.8247372597455979, "num_tokens": 28325816.0, "step": 23600 }, { "entropy": 1.966456551849842, "epoch": 0.07318893424233101, "grad_norm": 9.739566802978516, "learning_rate": 9.3514545063891e-06, "loss": 0.6504, "mean_token_accuracy": 0.8081991970539093, "num_tokens": 28337088.0, "step": 23610 }, { "entropy": 1.9840751081705092, "epoch": 0.07321993336738071, "grad_norm": 11.179567337036133, "learning_rate": 9.349474650226844e-06, "loss": 0.6291, "mean_token_accuracy": 0.8135482758283615, "num_tokens": 28347375.0, "step": 23620 }, { "entropy": 1.9408112317323685, "epoch": 0.0732509324924304, "grad_norm": 9.25151538848877, "learning_rate": 9.347496051036333e-06, "loss": 0.6223, "mean_token_accuracy": 0.8124508634209633, "num_tokens": 28358563.0, "step": 23630 }, { "entropy": 1.947693009674549, "epoch": 0.0732819316174801, "grad_norm": 9.259462356567383, "learning_rate": 9.345518707488087e-06, "loss": 0.6084, "mean_token_accuracy": 0.8231876268982887, "num_tokens": 28370399.0, "step": 23640 }, { "entropy": 1.846913254261017, "epoch": 0.07331293074252979, "grad_norm": 8.472787857055664, "learning_rate": 9.343542618254596e-06, "loss": 0.5402, "mean_token_accuracy": 0.822886273264885, "num_tokens": 28382814.0, "step": 23650 }, { "entropy": 1.848525333404541, "epoch": 0.07334392986757948, "grad_norm": 11.601837158203125, "learning_rate": 9.341567782010304e-06, "loss": 0.5389, "mean_token_accuracy": 0.8181129440665245, "num_tokens": 28395944.0, "step": 23660 }, { "entropy": 1.778299406170845, "epoch": 0.07337492899262918, "grad_norm": 10.347676277160645, "learning_rate": 9.339594197431625e-06, "loss": 0.548, "mean_token_accuracy": 0.8195615544915199, "num_tokens": 28409215.0, "step": 23670 }, { "entropy": 1.8473999500274658, "epoch": 0.07340592811767888, "grad_norm": 11.07874584197998, "learning_rate": 9.337621863196925e-06, "loss": 0.5769, "mean_token_accuracy": 0.8226366356015206, "num_tokens": 28421381.0, "step": 23680 }, { "entropy": 1.8470186293125153, "epoch": 0.07343692724272857, "grad_norm": 11.499205589294434, "learning_rate": 9.335650777986526e-06, "loss": 0.5876, "mean_token_accuracy": 0.824992710351944, "num_tokens": 28433681.0, "step": 23690 }, { "entropy": 1.9232969626784324, "epoch": 0.07346792636777827, "grad_norm": 11.156820297241211, "learning_rate": 9.333680940482696e-06, "loss": 0.5953, "mean_token_accuracy": 0.8178783282637596, "num_tokens": 28445840.0, "step": 23700 }, { "entropy": 1.9458463191986084, "epoch": 0.07349892549282797, "grad_norm": 5.818225860595703, "learning_rate": 9.331712349369646e-06, "loss": 0.6437, "mean_token_accuracy": 0.8080273911356926, "num_tokens": 28457345.0, "step": 23710 }, { "entropy": 1.9871521532535552, "epoch": 0.07352992461787766, "grad_norm": 9.459403991699219, "learning_rate": 9.329745003333538e-06, "loss": 0.6474, "mean_token_accuracy": 0.8077614203095436, "num_tokens": 28468578.0, "step": 23720 }, { "entropy": 1.9817648902535439, "epoch": 0.07356092374292736, "grad_norm": 8.348322868347168, "learning_rate": 9.327778901062466e-06, "loss": 0.6367, "mean_token_accuracy": 0.8168795883655549, "num_tokens": 28479788.0, "step": 23730 }, { "entropy": 1.9540872916579246, "epoch": 0.07359192286797706, "grad_norm": 8.266214370727539, "learning_rate": 9.325814041246455e-06, "loss": 0.6205, "mean_token_accuracy": 0.8153672978281975, "num_tokens": 28491967.0, "step": 23740 }, { "entropy": 1.960209146142006, "epoch": 0.07362292199302675, "grad_norm": 11.05349349975586, "learning_rate": 9.323850422577469e-06, "loss": 0.6019, "mean_token_accuracy": 0.811015397310257, "num_tokens": 28503930.0, "step": 23750 }, { "entropy": 1.9480343893170358, "epoch": 0.07365392111807645, "grad_norm": 10.689523696899414, "learning_rate": 9.321888043749389e-06, "loss": 0.5604, "mean_token_accuracy": 0.8198656231164932, "num_tokens": 28515451.0, "step": 23760 }, { "entropy": 1.994300290942192, "epoch": 0.07368492024312613, "grad_norm": 8.471319198608398, "learning_rate": 9.319926903458033e-06, "loss": 0.631, "mean_token_accuracy": 0.8167888432741165, "num_tokens": 28526353.0, "step": 23770 }, { "entropy": 1.9629247948527335, "epoch": 0.07371591936817583, "grad_norm": 10.409026145935059, "learning_rate": 9.317967000401127e-06, "loss": 0.6331, "mean_token_accuracy": 0.8050680428743362, "num_tokens": 28537484.0, "step": 23780 }, { "entropy": 1.8861139222979546, "epoch": 0.07374691849322552, "grad_norm": 8.942246437072754, "learning_rate": 9.316008333278319e-06, "loss": 0.5364, "mean_token_accuracy": 0.8210712686181069, "num_tokens": 28550244.0, "step": 23790 }, { "entropy": 1.867449736595154, "epoch": 0.07377791761827522, "grad_norm": 12.16384506225586, "learning_rate": 9.314050900791163e-06, "loss": 0.5982, "mean_token_accuracy": 0.8131821945309639, "num_tokens": 28562527.0, "step": 23800 }, { "entropy": 1.8669994980096818, "epoch": 0.07380891674332492, "grad_norm": 9.032928466796875, "learning_rate": 9.312094701643134e-06, "loss": 0.5414, "mean_token_accuracy": 0.8275477036833763, "num_tokens": 28575138.0, "step": 23810 }, { "entropy": 2.010842078924179, "epoch": 0.07383991586837461, "grad_norm": 9.27695369720459, "learning_rate": 9.3101397345396e-06, "loss": 0.635, "mean_token_accuracy": 0.8104411855340004, "num_tokens": 28586985.0, "step": 23820 }, { "entropy": 1.9500906214118003, "epoch": 0.07387091499342431, "grad_norm": 3.9518938064575195, "learning_rate": 9.30818599818784e-06, "loss": 0.5668, "mean_token_accuracy": 0.8293194532394409, "num_tokens": 28598926.0, "step": 23830 }, { "entropy": 1.9337919741868972, "epoch": 0.07390191411847401, "grad_norm": 10.783734321594238, "learning_rate": 9.306233491297024e-06, "loss": 0.5621, "mean_token_accuracy": 0.8271838411688804, "num_tokens": 28610265.0, "step": 23840 }, { "entropy": 1.8781434014439582, "epoch": 0.0739329132435237, "grad_norm": 4.472099781036377, "learning_rate": 9.30428221257822e-06, "loss": 0.5585, "mean_token_accuracy": 0.8306781485676765, "num_tokens": 28622824.0, "step": 23850 }, { "entropy": 1.9807551354169846, "epoch": 0.0739639123685734, "grad_norm": 11.769026756286621, "learning_rate": 9.302332160744387e-06, "loss": 0.6261, "mean_token_accuracy": 0.8115956127643585, "num_tokens": 28633746.0, "step": 23860 }, { "entropy": 1.9240091010928153, "epoch": 0.0739949114936231, "grad_norm": 11.441720962524414, "learning_rate": 9.300383334510372e-06, "loss": 0.6157, "mean_token_accuracy": 0.8130579099059105, "num_tokens": 28646025.0, "step": 23870 }, { "entropy": 1.9416207402944565, "epoch": 0.0740259106186728, "grad_norm": 9.635127067565918, "learning_rate": 9.298435732592904e-06, "loss": 0.5847, "mean_token_accuracy": 0.8167257070541382, "num_tokens": 28658461.0, "step": 23880 }, { "entropy": 1.9926012963056565, "epoch": 0.07405690974372248, "grad_norm": 9.58889102935791, "learning_rate": 9.296489353710593e-06, "loss": 0.6111, "mean_token_accuracy": 0.8202344119548798, "num_tokens": 28669851.0, "step": 23890 }, { "entropy": 1.917009449005127, "epoch": 0.07408790886877217, "grad_norm": 12.779633522033691, "learning_rate": 9.294544196583929e-06, "loss": 0.5674, "mean_token_accuracy": 0.8271616026759148, "num_tokens": 28682219.0, "step": 23900 }, { "entropy": 1.9119545385241508, "epoch": 0.07411890799382187, "grad_norm": 4.876058578491211, "learning_rate": 9.29260025993527e-06, "loss": 0.5712, "mean_token_accuracy": 0.8157782420516014, "num_tokens": 28694730.0, "step": 23910 }, { "entropy": 1.982969084382057, "epoch": 0.07414990711887157, "grad_norm": 4.22367525100708, "learning_rate": 9.290657542488846e-06, "loss": 0.5934, "mean_token_accuracy": 0.8123695835471153, "num_tokens": 28706446.0, "step": 23920 }, { "entropy": 1.965246671438217, "epoch": 0.07418090624392126, "grad_norm": 11.446246147155762, "learning_rate": 9.28871604297075e-06, "loss": 0.6086, "mean_token_accuracy": 0.8144690200686455, "num_tokens": 28718411.0, "step": 23930 }, { "entropy": 1.9847482353448869, "epoch": 0.07421190536897096, "grad_norm": 11.718853950500488, "learning_rate": 9.28677576010895e-06, "loss": 0.6281, "mean_token_accuracy": 0.8073780536651611, "num_tokens": 28730646.0, "step": 23940 }, { "entropy": 1.9997831106185913, "epoch": 0.07424290449402066, "grad_norm": 9.053925514221191, "learning_rate": 9.284836692633257e-06, "loss": 0.6105, "mean_token_accuracy": 0.8186374083161354, "num_tokens": 28741168.0, "step": 23950 }, { "entropy": 1.8356414943933488, "epoch": 0.07427390361907035, "grad_norm": 9.540495872497559, "learning_rate": 9.282898839275347e-06, "loss": 0.5233, "mean_token_accuracy": 0.8257902413606644, "num_tokens": 28755165.0, "step": 23960 }, { "entropy": 1.9523360833525658, "epoch": 0.07430490274412005, "grad_norm": 11.081901550292969, "learning_rate": 9.280962198768745e-06, "loss": 0.6224, "mean_token_accuracy": 0.8144389301538467, "num_tokens": 28766577.0, "step": 23970 }, { "entropy": 1.962296548485756, "epoch": 0.07433590186916975, "grad_norm": 9.727673530578613, "learning_rate": 9.279026769848825e-06, "loss": 0.6022, "mean_token_accuracy": 0.8120512589812279, "num_tokens": 28777780.0, "step": 23980 }, { "entropy": 1.9339822575449943, "epoch": 0.07436690099421944, "grad_norm": 12.94129467010498, "learning_rate": 9.277092551252813e-06, "loss": 0.6079, "mean_token_accuracy": 0.8044057339429855, "num_tokens": 28789644.0, "step": 23990 }, { "entropy": 1.9828555971384048, "epoch": 0.07439790011926914, "grad_norm": 11.137704849243164, "learning_rate": 9.275159541719766e-06, "loss": 0.6334, "mean_token_accuracy": 0.8016148954629898, "num_tokens": 28800391.0, "step": 24000 }, { "entropy": 1.8854010567069053, "epoch": 0.07442889924431884, "grad_norm": 9.98965835571289, "learning_rate": 9.273227739990585e-06, "loss": 0.5532, "mean_token_accuracy": 0.8219901576638222, "num_tokens": 28812899.0, "step": 24010 }, { "entropy": 1.9441875860095024, "epoch": 0.07445989836936852, "grad_norm": 9.923999786376953, "learning_rate": 9.271297144808003e-06, "loss": 0.637, "mean_token_accuracy": 0.8106958836317062, "num_tokens": 28824358.0, "step": 24020 }, { "entropy": 1.889090073108673, "epoch": 0.07449089749441822, "grad_norm": 8.807568550109863, "learning_rate": 9.269367754916594e-06, "loss": 0.5404, "mean_token_accuracy": 0.83165952116251, "num_tokens": 28835906.0, "step": 24030 }, { "entropy": 1.8731929019093514, "epoch": 0.07452189661946791, "grad_norm": 11.27502727508545, "learning_rate": 9.267439569062747e-06, "loss": 0.5855, "mean_token_accuracy": 0.8130838423967361, "num_tokens": 28848120.0, "step": 24040 }, { "entropy": 1.9439046129584312, "epoch": 0.07455289574451761, "grad_norm": 10.14338207244873, "learning_rate": 9.26551258599468e-06, "loss": 0.5926, "mean_token_accuracy": 0.817124992609024, "num_tokens": 28860425.0, "step": 24050 }, { "entropy": 1.8763557612895965, "epoch": 0.0745838948695673, "grad_norm": 9.435752868652344, "learning_rate": 9.26358680446244e-06, "loss": 0.5757, "mean_token_accuracy": 0.82074084430933, "num_tokens": 28873156.0, "step": 24060 }, { "entropy": 1.8747549593448638, "epoch": 0.074614893994617, "grad_norm": 12.689223289489746, "learning_rate": 9.26166222321788e-06, "loss": 0.5661, "mean_token_accuracy": 0.817321227490902, "num_tokens": 28885693.0, "step": 24070 }, { "entropy": 1.7927760377526283, "epoch": 0.0746458931196667, "grad_norm": 8.860281944274902, "learning_rate": 9.25973884101468e-06, "loss": 0.5196, "mean_token_accuracy": 0.8320321753621102, "num_tokens": 28898315.0, "step": 24080 }, { "entropy": 1.8106766402721406, "epoch": 0.0746768922447164, "grad_norm": 11.25741958618164, "learning_rate": 9.257816656608314e-06, "loss": 0.554, "mean_token_accuracy": 0.8215843066573143, "num_tokens": 28912120.0, "step": 24090 }, { "entropy": 1.951988846063614, "epoch": 0.07470789136976609, "grad_norm": 9.571369171142578, "learning_rate": 9.25589566875608e-06, "loss": 0.6411, "mean_token_accuracy": 0.8124495342373848, "num_tokens": 28924042.0, "step": 24100 }, { "entropy": 1.850806337594986, "epoch": 0.07473889049481579, "grad_norm": 9.427949905395508, "learning_rate": 9.253975876217073e-06, "loss": 0.5342, "mean_token_accuracy": 0.8303135067224503, "num_tokens": 28936552.0, "step": 24110 }, { "entropy": 1.9311017155647279, "epoch": 0.07476988961986548, "grad_norm": 5.728914260864258, "learning_rate": 9.25205727775219e-06, "loss": 0.6528, "mean_token_accuracy": 0.8051609516143798, "num_tokens": 28948713.0, "step": 24120 }, { "entropy": 1.9310711935162543, "epoch": 0.07480088874491518, "grad_norm": 11.202091217041016, "learning_rate": 9.250139872124125e-06, "loss": 0.6041, "mean_token_accuracy": 0.8168716624379158, "num_tokens": 28959606.0, "step": 24130 }, { "entropy": 1.8889696165919303, "epoch": 0.07483188786996486, "grad_norm": 3.5714454650878906, "learning_rate": 9.248223658097366e-06, "loss": 0.547, "mean_token_accuracy": 0.8295445218682289, "num_tokens": 28971784.0, "step": 24140 }, { "entropy": 1.9066083252429962, "epoch": 0.07486288699501456, "grad_norm": 15.336698532104492, "learning_rate": 9.246308634438193e-06, "loss": 0.6225, "mean_token_accuracy": 0.814862783253193, "num_tokens": 28983340.0, "step": 24150 }, { "entropy": 1.9325315952301025, "epoch": 0.07489388612006426, "grad_norm": 10.859710693359375, "learning_rate": 9.244394799914674e-06, "loss": 0.6306, "mean_token_accuracy": 0.8129459515213966, "num_tokens": 28995466.0, "step": 24160 }, { "entropy": 1.9299397438764572, "epoch": 0.07492488524511395, "grad_norm": 11.286776542663574, "learning_rate": 9.242482153296657e-06, "loss": 0.6152, "mean_token_accuracy": 0.8135070979595185, "num_tokens": 29007025.0, "step": 24170 }, { "entropy": 1.9263689830899238, "epoch": 0.07495588437016365, "grad_norm": 8.885149002075195, "learning_rate": 9.240570693355777e-06, "loss": 0.6203, "mean_token_accuracy": 0.8149414286017418, "num_tokens": 29019269.0, "step": 24180 }, { "entropy": 1.9538878142833709, "epoch": 0.07498688349521335, "grad_norm": 12.976593971252441, "learning_rate": 9.238660418865444e-06, "loss": 0.6581, "mean_token_accuracy": 0.8068120896816253, "num_tokens": 29030437.0, "step": 24190 }, { "entropy": 1.9112654134631157, "epoch": 0.07501788262026304, "grad_norm": 9.866148948669434, "learning_rate": 9.236751328600838e-06, "loss": 0.5783, "mean_token_accuracy": 0.8209733456373215, "num_tokens": 29042237.0, "step": 24200 }, { "entropy": 2.0098152339458464, "epoch": 0.07504888174531274, "grad_norm": 9.860359191894531, "learning_rate": 9.234843421338919e-06, "loss": 0.691, "mean_token_accuracy": 0.7986131757497787, "num_tokens": 29053227.0, "step": 24210 }, { "entropy": 1.914099135249853, "epoch": 0.07507988087036244, "grad_norm": 3.684596300125122, "learning_rate": 9.232936695858406e-06, "loss": 0.5496, "mean_token_accuracy": 0.8275950834155082, "num_tokens": 29066785.0, "step": 24220 }, { "entropy": 1.8749552696943284, "epoch": 0.07511087999541213, "grad_norm": 9.360295295715332, "learning_rate": 9.231031150939787e-06, "loss": 0.5658, "mean_token_accuracy": 0.8239286541938782, "num_tokens": 29080125.0, "step": 24230 }, { "entropy": 1.9456191077828406, "epoch": 0.07514187912046183, "grad_norm": 9.981027603149414, "learning_rate": 9.229126785365307e-06, "loss": 0.6293, "mean_token_accuracy": 0.8097566932439804, "num_tokens": 29091466.0, "step": 24240 }, { "entropy": 1.9858911633491516, "epoch": 0.07517287824551153, "grad_norm": 10.95433521270752, "learning_rate": 9.227223597918977e-06, "loss": 0.6356, "mean_token_accuracy": 0.8043408066034317, "num_tokens": 29102658.0, "step": 24250 }, { "entropy": 1.9025256112217903, "epoch": 0.07520387737056121, "grad_norm": 4.23534631729126, "learning_rate": 9.225321587386555e-06, "loss": 0.5672, "mean_token_accuracy": 0.8235425055027008, "num_tokens": 29115098.0, "step": 24260 }, { "entropy": 1.9474715083837508, "epoch": 0.0752348764956109, "grad_norm": 8.94451904296875, "learning_rate": 9.22342075255555e-06, "loss": 0.5524, "mean_token_accuracy": 0.82459936439991, "num_tokens": 29127073.0, "step": 24270 }, { "entropy": 2.0239467322826385, "epoch": 0.0752658756206606, "grad_norm": 9.617466926574707, "learning_rate": 9.221521092215226e-06, "loss": 0.6738, "mean_token_accuracy": 0.8019678488373756, "num_tokens": 29137792.0, "step": 24280 }, { "entropy": 1.9810176610946655, "epoch": 0.0752968747457103, "grad_norm": 9.926680564880371, "learning_rate": 9.219622605156583e-06, "loss": 0.6936, "mean_token_accuracy": 0.8082947731018066, "num_tokens": 29148654.0, "step": 24290 }, { "entropy": 1.9755667805671693, "epoch": 0.07532787387076, "grad_norm": 9.290923118591309, "learning_rate": 9.217725290172373e-06, "loss": 0.6299, "mean_token_accuracy": 0.8143254667520523, "num_tokens": 29159751.0, "step": 24300 }, { "entropy": 1.9506172388792038, "epoch": 0.07535887299580969, "grad_norm": 4.7398362159729, "learning_rate": 9.215829146057074e-06, "loss": 0.6397, "mean_token_accuracy": 0.8142923697829246, "num_tokens": 29171860.0, "step": 24310 }, { "entropy": 1.964981135725975, "epoch": 0.07538987212085939, "grad_norm": 10.409062385559082, "learning_rate": 9.213934171606911e-06, "loss": 0.585, "mean_token_accuracy": 0.8187711656093597, "num_tokens": 29183756.0, "step": 24320 }, { "entropy": 1.8626610353589057, "epoch": 0.07542087124590909, "grad_norm": 16.499589920043945, "learning_rate": 9.212040365619834e-06, "loss": 0.5001, "mean_token_accuracy": 0.8303863450884819, "num_tokens": 29196691.0, "step": 24330 }, { "entropy": 1.9400402843952178, "epoch": 0.07545187037095878, "grad_norm": 10.061908721923828, "learning_rate": 9.210147726895522e-06, "loss": 0.6201, "mean_token_accuracy": 0.8127806216478348, "num_tokens": 29208279.0, "step": 24340 }, { "entropy": 1.9271167308092116, "epoch": 0.07548286949600848, "grad_norm": 4.295543193817139, "learning_rate": 9.208256254235383e-06, "loss": 0.5521, "mean_token_accuracy": 0.8224424123764038, "num_tokens": 29219803.0, "step": 24350 }, { "entropy": 1.9891034051775933, "epoch": 0.07551386862105817, "grad_norm": 12.840479850769043, "learning_rate": 9.206365946442545e-06, "loss": 0.6032, "mean_token_accuracy": 0.8187379658222198, "num_tokens": 29230787.0, "step": 24360 }, { "entropy": 1.9009444043040276, "epoch": 0.07554486774610787, "grad_norm": 10.091814041137695, "learning_rate": 9.204476802321853e-06, "loss": 0.5682, "mean_token_accuracy": 0.8159004956483841, "num_tokens": 29243031.0, "step": 24370 }, { "entropy": 1.9343391075730323, "epoch": 0.07557586687115757, "grad_norm": 11.57731819152832, "learning_rate": 9.202588820679873e-06, "loss": 0.5871, "mean_token_accuracy": 0.8243474781513214, "num_tokens": 29254685.0, "step": 24380 }, { "entropy": 1.795560409873724, "epoch": 0.07560686599620725, "grad_norm": 2.2424561977386475, "learning_rate": 9.20070200032488e-06, "loss": 0.5271, "mean_token_accuracy": 0.8258658021688461, "num_tokens": 29268202.0, "step": 24390 }, { "entropy": 1.896498514711857, "epoch": 0.07563786512125695, "grad_norm": 11.616437911987305, "learning_rate": 9.198816340066862e-06, "loss": 0.6379, "mean_token_accuracy": 0.8165041267871856, "num_tokens": 29280463.0, "step": 24400 }, { "entropy": 1.9246143698692322, "epoch": 0.07566886424630664, "grad_norm": 10.278308868408203, "learning_rate": 9.19693183871751e-06, "loss": 0.5943, "mean_token_accuracy": 0.8151498556137085, "num_tokens": 29292665.0, "step": 24410 }, { "entropy": 2.0228449046611785, "epoch": 0.07569986337135634, "grad_norm": 9.317980766296387, "learning_rate": 9.19504849509022e-06, "loss": 0.6758, "mean_token_accuracy": 0.7997980415821075, "num_tokens": 29304324.0, "step": 24420 }, { "entropy": 1.9563984453678132, "epoch": 0.07573086249640604, "grad_norm": 4.3756422996521, "learning_rate": 9.19316630800009e-06, "loss": 0.6041, "mean_token_accuracy": 0.8073103711009025, "num_tokens": 29316485.0, "step": 24430 }, { "entropy": 1.9286874875426292, "epoch": 0.07576186162145573, "grad_norm": 9.686562538146973, "learning_rate": 9.191285276263909e-06, "loss": 0.5309, "mean_token_accuracy": 0.8244198396801948, "num_tokens": 29329320.0, "step": 24440 }, { "entropy": 1.9644469603896142, "epoch": 0.07579286074650543, "grad_norm": 8.78291130065918, "learning_rate": 9.18940539870017e-06, "loss": 0.5696, "mean_token_accuracy": 0.8178132340312004, "num_tokens": 29340967.0, "step": 24450 }, { "entropy": 1.981376151740551, "epoch": 0.07582385987155513, "grad_norm": 10.671409606933594, "learning_rate": 9.187526674129046e-06, "loss": 0.6293, "mean_token_accuracy": 0.807844452559948, "num_tokens": 29353256.0, "step": 24460 }, { "entropy": 1.9730531215667724, "epoch": 0.07585485899660482, "grad_norm": 9.958746910095215, "learning_rate": 9.185649101372406e-06, "loss": 0.5865, "mean_token_accuracy": 0.8276485189795494, "num_tokens": 29363717.0, "step": 24470 }, { "entropy": 1.9136406406760216, "epoch": 0.07588585812165452, "grad_norm": 5.672607898712158, "learning_rate": 9.1837726792538e-06, "loss": 0.5799, "mean_token_accuracy": 0.8139689520001412, "num_tokens": 29375747.0, "step": 24480 }, { "entropy": 1.8405794501304626, "epoch": 0.07591685724670422, "grad_norm": 9.598546028137207, "learning_rate": 9.18189740659846e-06, "loss": 0.5324, "mean_token_accuracy": 0.825704300403595, "num_tokens": 29388350.0, "step": 24490 }, { "entropy": 1.9111091524362565, "epoch": 0.07594785637175391, "grad_norm": 11.881855964660645, "learning_rate": 9.180023282233297e-06, "loss": 0.6201, "mean_token_accuracy": 0.8082716703414917, "num_tokens": 29400465.0, "step": 24500 }, { "entropy": 1.9688442632555962, "epoch": 0.0759788554968036, "grad_norm": 9.742838859558105, "learning_rate": 9.178150304986897e-06, "loss": 0.5825, "mean_token_accuracy": 0.8214067354798317, "num_tokens": 29411568.0, "step": 24510 }, { "entropy": 1.8937147334218025, "epoch": 0.07600985462185329, "grad_norm": 10.891999244689941, "learning_rate": 9.17627847368952e-06, "loss": 0.5447, "mean_token_accuracy": 0.8176762789487839, "num_tokens": 29424053.0, "step": 24520 }, { "entropy": 1.831982211768627, "epoch": 0.07604085374690299, "grad_norm": 9.699090003967285, "learning_rate": 9.174407787173092e-06, "loss": 0.4613, "mean_token_accuracy": 0.8441127195954323, "num_tokens": 29436498.0, "step": 24530 }, { "entropy": 1.829953595995903, "epoch": 0.07607185287195269, "grad_norm": 10.976648330688477, "learning_rate": 9.172538244271205e-06, "loss": 0.5677, "mean_token_accuracy": 0.8100987046957016, "num_tokens": 29449302.0, "step": 24540 }, { "entropy": 1.8781290456652642, "epoch": 0.07610285199700238, "grad_norm": 10.623594284057617, "learning_rate": 9.170669843819118e-06, "loss": 0.6207, "mean_token_accuracy": 0.8065455496311188, "num_tokens": 29461309.0, "step": 24550 }, { "entropy": 1.9357499971985817, "epoch": 0.07613385112205208, "grad_norm": 5.401109218597412, "learning_rate": 9.16880258465375e-06, "loss": 0.6238, "mean_token_accuracy": 0.809730452299118, "num_tokens": 29473381.0, "step": 24560 }, { "entropy": 1.9581341043114662, "epoch": 0.07616485024710178, "grad_norm": 9.8287935256958, "learning_rate": 9.166936465613671e-06, "loss": 0.606, "mean_token_accuracy": 0.8168662905693054, "num_tokens": 29484514.0, "step": 24570 }, { "entropy": 1.8992894351482392, "epoch": 0.07619584937215147, "grad_norm": 10.469278335571289, "learning_rate": 9.165071485539113e-06, "loss": 0.5448, "mean_token_accuracy": 0.8262490287423134, "num_tokens": 29496684.0, "step": 24580 }, { "entropy": 1.856746034324169, "epoch": 0.07622684849720117, "grad_norm": 7.977566719055176, "learning_rate": 9.163207643271953e-06, "loss": 0.5472, "mean_token_accuracy": 0.8371643677353859, "num_tokens": 29508911.0, "step": 24590 }, { "entropy": 1.8318717867136, "epoch": 0.07625784762225087, "grad_norm": 7.776499271392822, "learning_rate": 9.161344937655717e-06, "loss": 0.5138, "mean_token_accuracy": 0.8247979655861855, "num_tokens": 29522385.0, "step": 24600 }, { "entropy": 1.9623229920864105, "epoch": 0.07628884674730056, "grad_norm": 11.194042205810547, "learning_rate": 9.159483367535581e-06, "loss": 0.5956, "mean_token_accuracy": 0.8133369132876396, "num_tokens": 29533515.0, "step": 24610 }, { "entropy": 1.8496048122644424, "epoch": 0.07631984587235026, "grad_norm": 9.94607162475586, "learning_rate": 9.157622931758355e-06, "loss": 0.4642, "mean_token_accuracy": 0.8247012510895729, "num_tokens": 29546981.0, "step": 24620 }, { "entropy": 1.8561659142374993, "epoch": 0.07635084499739994, "grad_norm": 4.343288421630859, "learning_rate": 9.155763629172494e-06, "loss": 0.5875, "mean_token_accuracy": 0.8228087961673737, "num_tokens": 29559906.0, "step": 24630 }, { "entropy": 1.8643113687634467, "epoch": 0.07638184412244964, "grad_norm": 3.739152193069458, "learning_rate": 9.153905458628086e-06, "loss": 0.5515, "mean_token_accuracy": 0.8250072717666626, "num_tokens": 29572877.0, "step": 24640 }, { "entropy": 1.9850792646408082, "epoch": 0.07641284324749933, "grad_norm": 11.358891487121582, "learning_rate": 9.152048418976852e-06, "loss": 0.656, "mean_token_accuracy": 0.8045062452554703, "num_tokens": 29584254.0, "step": 24650 }, { "entropy": 1.930943602323532, "epoch": 0.07644384237254903, "grad_norm": 9.818647384643555, "learning_rate": 9.150192509072147e-06, "loss": 0.5624, "mean_token_accuracy": 0.811825430393219, "num_tokens": 29596694.0, "step": 24660 }, { "entropy": 1.963192854821682, "epoch": 0.07647484149759873, "grad_norm": 10.888368606567383, "learning_rate": 9.148337727768948e-06, "loss": 0.6148, "mean_token_accuracy": 0.8172325447201729, "num_tokens": 29607626.0, "step": 24670 }, { "entropy": 1.8728375658392906, "epoch": 0.07650584062264842, "grad_norm": 9.656998634338379, "learning_rate": 9.146484073923858e-06, "loss": 0.6045, "mean_token_accuracy": 0.8192928373813629, "num_tokens": 29620518.0, "step": 24680 }, { "entropy": 1.9853510811924935, "epoch": 0.07653683974769812, "grad_norm": 9.287243843078613, "learning_rate": 9.144631546395098e-06, "loss": 0.6036, "mean_token_accuracy": 0.814111416041851, "num_tokens": 29632155.0, "step": 24690 }, { "entropy": 1.974936306476593, "epoch": 0.07656783887274782, "grad_norm": 9.676168441772461, "learning_rate": 9.142780144042515e-06, "loss": 0.6892, "mean_token_accuracy": 0.8018039375543594, "num_tokens": 29643855.0, "step": 24700 }, { "entropy": 1.9820726066827774, "epoch": 0.07659883799779751, "grad_norm": 10.02213191986084, "learning_rate": 9.140929865727566e-06, "loss": 0.5892, "mean_token_accuracy": 0.8112702906131745, "num_tokens": 29655657.0, "step": 24710 }, { "entropy": 1.909799675643444, "epoch": 0.07662983712284721, "grad_norm": 9.104336738586426, "learning_rate": 9.139080710313316e-06, "loss": 0.5391, "mean_token_accuracy": 0.8222946509718895, "num_tokens": 29667862.0, "step": 24720 }, { "entropy": 1.992139181494713, "epoch": 0.07666083624789691, "grad_norm": 4.088809013366699, "learning_rate": 9.137232676664449e-06, "loss": 0.66, "mean_token_accuracy": 0.7951893359422684, "num_tokens": 29679790.0, "step": 24730 }, { "entropy": 1.9576111137866974, "epoch": 0.0766918353729466, "grad_norm": 11.400434494018555, "learning_rate": 9.135385763647246e-06, "loss": 0.6328, "mean_token_accuracy": 0.8115091070532798, "num_tokens": 29691260.0, "step": 24740 }, { "entropy": 1.8499963372945785, "epoch": 0.0767228344979963, "grad_norm": 9.526920318603516, "learning_rate": 9.1335399701296e-06, "loss": 0.5439, "mean_token_accuracy": 0.8311129599809647, "num_tokens": 29704733.0, "step": 24750 }, { "entropy": 1.848706914484501, "epoch": 0.07675383362304598, "grad_norm": 9.934120178222656, "learning_rate": 9.131695294980995e-06, "loss": 0.5474, "mean_token_accuracy": 0.831969042122364, "num_tokens": 29717478.0, "step": 24760 }, { "entropy": 1.912044520676136, "epoch": 0.07678483274809568, "grad_norm": 10.17774772644043, "learning_rate": 9.129851737072522e-06, "loss": 0.63, "mean_token_accuracy": 0.8104033455252647, "num_tokens": 29728251.0, "step": 24770 }, { "entropy": 2.015045040845871, "epoch": 0.07681583187314538, "grad_norm": 7.955008506774902, "learning_rate": 9.128009295276862e-06, "loss": 0.6655, "mean_token_accuracy": 0.8075503215193749, "num_tokens": 29739613.0, "step": 24780 }, { "entropy": 1.8380137085914612, "epoch": 0.07684683099819507, "grad_norm": 6.014031887054443, "learning_rate": 9.126167968468289e-06, "loss": 0.5133, "mean_token_accuracy": 0.8174324378371238, "num_tokens": 29752547.0, "step": 24790 }, { "entropy": 2.001574045419693, "epoch": 0.07687783012324477, "grad_norm": 10.860368728637695, "learning_rate": 9.124327755522661e-06, "loss": 0.6676, "mean_token_accuracy": 0.808248932659626, "num_tokens": 29762867.0, "step": 24800 }, { "entropy": 2.0134127497673036, "epoch": 0.07690882924829447, "grad_norm": 9.547008514404297, "learning_rate": 9.122488655317434e-06, "loss": 0.7027, "mean_token_accuracy": 0.804673321545124, "num_tokens": 29773812.0, "step": 24810 }, { "entropy": 1.8855068862438202, "epoch": 0.07693982837334416, "grad_norm": 11.452570915222168, "learning_rate": 9.120650666731632e-06, "loss": 0.56, "mean_token_accuracy": 0.8221283480525017, "num_tokens": 29785964.0, "step": 24820 }, { "entropy": 1.8937954485416413, "epoch": 0.07697082749839386, "grad_norm": 9.164377212524414, "learning_rate": 9.118813788645872e-06, "loss": 0.6028, "mean_token_accuracy": 0.8206618830561638, "num_tokens": 29797873.0, "step": 24830 }, { "entropy": 1.8926649376749993, "epoch": 0.07700182662344356, "grad_norm": 10.76089096069336, "learning_rate": 9.116978019942341e-06, "loss": 0.5414, "mean_token_accuracy": 0.8190899342298508, "num_tokens": 29809898.0, "step": 24840 }, { "entropy": 1.739300973713398, "epoch": 0.07703282574849325, "grad_norm": 4.010213851928711, "learning_rate": 9.115143359504806e-06, "loss": 0.4485, "mean_token_accuracy": 0.8370007425546646, "num_tokens": 29824298.0, "step": 24850 }, { "entropy": 1.8673224583268166, "epoch": 0.07706382487354295, "grad_norm": 7.60786247253418, "learning_rate": 9.113309806218598e-06, "loss": 0.5623, "mean_token_accuracy": 0.8228920385241508, "num_tokens": 29836384.0, "step": 24860 }, { "entropy": 1.8416576564311982, "epoch": 0.07709482399859265, "grad_norm": 10.18614387512207, "learning_rate": 9.111477358970625e-06, "loss": 0.5068, "mean_token_accuracy": 0.8315700188279151, "num_tokens": 29848465.0, "step": 24870 }, { "entropy": 1.833722086250782, "epoch": 0.07712582312364233, "grad_norm": 9.657119750976562, "learning_rate": 9.109646016649358e-06, "loss": 0.5351, "mean_token_accuracy": 0.822961401939392, "num_tokens": 29860315.0, "step": 24880 }, { "entropy": 1.9155391678214073, "epoch": 0.07715682224869203, "grad_norm": 9.408291816711426, "learning_rate": 9.107815778144829e-06, "loss": 0.6015, "mean_token_accuracy": 0.8233883902430534, "num_tokens": 29871950.0, "step": 24890 }, { "entropy": 1.8746279999613762, "epoch": 0.07718782137374172, "grad_norm": 9.93213176727295, "learning_rate": 9.105986642348637e-06, "loss": 0.6101, "mean_token_accuracy": 0.8215365827083587, "num_tokens": 29883850.0, "step": 24900 }, { "entropy": 1.997341087460518, "epoch": 0.07721882049879142, "grad_norm": 10.22244930267334, "learning_rate": 9.104158608153925e-06, "loss": 0.6552, "mean_token_accuracy": 0.8083247482776642, "num_tokens": 29894752.0, "step": 24910 }, { "entropy": 1.9021727859973907, "epoch": 0.07724981962384112, "grad_norm": 10.58688735961914, "learning_rate": 9.10233167445541e-06, "loss": 0.5976, "mean_token_accuracy": 0.8210710018873215, "num_tokens": 29906184.0, "step": 24920 }, { "entropy": 1.9085007548332213, "epoch": 0.07728081874889081, "grad_norm": 10.130495071411133, "learning_rate": 9.100505840149343e-06, "loss": 0.5777, "mean_token_accuracy": 0.8177463442087174, "num_tokens": 29918911.0, "step": 24930 }, { "entropy": 1.949122653901577, "epoch": 0.07731181787394051, "grad_norm": 10.247212409973145, "learning_rate": 9.09868110413354e-06, "loss": 0.5698, "mean_token_accuracy": 0.8257071673870087, "num_tokens": 29930418.0, "step": 24940 }, { "entropy": 1.8943613111972808, "epoch": 0.0773428169989902, "grad_norm": 8.908804893493652, "learning_rate": 9.096857465307348e-06, "loss": 0.5782, "mean_token_accuracy": 0.8173254355788231, "num_tokens": 29942796.0, "step": 24950 }, { "entropy": 1.8852560982108115, "epoch": 0.0773738161240399, "grad_norm": 9.335540771484375, "learning_rate": 9.095034922571667e-06, "loss": 0.5485, "mean_token_accuracy": 0.8281083196401596, "num_tokens": 29954762.0, "step": 24960 }, { "entropy": 1.9339411437511445, "epoch": 0.0774048152490896, "grad_norm": 8.73486042022705, "learning_rate": 9.093213474828937e-06, "loss": 0.6839, "mean_token_accuracy": 0.8056422159075737, "num_tokens": 29966209.0, "step": 24970 }, { "entropy": 1.8483040496706962, "epoch": 0.0774358143741393, "grad_norm": 16.429473876953125, "learning_rate": 9.091393120983134e-06, "loss": 0.5784, "mean_token_accuracy": 0.8213301405310631, "num_tokens": 29979277.0, "step": 24980 }, { "entropy": 1.9246578827500342, "epoch": 0.07746681349918899, "grad_norm": 11.819490432739258, "learning_rate": 9.089573859939769e-06, "loss": 0.6234, "mean_token_accuracy": 0.8150523856282235, "num_tokens": 29991214.0, "step": 24990 }, { "entropy": 1.8754414036870002, "epoch": 0.07749781262423869, "grad_norm": 3.053318500518799, "learning_rate": 9.087755690605889e-06, "loss": 0.5299, "mean_token_accuracy": 0.8337288603186608, "num_tokens": 30004088.0, "step": 25000 }, { "entropy": 1.958650803565979, "epoch": 0.07752881174928837, "grad_norm": 9.075653076171875, "learning_rate": 9.085938611890065e-06, "loss": 0.6506, "mean_token_accuracy": 0.8138946458697319, "num_tokens": 30015387.0, "step": 25010 }, { "entropy": 1.831257238984108, "epoch": 0.07755981087433807, "grad_norm": 7.205061912536621, "learning_rate": 9.084122622702402e-06, "loss": 0.5113, "mean_token_accuracy": 0.8284579709172248, "num_tokens": 30028069.0, "step": 25020 }, { "entropy": 1.921176479756832, "epoch": 0.07759080999938776, "grad_norm": 9.383119583129883, "learning_rate": 9.082307721954523e-06, "loss": 0.5628, "mean_token_accuracy": 0.8308942928910256, "num_tokens": 30039720.0, "step": 25030 }, { "entropy": 1.881229367852211, "epoch": 0.07762180912443746, "grad_norm": 9.154582023620605, "learning_rate": 9.080493908559574e-06, "loss": 0.5715, "mean_token_accuracy": 0.8158265233039856, "num_tokens": 30052310.0, "step": 25040 }, { "entropy": 1.9333179131150247, "epoch": 0.07765280824948716, "grad_norm": 10.319096565246582, "learning_rate": 9.078681181432226e-06, "loss": 0.656, "mean_token_accuracy": 0.8071909308433532, "num_tokens": 30063377.0, "step": 25050 }, { "entropy": 1.9245691820979118, "epoch": 0.07768380737453685, "grad_norm": 9.286566734313965, "learning_rate": 9.076869539488652e-06, "loss": 0.5793, "mean_token_accuracy": 0.8230229288339614, "num_tokens": 30074509.0, "step": 25060 }, { "entropy": 1.9185208335518837, "epoch": 0.07771480649958655, "grad_norm": 10.67673110961914, "learning_rate": 9.075058981646555e-06, "loss": 0.6388, "mean_token_accuracy": 0.810713978111744, "num_tokens": 30086905.0, "step": 25070 }, { "entropy": 1.9412970080971719, "epoch": 0.07774580562463625, "grad_norm": 8.668137550354004, "learning_rate": 9.073249506825138e-06, "loss": 0.5973, "mean_token_accuracy": 0.8281246423721313, "num_tokens": 30098285.0, "step": 25080 }, { "entropy": 1.9020920038223266, "epoch": 0.07777680474968594, "grad_norm": 4.553186416625977, "learning_rate": 9.071441113945115e-06, "loss": 0.5749, "mean_token_accuracy": 0.8125508442521095, "num_tokens": 30109763.0, "step": 25090 }, { "entropy": 1.9090612187981606, "epoch": 0.07780780387473564, "grad_norm": 9.096549987792969, "learning_rate": 9.069633801928704e-06, "loss": 0.5595, "mean_token_accuracy": 0.8202937185764313, "num_tokens": 30121903.0, "step": 25100 }, { "entropy": 1.9104878604412079, "epoch": 0.07783880299978534, "grad_norm": 10.058632850646973, "learning_rate": 9.067827569699626e-06, "loss": 0.6086, "mean_token_accuracy": 0.8223781570792198, "num_tokens": 30134136.0, "step": 25110 }, { "entropy": 1.9066944912075996, "epoch": 0.07786980212483503, "grad_norm": 7.3937835693359375, "learning_rate": 9.066022416183104e-06, "loss": 0.6247, "mean_token_accuracy": 0.8184296682476997, "num_tokens": 30145673.0, "step": 25120 }, { "entropy": 1.977784439921379, "epoch": 0.07790080124988472, "grad_norm": 8.76796817779541, "learning_rate": 9.064218340305854e-06, "loss": 0.6401, "mean_token_accuracy": 0.8132604837417603, "num_tokens": 30156471.0, "step": 25130 }, { "entropy": 1.886358195543289, "epoch": 0.07793180037493441, "grad_norm": 8.512279510498047, "learning_rate": 9.06241534099609e-06, "loss": 0.5172, "mean_token_accuracy": 0.8291906744241715, "num_tokens": 30168512.0, "step": 25140 }, { "entropy": 1.9338295564055443, "epoch": 0.07796279949998411, "grad_norm": 8.981732368469238, "learning_rate": 9.060613417183516e-06, "loss": 0.6373, "mean_token_accuracy": 0.8155599415302277, "num_tokens": 30180559.0, "step": 25150 }, { "entropy": 1.851930246502161, "epoch": 0.0779937986250338, "grad_norm": 11.297672271728516, "learning_rate": 9.058812567799327e-06, "loss": 0.4845, "mean_token_accuracy": 0.8301180645823478, "num_tokens": 30194350.0, "step": 25160 }, { "entropy": 1.932929702103138, "epoch": 0.0780247977500835, "grad_norm": 4.530979156494141, "learning_rate": 9.0570127917762e-06, "loss": 0.5372, "mean_token_accuracy": 0.823866055905819, "num_tokens": 30206582.0, "step": 25170 }, { "entropy": 1.9074819549918174, "epoch": 0.0780557968751332, "grad_norm": 4.458383560180664, "learning_rate": 9.055214088048302e-06, "loss": 0.5628, "mean_token_accuracy": 0.8161739498376847, "num_tokens": 30218782.0, "step": 25180 }, { "entropy": 1.9132570594549179, "epoch": 0.0780867960001829, "grad_norm": Infinity, "learning_rate": 9.053416455551274e-06, "loss": 0.5894, "mean_token_accuracy": 0.8196236163377761, "num_tokens": 30230586.0, "step": 25190 }, { "entropy": 1.8898057445883751, "epoch": 0.07811779512523259, "grad_norm": 10.808612823486328, "learning_rate": 9.051619893222242e-06, "loss": 0.5555, "mean_token_accuracy": 0.8312712505459785, "num_tokens": 30242839.0, "step": 25200 }, { "entropy": 1.8855190083384514, "epoch": 0.07814879425028229, "grad_norm": 10.371384620666504, "learning_rate": 9.049824399999807e-06, "loss": 0.5713, "mean_token_accuracy": 0.8088703706860543, "num_tokens": 30255525.0, "step": 25210 }, { "entropy": 1.8693858250975608, "epoch": 0.07817979337533199, "grad_norm": 11.21320629119873, "learning_rate": 9.048029974824037e-06, "loss": 0.5543, "mean_token_accuracy": 0.826404581964016, "num_tokens": 30268078.0, "step": 25220 }, { "entropy": 1.9001759216189384, "epoch": 0.07821079250038168, "grad_norm": 8.04379940032959, "learning_rate": 9.046236616636477e-06, "loss": 0.6188, "mean_token_accuracy": 0.8241765573620796, "num_tokens": 30280040.0, "step": 25230 }, { "entropy": 1.9273234009742737, "epoch": 0.07824179162543138, "grad_norm": 11.019646644592285, "learning_rate": 9.044444324380139e-06, "loss": 0.5832, "mean_token_accuracy": 0.8227365925908089, "num_tokens": 30290832.0, "step": 25240 }, { "entropy": 1.9361531496047975, "epoch": 0.07827279075048106, "grad_norm": 3.8891756534576416, "learning_rate": 9.042653096999496e-06, "loss": 0.6237, "mean_token_accuracy": 0.8081121280789375, "num_tokens": 30302299.0, "step": 25250 }, { "entropy": 1.96910317838192, "epoch": 0.07830378987553076, "grad_norm": 8.430257797241211, "learning_rate": 9.04086293344049e-06, "loss": 0.5955, "mean_token_accuracy": 0.8174209460616112, "num_tokens": 30313745.0, "step": 25260 }, { "entropy": 1.8507313832640648, "epoch": 0.07833478900058045, "grad_norm": 10.676153182983398, "learning_rate": 9.039073832650518e-06, "loss": 0.5944, "mean_token_accuracy": 0.8205841019749641, "num_tokens": 30326883.0, "step": 25270 }, { "entropy": 1.963607743382454, "epoch": 0.07836578812563015, "grad_norm": 10.588327407836914, "learning_rate": 9.037285793578439e-06, "loss": 0.6089, "mean_token_accuracy": 0.8108539909124375, "num_tokens": 30338102.0, "step": 25280 }, { "entropy": 1.9715974554419518, "epoch": 0.07839678725067985, "grad_norm": 9.336774826049805, "learning_rate": 9.035498815174564e-06, "loss": 0.6256, "mean_token_accuracy": 0.8076756104826928, "num_tokens": 30349489.0, "step": 25290 }, { "entropy": 1.9550383672118188, "epoch": 0.07842778637572954, "grad_norm": 9.92357063293457, "learning_rate": 9.033712896390654e-06, "loss": 0.6597, "mean_token_accuracy": 0.803634025156498, "num_tokens": 30361182.0, "step": 25300 }, { "entropy": 1.9395698383450508, "epoch": 0.07845878550077924, "grad_norm": 8.71617603302002, "learning_rate": 9.031928036179925e-06, "loss": 0.631, "mean_token_accuracy": 0.8065466418862343, "num_tokens": 30372996.0, "step": 25310 }, { "entropy": 1.847324576973915, "epoch": 0.07848978462582894, "grad_norm": 5.6480584144592285, "learning_rate": 9.030144233497038e-06, "loss": 0.5279, "mean_token_accuracy": 0.8144370660185813, "num_tokens": 30386024.0, "step": 25320 }, { "entropy": 1.8323638439178467, "epoch": 0.07852078375087863, "grad_norm": 10.176698684692383, "learning_rate": 9.028361487298097e-06, "loss": 0.5583, "mean_token_accuracy": 0.8260697081685067, "num_tokens": 30399072.0, "step": 25330 }, { "entropy": 1.9654901623725891, "epoch": 0.07855178287592833, "grad_norm": 10.919507026672363, "learning_rate": 9.026579796540651e-06, "loss": 0.6562, "mean_token_accuracy": 0.8032170712947846, "num_tokens": 30410445.0, "step": 25340 }, { "entropy": 1.8728495821356774, "epoch": 0.07858278200097803, "grad_norm": 9.558503150939941, "learning_rate": 9.024799160183686e-06, "loss": 0.5595, "mean_token_accuracy": 0.82089733928442, "num_tokens": 30422438.0, "step": 25350 }, { "entropy": 1.8767936065793038, "epoch": 0.07861378112602772, "grad_norm": 6.74028205871582, "learning_rate": 9.023019577187625e-06, "loss": 0.5351, "mean_token_accuracy": 0.8307792738080024, "num_tokens": 30435068.0, "step": 25360 }, { "entropy": 1.9264275386929512, "epoch": 0.07864478025107742, "grad_norm": 9.8104829788208, "learning_rate": 9.021241046514326e-06, "loss": 0.5865, "mean_token_accuracy": 0.8222647801041603, "num_tokens": 30446563.0, "step": 25370 }, { "entropy": 1.93300940990448, "epoch": 0.0786757793761271, "grad_norm": 9.438268661499023, "learning_rate": 9.019463567127084e-06, "loss": 0.6245, "mean_token_accuracy": 0.813123632967472, "num_tokens": 30457702.0, "step": 25380 }, { "entropy": 1.9117926597595214, "epoch": 0.0787067785011768, "grad_norm": 10.080045700073242, "learning_rate": 9.017687137990611e-06, "loss": 0.6594, "mean_token_accuracy": 0.8094756618142128, "num_tokens": 30469833.0, "step": 25390 }, { "entropy": 1.9238570496439933, "epoch": 0.0787377776262265, "grad_norm": 9.324179649353027, "learning_rate": 9.01591175807106e-06, "loss": 0.6385, "mean_token_accuracy": 0.8032417684793473, "num_tokens": 30481722.0, "step": 25400 }, { "entropy": 1.9335221163928509, "epoch": 0.07876877675127619, "grad_norm": 10.291169166564941, "learning_rate": 9.014137426335997e-06, "loss": 0.6627, "mean_token_accuracy": 0.8041873052716255, "num_tokens": 30494711.0, "step": 25410 }, { "entropy": 1.8992304280400276, "epoch": 0.07879977587632589, "grad_norm": 9.40573787689209, "learning_rate": 9.012364141754415e-06, "loss": 0.5454, "mean_token_accuracy": 0.8209843635559082, "num_tokens": 30506481.0, "step": 25420 }, { "entropy": 1.9901362270116807, "epoch": 0.07883077500137559, "grad_norm": 11.049663543701172, "learning_rate": 9.010591903296726e-06, "loss": 0.652, "mean_token_accuracy": 0.8064600333571434, "num_tokens": 30517738.0, "step": 25430 }, { "entropy": 1.9595788344740868, "epoch": 0.07886177412642528, "grad_norm": 9.368852615356445, "learning_rate": 9.008820709934756e-06, "loss": 0.6615, "mean_token_accuracy": 0.7978012830018997, "num_tokens": 30529230.0, "step": 25440 }, { "entropy": 1.8642796009778977, "epoch": 0.07889277325147498, "grad_norm": 5.745087146759033, "learning_rate": 9.00705056064175e-06, "loss": 0.4942, "mean_token_accuracy": 0.8303910657763481, "num_tokens": 30542462.0, "step": 25450 }, { "entropy": 1.890346498787403, "epoch": 0.07892377237652468, "grad_norm": 12.036408424377441, "learning_rate": 9.00528145439236e-06, "loss": 0.5508, "mean_token_accuracy": 0.8230859890580178, "num_tokens": 30554735.0, "step": 25460 }, { "entropy": 1.9690846800804138, "epoch": 0.07895477150157437, "grad_norm": 10.138855934143066, "learning_rate": 9.00351339016265e-06, "loss": 0.6098, "mean_token_accuracy": 0.8199924737215042, "num_tokens": 30566201.0, "step": 25470 }, { "entropy": 2.0111054688692094, "epoch": 0.07898577062662407, "grad_norm": 8.961263656616211, "learning_rate": 9.001746366930088e-06, "loss": 0.6215, "mean_token_accuracy": 0.8159183651208878, "num_tokens": 30577578.0, "step": 25480 }, { "entropy": 1.9971545487642288, "epoch": 0.07901676975167377, "grad_norm": 8.601164817810059, "learning_rate": 8.99998038367355e-06, "loss": 0.6461, "mean_token_accuracy": 0.8124226480722427, "num_tokens": 30589416.0, "step": 25490 }, { "entropy": 2.0272182136774064, "epoch": 0.07904776887672345, "grad_norm": 9.910305976867676, "learning_rate": 8.99821543937331e-06, "loss": 0.7675, "mean_token_accuracy": 0.8009614288806916, "num_tokens": 30601049.0, "step": 25500 }, { "entropy": 1.9942003041505814, "epoch": 0.07907876800177314, "grad_norm": 9.830111503601074, "learning_rate": 8.996451533011044e-06, "loss": 0.62, "mean_token_accuracy": 0.8120215192437172, "num_tokens": 30611769.0, "step": 25510 }, { "entropy": 1.929137869179249, "epoch": 0.07910976712682284, "grad_norm": 10.363052368164062, "learning_rate": 8.994688663569825e-06, "loss": 0.6134, "mean_token_accuracy": 0.8164379015564919, "num_tokens": 30623572.0, "step": 25520 }, { "entropy": 1.9560409665107727, "epoch": 0.07914076625187254, "grad_norm": 10.759134292602539, "learning_rate": 8.992926830034117e-06, "loss": 0.609, "mean_token_accuracy": 0.8174306198954582, "num_tokens": 30634807.0, "step": 25530 }, { "entropy": 2.0076246559619904, "epoch": 0.07917176537692223, "grad_norm": 10.531962394714355, "learning_rate": 8.991166031389779e-06, "loss": 0.7254, "mean_token_accuracy": 0.7958659037947655, "num_tokens": 30645607.0, "step": 25540 }, { "entropy": 1.8035003036260604, "epoch": 0.07920276450197193, "grad_norm": 8.862020492553711, "learning_rate": 8.989406266624054e-06, "loss": 0.4378, "mean_token_accuracy": 0.8288759797811508, "num_tokens": 30659160.0, "step": 25550 }, { "entropy": 1.9678311094641685, "epoch": 0.07923376362702163, "grad_norm": 9.012432098388672, "learning_rate": 8.987647534725585e-06, "loss": 0.6402, "mean_token_accuracy": 0.8036282330751419, "num_tokens": 30670411.0, "step": 25560 }, { "entropy": 1.9711192145943641, "epoch": 0.07926476275207132, "grad_norm": 10.310850143432617, "learning_rate": 8.985889834684384e-06, "loss": 0.6003, "mean_token_accuracy": 0.8189939826726913, "num_tokens": 30682267.0, "step": 25570 }, { "entropy": 1.792180709540844, "epoch": 0.07929576187712102, "grad_norm": 12.734272956848145, "learning_rate": 8.984133165491855e-06, "loss": 0.4795, "mean_token_accuracy": 0.8344605669379235, "num_tokens": 30695155.0, "step": 25580 }, { "entropy": 1.9018129274249076, "epoch": 0.07932676100217072, "grad_norm": 9.23828125, "learning_rate": 8.982377526140776e-06, "loss": 0.6423, "mean_token_accuracy": 0.8115654811263084, "num_tokens": 30708151.0, "step": 25590 }, { "entropy": 1.9214498043060302, "epoch": 0.07935776012722041, "grad_norm": 4.616427421569824, "learning_rate": 8.98062291562531e-06, "loss": 0.5874, "mean_token_accuracy": 0.817217455804348, "num_tokens": 30719730.0, "step": 25600 }, { "entropy": 1.8673983976244926, "epoch": 0.07938875925227011, "grad_norm": 9.74424934387207, "learning_rate": 8.978869332940982e-06, "loss": 0.5426, "mean_token_accuracy": 0.8221541255712509, "num_tokens": 30732691.0, "step": 25610 }, { "entropy": 1.8662855371832847, "epoch": 0.0794197583773198, "grad_norm": 10.79581069946289, "learning_rate": 8.977116777084705e-06, "loss": 0.5544, "mean_token_accuracy": 0.8190583303570748, "num_tokens": 30745114.0, "step": 25620 }, { "entropy": 2.0325272887945176, "epoch": 0.07945075750236949, "grad_norm": 10.473299026489258, "learning_rate": 8.97536524705475e-06, "loss": 0.677, "mean_token_accuracy": 0.8067957848310471, "num_tokens": 30756265.0, "step": 25630 }, { "entropy": 1.974963715672493, "epoch": 0.07948175662741919, "grad_norm": 8.578821182250977, "learning_rate": 8.97361474185076e-06, "loss": 0.5835, "mean_token_accuracy": 0.8203337907791137, "num_tokens": 30767729.0, "step": 25640 }, { "entropy": 1.9518730476498605, "epoch": 0.07951275575246888, "grad_norm": 9.881929397583008, "learning_rate": 8.971865260473745e-06, "loss": 0.6164, "mean_token_accuracy": 0.8127556294202805, "num_tokens": 30779093.0, "step": 25650 }, { "entropy": 1.9101403415203095, "epoch": 0.07954375487751858, "grad_norm": 9.09304428100586, "learning_rate": 8.970116801926072e-06, "loss": 0.5419, "mean_token_accuracy": 0.833724494278431, "num_tokens": 30790837.0, "step": 25660 }, { "entropy": 1.8745130628347397, "epoch": 0.07957475400256828, "grad_norm": 9.2150239944458, "learning_rate": 8.968369365211478e-06, "loss": 0.5464, "mean_token_accuracy": 0.8339875742793084, "num_tokens": 30802641.0, "step": 25670 }, { "entropy": 1.94149309694767, "epoch": 0.07960575312761797, "grad_norm": 9.093975067138672, "learning_rate": 8.966622949335044e-06, "loss": 0.6163, "mean_token_accuracy": 0.8239960089325905, "num_tokens": 30813720.0, "step": 25680 }, { "entropy": 1.8855563715100288, "epoch": 0.07963675225266767, "grad_norm": 4.550987243652344, "learning_rate": 8.964877553303222e-06, "loss": 0.5868, "mean_token_accuracy": 0.8233138158917427, "num_tokens": 30825831.0, "step": 25690 }, { "entropy": 1.9300659999251366, "epoch": 0.07966775137771737, "grad_norm": 10.610151290893555, "learning_rate": 8.963133176123809e-06, "loss": 0.5674, "mean_token_accuracy": 0.8244445592164993, "num_tokens": 30837291.0, "step": 25700 }, { "entropy": 1.9518328681588173, "epoch": 0.07969875050276706, "grad_norm": 10.140637397766113, "learning_rate": 8.96138981680595e-06, "loss": 0.6133, "mean_token_accuracy": 0.8174855709075928, "num_tokens": 30848725.0, "step": 25710 }, { "entropy": 1.9481216147542, "epoch": 0.07972974962781676, "grad_norm": 5.342643737792969, "learning_rate": 8.959647474360146e-06, "loss": 0.6286, "mean_token_accuracy": 0.8104046359658241, "num_tokens": 30860275.0, "step": 25720 }, { "entropy": 1.9668898478150367, "epoch": 0.07976074875286646, "grad_norm": 9.690641403198242, "learning_rate": 8.95790614779824e-06, "loss": 0.644, "mean_token_accuracy": 0.8048096477985383, "num_tokens": 30872020.0, "step": 25730 }, { "entropy": 1.895005388557911, "epoch": 0.07979174787791615, "grad_norm": 13.030941009521484, "learning_rate": 8.956165836133419e-06, "loss": 0.6495, "mean_token_accuracy": 0.8069834470748901, "num_tokens": 30884853.0, "step": 25740 }, { "entropy": 1.9691629260778427, "epoch": 0.07982274700296584, "grad_norm": 11.10064697265625, "learning_rate": 8.954426538380212e-06, "loss": 0.6367, "mean_token_accuracy": 0.8148859232664108, "num_tokens": 30895623.0, "step": 25750 }, { "entropy": 1.9379816070199012, "epoch": 0.07985374612801553, "grad_norm": 7.397519588470459, "learning_rate": 8.952688253554488e-06, "loss": 0.6139, "mean_token_accuracy": 0.820967635512352, "num_tokens": 30907044.0, "step": 25760 }, { "entropy": 1.8707928493618966, "epoch": 0.07988474525306523, "grad_norm": 8.89204216003418, "learning_rate": 8.950950980673451e-06, "loss": 0.5938, "mean_token_accuracy": 0.8097473427653312, "num_tokens": 30919270.0, "step": 25770 }, { "entropy": 1.8771313697099685, "epoch": 0.07991574437811493, "grad_norm": 9.017126083374023, "learning_rate": 8.949214718755644e-06, "loss": 0.6379, "mean_token_accuracy": 0.8157517641782761, "num_tokens": 30931930.0, "step": 25780 }, { "entropy": 1.9514517337083817, "epoch": 0.07994674350316462, "grad_norm": 9.412887573242188, "learning_rate": 8.947479466820933e-06, "loss": 0.6404, "mean_token_accuracy": 0.807883444428444, "num_tokens": 30943436.0, "step": 25790 }, { "entropy": 1.9551150798797607, "epoch": 0.07997774262821432, "grad_norm": 10.580997467041016, "learning_rate": 8.945745223890525e-06, "loss": 0.635, "mean_token_accuracy": 0.8118805930018425, "num_tokens": 30955093.0, "step": 25800 }, { "entropy": 1.9215492516756059, "epoch": 0.08000874175326401, "grad_norm": 4.973093509674072, "learning_rate": 8.944011988986943e-06, "loss": 0.6226, "mean_token_accuracy": 0.8189863070845604, "num_tokens": 30966468.0, "step": 25810 }, { "entropy": 1.9604627519845963, "epoch": 0.08003974087831371, "grad_norm": 11.397422790527344, "learning_rate": 8.942279761134045e-06, "loss": 0.6099, "mean_token_accuracy": 0.8068236276507378, "num_tokens": 30978392.0, "step": 25820 }, { "entropy": 1.8480182617902756, "epoch": 0.08007074000336341, "grad_norm": 4.0751261711120605, "learning_rate": 8.940548539357008e-06, "loss": 0.5905, "mean_token_accuracy": 0.81248699426651, "num_tokens": 30992369.0, "step": 25830 }, { "entropy": 1.8566480681300164, "epoch": 0.0801017391284131, "grad_norm": 6.9857916831970215, "learning_rate": 8.938818322682328e-06, "loss": 0.5375, "mean_token_accuracy": 0.8251223266124725, "num_tokens": 31005133.0, "step": 25840 }, { "entropy": 1.951548993587494, "epoch": 0.0801327382534628, "grad_norm": 9.003771781921387, "learning_rate": 8.937089110137822e-06, "loss": 0.6049, "mean_token_accuracy": 0.812137958407402, "num_tokens": 31017000.0, "step": 25850 }, { "entropy": 1.9318700328469276, "epoch": 0.0801637373785125, "grad_norm": 10.689835548400879, "learning_rate": 8.935360900752618e-06, "loss": 0.5753, "mean_token_accuracy": 0.8234897822141647, "num_tokens": 31028427.0, "step": 25860 }, { "entropy": 1.9848022490739823, "epoch": 0.08019473650356218, "grad_norm": 9.83168888092041, "learning_rate": 8.933633693557168e-06, "loss": 0.6557, "mean_token_accuracy": 0.8102818682789803, "num_tokens": 31039440.0, "step": 25870 }, { "entropy": 1.8783807411789895, "epoch": 0.08022573562861188, "grad_norm": 10.153984069824219, "learning_rate": 8.931907487583224e-06, "loss": 0.5498, "mean_token_accuracy": 0.8286775290966034, "num_tokens": 31051086.0, "step": 25880 }, { "entropy": 1.9235938981175422, "epoch": 0.08025673475366157, "grad_norm": 10.353632926940918, "learning_rate": 8.930182281863854e-06, "loss": 0.6119, "mean_token_accuracy": 0.8167414903640747, "num_tokens": 31062372.0, "step": 25890 }, { "entropy": 1.9412924006581307, "epoch": 0.08028773387871127, "grad_norm": 9.35558032989502, "learning_rate": 8.928458075433428e-06, "loss": 0.6887, "mean_token_accuracy": 0.7980827406048775, "num_tokens": 31074103.0, "step": 25900 }, { "entropy": 1.9117850810289383, "epoch": 0.08031873300376097, "grad_norm": 8.50671100616455, "learning_rate": 8.926734867327626e-06, "loss": 0.5881, "mean_token_accuracy": 0.80945935100317, "num_tokens": 31086274.0, "step": 25910 }, { "entropy": 1.9611097499728203, "epoch": 0.08034973212881066, "grad_norm": 8.947006225585938, "learning_rate": 8.925012656583428e-06, "loss": 0.6327, "mean_token_accuracy": 0.8106518238782883, "num_tokens": 31097609.0, "step": 25920 }, { "entropy": 1.941481387615204, "epoch": 0.08038073125386036, "grad_norm": 4.701364040374756, "learning_rate": 8.923291442239114e-06, "loss": 0.5451, "mean_token_accuracy": 0.8272410377860069, "num_tokens": 31109738.0, "step": 25930 }, { "entropy": 1.9850254267454148, "epoch": 0.08041173037891006, "grad_norm": 10.166258811950684, "learning_rate": 8.921571223334262e-06, "loss": 0.6038, "mean_token_accuracy": 0.8136166602373123, "num_tokens": 31120962.0, "step": 25940 }, { "entropy": 1.8603035241365433, "epoch": 0.08044272950395975, "grad_norm": 9.798973083496094, "learning_rate": 8.919851998909738e-06, "loss": 0.5282, "mean_token_accuracy": 0.833336153626442, "num_tokens": 31133382.0, "step": 25950 }, { "entropy": 1.9528677210211753, "epoch": 0.08047372862900945, "grad_norm": 10.009119033813477, "learning_rate": 8.918133768007718e-06, "loss": 0.6604, "mean_token_accuracy": 0.806072898209095, "num_tokens": 31144362.0, "step": 25960 }, { "entropy": 1.8188411056995393, "epoch": 0.08050472775405915, "grad_norm": 8.478353500366211, "learning_rate": 8.916416529671652e-06, "loss": 0.5355, "mean_token_accuracy": 0.8259258419275284, "num_tokens": 31157130.0, "step": 25970 }, { "entropy": 1.855710941553116, "epoch": 0.08053572687910884, "grad_norm": 4.531567096710205, "learning_rate": 8.914700282946292e-06, "loss": 0.5377, "mean_token_accuracy": 0.826113897562027, "num_tokens": 31169756.0, "step": 25980 }, { "entropy": 1.8293490082025528, "epoch": 0.08056672600415853, "grad_norm": 4.700442790985107, "learning_rate": 8.912985026877668e-06, "loss": 0.5051, "mean_token_accuracy": 0.8252740427851677, "num_tokens": 31182565.0, "step": 25990 }, { "entropy": 1.8819095849990846, "epoch": 0.08059772512920822, "grad_norm": 10.227608680725098, "learning_rate": 8.911270760513097e-06, "loss": 0.5533, "mean_token_accuracy": 0.8177149787545204, "num_tokens": 31194457.0, "step": 26000 }, { "entropy": 1.9495192840695381, "epoch": 0.08062872425425792, "grad_norm": 10.328841209411621, "learning_rate": 8.909557482901179e-06, "loss": 0.5876, "mean_token_accuracy": 0.8159580215811729, "num_tokens": 31205886.0, "step": 26010 }, { "entropy": 1.9724682122468948, "epoch": 0.08065972337930762, "grad_norm": 10.351836204528809, "learning_rate": 8.907845193091793e-06, "loss": 0.6892, "mean_token_accuracy": 0.7933256924152374, "num_tokens": 31216798.0, "step": 26020 }, { "entropy": 1.8941216632723807, "epoch": 0.08069072250435731, "grad_norm": 11.160825729370117, "learning_rate": 8.906133890136095e-06, "loss": 0.5391, "mean_token_accuracy": 0.8280467942357064, "num_tokens": 31229398.0, "step": 26030 }, { "entropy": 1.9749949797987938, "epoch": 0.08072172162940701, "grad_norm": 7.923883438110352, "learning_rate": 8.90442357308652e-06, "loss": 0.6518, "mean_token_accuracy": 0.8020041942596435, "num_tokens": 31240647.0, "step": 26040 }, { "entropy": 1.9157247439026832, "epoch": 0.0807527207544567, "grad_norm": 8.247703552246094, "learning_rate": 8.902714240996773e-06, "loss": 0.5723, "mean_token_accuracy": 0.8275446742773056, "num_tokens": 31253058.0, "step": 26050 }, { "entropy": 1.9141668871045112, "epoch": 0.0807837198795064, "grad_norm": 4.465513706207275, "learning_rate": 8.901005892921827e-06, "loss": 0.5543, "mean_token_accuracy": 0.81632010191679, "num_tokens": 31265597.0, "step": 26060 }, { "entropy": 1.998997524380684, "epoch": 0.0808147190045561, "grad_norm": 9.465398788452148, "learning_rate": 8.899298527917932e-06, "loss": 0.625, "mean_token_accuracy": 0.8183861076831818, "num_tokens": 31276394.0, "step": 26070 }, { "entropy": 1.9701352685689926, "epoch": 0.0808457181296058, "grad_norm": 9.879446029663086, "learning_rate": 8.897592145042599e-06, "loss": 0.6342, "mean_token_accuracy": 0.8082750916481019, "num_tokens": 31287446.0, "step": 26080 }, { "entropy": 1.8816271275281906, "epoch": 0.08087671725465549, "grad_norm": 10.568394660949707, "learning_rate": 8.895886743354607e-06, "loss": 0.538, "mean_token_accuracy": 0.8256219759583473, "num_tokens": 31299916.0, "step": 26090 }, { "entropy": 1.9000601902604104, "epoch": 0.08090771637970519, "grad_norm": 9.578839302062988, "learning_rate": 8.89418232191399e-06, "loss": 0.57, "mean_token_accuracy": 0.8156951382756233, "num_tokens": 31313478.0, "step": 26100 }, { "entropy": 1.9269771069288253, "epoch": 0.08093871550475489, "grad_norm": 5.241694927215576, "learning_rate": 8.892478879782055e-06, "loss": 0.5895, "mean_token_accuracy": 0.822375500202179, "num_tokens": 31325006.0, "step": 26110 }, { "entropy": 1.915390558540821, "epoch": 0.08096971462980457, "grad_norm": 10.392666816711426, "learning_rate": 8.89077641602135e-06, "loss": 0.5677, "mean_token_accuracy": 0.8208850100636482, "num_tokens": 31336358.0, "step": 26120 }, { "entropy": 1.8701285541057586, "epoch": 0.08100071375485426, "grad_norm": 8.39531135559082, "learning_rate": 8.889074929695697e-06, "loss": 0.5223, "mean_token_accuracy": 0.8237580001354218, "num_tokens": 31349262.0, "step": 26130 }, { "entropy": 1.9644879460334779, "epoch": 0.08103171287990396, "grad_norm": 8.174306869506836, "learning_rate": 8.887374419870162e-06, "loss": 0.6391, "mean_token_accuracy": 0.8160557880997658, "num_tokens": 31360978.0, "step": 26140 }, { "entropy": 1.870239832997322, "epoch": 0.08106271200495366, "grad_norm": 4.718651294708252, "learning_rate": 8.885674885611059e-06, "loss": 0.5204, "mean_token_accuracy": 0.8182685613632202, "num_tokens": 31374020.0, "step": 26150 }, { "entropy": 1.8591389670968055, "epoch": 0.08109371113000335, "grad_norm": 8.56635570526123, "learning_rate": 8.883976325985959e-06, "loss": 0.5134, "mean_token_accuracy": 0.8392164379358291, "num_tokens": 31386295.0, "step": 26160 }, { "entropy": 2.010419914126396, "epoch": 0.08112471025505305, "grad_norm": 10.94079303741455, "learning_rate": 8.882278740063671e-06, "loss": 0.7017, "mean_token_accuracy": 0.804059025645256, "num_tokens": 31397492.0, "step": 26170 }, { "entropy": 1.9494641929864884, "epoch": 0.08115570938010275, "grad_norm": 3.204777717590332, "learning_rate": 8.880582126914265e-06, "loss": 0.6059, "mean_token_accuracy": 0.8109059870243073, "num_tokens": 31408976.0, "step": 26180 }, { "entropy": 1.9177653357386588, "epoch": 0.08118670850515244, "grad_norm": 9.458409309387207, "learning_rate": 8.878886485609038e-06, "loss": 0.5732, "mean_token_accuracy": 0.8205662295222282, "num_tokens": 31421691.0, "step": 26190 }, { "entropy": 1.941348561644554, "epoch": 0.08121770763020214, "grad_norm": 10.314962387084961, "learning_rate": 8.877191815220537e-06, "loss": 0.609, "mean_token_accuracy": 0.8177227556705475, "num_tokens": 31433019.0, "step": 26200 }, { "entropy": 1.8470569260418415, "epoch": 0.08124870675525184, "grad_norm": 10.691038131713867, "learning_rate": 8.87549811482254e-06, "loss": 0.4794, "mean_token_accuracy": 0.8288433074951171, "num_tokens": 31446595.0, "step": 26210 }, { "entropy": 1.9533916339278221, "epoch": 0.08127970588030153, "grad_norm": 4.70416784286499, "learning_rate": 8.873805383490072e-06, "loss": 0.5795, "mean_token_accuracy": 0.8235921949148178, "num_tokens": 31458289.0, "step": 26220 }, { "entropy": 1.9918930500745773, "epoch": 0.08131070500535123, "grad_norm": 10.034157752990723, "learning_rate": 8.872113620299381e-06, "loss": 0.6885, "mean_token_accuracy": 0.8030604913830757, "num_tokens": 31469163.0, "step": 26230 }, { "entropy": 1.8951246917247773, "epoch": 0.08134170413040091, "grad_norm": 8.459050178527832, "learning_rate": 8.870422824327956e-06, "loss": 0.4934, "mean_token_accuracy": 0.8330509856343269, "num_tokens": 31482597.0, "step": 26240 }, { "entropy": 1.901189935207367, "epoch": 0.08137270325545061, "grad_norm": 4.526102542877197, "learning_rate": 8.868732994654511e-06, "loss": 0.5578, "mean_token_accuracy": 0.8234093725681305, "num_tokens": 31495707.0, "step": 26250 }, { "entropy": 1.8694349378347397, "epoch": 0.0814037023805003, "grad_norm": 8.740363121032715, "learning_rate": 8.86704413035899e-06, "loss": 0.5515, "mean_token_accuracy": 0.8234608098864555, "num_tokens": 31507761.0, "step": 26260 }, { "entropy": 1.90776207447052, "epoch": 0.08143470150555, "grad_norm": 8.299092292785645, "learning_rate": 8.865356230522566e-06, "loss": 0.5869, "mean_token_accuracy": 0.8166111215949059, "num_tokens": 31519733.0, "step": 26270 }, { "entropy": 1.988849925994873, "epoch": 0.0814657006305997, "grad_norm": 10.71021842956543, "learning_rate": 8.86366929422763e-06, "loss": 0.6454, "mean_token_accuracy": 0.8144510120153428, "num_tokens": 31530442.0, "step": 26280 }, { "entropy": 1.9403239041566849, "epoch": 0.0814966997556494, "grad_norm": 10.783156394958496, "learning_rate": 8.861983320557797e-06, "loss": 0.619, "mean_token_accuracy": 0.8142760202288628, "num_tokens": 31541855.0, "step": 26290 }, { "entropy": 1.9194020926952362, "epoch": 0.08152769888069909, "grad_norm": 5.271920204162598, "learning_rate": 8.860298308597903e-06, "loss": 0.6006, "mean_token_accuracy": 0.8099767789244652, "num_tokens": 31553683.0, "step": 26300 }, { "entropy": 1.9054380610585213, "epoch": 0.08155869800574879, "grad_norm": 12.495100975036621, "learning_rate": 8.858614257434004e-06, "loss": 0.5515, "mean_token_accuracy": 0.8234122693538666, "num_tokens": 31566209.0, "step": 26310 }, { "entropy": 1.9283027410507203, "epoch": 0.08158969713079849, "grad_norm": 10.337760925292969, "learning_rate": 8.856931166153366e-06, "loss": 0.557, "mean_token_accuracy": 0.827317263185978, "num_tokens": 31578676.0, "step": 26320 }, { "entropy": 1.9200524538755417, "epoch": 0.08162069625584818, "grad_norm": 9.985169410705566, "learning_rate": 8.855249033844469e-06, "loss": 0.5736, "mean_token_accuracy": 0.8291199162602425, "num_tokens": 31591513.0, "step": 26330 }, { "entropy": 1.8813577458262443, "epoch": 0.08165169538089788, "grad_norm": 10.70927906036377, "learning_rate": 8.85356785959701e-06, "loss": 0.5232, "mean_token_accuracy": 0.8120563924312592, "num_tokens": 31604610.0, "step": 26340 }, { "entropy": 1.9429839253425598, "epoch": 0.08168269450594758, "grad_norm": 7.395925998687744, "learning_rate": 8.851887642501889e-06, "loss": 0.5473, "mean_token_accuracy": 0.8196773245930672, "num_tokens": 31616846.0, "step": 26350 }, { "entropy": 1.8979680463671684, "epoch": 0.08171369363099726, "grad_norm": 10.052593231201172, "learning_rate": 8.850208381651215e-06, "loss": 0.5895, "mean_token_accuracy": 0.8154531285166741, "num_tokens": 31629253.0, "step": 26360 }, { "entropy": 1.894950045645237, "epoch": 0.08174469275604695, "grad_norm": 10.194618225097656, "learning_rate": 8.848530076138306e-06, "loss": 0.5183, "mean_token_accuracy": 0.827742937207222, "num_tokens": 31642045.0, "step": 26370 }, { "entropy": 1.9756985664367677, "epoch": 0.08177569188109665, "grad_norm": 9.307488441467285, "learning_rate": 8.846852725057679e-06, "loss": 0.6164, "mean_token_accuracy": 0.8118034496903419, "num_tokens": 31653833.0, "step": 26380 }, { "entropy": 1.977025419473648, "epoch": 0.08180669100614635, "grad_norm": 8.45008659362793, "learning_rate": 8.845176327505053e-06, "loss": 0.5422, "mean_token_accuracy": 0.8323731452226639, "num_tokens": 31664531.0, "step": 26390 }, { "entropy": 1.8568702943623066, "epoch": 0.08183769013119604, "grad_norm": 9.432966232299805, "learning_rate": 8.843500882577342e-06, "loss": 0.4926, "mean_token_accuracy": 0.83240677267313, "num_tokens": 31677924.0, "step": 26400 }, { "entropy": 1.9465472564101218, "epoch": 0.08186868925624574, "grad_norm": 2.6528499126434326, "learning_rate": 8.841826389372667e-06, "loss": 0.57, "mean_token_accuracy": 0.819884067773819, "num_tokens": 31689433.0, "step": 26410 }, { "entropy": 1.9157054662704467, "epoch": 0.08189968838129544, "grad_norm": 4.077538013458252, "learning_rate": 8.840152846990336e-06, "loss": 0.5505, "mean_token_accuracy": 0.8310910239815712, "num_tokens": 31701899.0, "step": 26420 }, { "entropy": 1.9014649242162704, "epoch": 0.08193068750634513, "grad_norm": 9.67369556427002, "learning_rate": 8.838480254530852e-06, "loss": 0.5669, "mean_token_accuracy": 0.8307635292410851, "num_tokens": 31713964.0, "step": 26430 }, { "entropy": 1.944985429942608, "epoch": 0.08196168663139483, "grad_norm": 10.318603515625, "learning_rate": 8.836808611095908e-06, "loss": 0.589, "mean_token_accuracy": 0.8197949156165123, "num_tokens": 31725749.0, "step": 26440 }, { "entropy": 1.9587625324726106, "epoch": 0.08199268575644453, "grad_norm": 9.986980438232422, "learning_rate": 8.835137915788388e-06, "loss": 0.6011, "mean_token_accuracy": 0.816116102039814, "num_tokens": 31738281.0, "step": 26450 }, { "entropy": 1.9987529665231705, "epoch": 0.08202368488149422, "grad_norm": 9.245450973510742, "learning_rate": 8.83346816771236e-06, "loss": 0.637, "mean_token_accuracy": 0.8139676377177238, "num_tokens": 31749103.0, "step": 26460 }, { "entropy": 1.9783283829689027, "epoch": 0.08205468400654392, "grad_norm": 13.349959373474121, "learning_rate": 8.831799365973078e-06, "loss": 0.6254, "mean_token_accuracy": 0.8167249724268913, "num_tokens": 31760327.0, "step": 26470 }, { "entropy": 1.9021371573209762, "epoch": 0.08208568313159362, "grad_norm": 4.939640998840332, "learning_rate": 8.83013150967698e-06, "loss": 0.5441, "mean_token_accuracy": 0.8197105556726456, "num_tokens": 31772545.0, "step": 26480 }, { "entropy": 1.8698833830654622, "epoch": 0.0821166822566433, "grad_norm": 9.22522258758545, "learning_rate": 8.828464597931686e-06, "loss": 0.5702, "mean_token_accuracy": 0.8225023493170738, "num_tokens": 31785337.0, "step": 26490 }, { "entropy": 1.8712293177843093, "epoch": 0.082147681381693, "grad_norm": 5.361429214477539, "learning_rate": 8.82679862984599e-06, "loss": 0.5483, "mean_token_accuracy": 0.8150455951690674, "num_tokens": 31797319.0, "step": 26500 }, { "entropy": 1.9659972831606864, "epoch": 0.0821786805067427, "grad_norm": 9.854397773742676, "learning_rate": 8.825133604529864e-06, "loss": 0.5934, "mean_token_accuracy": 0.8131707355380058, "num_tokens": 31808978.0, "step": 26510 }, { "entropy": 1.9501794785261155, "epoch": 0.08220967963179239, "grad_norm": 8.959293365478516, "learning_rate": 8.823469521094459e-06, "loss": 0.6272, "mean_token_accuracy": 0.8212861150503159, "num_tokens": 31821515.0, "step": 26520 }, { "entropy": 1.9314236760139465, "epoch": 0.08224067875684209, "grad_norm": 10.00307846069336, "learning_rate": 8.821806378652095e-06, "loss": 0.5826, "mean_token_accuracy": 0.8202818527817726, "num_tokens": 31832945.0, "step": 26530 }, { "entropy": 1.9181413248181343, "epoch": 0.08227167788189178, "grad_norm": 12.737542152404785, "learning_rate": 8.820144176316263e-06, "loss": 0.6233, "mean_token_accuracy": 0.808757683634758, "num_tokens": 31844982.0, "step": 26540 }, { "entropy": 1.9316234186291694, "epoch": 0.08230267700694148, "grad_norm": 4.668799877166748, "learning_rate": 8.818482913201624e-06, "loss": 0.5973, "mean_token_accuracy": 0.81618000715971, "num_tokens": 31857158.0, "step": 26550 }, { "entropy": 1.9605771273374557, "epoch": 0.08233367613199118, "grad_norm": 10.899618148803711, "learning_rate": 8.816822588424007e-06, "loss": 0.5998, "mean_token_accuracy": 0.8206842705607414, "num_tokens": 31868956.0, "step": 26560 }, { "entropy": 1.9107506170868873, "epoch": 0.08236467525704087, "grad_norm": 10.741750717163086, "learning_rate": 8.815163201100404e-06, "loss": 0.564, "mean_token_accuracy": 0.8113860443234444, "num_tokens": 31881384.0, "step": 26570 }, { "entropy": 1.940708489716053, "epoch": 0.08239567438209057, "grad_norm": 10.441813468933105, "learning_rate": 8.813504750348967e-06, "loss": 0.584, "mean_token_accuracy": 0.8169033840298653, "num_tokens": 31892982.0, "step": 26580 }, { "entropy": 1.9362832650542259, "epoch": 0.08242667350714027, "grad_norm": 9.666556358337402, "learning_rate": 8.811847235289013e-06, "loss": 0.6264, "mean_token_accuracy": 0.8026597559452057, "num_tokens": 31905412.0, "step": 26590 }, { "entropy": 1.9204092472791672, "epoch": 0.08245767263218996, "grad_norm": 8.493167877197266, "learning_rate": 8.81019065504102e-06, "loss": 0.6364, "mean_token_accuracy": 0.8150411576032639, "num_tokens": 31917332.0, "step": 26600 }, { "entropy": 1.9749287247657776, "epoch": 0.08248867175723965, "grad_norm": 9.949286460876465, "learning_rate": 8.808535008726616e-06, "loss": 0.6233, "mean_token_accuracy": 0.8155123621225357, "num_tokens": 31928762.0, "step": 26610 }, { "entropy": 2.0281356394290926, "epoch": 0.08251967088228934, "grad_norm": 8.458736419677734, "learning_rate": 8.806880295468594e-06, "loss": 0.6129, "mean_token_accuracy": 0.817957803606987, "num_tokens": 31939691.0, "step": 26620 }, { "entropy": 1.9947254791855813, "epoch": 0.08255067000733904, "grad_norm": 10.101909637451172, "learning_rate": 8.805226514390884e-06, "loss": 0.5895, "mean_token_accuracy": 0.8237784549593925, "num_tokens": 31950789.0, "step": 26630 }, { "entropy": 1.9266445934772491, "epoch": 0.08258166913238874, "grad_norm": 9.334656715393066, "learning_rate": 8.803573664618587e-06, "loss": 0.5351, "mean_token_accuracy": 0.8326213672757149, "num_tokens": 31962662.0, "step": 26640 }, { "entropy": 1.9811443358659744, "epoch": 0.08261266825743843, "grad_norm": 9.728004455566406, "learning_rate": 8.801921745277938e-06, "loss": 0.6228, "mean_token_accuracy": 0.8104926988482475, "num_tokens": 31973878.0, "step": 26650 }, { "entropy": 1.9071193888783455, "epoch": 0.08264366738248813, "grad_norm": 9.647929191589355, "learning_rate": 8.800270755496327e-06, "loss": 0.5784, "mean_token_accuracy": 0.8240789666771888, "num_tokens": 31985601.0, "step": 26660 }, { "entropy": 1.8988082259893417, "epoch": 0.08267466650753783, "grad_norm": 5.0942888259887695, "learning_rate": 8.798620694402286e-06, "loss": 0.5738, "mean_token_accuracy": 0.819773106276989, "num_tokens": 31999208.0, "step": 26670 }, { "entropy": 1.9712405875325203, "epoch": 0.08270566563258752, "grad_norm": 8.369913101196289, "learning_rate": 8.796971561125492e-06, "loss": 0.6086, "mean_token_accuracy": 0.8169108390808105, "num_tokens": 32010782.0, "step": 26680 }, { "entropy": 1.868271279335022, "epoch": 0.08273666475763722, "grad_norm": 8.779671669006348, "learning_rate": 8.795323354796762e-06, "loss": 0.5195, "mean_token_accuracy": 0.8314287766814232, "num_tokens": 32023787.0, "step": 26690 }, { "entropy": 1.9078305020928383, "epoch": 0.08276766388268691, "grad_norm": 9.038331985473633, "learning_rate": 8.79367607454805e-06, "loss": 0.5642, "mean_token_accuracy": 0.8266749441623688, "num_tokens": 32036579.0, "step": 26700 }, { "entropy": 1.9659512534737587, "epoch": 0.08279866300773661, "grad_norm": 10.663753509521484, "learning_rate": 8.792029719512458e-06, "loss": 0.5953, "mean_token_accuracy": 0.8178058177232742, "num_tokens": 32048734.0, "step": 26710 }, { "entropy": 1.8505152672529221, "epoch": 0.08282966213278631, "grad_norm": 3.9875552654266357, "learning_rate": 8.79038428882421e-06, "loss": 0.5194, "mean_token_accuracy": 0.8262492626905441, "num_tokens": 32061882.0, "step": 26720 }, { "entropy": 1.913353630900383, "epoch": 0.08286066125783599, "grad_norm": 10.27352523803711, "learning_rate": 8.788739781618678e-06, "loss": 0.5767, "mean_token_accuracy": 0.8218828424811363, "num_tokens": 32074037.0, "step": 26730 }, { "entropy": 1.9119721353054047, "epoch": 0.08289166038288569, "grad_norm": 9.0624361038208, "learning_rate": 8.78709619703235e-06, "loss": 0.5735, "mean_token_accuracy": 0.8198665246367455, "num_tokens": 32086164.0, "step": 26740 }, { "entropy": 1.8503568902611733, "epoch": 0.08292265950793538, "grad_norm": 5.034000396728516, "learning_rate": 8.785453534202857e-06, "loss": 0.5473, "mean_token_accuracy": 0.8217618718743325, "num_tokens": 32099236.0, "step": 26750 }, { "entropy": 1.9170492082834243, "epoch": 0.08295365863298508, "grad_norm": 11.386276245117188, "learning_rate": 8.78381179226895e-06, "loss": 0.5962, "mean_token_accuracy": 0.8131098374724388, "num_tokens": 32111036.0, "step": 26760 }, { "entropy": 2.001404981315136, "epoch": 0.08298465775803478, "grad_norm": 9.401607513427734, "learning_rate": 8.782170970370514e-06, "loss": 0.6513, "mean_token_accuracy": 0.8121735200285911, "num_tokens": 32122745.0, "step": 26770 }, { "entropy": 1.9728690326213836, "epoch": 0.08301565688308447, "grad_norm": 9.115731239318848, "learning_rate": 8.78053106764855e-06, "loss": 0.6129, "mean_token_accuracy": 0.8155710816383361, "num_tokens": 32133822.0, "step": 26780 }, { "entropy": 1.952053852379322, "epoch": 0.08304665600813417, "grad_norm": 9.419805526733398, "learning_rate": 8.778892083245187e-06, "loss": 0.6176, "mean_token_accuracy": 0.814921198785305, "num_tokens": 32144770.0, "step": 26790 }, { "entropy": 1.9340526655316352, "epoch": 0.08307765513318387, "grad_norm": 4.676142692565918, "learning_rate": 8.77725401630367e-06, "loss": 0.6304, "mean_token_accuracy": 0.7994276747107506, "num_tokens": 32156515.0, "step": 26800 }, { "entropy": 2.002475252747536, "epoch": 0.08310865425823356, "grad_norm": 5.354742527008057, "learning_rate": 8.775616865968369e-06, "loss": 0.7259, "mean_token_accuracy": 0.7925671577453614, "num_tokens": 32167752.0, "step": 26810 }, { "entropy": 1.8802199259400367, "epoch": 0.08313965338328326, "grad_norm": 8.681201934814453, "learning_rate": 8.773980631384764e-06, "loss": 0.5398, "mean_token_accuracy": 0.8213910296559334, "num_tokens": 32181313.0, "step": 26820 }, { "entropy": 1.873046538233757, "epoch": 0.08317065250833296, "grad_norm": 8.961077690124512, "learning_rate": 8.772345311699455e-06, "loss": 0.558, "mean_token_accuracy": 0.8188267648220062, "num_tokens": 32194510.0, "step": 26830 }, { "entropy": 1.8043782696127892, "epoch": 0.08320165163338265, "grad_norm": 10.20888614654541, "learning_rate": 8.770710906060152e-06, "loss": 0.5254, "mean_token_accuracy": 0.8352167114615441, "num_tokens": 32207477.0, "step": 26840 }, { "entropy": 1.962397077679634, "epoch": 0.08323265075843235, "grad_norm": 8.981500625610352, "learning_rate": 8.769077413615676e-06, "loss": 0.6166, "mean_token_accuracy": 0.8194408491253853, "num_tokens": 32218668.0, "step": 26850 }, { "entropy": 1.9099099516868592, "epoch": 0.08326364988348203, "grad_norm": 9.33209228515625, "learning_rate": 8.76744483351596e-06, "loss": 0.5514, "mean_token_accuracy": 0.8329472854733467, "num_tokens": 32230093.0, "step": 26860 }, { "entropy": 1.8509590789675712, "epoch": 0.08329464900853173, "grad_norm": 10.720943450927734, "learning_rate": 8.76581316491204e-06, "loss": 0.5368, "mean_token_accuracy": 0.8222440704703331, "num_tokens": 32242430.0, "step": 26870 }, { "entropy": 1.7675331860780716, "epoch": 0.08332564813358143, "grad_norm": 5.470025539398193, "learning_rate": 8.764182406956064e-06, "loss": 0.4998, "mean_token_accuracy": 0.8318305298686027, "num_tokens": 32255794.0, "step": 26880 }, { "entropy": 1.8293626666069032, "epoch": 0.08335664725863112, "grad_norm": 6.280379772186279, "learning_rate": 8.762552558801276e-06, "loss": 0.5219, "mean_token_accuracy": 0.8191826656460762, "num_tokens": 32269026.0, "step": 26890 }, { "entropy": 1.8529397904872895, "epoch": 0.08338764638368082, "grad_norm": 9.650927543640137, "learning_rate": 8.760923619602028e-06, "loss": 0.5539, "mean_token_accuracy": 0.8250053748488426, "num_tokens": 32282150.0, "step": 26900 }, { "entropy": 1.8897050678730012, "epoch": 0.08341864550873052, "grad_norm": 4.701704978942871, "learning_rate": 8.75929558851377e-06, "loss": 0.564, "mean_token_accuracy": 0.8231826663017273, "num_tokens": 32293837.0, "step": 26910 }, { "entropy": 1.8501434102654457, "epoch": 0.08344964463378021, "grad_norm": 10.15555191040039, "learning_rate": 8.757668464693049e-06, "loss": 0.5392, "mean_token_accuracy": 0.8285914018750191, "num_tokens": 32306371.0, "step": 26920 }, { "entropy": 1.9136781513690948, "epoch": 0.08348064375882991, "grad_norm": 9.14686107635498, "learning_rate": 8.756042247297512e-06, "loss": 0.5924, "mean_token_accuracy": 0.8250029042363167, "num_tokens": 32318441.0, "step": 26930 }, { "entropy": 1.9553735256195068, "epoch": 0.0835116428838796, "grad_norm": 4.363174915313721, "learning_rate": 8.754416935485893e-06, "loss": 0.6237, "mean_token_accuracy": 0.8168588444590569, "num_tokens": 32329443.0, "step": 26940 }, { "entropy": 1.839969304203987, "epoch": 0.0835426420089293, "grad_norm": 9.89714241027832, "learning_rate": 8.75279252841803e-06, "loss": 0.4824, "mean_token_accuracy": 0.8351193264126777, "num_tokens": 32341869.0, "step": 26950 }, { "entropy": 1.9383182168006896, "epoch": 0.083573641133979, "grad_norm": 10.119979858398438, "learning_rate": 8.751169025254838e-06, "loss": 0.593, "mean_token_accuracy": 0.8233784183859825, "num_tokens": 32352867.0, "step": 26960 }, { "entropy": 1.8974655985832214, "epoch": 0.0836046402590287, "grad_norm": 9.094542503356934, "learning_rate": 8.749546425158334e-06, "loss": 0.5511, "mean_token_accuracy": 0.8213450491428376, "num_tokens": 32365021.0, "step": 26970 }, { "entropy": 1.8861443296074867, "epoch": 0.08363563938407838, "grad_norm": 9.384647369384766, "learning_rate": 8.747924727291615e-06, "loss": 0.5612, "mean_token_accuracy": 0.8222412168979645, "num_tokens": 32376467.0, "step": 26980 }, { "entropy": 1.9021775022149086, "epoch": 0.08366663850912807, "grad_norm": 8.179243087768555, "learning_rate": 8.746303930818864e-06, "loss": 0.5683, "mean_token_accuracy": 0.8159692242741585, "num_tokens": 32388816.0, "step": 26990 }, { "entropy": 1.9401640132069589, "epoch": 0.08369763763417777, "grad_norm": 4.109061241149902, "learning_rate": 8.744684034905353e-06, "loss": 0.5723, "mean_token_accuracy": 0.8248335152864457, "num_tokens": 32400534.0, "step": 27000 }, { "entropy": 1.8398920968174934, "epoch": 0.08372863675922747, "grad_norm": 10.484107971191406, "learning_rate": 8.743065038717426e-06, "loss": 0.5011, "mean_token_accuracy": 0.8244616940617562, "num_tokens": 32413551.0, "step": 27010 }, { "entropy": 1.9067912593483924, "epoch": 0.08375963588427716, "grad_norm": 4.866901397705078, "learning_rate": 8.741446941422514e-06, "loss": 0.5775, "mean_token_accuracy": 0.8243859261274338, "num_tokens": 32425381.0, "step": 27020 }, { "entropy": 1.839262606203556, "epoch": 0.08379063500932686, "grad_norm": 11.031305313110352, "learning_rate": 8.739829742189128e-06, "loss": 0.5304, "mean_token_accuracy": 0.8227956235408783, "num_tokens": 32438069.0, "step": 27030 }, { "entropy": 1.8895979836583137, "epoch": 0.08382163413437656, "grad_norm": 8.922094345092773, "learning_rate": 8.738213440186849e-06, "loss": 0.5385, "mean_token_accuracy": 0.8271268501877784, "num_tokens": 32449380.0, "step": 27040 }, { "entropy": 1.945242629945278, "epoch": 0.08385263325942625, "grad_norm": 4.774547576904297, "learning_rate": 8.736598034586335e-06, "loss": 0.609, "mean_token_accuracy": 0.8129522934556007, "num_tokens": 32460665.0, "step": 27050 }, { "entropy": 1.9358567342162132, "epoch": 0.08388363238447595, "grad_norm": 5.142744064331055, "learning_rate": 8.734983524559322e-06, "loss": 0.6, "mean_token_accuracy": 0.8112214103341102, "num_tokens": 32472553.0, "step": 27060 }, { "entropy": 1.909812480211258, "epoch": 0.08391463150952565, "grad_norm": 8.57243824005127, "learning_rate": 8.733369909278609e-06, "loss": 0.5495, "mean_token_accuracy": 0.8224716767668724, "num_tokens": 32485563.0, "step": 27070 }, { "entropy": 2.0142262816429137, "epoch": 0.08394563063457534, "grad_norm": 10.417340278625488, "learning_rate": 8.731757187918067e-06, "loss": 0.6399, "mean_token_accuracy": 0.8098721027374267, "num_tokens": 32496322.0, "step": 27080 }, { "entropy": 1.8523387983441353, "epoch": 0.08397662975962504, "grad_norm": 4.585434436798096, "learning_rate": 8.730145359652638e-06, "loss": 0.5058, "mean_token_accuracy": 0.8351857841014863, "num_tokens": 32509323.0, "step": 27090 }, { "entropy": 1.8732295900583267, "epoch": 0.08400762888467472, "grad_norm": 4.835305690765381, "learning_rate": 8.728534423658325e-06, "loss": 0.5411, "mean_token_accuracy": 0.8177412390708924, "num_tokens": 32521503.0, "step": 27100 }, { "entropy": 2.0055402636528017, "epoch": 0.08403862800972442, "grad_norm": 13.625639915466309, "learning_rate": 8.726924379112201e-06, "loss": 0.6815, "mean_token_accuracy": 0.8050232946872711, "num_tokens": 32532199.0, "step": 27110 }, { "entropy": 1.8963544055819512, "epoch": 0.08406962713477412, "grad_norm": 9.827751159667969, "learning_rate": 8.725315225192391e-06, "loss": 0.5774, "mean_token_accuracy": 0.8158974751830101, "num_tokens": 32544370.0, "step": 27120 }, { "entropy": 1.957877866923809, "epoch": 0.08410062625982381, "grad_norm": 4.588069915771484, "learning_rate": 8.723706961078094e-06, "loss": 0.5492, "mean_token_accuracy": 0.8149573966860771, "num_tokens": 32555644.0, "step": 27130 }, { "entropy": 1.9230869844555856, "epoch": 0.08413162538487351, "grad_norm": 8.736885070800781, "learning_rate": 8.722099585949552e-06, "loss": 0.5671, "mean_token_accuracy": 0.8175855800509453, "num_tokens": 32568234.0, "step": 27140 }, { "entropy": 1.9423416420817374, "epoch": 0.0841626245099232, "grad_norm": 10.53701400756836, "learning_rate": 8.720493098988078e-06, "loss": 0.6455, "mean_token_accuracy": 0.8082656994462013, "num_tokens": 32580063.0, "step": 27150 }, { "entropy": 2.0061039671301844, "epoch": 0.0841936236349729, "grad_norm": 9.278496742248535, "learning_rate": 8.718887499376033e-06, "loss": 0.6392, "mean_token_accuracy": 0.8064600452780724, "num_tokens": 32591372.0, "step": 27160 }, { "entropy": 1.9022212833166123, "epoch": 0.0842246227600226, "grad_norm": 4.704682350158691, "learning_rate": 8.717282786296834e-06, "loss": 0.5109, "mean_token_accuracy": 0.8284071788191796, "num_tokens": 32604362.0, "step": 27170 }, { "entropy": 1.858892096579075, "epoch": 0.0842556218850723, "grad_norm": 10.126176834106445, "learning_rate": 8.715678958934944e-06, "loss": 0.5473, "mean_token_accuracy": 0.8314953356981277, "num_tokens": 32616980.0, "step": 27180 }, { "entropy": 1.8593590274453162, "epoch": 0.08428662101012199, "grad_norm": 10.302115440368652, "learning_rate": 8.714076016475885e-06, "loss": 0.5496, "mean_token_accuracy": 0.8241918250918389, "num_tokens": 32629349.0, "step": 27190 }, { "entropy": 1.8986007198691368, "epoch": 0.08431762013517169, "grad_norm": 3.36916446685791, "learning_rate": 8.712473958106222e-06, "loss": 0.5957, "mean_token_accuracy": 0.8168155118823052, "num_tokens": 32641755.0, "step": 27200 }, { "entropy": 1.8978888988494873, "epoch": 0.08434861926022139, "grad_norm": 9.608877182006836, "learning_rate": 8.710872783013563e-06, "loss": 0.5693, "mean_token_accuracy": 0.8153728187084198, "num_tokens": 32653282.0, "step": 27210 }, { "entropy": 1.9542479366064072, "epoch": 0.08437961838527108, "grad_norm": 4.4257307052612305, "learning_rate": 8.709272490386569e-06, "loss": 0.6091, "mean_token_accuracy": 0.806995865702629, "num_tokens": 32664764.0, "step": 27220 }, { "entropy": 1.9367131859064102, "epoch": 0.08441061751032077, "grad_norm": 8.415755271911621, "learning_rate": 8.707673079414937e-06, "loss": 0.5859, "mean_token_accuracy": 0.8194612860679626, "num_tokens": 32675941.0, "step": 27230 }, { "entropy": 1.9739065006375314, "epoch": 0.08444161663537046, "grad_norm": 9.644390106201172, "learning_rate": 8.706074549289411e-06, "loss": 0.6278, "mean_token_accuracy": 0.8121867910027504, "num_tokens": 32687272.0, "step": 27240 }, { "entropy": 1.9072382494807243, "epoch": 0.08447261576042016, "grad_norm": 8.06834888458252, "learning_rate": 8.704476899201766e-06, "loss": 0.5399, "mean_token_accuracy": 0.8260136678814888, "num_tokens": 32698817.0, "step": 27250 }, { "entropy": 1.9281651645898819, "epoch": 0.08450361488546985, "grad_norm": 10.1871976852417, "learning_rate": 8.702880128344827e-06, "loss": 0.5762, "mean_token_accuracy": 0.82305498868227, "num_tokens": 32710416.0, "step": 27260 }, { "entropy": 1.9203084349632262, "epoch": 0.08453461401051955, "grad_norm": 11.28444766998291, "learning_rate": 8.701284235912444e-06, "loss": 0.5659, "mean_token_accuracy": 0.8235008075833321, "num_tokens": 32722557.0, "step": 27270 }, { "entropy": 1.8537453413009644, "epoch": 0.08456561313556925, "grad_norm": 10.914976119995117, "learning_rate": 8.699689221099508e-06, "loss": 0.6454, "mean_token_accuracy": 0.8067585110664368, "num_tokens": 32735577.0, "step": 27280 }, { "entropy": 1.9035761684179306, "epoch": 0.08459661226061894, "grad_norm": 11.605010986328125, "learning_rate": 8.698095083101939e-06, "loss": 0.5898, "mean_token_accuracy": 0.8235433489084244, "num_tokens": 32747673.0, "step": 27290 }, { "entropy": 1.8419725641608238, "epoch": 0.08462761138566864, "grad_norm": 7.603212833404541, "learning_rate": 8.69650182111669e-06, "loss": 0.4914, "mean_token_accuracy": 0.8381339445710182, "num_tokens": 32760397.0, "step": 27300 }, { "entropy": 1.9057568103075027, "epoch": 0.08465861051071834, "grad_norm": 7.779855251312256, "learning_rate": 8.694909434341743e-06, "loss": 0.5506, "mean_token_accuracy": 0.8320439457893372, "num_tokens": 32772961.0, "step": 27310 }, { "entropy": 1.9533362567424775, "epoch": 0.08468960963576803, "grad_norm": 9.11958122253418, "learning_rate": 8.693317921976107e-06, "loss": 0.6301, "mean_token_accuracy": 0.8056516513228417, "num_tokens": 32785213.0, "step": 27320 }, { "entropy": 1.9149364829063416, "epoch": 0.08472060876081773, "grad_norm": 8.80299186706543, "learning_rate": 8.69172728321982e-06, "loss": 0.5535, "mean_token_accuracy": 0.8212979912757874, "num_tokens": 32798167.0, "step": 27330 }, { "entropy": 1.8609929516911508, "epoch": 0.08475160788586743, "grad_norm": 4.60377311706543, "learning_rate": 8.690137517273937e-06, "loss": 0.5088, "mean_token_accuracy": 0.8336883813142777, "num_tokens": 32810453.0, "step": 27340 }, { "entropy": 1.9059410750865937, "epoch": 0.08478260701091711, "grad_norm": 10.732145309448242, "learning_rate": 8.688548623340543e-06, "loss": 0.6174, "mean_token_accuracy": 0.8180224969983101, "num_tokens": 32823220.0, "step": 27350 }, { "entropy": 1.9536007195711136, "epoch": 0.08481360613596681, "grad_norm": 8.611103057861328, "learning_rate": 8.68696060062274e-06, "loss": 0.6304, "mean_token_accuracy": 0.8148555681109428, "num_tokens": 32834092.0, "step": 27360 }, { "entropy": 1.8823547706007957, "epoch": 0.0848446052610165, "grad_norm": 8.003419876098633, "learning_rate": 8.685373448324655e-06, "loss": 0.5494, "mean_token_accuracy": 0.8131691709160804, "num_tokens": 32846184.0, "step": 27370 }, { "entropy": 1.8589939393103123, "epoch": 0.0848756043860662, "grad_norm": 9.513988494873047, "learning_rate": 8.683787165651419e-06, "loss": 0.5296, "mean_token_accuracy": 0.8292522057890892, "num_tokens": 32859312.0, "step": 27380 }, { "entropy": 1.9123133316636085, "epoch": 0.0849066035111159, "grad_norm": 9.80207633972168, "learning_rate": 8.682201751809196e-06, "loss": 0.6113, "mean_token_accuracy": 0.8205790624022484, "num_tokens": 32870081.0, "step": 27390 }, { "entropy": 1.8975965559482575, "epoch": 0.0849376026361656, "grad_norm": 9.128314971923828, "learning_rate": 8.680617206005148e-06, "loss": 0.5771, "mean_token_accuracy": 0.8289038851857186, "num_tokens": 32881096.0, "step": 27400 }, { "entropy": 1.9172597080469131, "epoch": 0.08496860176121529, "grad_norm": 10.668315887451172, "learning_rate": 8.679033527447462e-06, "loss": 0.5846, "mean_token_accuracy": 0.8238224640488625, "num_tokens": 32893038.0, "step": 27410 }, { "entropy": 1.8307318970561028, "epoch": 0.08499960088626499, "grad_norm": 4.935861587524414, "learning_rate": 8.67745071534533e-06, "loss": 0.531, "mean_token_accuracy": 0.8222571074962616, "num_tokens": 32906388.0, "step": 27420 }, { "entropy": 1.9008280768990518, "epoch": 0.08503060001131468, "grad_norm": 4.503681182861328, "learning_rate": 8.675868768908956e-06, "loss": 0.5765, "mean_token_accuracy": 0.8253033936023713, "num_tokens": 32918054.0, "step": 27430 }, { "entropy": 1.8841553077101707, "epoch": 0.08506159913636438, "grad_norm": 8.340595245361328, "learning_rate": 8.674287687349546e-06, "loss": 0.5451, "mean_token_accuracy": 0.8274075031280518, "num_tokens": 32929483.0, "step": 27440 }, { "entropy": 1.8089099466800689, "epoch": 0.08509259826141408, "grad_norm": 4.920588493347168, "learning_rate": 8.672707469879315e-06, "loss": 0.4781, "mean_token_accuracy": 0.8309017539024353, "num_tokens": 32942288.0, "step": 27450 }, { "entropy": 1.8523821212351321, "epoch": 0.08512359738646377, "grad_norm": 10.191000938415527, "learning_rate": 8.67112811571149e-06, "loss": 0.5767, "mean_token_accuracy": 0.8139420121908187, "num_tokens": 32954595.0, "step": 27460 }, { "entropy": 1.878308503329754, "epoch": 0.08515459651151346, "grad_norm": 5.282871246337891, "learning_rate": 8.669549624060287e-06, "loss": 0.5801, "mean_token_accuracy": 0.8135267734527588, "num_tokens": 32966387.0, "step": 27470 }, { "entropy": 1.8660302713513375, "epoch": 0.08518559563656315, "grad_norm": 7.17582893371582, "learning_rate": 8.66797199414093e-06, "loss": 0.6105, "mean_token_accuracy": 0.8228353872895241, "num_tokens": 32978739.0, "step": 27480 }, { "entropy": 1.855283497273922, "epoch": 0.08521659476161285, "grad_norm": 9.164436340332031, "learning_rate": 8.666395225169643e-06, "loss": 0.5588, "mean_token_accuracy": 0.8205430552363395, "num_tokens": 32991000.0, "step": 27490 }, { "entropy": 1.916937005519867, "epoch": 0.08524759388666255, "grad_norm": 5.297219276428223, "learning_rate": 8.664819316363645e-06, "loss": 0.6027, "mean_token_accuracy": 0.8212128937244415, "num_tokens": 33002576.0, "step": 27500 }, { "entropy": 1.9633919060230256, "epoch": 0.08527859301171224, "grad_norm": 10.939813613891602, "learning_rate": 8.663244266941157e-06, "loss": 0.6718, "mean_token_accuracy": 0.8004844337701797, "num_tokens": 33013283.0, "step": 27510 }, { "entropy": 1.8730249777436256, "epoch": 0.08530959213676194, "grad_norm": 5.108644962310791, "learning_rate": 8.661670076121382e-06, "loss": 0.6221, "mean_token_accuracy": 0.810503962635994, "num_tokens": 33025788.0, "step": 27520 }, { "entropy": 1.9476985320448876, "epoch": 0.08534059126181164, "grad_norm": 9.54196548461914, "learning_rate": 8.66009674312453e-06, "loss": 0.6355, "mean_token_accuracy": 0.800178411602974, "num_tokens": 33036893.0, "step": 27530 }, { "entropy": 1.9610710114240646, "epoch": 0.08537159038686133, "grad_norm": 8.600343704223633, "learning_rate": 8.658524267171792e-06, "loss": 0.6988, "mean_token_accuracy": 0.8083438903093338, "num_tokens": 33048666.0, "step": 27540 }, { "entropy": 1.8692133530974389, "epoch": 0.08540258951191103, "grad_norm": 10.251091957092285, "learning_rate": 8.656952647485353e-06, "loss": 0.561, "mean_token_accuracy": 0.8302003368735313, "num_tokens": 33061005.0, "step": 27550 }, { "entropy": 1.827210921049118, "epoch": 0.08543358863696073, "grad_norm": 9.993083000183105, "learning_rate": 8.655381883288387e-06, "loss": 0.6069, "mean_token_accuracy": 0.8159284323453904, "num_tokens": 33074475.0, "step": 27560 }, { "entropy": 1.8685495540499688, "epoch": 0.08546458776201042, "grad_norm": 10.750574111938477, "learning_rate": 8.653811973805048e-06, "loss": 0.5993, "mean_token_accuracy": 0.8224136650562286, "num_tokens": 33086089.0, "step": 27570 }, { "entropy": 1.7942854017019272, "epoch": 0.08549558688706012, "grad_norm": 4.533649921417236, "learning_rate": 8.652242918260485e-06, "loss": 0.5276, "mean_token_accuracy": 0.8248818814754486, "num_tokens": 33100203.0, "step": 27580 }, { "entropy": 1.8375580973923207, "epoch": 0.08552658601210981, "grad_norm": 4.471881866455078, "learning_rate": 8.650674715880821e-06, "loss": 0.5732, "mean_token_accuracy": 0.8313728928565979, "num_tokens": 33113514.0, "step": 27590 }, { "entropy": 1.9055291563272476, "epoch": 0.0855575851371595, "grad_norm": 8.167454719543457, "learning_rate": 8.649107365893162e-06, "loss": 0.5533, "mean_token_accuracy": 0.8212564826011658, "num_tokens": 33125377.0, "step": 27600 }, { "entropy": 1.882322046160698, "epoch": 0.0855885842622092, "grad_norm": 9.761778831481934, "learning_rate": 8.647540867525599e-06, "loss": 0.5904, "mean_token_accuracy": 0.8162475794553756, "num_tokens": 33137354.0, "step": 27610 }, { "entropy": 1.8892990604043007, "epoch": 0.08561958338725889, "grad_norm": 9.397979736328125, "learning_rate": 8.645975220007197e-06, "loss": 0.6297, "mean_token_accuracy": 0.8177220180630684, "num_tokens": 33149211.0, "step": 27620 }, { "entropy": 1.9387479245662689, "epoch": 0.08565058251230859, "grad_norm": 10.300222396850586, "learning_rate": 8.644410422567995e-06, "loss": 0.6018, "mean_token_accuracy": 0.8117999002337456, "num_tokens": 33160821.0, "step": 27630 }, { "entropy": 1.942291297018528, "epoch": 0.08568158163735828, "grad_norm": 11.509066581726074, "learning_rate": 8.642846474439016e-06, "loss": 0.5863, "mean_token_accuracy": 0.8200451254844665, "num_tokens": 33172491.0, "step": 27640 }, { "entropy": 1.954495507478714, "epoch": 0.08571258076240798, "grad_norm": 8.897340774536133, "learning_rate": 8.641283374852245e-06, "loss": 0.5856, "mean_token_accuracy": 0.8219591468572617, "num_tokens": 33184078.0, "step": 27650 }, { "entropy": 1.8060628071427345, "epoch": 0.08574357988745768, "grad_norm": 8.12181282043457, "learning_rate": 8.639721123040653e-06, "loss": 0.49, "mean_token_accuracy": 0.8268592938780784, "num_tokens": 33198036.0, "step": 27660 }, { "entropy": 1.7999069422483445, "epoch": 0.08577457901250737, "grad_norm": 4.471950054168701, "learning_rate": 8.638159718238167e-06, "loss": 0.4485, "mean_token_accuracy": 0.8397438317537308, "num_tokens": 33212045.0, "step": 27670 }, { "entropy": 1.9364047437906264, "epoch": 0.08580557813755707, "grad_norm": 13.346952438354492, "learning_rate": 8.636599159679694e-06, "loss": 0.6085, "mean_token_accuracy": 0.8085330918431282, "num_tokens": 33223752.0, "step": 27680 }, { "entropy": 1.8672024756669998, "epoch": 0.08583657726260677, "grad_norm": 5.810976028442383, "learning_rate": 8.635039446601096e-06, "loss": 0.5647, "mean_token_accuracy": 0.8230971753597259, "num_tokens": 33236114.0, "step": 27690 }, { "entropy": 1.8852705493569375, "epoch": 0.08586757638765646, "grad_norm": 5.571743011474609, "learning_rate": 8.633480578239217e-06, "loss": 0.5606, "mean_token_accuracy": 0.8273131296038627, "num_tokens": 33248369.0, "step": 27700 }, { "entropy": 1.9336923122406007, "epoch": 0.08589857551270616, "grad_norm": 11.164706230163574, "learning_rate": 8.63192255383185e-06, "loss": 0.5876, "mean_token_accuracy": 0.8112567231059075, "num_tokens": 33260442.0, "step": 27710 }, { "entropy": 1.9060131937265397, "epoch": 0.08592957463775584, "grad_norm": 10.689841270446777, "learning_rate": 8.630365372617761e-06, "loss": 0.5697, "mean_token_accuracy": 0.8298930734395981, "num_tokens": 33272270.0, "step": 27720 }, { "entropy": 1.9362266033887863, "epoch": 0.08596057376280554, "grad_norm": 9.383460998535156, "learning_rate": 8.62880903383667e-06, "loss": 0.6129, "mean_token_accuracy": 0.8200540199875832, "num_tokens": 33283981.0, "step": 27730 }, { "entropy": 1.9314474761486053, "epoch": 0.08599157288785524, "grad_norm": 10.130805969238281, "learning_rate": 8.627253536729257e-06, "loss": 0.5401, "mean_token_accuracy": 0.8344287112355232, "num_tokens": 33296450.0, "step": 27740 }, { "entropy": 1.944339656829834, "epoch": 0.08602257201290493, "grad_norm": 8.83008861541748, "learning_rate": 8.625698880537165e-06, "loss": 0.6563, "mean_token_accuracy": 0.8038821011781693, "num_tokens": 33307571.0, "step": 27750 }, { "entropy": 1.9081848710775375, "epoch": 0.08605357113795463, "grad_norm": 10.480369567871094, "learning_rate": 8.62414506450299e-06, "loss": 0.5913, "mean_token_accuracy": 0.817870119214058, "num_tokens": 33319700.0, "step": 27760 }, { "entropy": 1.84724163711071, "epoch": 0.08608457026300433, "grad_norm": 3.8803551197052, "learning_rate": 8.622592087870282e-06, "loss": 0.5173, "mean_token_accuracy": 0.8268263578414917, "num_tokens": 33331971.0, "step": 27770 }, { "entropy": 1.934218955039978, "epoch": 0.08611556938805402, "grad_norm": 9.977314949035645, "learning_rate": 8.621039949883543e-06, "loss": 0.6326, "mean_token_accuracy": 0.8152559250593185, "num_tokens": 33342967.0, "step": 27780 }, { "entropy": 1.8323631912469864, "epoch": 0.08614656851310372, "grad_norm": 12.091215133666992, "learning_rate": 8.619488649788232e-06, "loss": 0.5562, "mean_token_accuracy": 0.8163023605942726, "num_tokens": 33356235.0, "step": 27790 }, { "entropy": 1.8991770133376122, "epoch": 0.08617756763815342, "grad_norm": 7.5659074783325195, "learning_rate": 8.617938186830752e-06, "loss": 0.5869, "mean_token_accuracy": 0.828110148012638, "num_tokens": 33368988.0, "step": 27800 }, { "entropy": 1.8398756310343742, "epoch": 0.08620856676320311, "grad_norm": 8.638795852661133, "learning_rate": 8.616388560258459e-06, "loss": 0.5014, "mean_token_accuracy": 0.8359197080135345, "num_tokens": 33381666.0, "step": 27810 }, { "entropy": 1.9264942169189454, "epoch": 0.08623956588825281, "grad_norm": 9.415760040283203, "learning_rate": 8.61483976931965e-06, "loss": 0.5762, "mean_token_accuracy": 0.8242600187659264, "num_tokens": 33393125.0, "step": 27820 }, { "entropy": 1.841173852980137, "epoch": 0.0862705650133025, "grad_norm": 6.242795944213867, "learning_rate": 8.613291813263578e-06, "loss": 0.4799, "mean_token_accuracy": 0.8402754053473472, "num_tokens": 33405715.0, "step": 27830 }, { "entropy": 1.9486567616462707, "epoch": 0.08630156413835219, "grad_norm": 8.530654907226562, "learning_rate": 8.61174469134043e-06, "loss": 0.621, "mean_token_accuracy": 0.8173292249441146, "num_tokens": 33417314.0, "step": 27840 }, { "entropy": 1.8448371395468712, "epoch": 0.08633256326340188, "grad_norm": 5.01255464553833, "learning_rate": 8.610198402801338e-06, "loss": 0.4891, "mean_token_accuracy": 0.8294866099953652, "num_tokens": 33431319.0, "step": 27850 }, { "entropy": 1.8705769084393977, "epoch": 0.08636356238845158, "grad_norm": 12.166780471801758, "learning_rate": 8.60865294689838e-06, "loss": 0.6069, "mean_token_accuracy": 0.8117830008268356, "num_tokens": 33444973.0, "step": 27860 }, { "entropy": 1.9853614136576652, "epoch": 0.08639456151350128, "grad_norm": 9.717790603637695, "learning_rate": 8.607108322884566e-06, "loss": 0.61, "mean_token_accuracy": 0.8108811169862747, "num_tokens": 33456479.0, "step": 27870 }, { "entropy": 1.918572799861431, "epoch": 0.08642556063855097, "grad_norm": 4.174164295196533, "learning_rate": 8.605564530013847e-06, "loss": 0.5333, "mean_token_accuracy": 0.829804676771164, "num_tokens": 33468303.0, "step": 27880 }, { "entropy": 1.9406263917684554, "epoch": 0.08645655976360067, "grad_norm": 8.830784797668457, "learning_rate": 8.604021567541113e-06, "loss": 0.6364, "mean_token_accuracy": 0.8110644370317459, "num_tokens": 33480915.0, "step": 27890 }, { "entropy": 1.9681098356842994, "epoch": 0.08648755888865037, "grad_norm": 9.740206718444824, "learning_rate": 8.602479434722184e-06, "loss": 0.623, "mean_token_accuracy": 0.8138311117887497, "num_tokens": 33491964.0, "step": 27900 }, { "entropy": 1.9589456513524055, "epoch": 0.08651855801370006, "grad_norm": 9.058309555053711, "learning_rate": 8.600938130813817e-06, "loss": 0.6245, "mean_token_accuracy": 0.814877088367939, "num_tokens": 33503440.0, "step": 27910 }, { "entropy": 1.787692840397358, "epoch": 0.08654955713874976, "grad_norm": 9.325605392456055, "learning_rate": 8.5993976550737e-06, "loss": 0.4343, "mean_token_accuracy": 0.8466595977544784, "num_tokens": 33517709.0, "step": 27920 }, { "entropy": 1.9136761993169784, "epoch": 0.08658055626379946, "grad_norm": 11.386701583862305, "learning_rate": 8.59785800676045e-06, "loss": 0.5628, "mean_token_accuracy": 0.8248209357261658, "num_tokens": 33529539.0, "step": 27930 }, { "entropy": 1.8507066249847413, "epoch": 0.08661155538884915, "grad_norm": 9.556991577148438, "learning_rate": 8.596319185133614e-06, "loss": 0.521, "mean_token_accuracy": 0.8304374098777771, "num_tokens": 33542002.0, "step": 27940 }, { "entropy": 1.9005508705973626, "epoch": 0.08664255451389885, "grad_norm": 4.965198516845703, "learning_rate": 8.594781189453666e-06, "loss": 0.5964, "mean_token_accuracy": 0.8208310827612877, "num_tokens": 33553517.0, "step": 27950 }, { "entropy": 1.9146973922848702, "epoch": 0.08667355363894855, "grad_norm": 4.9398932456970215, "learning_rate": 8.593244018982006e-06, "loss": 0.5812, "mean_token_accuracy": 0.8191220477223397, "num_tokens": 33565833.0, "step": 27960 }, { "entropy": 1.8798900321125984, "epoch": 0.08670455276399823, "grad_norm": 8.898138046264648, "learning_rate": 8.59170767298096e-06, "loss": 0.6295, "mean_token_accuracy": 0.8155390247702599, "num_tokens": 33578565.0, "step": 27970 }, { "entropy": 1.8256605304777622, "epoch": 0.08673555188904793, "grad_norm": 9.652932167053223, "learning_rate": 8.590172150713773e-06, "loss": 0.4694, "mean_token_accuracy": 0.8326058983802795, "num_tokens": 33591102.0, "step": 27980 }, { "entropy": 1.8813355445861817, "epoch": 0.08676655101409762, "grad_norm": 9.704791069030762, "learning_rate": 8.588637451444612e-06, "loss": 0.5626, "mean_token_accuracy": 0.8161522090435028, "num_tokens": 33603187.0, "step": 27990 }, { "entropy": 1.926676908135414, "epoch": 0.08679755013914732, "grad_norm": 10.068709373474121, "learning_rate": 8.587103574438569e-06, "loss": 0.644, "mean_token_accuracy": 0.8118028625845909, "num_tokens": 33614612.0, "step": 28000 }, { "entropy": 1.880669642984867, "epoch": 0.08682854926419702, "grad_norm": 8.847538948059082, "learning_rate": 8.585570518961651e-06, "loss": 0.5402, "mean_token_accuracy": 0.827701772749424, "num_tokens": 33626936.0, "step": 28010 }, { "entropy": 1.9508138015866279, "epoch": 0.08685954838924671, "grad_norm": 11.043745994567871, "learning_rate": 8.58403828428078e-06, "loss": 0.5966, "mean_token_accuracy": 0.8156369537115097, "num_tokens": 33638555.0, "step": 28020 }, { "entropy": 1.9032750859856606, "epoch": 0.08689054751429641, "grad_norm": 10.193146705627441, "learning_rate": 8.582506869663793e-06, "loss": 0.5462, "mean_token_accuracy": 0.8273995086550713, "num_tokens": 33650029.0, "step": 28030 }, { "entropy": 1.8772203177213669, "epoch": 0.0869215466393461, "grad_norm": 4.957283020019531, "learning_rate": 8.580976274379448e-06, "loss": 0.5746, "mean_token_accuracy": 0.8220570385456085, "num_tokens": 33662687.0, "step": 28040 }, { "entropy": 2.005272647738457, "epoch": 0.0869525457643958, "grad_norm": 8.876986503601074, "learning_rate": 8.579446497697407e-06, "loss": 0.6347, "mean_token_accuracy": 0.809257960319519, "num_tokens": 33673133.0, "step": 28050 }, { "entropy": 1.897057008743286, "epoch": 0.0869835448894455, "grad_norm": 4.687375068664551, "learning_rate": 8.57791753888825e-06, "loss": 0.5549, "mean_token_accuracy": 0.8260800316929817, "num_tokens": 33685382.0, "step": 28060 }, { "entropy": 1.890799196064472, "epoch": 0.0870145440144952, "grad_norm": 11.013267517089844, "learning_rate": 8.576389397223463e-06, "loss": 0.5619, "mean_token_accuracy": 0.8162667453289032, "num_tokens": 33698473.0, "step": 28070 }, { "entropy": 1.8947896853089332, "epoch": 0.08704554313954489, "grad_norm": 9.438980102539062, "learning_rate": 8.574862071975438e-06, "loss": 0.495, "mean_token_accuracy": 0.8390587836503982, "num_tokens": 33710540.0, "step": 28080 }, { "entropy": 1.833718155324459, "epoch": 0.08707654226459458, "grad_norm": 9.037769317626953, "learning_rate": 8.57333556241748e-06, "loss": 0.5269, "mean_token_accuracy": 0.8308623015880585, "num_tokens": 33723318.0, "step": 28090 }, { "entropy": 1.9327234029769897, "epoch": 0.08710754138964427, "grad_norm": 8.257326126098633, "learning_rate": 8.571809867823794e-06, "loss": 0.6263, "mean_token_accuracy": 0.8134735986590386, "num_tokens": 33734540.0, "step": 28100 }, { "entropy": 1.9399518385529517, "epoch": 0.08713854051469397, "grad_norm": 8.866934776306152, "learning_rate": 8.57028498746949e-06, "loss": 0.5604, "mean_token_accuracy": 0.8291549518704414, "num_tokens": 33745641.0, "step": 28110 }, { "entropy": 1.8807589188218117, "epoch": 0.08716953963974367, "grad_norm": 9.743132591247559, "learning_rate": 8.568760920630582e-06, "loss": 0.5934, "mean_token_accuracy": 0.8171397164463997, "num_tokens": 33757675.0, "step": 28120 }, { "entropy": 1.8112970426678658, "epoch": 0.08720053876479336, "grad_norm": 12.117599487304688, "learning_rate": 8.567237666583983e-06, "loss": 0.5077, "mean_token_accuracy": 0.8401974871754646, "num_tokens": 33769971.0, "step": 28130 }, { "entropy": 1.8710318520665168, "epoch": 0.08723153788984306, "grad_norm": 10.758975982666016, "learning_rate": 8.565715224607507e-06, "loss": 0.5972, "mean_token_accuracy": 0.8087448701262474, "num_tokens": 33782332.0, "step": 28140 }, { "entropy": 1.9059083193540574, "epoch": 0.08726253701489275, "grad_norm": 9.423166275024414, "learning_rate": 8.564193593979863e-06, "loss": 0.566, "mean_token_accuracy": 0.8211286991834641, "num_tokens": 33794157.0, "step": 28150 }, { "entropy": 1.8057139664888382, "epoch": 0.08729353613994245, "grad_norm": 4.398457050323486, "learning_rate": 8.562672773980662e-06, "loss": 0.5061, "mean_token_accuracy": 0.8441807180643082, "num_tokens": 33806432.0, "step": 28160 }, { "entropy": 1.8262229442596436, "epoch": 0.08732453526499215, "grad_norm": 8.604774475097656, "learning_rate": 8.561152763890406e-06, "loss": 0.519, "mean_token_accuracy": 0.8298747554421425, "num_tokens": 33818953.0, "step": 28170 }, { "entropy": 1.856579264998436, "epoch": 0.08735553439004184, "grad_norm": 9.146932601928711, "learning_rate": 8.559633562990491e-06, "loss": 0.5448, "mean_token_accuracy": 0.8260318920016289, "num_tokens": 33831302.0, "step": 28180 }, { "entropy": 1.9090891823172569, "epoch": 0.08738653351509154, "grad_norm": 10.917224884033203, "learning_rate": 8.558115170563206e-06, "loss": 0.6262, "mean_token_accuracy": 0.7991869211196899, "num_tokens": 33843087.0, "step": 28190 }, { "entropy": 1.8630158439278603, "epoch": 0.08741753264014124, "grad_norm": 8.141376495361328, "learning_rate": 8.556597585891731e-06, "loss": 0.5709, "mean_token_accuracy": 0.8259263783693314, "num_tokens": 33855443.0, "step": 28200 }, { "entropy": 1.9804678469896317, "epoch": 0.08744853176519093, "grad_norm": 9.60744857788086, "learning_rate": 8.555080808260135e-06, "loss": 0.6753, "mean_token_accuracy": 0.8023278743028641, "num_tokens": 33866719.0, "step": 28210 }, { "entropy": 1.933399637043476, "epoch": 0.08747953089024062, "grad_norm": 7.662554740905762, "learning_rate": 8.55356483695338e-06, "loss": 0.5787, "mean_token_accuracy": 0.8204610332846641, "num_tokens": 33877680.0, "step": 28220 }, { "entropy": 1.936586219072342, "epoch": 0.08751053001529031, "grad_norm": 9.148261070251465, "learning_rate": 8.552049671257301e-06, "loss": 0.6002, "mean_token_accuracy": 0.810947285592556, "num_tokens": 33890103.0, "step": 28230 }, { "entropy": 1.9918580889701842, "epoch": 0.08754152914034001, "grad_norm": 9.620087623596191, "learning_rate": 8.550535310458639e-06, "loss": 0.6424, "mean_token_accuracy": 0.8145117297768593, "num_tokens": 33901085.0, "step": 28240 }, { "entropy": 1.9204299062490464, "epoch": 0.0875725282653897, "grad_norm": 9.934686660766602, "learning_rate": 8.549021753844996e-06, "loss": 0.5643, "mean_token_accuracy": 0.82352195084095, "num_tokens": 33913338.0, "step": 28250 }, { "entropy": 1.8327893808484077, "epoch": 0.0876035273904394, "grad_norm": 5.132232666015625, "learning_rate": 8.547509000704874e-06, "loss": 0.4515, "mean_token_accuracy": 0.8444669589400291, "num_tokens": 33926686.0, "step": 28260 }, { "entropy": 1.8552611097693443, "epoch": 0.0876345265154891, "grad_norm": 10.373604774475098, "learning_rate": 8.545997050327648e-06, "loss": 0.5785, "mean_token_accuracy": 0.8225721046328545, "num_tokens": 33939038.0, "step": 28270 }, { "entropy": 1.9178115367889403, "epoch": 0.0876655256405388, "grad_norm": 11.540125846862793, "learning_rate": 8.544485902003573e-06, "loss": 0.6322, "mean_token_accuracy": 0.8020992532372475, "num_tokens": 33950992.0, "step": 28280 }, { "entropy": 1.852571301162243, "epoch": 0.0876965247655885, "grad_norm": 10.074957847595215, "learning_rate": 8.542975555023786e-06, "loss": 0.5433, "mean_token_accuracy": 0.8297949224710465, "num_tokens": 33963043.0, "step": 28290 }, { "entropy": 1.815762387216091, "epoch": 0.08772752389063819, "grad_norm": 5.080463886260986, "learning_rate": 8.541466008680297e-06, "loss": 0.5416, "mean_token_accuracy": 0.8305084735155106, "num_tokens": 33976042.0, "step": 28300 }, { "entropy": 1.7797249928116798, "epoch": 0.08775852301568789, "grad_norm": 11.101983070373535, "learning_rate": 8.539957262265988e-06, "loss": 0.5854, "mean_token_accuracy": 0.8253014713525773, "num_tokens": 33989173.0, "step": 28310 }, { "entropy": 1.9091511026024819, "epoch": 0.08778952214073758, "grad_norm": 6.8226141929626465, "learning_rate": 8.538449315074628e-06, "loss": 0.6012, "mean_token_accuracy": 0.8207783192396164, "num_tokens": 34000588.0, "step": 28320 }, { "entropy": 1.8104485064744948, "epoch": 0.08782052126578728, "grad_norm": 8.398052215576172, "learning_rate": 8.536942166400845e-06, "loss": 0.456, "mean_token_accuracy": 0.8447774976491929, "num_tokens": 34013462.0, "step": 28330 }, { "entropy": 1.8076317757368088, "epoch": 0.08785152039083696, "grad_norm": 8.194868087768555, "learning_rate": 8.535435815540142e-06, "loss": 0.5373, "mean_token_accuracy": 0.8241015315055847, "num_tokens": 34025542.0, "step": 28340 }, { "entropy": 1.8683214783668518, "epoch": 0.08788251951588666, "grad_norm": 9.149380683898926, "learning_rate": 8.533930261788897e-06, "loss": 0.5971, "mean_token_accuracy": 0.8132390677928925, "num_tokens": 34037377.0, "step": 28350 }, { "entropy": 1.8220268458127975, "epoch": 0.08791351864093636, "grad_norm": 4.427048683166504, "learning_rate": 8.532425504444351e-06, "loss": 0.5305, "mean_token_accuracy": 0.82462307959795, "num_tokens": 34050269.0, "step": 28360 }, { "entropy": 1.804908536374569, "epoch": 0.08794451776598605, "grad_norm": 4.890988349914551, "learning_rate": 8.530921542804612e-06, "loss": 0.5341, "mean_token_accuracy": 0.8353535622358322, "num_tokens": 34062946.0, "step": 28370 }, { "entropy": 1.832678309082985, "epoch": 0.08797551689103575, "grad_norm": 9.909698486328125, "learning_rate": 8.52941837616866e-06, "loss": 0.5177, "mean_token_accuracy": 0.8312997654080391, "num_tokens": 34075013.0, "step": 28380 }, { "entropy": 1.7965776398777962, "epoch": 0.08800651601608545, "grad_norm": 2.5267937183380127, "learning_rate": 8.527916003836331e-06, "loss": 0.5494, "mean_token_accuracy": 0.8231059461832047, "num_tokens": 34087851.0, "step": 28390 }, { "entropy": 1.8382470428943634, "epoch": 0.08803751514113514, "grad_norm": 9.315332412719727, "learning_rate": 8.52641442510833e-06, "loss": 0.5739, "mean_token_accuracy": 0.8272110924124718, "num_tokens": 34099777.0, "step": 28400 }, { "entropy": 1.7765842095017432, "epoch": 0.08806851426618484, "grad_norm": 9.057583808898926, "learning_rate": 8.524913639286219e-06, "loss": 0.4539, "mean_token_accuracy": 0.8396439641714096, "num_tokens": 34111989.0, "step": 28410 }, { "entropy": 1.9059796720743178, "epoch": 0.08809951339123454, "grad_norm": 8.058761596679688, "learning_rate": 8.523413645672424e-06, "loss": 0.6247, "mean_token_accuracy": 0.811768627166748, "num_tokens": 34123482.0, "step": 28420 }, { "entropy": 1.8845535546541214, "epoch": 0.08813051251628423, "grad_norm": 7.467669486999512, "learning_rate": 8.52191444357023e-06, "loss": 0.6146, "mean_token_accuracy": 0.8204743906855583, "num_tokens": 34134946.0, "step": 28430 }, { "entropy": 1.789374603331089, "epoch": 0.08816151164133393, "grad_norm": 8.87814712524414, "learning_rate": 8.520416032283778e-06, "loss": 0.5555, "mean_token_accuracy": 0.8291935101151466, "num_tokens": 34147310.0, "step": 28440 }, { "entropy": 1.9232223629951477, "epoch": 0.08819251076638362, "grad_norm": 10.156554222106934, "learning_rate": 8.518918411118063e-06, "loss": 0.6747, "mean_token_accuracy": 0.8002053126692772, "num_tokens": 34157822.0, "step": 28450 }, { "entropy": 1.8766271471977234, "epoch": 0.08822350989143331, "grad_norm": 9.71645450592041, "learning_rate": 8.51742157937894e-06, "loss": 0.6254, "mean_token_accuracy": 0.8169643610715867, "num_tokens": 34168930.0, "step": 28460 }, { "entropy": 1.9275714561343193, "epoch": 0.088254509016483, "grad_norm": 10.583243370056152, "learning_rate": 8.515925536373112e-06, "loss": 0.6441, "mean_token_accuracy": 0.8056027069687843, "num_tokens": 34181366.0, "step": 28470 }, { "entropy": 1.8285674542188644, "epoch": 0.0882855081415327, "grad_norm": 12.124537467956543, "learning_rate": 8.51443028140814e-06, "loss": 0.5686, "mean_token_accuracy": 0.81962631046772, "num_tokens": 34193201.0, "step": 28480 }, { "entropy": 1.8924355298280715, "epoch": 0.0883165072665824, "grad_norm": 8.544037818908691, "learning_rate": 8.512935813792427e-06, "loss": 0.5901, "mean_token_accuracy": 0.8254889070987701, "num_tokens": 34203925.0, "step": 28490 }, { "entropy": 1.8146496564149857, "epoch": 0.0883475063916321, "grad_norm": 9.721524238586426, "learning_rate": 8.511442132835237e-06, "loss": 0.533, "mean_token_accuracy": 0.8225750029087067, "num_tokens": 34216079.0, "step": 28500 }, { "entropy": 1.8287780940532685, "epoch": 0.08837850551668179, "grad_norm": 10.420062065124512, "learning_rate": 8.509949237846672e-06, "loss": 0.6115, "mean_token_accuracy": 0.8089086845517158, "num_tokens": 34228708.0, "step": 28510 }, { "entropy": 1.8081783071160316, "epoch": 0.08840950464173149, "grad_norm": 5.157535076141357, "learning_rate": 8.508457128137686e-06, "loss": 0.5269, "mean_token_accuracy": 0.8270225182175637, "num_tokens": 34241771.0, "step": 28520 }, { "entropy": 1.8582287296652793, "epoch": 0.08844050376678118, "grad_norm": 4.54521369934082, "learning_rate": 8.506965803020078e-06, "loss": 0.5714, "mean_token_accuracy": 0.8173837080597878, "num_tokens": 34253510.0, "step": 28530 }, { "entropy": 1.769499270617962, "epoch": 0.08847150289183088, "grad_norm": 11.29238224029541, "learning_rate": 8.50547526180649e-06, "loss": 0.5211, "mean_token_accuracy": 0.8185042932629585, "num_tokens": 34266457.0, "step": 28540 }, { "entropy": 1.8402513667941094, "epoch": 0.08850250201688058, "grad_norm": 10.66517162322998, "learning_rate": 8.503985503810404e-06, "loss": 0.5411, "mean_token_accuracy": 0.8161203816533089, "num_tokens": 34278814.0, "step": 28550 }, { "entropy": 1.9259721651673316, "epoch": 0.08853350114193027, "grad_norm": 11.027581214904785, "learning_rate": 8.502496528346151e-06, "loss": 0.6318, "mean_token_accuracy": 0.8031986460089684, "num_tokens": 34290769.0, "step": 28560 }, { "entropy": 1.8921418815851212, "epoch": 0.08856450026697997, "grad_norm": 8.231386184692383, "learning_rate": 8.501008334728893e-06, "loss": 0.573, "mean_token_accuracy": 0.8195232212543487, "num_tokens": 34301680.0, "step": 28570 }, { "entropy": 1.9299334168434144, "epoch": 0.08859549939202967, "grad_norm": 12.117446899414062, "learning_rate": 8.49952092227464e-06, "loss": 0.6296, "mean_token_accuracy": 0.8005195692181587, "num_tokens": 34313378.0, "step": 28580 }, { "entropy": 1.8172454446554185, "epoch": 0.08862649851707935, "grad_norm": 8.979788780212402, "learning_rate": 8.498034290300233e-06, "loss": 0.5726, "mean_token_accuracy": 0.8196427300572395, "num_tokens": 34326259.0, "step": 28590 }, { "entropy": 1.8738806433975697, "epoch": 0.08865749764212905, "grad_norm": 9.108292579650879, "learning_rate": 8.496548438123347e-06, "loss": 0.5925, "mean_token_accuracy": 0.8214808851480484, "num_tokens": 34337849.0, "step": 28600 }, { "entropy": 1.8880690574645995, "epoch": 0.08868849676717874, "grad_norm": 10.18707275390625, "learning_rate": 8.495063365062501e-06, "loss": 0.5741, "mean_token_accuracy": 0.8180713891983032, "num_tokens": 34349609.0, "step": 28610 }, { "entropy": 1.7710354149341583, "epoch": 0.08871949589222844, "grad_norm": 2.5999815464019775, "learning_rate": 8.493579070437038e-06, "loss": 0.4607, "mean_token_accuracy": 0.8418709754943847, "num_tokens": 34363340.0, "step": 28620 }, { "entropy": 1.7987074330449104, "epoch": 0.08875049501727814, "grad_norm": 4.221140384674072, "learning_rate": 8.492095553567142e-06, "loss": 0.5227, "mean_token_accuracy": 0.8264068886637688, "num_tokens": 34376348.0, "step": 28630 }, { "entropy": 1.8191845625638963, "epoch": 0.08878149414232783, "grad_norm": 8.881943702697754, "learning_rate": 8.49061281377382e-06, "loss": 0.615, "mean_token_accuracy": 0.8144074931740761, "num_tokens": 34389472.0, "step": 28640 }, { "entropy": 1.910823555290699, "epoch": 0.08881249326737753, "grad_norm": 10.219226837158203, "learning_rate": 8.489130850378912e-06, "loss": 0.5835, "mean_token_accuracy": 0.8214432463049889, "num_tokens": 34400353.0, "step": 28650 }, { "entropy": 1.817768232524395, "epoch": 0.08884349239242723, "grad_norm": 9.067606925964355, "learning_rate": 8.487649662705087e-06, "loss": 0.5141, "mean_token_accuracy": 0.8307969868183136, "num_tokens": 34412601.0, "step": 28660 }, { "entropy": 1.7433785900473595, "epoch": 0.08887449151747692, "grad_norm": 9.326837539672852, "learning_rate": 8.48616925007584e-06, "loss": 0.4671, "mean_token_accuracy": 0.8333866909146309, "num_tokens": 34425813.0, "step": 28670 }, { "entropy": 1.8864496439695357, "epoch": 0.08890549064252662, "grad_norm": 8.599132537841797, "learning_rate": 8.484689611815491e-06, "loss": 0.6327, "mean_token_accuracy": 0.8141407266259193, "num_tokens": 34437369.0, "step": 28680 }, { "entropy": 1.8374801024794578, "epoch": 0.08893648976757632, "grad_norm": 9.512642860412598, "learning_rate": 8.483210747249186e-06, "loss": 0.5685, "mean_token_accuracy": 0.8316513374447823, "num_tokens": 34449662.0, "step": 28690 }, { "entropy": 1.7934413641691207, "epoch": 0.08896748889262601, "grad_norm": 5.296740531921387, "learning_rate": 8.481732655702892e-06, "loss": 0.5281, "mean_token_accuracy": 0.8272898524999619, "num_tokens": 34462445.0, "step": 28700 }, { "entropy": 1.8716158524155617, "epoch": 0.0889984880176757, "grad_norm": 9.011677742004395, "learning_rate": 8.4802553365034e-06, "loss": 0.5302, "mean_token_accuracy": 0.8340117856860161, "num_tokens": 34474062.0, "step": 28710 }, { "entropy": 1.801230075955391, "epoch": 0.08902948714272539, "grad_norm": 8.55329418182373, "learning_rate": 8.478778788978323e-06, "loss": 0.5594, "mean_token_accuracy": 0.8217922821640968, "num_tokens": 34487372.0, "step": 28720 }, { "entropy": 1.8363505557179451, "epoch": 0.08906048626777509, "grad_norm": 9.565589904785156, "learning_rate": 8.477303012456088e-06, "loss": 0.5462, "mean_token_accuracy": 0.825613497197628, "num_tokens": 34499424.0, "step": 28730 }, { "entropy": 1.8904439568519593, "epoch": 0.08909148539282478, "grad_norm": 9.289710998535156, "learning_rate": 8.47582800626594e-06, "loss": 0.5842, "mean_token_accuracy": 0.8117494031786918, "num_tokens": 34510549.0, "step": 28740 }, { "entropy": 1.876707810163498, "epoch": 0.08912248451787448, "grad_norm": 10.488570213317871, "learning_rate": 8.474353769737951e-06, "loss": 0.584, "mean_token_accuracy": 0.8244671374559402, "num_tokens": 34522326.0, "step": 28750 }, { "entropy": 1.8384212747216224, "epoch": 0.08915348364292418, "grad_norm": 4.727441787719727, "learning_rate": 8.472880302202995e-06, "loss": 0.5507, "mean_token_accuracy": 0.8281288787722587, "num_tokens": 34533723.0, "step": 28760 }, { "entropy": 1.8969373285770417, "epoch": 0.08918448276797387, "grad_norm": 10.660612106323242, "learning_rate": 8.471407602992768e-06, "loss": 0.6065, "mean_token_accuracy": 0.8140498995780945, "num_tokens": 34545247.0, "step": 28770 }, { "entropy": 1.8668588057160378, "epoch": 0.08921548189302357, "grad_norm": 5.187861919403076, "learning_rate": 8.469935671439776e-06, "loss": 0.5975, "mean_token_accuracy": 0.8087451472878456, "num_tokens": 34557397.0, "step": 28780 }, { "entropy": 1.9032288029789926, "epoch": 0.08924648101807327, "grad_norm": 5.01014518737793, "learning_rate": 8.468464506877338e-06, "loss": 0.5857, "mean_token_accuracy": 0.8231427073478699, "num_tokens": 34568813.0, "step": 28790 }, { "entropy": 1.83003838211298, "epoch": 0.08927748014312296, "grad_norm": 7.490168571472168, "learning_rate": 8.46699410863958e-06, "loss": 0.5445, "mean_token_accuracy": 0.8283394366502762, "num_tokens": 34581102.0, "step": 28800 }, { "entropy": 1.8458102241158485, "epoch": 0.08930847926817266, "grad_norm": 11.137908935546875, "learning_rate": 8.465524476061445e-06, "loss": 0.5456, "mean_token_accuracy": 0.8272756159305572, "num_tokens": 34593458.0, "step": 28810 }, { "entropy": 1.7978875115513802, "epoch": 0.08933947839322236, "grad_norm": 13.252009391784668, "learning_rate": 8.464055608478673e-06, "loss": 0.5037, "mean_token_accuracy": 0.8363189071416854, "num_tokens": 34607214.0, "step": 28820 }, { "entropy": 1.8788623362779617, "epoch": 0.08937047751827204, "grad_norm": 8.798412322998047, "learning_rate": 8.46258750522782e-06, "loss": 0.5202, "mean_token_accuracy": 0.8359053075313568, "num_tokens": 34618731.0, "step": 28830 }, { "entropy": 1.8706136047840118, "epoch": 0.08940147664332174, "grad_norm": 10.804203033447266, "learning_rate": 8.46112016564624e-06, "loss": 0.6323, "mean_token_accuracy": 0.8170384958386421, "num_tokens": 34629767.0, "step": 28840 }, { "entropy": 1.9193976387381553, "epoch": 0.08943247576837143, "grad_norm": 9.608490943908691, "learning_rate": 8.459653589072098e-06, "loss": 0.5879, "mean_token_accuracy": 0.823236158490181, "num_tokens": 34641783.0, "step": 28850 }, { "entropy": 1.953177237510681, "epoch": 0.08946347489342113, "grad_norm": 9.958290100097656, "learning_rate": 8.458187774844355e-06, "loss": 0.6343, "mean_token_accuracy": 0.8059092834591866, "num_tokens": 34652879.0, "step": 28860 }, { "entropy": 1.9278223618865014, "epoch": 0.08949447401847083, "grad_norm": 9.370678901672363, "learning_rate": 8.456722722302779e-06, "loss": 0.5779, "mean_token_accuracy": 0.8173175677657127, "num_tokens": 34664517.0, "step": 28870 }, { "entropy": 1.743288952112198, "epoch": 0.08952547314352052, "grad_norm": 4.402005672454834, "learning_rate": 8.45525843078793e-06, "loss": 0.4175, "mean_token_accuracy": 0.8388036683201789, "num_tokens": 34678318.0, "step": 28880 }, { "entropy": 1.82712120115757, "epoch": 0.08955647226857022, "grad_norm": 5.668634414672852, "learning_rate": 8.453794899641178e-06, "loss": 0.5317, "mean_token_accuracy": 0.8243148773908615, "num_tokens": 34690574.0, "step": 28890 }, { "entropy": 1.794793924689293, "epoch": 0.08958747139361992, "grad_norm": 4.694520950317383, "learning_rate": 8.452332128204687e-06, "loss": 0.5341, "mean_token_accuracy": 0.8249142169952393, "num_tokens": 34702705.0, "step": 28900 }, { "entropy": 1.9069462090730667, "epoch": 0.08961847051866961, "grad_norm": 9.739960670471191, "learning_rate": 8.450870115821412e-06, "loss": 0.6399, "mean_token_accuracy": 0.8125949770212173, "num_tokens": 34713359.0, "step": 28910 }, { "entropy": 1.843550091981888, "epoch": 0.08964946964371931, "grad_norm": 9.711644172668457, "learning_rate": 8.449408861835107e-06, "loss": 0.5467, "mean_token_accuracy": 0.8326569676399231, "num_tokens": 34724846.0, "step": 28920 }, { "entropy": 1.8499679207801818, "epoch": 0.089680468768769, "grad_norm": 9.517023086547852, "learning_rate": 8.447948365590324e-06, "loss": 0.6048, "mean_token_accuracy": 0.8209212452173233, "num_tokens": 34736090.0, "step": 28930 }, { "entropy": 1.8778686568140983, "epoch": 0.0897114678938187, "grad_norm": 7.4227423667907715, "learning_rate": 8.446488626432398e-06, "loss": 0.5703, "mean_token_accuracy": 0.8157725527882576, "num_tokens": 34748597.0, "step": 28940 }, { "entropy": 1.901959116756916, "epoch": 0.0897424670188684, "grad_norm": 8.503968238830566, "learning_rate": 8.445029643707466e-06, "loss": 0.6077, "mean_token_accuracy": 0.8245082676410675, "num_tokens": 34760085.0, "step": 28950 }, { "entropy": 1.9186773270368576, "epoch": 0.08977346614391808, "grad_norm": 9.638235092163086, "learning_rate": 8.443571416762454e-06, "loss": 0.5811, "mean_token_accuracy": 0.8147833585739136, "num_tokens": 34771714.0, "step": 28960 }, { "entropy": 1.9039877220988273, "epoch": 0.08980446526896778, "grad_norm": 10.257258415222168, "learning_rate": 8.442113944945066e-06, "loss": 0.5929, "mean_token_accuracy": 0.8272214874625206, "num_tokens": 34783065.0, "step": 28970 }, { "entropy": 1.8442776799201965, "epoch": 0.08983546439401748, "grad_norm": 8.371413230895996, "learning_rate": 8.440657227603809e-06, "loss": 0.5034, "mean_token_accuracy": 0.8354337051510811, "num_tokens": 34796313.0, "step": 28980 }, { "entropy": 1.8650359570980073, "epoch": 0.08986646351906717, "grad_norm": 8.93613052368164, "learning_rate": 8.439201264087966e-06, "loss": 0.4865, "mean_token_accuracy": 0.8383944362401963, "num_tokens": 34809266.0, "step": 28990 }, { "entropy": 1.898645070195198, "epoch": 0.08989746264411687, "grad_norm": 4.809848785400391, "learning_rate": 8.437746053747611e-06, "loss": 0.5454, "mean_token_accuracy": 0.8344621181488037, "num_tokens": 34821442.0, "step": 29000 }, { "entropy": 1.8691908940672874, "epoch": 0.08992846176916656, "grad_norm": 4.506728649139404, "learning_rate": 8.436291595933597e-06, "loss": 0.5207, "mean_token_accuracy": 0.8261950254440308, "num_tokens": 34834092.0, "step": 29010 }, { "entropy": 1.989156812429428, "epoch": 0.08995946089421626, "grad_norm": 10.102865219116211, "learning_rate": 8.434837889997567e-06, "loss": 0.6544, "mean_token_accuracy": 0.8023478895425796, "num_tokens": 34844589.0, "step": 29020 }, { "entropy": 1.8639222919940948, "epoch": 0.08999046001926596, "grad_norm": 8.607808113098145, "learning_rate": 8.433384935291941e-06, "loss": 0.5202, "mean_token_accuracy": 0.8342802464962006, "num_tokens": 34856930.0, "step": 29030 }, { "entropy": 1.888324649631977, "epoch": 0.09002145914431565, "grad_norm": 15.61713695526123, "learning_rate": 8.43193273116992e-06, "loss": 0.5623, "mean_token_accuracy": 0.8299177408218383, "num_tokens": 34868535.0, "step": 29040 }, { "entropy": 1.9326095387339592, "epoch": 0.09005245826936535, "grad_norm": 13.876715660095215, "learning_rate": 8.430481276985486e-06, "loss": 0.606, "mean_token_accuracy": 0.8128352046012879, "num_tokens": 34879434.0, "step": 29050 }, { "entropy": 1.8903497770428657, "epoch": 0.09008345739441505, "grad_norm": 10.24538803100586, "learning_rate": 8.429030572093397e-06, "loss": 0.556, "mean_token_accuracy": 0.8233076304197311, "num_tokens": 34891052.0, "step": 29060 }, { "entropy": 1.8975743114948274, "epoch": 0.09011445651946474, "grad_norm": 11.500408172607422, "learning_rate": 8.427580615849188e-06, "loss": 0.5778, "mean_token_accuracy": 0.8229522377252578, "num_tokens": 34903040.0, "step": 29070 }, { "entropy": 1.8994533449411393, "epoch": 0.09014545564451443, "grad_norm": 11.263349533081055, "learning_rate": 8.426131407609173e-06, "loss": 0.5816, "mean_token_accuracy": 0.820560471713543, "num_tokens": 34914402.0, "step": 29080 }, { "entropy": 1.8676754549145698, "epoch": 0.09017645476956412, "grad_norm": 9.663436889648438, "learning_rate": 8.42468294673044e-06, "loss": 0.5948, "mean_token_accuracy": 0.8182600021362305, "num_tokens": 34926419.0, "step": 29090 }, { "entropy": 1.8573566570878028, "epoch": 0.09020745389461382, "grad_norm": 11.30569076538086, "learning_rate": 8.423235232570846e-06, "loss": 0.5602, "mean_token_accuracy": 0.8217246904969215, "num_tokens": 34938924.0, "step": 29100 }, { "entropy": 1.9108622312545775, "epoch": 0.09023845301966352, "grad_norm": 5.698962211608887, "learning_rate": 8.421788264489021e-06, "loss": 0.5796, "mean_token_accuracy": 0.8229934841394424, "num_tokens": 34950788.0, "step": 29110 }, { "entropy": 1.777287058532238, "epoch": 0.09026945214471321, "grad_norm": 9.155726432800293, "learning_rate": 8.420342041844372e-06, "loss": 0.518, "mean_token_accuracy": 0.834735095500946, "num_tokens": 34963772.0, "step": 29120 }, { "entropy": 1.8479400515556335, "epoch": 0.09030045126976291, "grad_norm": 4.175968170166016, "learning_rate": 8.418896563997072e-06, "loss": 0.5232, "mean_token_accuracy": 0.8213765308260917, "num_tokens": 34976155.0, "step": 29130 }, { "entropy": 1.8892390161752701, "epoch": 0.0903314503948126, "grad_norm": 10.586323738098145, "learning_rate": 8.41745183030806e-06, "loss": 0.5572, "mean_token_accuracy": 0.8211356267333031, "num_tokens": 34988223.0, "step": 29140 }, { "entropy": 1.8357836604118347, "epoch": 0.0903624495198623, "grad_norm": 10.024797439575195, "learning_rate": 8.416007840139042e-06, "loss": 0.5248, "mean_token_accuracy": 0.8338177174329757, "num_tokens": 35001045.0, "step": 29150 }, { "entropy": 1.887669287621975, "epoch": 0.090393448644912, "grad_norm": 9.929082870483398, "learning_rate": 8.4145645928525e-06, "loss": 0.6165, "mean_token_accuracy": 0.8130573540925979, "num_tokens": 35012268.0, "step": 29160 }, { "entropy": 1.8210702255368232, "epoch": 0.0904244477699617, "grad_norm": 4.600307464599609, "learning_rate": 8.413122087811668e-06, "loss": 0.5054, "mean_token_accuracy": 0.8326424300670624, "num_tokens": 35025165.0, "step": 29170 }, { "entropy": 1.9049372583627702, "epoch": 0.0904554468950114, "grad_norm": 9.302005767822266, "learning_rate": 8.411680324380554e-06, "loss": 0.5807, "mean_token_accuracy": 0.8166706204414368, "num_tokens": 35036999.0, "step": 29180 }, { "entropy": 1.8811736673116684, "epoch": 0.09048644602006109, "grad_norm": 11.642059326171875, "learning_rate": 8.410239301923921e-06, "loss": 0.5831, "mean_token_accuracy": 0.8235431507229805, "num_tokens": 35048718.0, "step": 29190 }, { "entropy": 1.7818408221006394, "epoch": 0.09051744514511077, "grad_norm": 3.9471521377563477, "learning_rate": 8.408799019807298e-06, "loss": 0.4817, "mean_token_accuracy": 0.8274667426943779, "num_tokens": 35062426.0, "step": 29200 }, { "entropy": 1.9234388038516044, "epoch": 0.09054844427016047, "grad_norm": 9.527081489562988, "learning_rate": 8.407359477396974e-06, "loss": 0.5952, "mean_token_accuracy": 0.824344064295292, "num_tokens": 35073342.0, "step": 29210 }, { "entropy": 1.9060495615005493, "epoch": 0.09057944339521017, "grad_norm": 4.219698905944824, "learning_rate": 8.405920674059997e-06, "loss": 0.5991, "mean_token_accuracy": 0.8162388294935227, "num_tokens": 35085769.0, "step": 29220 }, { "entropy": 1.9729074537754059, "epoch": 0.09061044252025986, "grad_norm": 12.57748794555664, "learning_rate": 8.404482609164172e-06, "loss": 0.678, "mean_token_accuracy": 0.8029696837067604, "num_tokens": 35096711.0, "step": 29230 }, { "entropy": 1.9721817374229431, "epoch": 0.09064144164530956, "grad_norm": 9.833843231201172, "learning_rate": 8.40304528207806e-06, "loss": 0.5768, "mean_token_accuracy": 0.8235157504677773, "num_tokens": 35107468.0, "step": 29240 }, { "entropy": 1.8956667900085449, "epoch": 0.09067244077035926, "grad_norm": 8.549565315246582, "learning_rate": 8.40160869217098e-06, "loss": 0.5324, "mean_token_accuracy": 0.8300123199820518, "num_tokens": 35119143.0, "step": 29250 }, { "entropy": 1.856950616836548, "epoch": 0.09070343989540895, "grad_norm": 8.162071228027344, "learning_rate": 8.400172838813004e-06, "loss": 0.4855, "mean_token_accuracy": 0.8311971753835679, "num_tokens": 35131680.0, "step": 29260 }, { "entropy": 1.92002714574337, "epoch": 0.09073443902045865, "grad_norm": 9.52449893951416, "learning_rate": 8.398737721374958e-06, "loss": 0.5886, "mean_token_accuracy": 0.8099859327077865, "num_tokens": 35143630.0, "step": 29270 }, { "entropy": 1.8672298848628999, "epoch": 0.09076543814550835, "grad_norm": 10.40442180633545, "learning_rate": 8.39730333922842e-06, "loss": 0.5327, "mean_token_accuracy": 0.8219729751348496, "num_tokens": 35155669.0, "step": 29280 }, { "entropy": 1.8741114243865014, "epoch": 0.09079643727055804, "grad_norm": 9.638392448425293, "learning_rate": 8.39586969174572e-06, "loss": 0.6016, "mean_token_accuracy": 0.8221320033073425, "num_tokens": 35168182.0, "step": 29290 }, { "entropy": 1.8563538879156112, "epoch": 0.09082743639560774, "grad_norm": 11.128931999206543, "learning_rate": 8.394436778299934e-06, "loss": 0.5658, "mean_token_accuracy": 0.8322840392589569, "num_tokens": 35180920.0, "step": 29300 }, { "entropy": 1.940355482697487, "epoch": 0.09085843552065744, "grad_norm": 12.628369331359863, "learning_rate": 8.393004598264892e-06, "loss": 0.6027, "mean_token_accuracy": 0.8121664762496948, "num_tokens": 35191979.0, "step": 29310 }, { "entropy": 1.881583635509014, "epoch": 0.09088943464570713, "grad_norm": 10.32827091217041, "learning_rate": 8.391573151015169e-06, "loss": 0.5926, "mean_token_accuracy": 0.8216746702790261, "num_tokens": 35204154.0, "step": 29320 }, { "entropy": 1.9963495016098023, "epoch": 0.09092043377075681, "grad_norm": 10.914651870727539, "learning_rate": 8.390142435926085e-06, "loss": 0.6708, "mean_token_accuracy": 0.802430622279644, "num_tokens": 35215370.0, "step": 29330 }, { "entropy": 1.807043470442295, "epoch": 0.09095143289580651, "grad_norm": 6.392316818237305, "learning_rate": 8.38871245237371e-06, "loss": 0.4936, "mean_token_accuracy": 0.83642118871212, "num_tokens": 35228636.0, "step": 29340 }, { "entropy": 1.8242978394031524, "epoch": 0.09098243202085621, "grad_norm": 8.627922058105469, "learning_rate": 8.387283199734848e-06, "loss": 0.4963, "mean_token_accuracy": 0.833967824280262, "num_tokens": 35241156.0, "step": 29350 }, { "entropy": 1.937102773785591, "epoch": 0.0910134311459059, "grad_norm": 7.6476640701293945, "learning_rate": 8.38585467738706e-06, "loss": 0.6429, "mean_token_accuracy": 0.810026279091835, "num_tokens": 35252420.0, "step": 29360 }, { "entropy": 1.8407186821103096, "epoch": 0.0910444302709556, "grad_norm": 9.836170196533203, "learning_rate": 8.38442688470864e-06, "loss": 0.5336, "mean_token_accuracy": 0.8297666862607003, "num_tokens": 35264790.0, "step": 29370 }, { "entropy": 1.8077852353453636, "epoch": 0.0910754293960053, "grad_norm": 9.396247863769531, "learning_rate": 8.382999821078624e-06, "loss": 0.5106, "mean_token_accuracy": 0.8240919351577759, "num_tokens": 35277676.0, "step": 29380 }, { "entropy": 1.8391135558485985, "epoch": 0.091106428521055, "grad_norm": 4.910935401916504, "learning_rate": 8.381573485876786e-06, "loss": 0.4858, "mean_token_accuracy": 0.8325628146529198, "num_tokens": 35290859.0, "step": 29390 }, { "entropy": 1.8129886105656623, "epoch": 0.09113742764610469, "grad_norm": 9.58067798614502, "learning_rate": 8.380147878483645e-06, "loss": 0.5201, "mean_token_accuracy": 0.826962910592556, "num_tokens": 35303398.0, "step": 29400 }, { "entropy": 1.85167246311903, "epoch": 0.09116842677115439, "grad_norm": 8.074714660644531, "learning_rate": 8.378722998280448e-06, "loss": 0.5415, "mean_token_accuracy": 0.8280769392848015, "num_tokens": 35315344.0, "step": 29410 }, { "entropy": 1.992096221446991, "epoch": 0.09119942589620408, "grad_norm": 9.145973205566406, "learning_rate": 8.377298844649186e-06, "loss": 0.7051, "mean_token_accuracy": 0.7987665429711341, "num_tokens": 35326107.0, "step": 29420 }, { "entropy": 1.8843266785144805, "epoch": 0.09123042502125378, "grad_norm": 9.272721290588379, "learning_rate": 8.375875416972584e-06, "loss": 0.5242, "mean_token_accuracy": 0.83641587048769, "num_tokens": 35338238.0, "step": 29430 }, { "entropy": 1.8167478024959565, "epoch": 0.09126142414630348, "grad_norm": 3.9986329078674316, "learning_rate": 8.374452714634094e-06, "loss": 0.5826, "mean_token_accuracy": 0.8207257181406021, "num_tokens": 35351268.0, "step": 29440 }, { "entropy": 1.9153858974575997, "epoch": 0.09129242327135316, "grad_norm": 10.753203392028809, "learning_rate": 8.373030737017907e-06, "loss": 0.5577, "mean_token_accuracy": 0.8246575996279717, "num_tokens": 35362594.0, "step": 29450 }, { "entropy": 1.9501359462738037, "epoch": 0.09132342239640286, "grad_norm": 9.741477966308594, "learning_rate": 8.371609483508947e-06, "loss": 0.6487, "mean_token_accuracy": 0.8103729501366616, "num_tokens": 35373596.0, "step": 29460 }, { "entropy": 1.932808554172516, "epoch": 0.09135442152145255, "grad_norm": 11.1822509765625, "learning_rate": 8.370188953492866e-06, "loss": 0.6056, "mean_token_accuracy": 0.821463891863823, "num_tokens": 35385127.0, "step": 29470 }, { "entropy": 1.8480829164385795, "epoch": 0.09138542064650225, "grad_norm": 8.782395362854004, "learning_rate": 8.368769146356043e-06, "loss": 0.5412, "mean_token_accuracy": 0.8359311327338219, "num_tokens": 35397793.0, "step": 29480 }, { "entropy": 1.9216597527265549, "epoch": 0.09141641977155195, "grad_norm": 10.21785831451416, "learning_rate": 8.36735006148559e-06, "loss": 0.6182, "mean_token_accuracy": 0.8218403205275535, "num_tokens": 35408887.0, "step": 29490 }, { "entropy": 1.854940627515316, "epoch": 0.09144741889660164, "grad_norm": 9.738880157470703, "learning_rate": 8.365931698269346e-06, "loss": 0.6001, "mean_token_accuracy": 0.8242891728878021, "num_tokens": 35421648.0, "step": 29500 }, { "entropy": 1.8743022330105306, "epoch": 0.09147841802165134, "grad_norm": 10.232392311096191, "learning_rate": 8.36451405609587e-06, "loss": 0.5753, "mean_token_accuracy": 0.8169502720236779, "num_tokens": 35434473.0, "step": 29510 }, { "entropy": 1.8739581793546676, "epoch": 0.09150941714670104, "grad_norm": 9.921628952026367, "learning_rate": 8.363097134354453e-06, "loss": 0.5563, "mean_token_accuracy": 0.8249258384108543, "num_tokens": 35446211.0, "step": 29520 }, { "entropy": 1.9153602778911591, "epoch": 0.09154041627175073, "grad_norm": 9.796987533569336, "learning_rate": 8.361680932435107e-06, "loss": 0.5789, "mean_token_accuracy": 0.8256885960698128, "num_tokens": 35457049.0, "step": 29530 }, { "entropy": 1.8282124564051627, "epoch": 0.09157141539680043, "grad_norm": 8.886029243469238, "learning_rate": 8.360265449728567e-06, "loss": 0.5452, "mean_token_accuracy": 0.8246500983834266, "num_tokens": 35470118.0, "step": 29540 }, { "entropy": 1.8801322914659977, "epoch": 0.09160241452185013, "grad_norm": 6.403395652770996, "learning_rate": 8.358850685626288e-06, "loss": 0.5331, "mean_token_accuracy": 0.8278462216258049, "num_tokens": 35483031.0, "step": 29550 }, { "entropy": 1.8667038604617119, "epoch": 0.09163341364689982, "grad_norm": 8.779369354248047, "learning_rate": 8.357436639520454e-06, "loss": 0.6143, "mean_token_accuracy": 0.8238617271184921, "num_tokens": 35494985.0, "step": 29560 }, { "entropy": 1.8358441174030304, "epoch": 0.0916644127719495, "grad_norm": 9.281476020812988, "learning_rate": 8.356023310803953e-06, "loss": 0.5642, "mean_token_accuracy": 0.8221923336386681, "num_tokens": 35506647.0, "step": 29570 }, { "entropy": 1.8791137009859085, "epoch": 0.0916954118969992, "grad_norm": 11.18396282196045, "learning_rate": 8.354610698870407e-06, "loss": 0.6371, "mean_token_accuracy": 0.8211662247776985, "num_tokens": 35518289.0, "step": 29580 }, { "entropy": 1.8068061396479607, "epoch": 0.0917264110220489, "grad_norm": 9.689407348632812, "learning_rate": 8.353198803114144e-06, "loss": 0.5216, "mean_token_accuracy": 0.819252048432827, "num_tokens": 35531769.0, "step": 29590 }, { "entropy": 1.8276824593544005, "epoch": 0.0917574101470986, "grad_norm": 8.703977584838867, "learning_rate": 8.351787622930218e-06, "loss": 0.4981, "mean_token_accuracy": 0.8255561321973801, "num_tokens": 35544219.0, "step": 29600 }, { "entropy": 1.9323107048869133, "epoch": 0.09178840927214829, "grad_norm": 9.766465187072754, "learning_rate": 8.350377157714388e-06, "loss": 0.6171, "mean_token_accuracy": 0.8179709568619729, "num_tokens": 35555186.0, "step": 29610 }, { "entropy": 1.896198531985283, "epoch": 0.09181940839719799, "grad_norm": 9.552749633789062, "learning_rate": 8.348967406863137e-06, "loss": 0.6246, "mean_token_accuracy": 0.8194371804594993, "num_tokens": 35567088.0, "step": 29620 }, { "entropy": 1.9005744218826295, "epoch": 0.09185040752224768, "grad_norm": 8.323497772216797, "learning_rate": 8.347558369773652e-06, "loss": 0.5541, "mean_token_accuracy": 0.8280677005648613, "num_tokens": 35578712.0, "step": 29630 }, { "entropy": 1.8314684435725213, "epoch": 0.09188140664729738, "grad_norm": 3.150182008743286, "learning_rate": 8.346150045843839e-06, "loss": 0.5522, "mean_token_accuracy": 0.8305447429418564, "num_tokens": 35591457.0, "step": 29640 }, { "entropy": 1.8687354385852815, "epoch": 0.09191240577234708, "grad_norm": 9.428099632263184, "learning_rate": 8.344742434472313e-06, "loss": 0.5739, "mean_token_accuracy": 0.8308294847607612, "num_tokens": 35603419.0, "step": 29650 }, { "entropy": 1.8598203748464583, "epoch": 0.09194340489739677, "grad_norm": 7.902471542358398, "learning_rate": 8.343335535058393e-06, "loss": 0.508, "mean_token_accuracy": 0.8371548101305961, "num_tokens": 35615606.0, "step": 29660 }, { "entropy": 1.955969789624214, "epoch": 0.09197440402244647, "grad_norm": 9.46319580078125, "learning_rate": 8.341929347002115e-06, "loss": 0.6819, "mean_token_accuracy": 0.8057120770215989, "num_tokens": 35626376.0, "step": 29670 }, { "entropy": 1.9067956805229187, "epoch": 0.09200540314749617, "grad_norm": 4.286515235900879, "learning_rate": 8.340523869704218e-06, "loss": 0.5918, "mean_token_accuracy": 0.8169622346758842, "num_tokens": 35637722.0, "step": 29680 }, { "entropy": 1.824674019217491, "epoch": 0.09203640227254586, "grad_norm": 9.380338668823242, "learning_rate": 8.33911910256615e-06, "loss": 0.5286, "mean_token_accuracy": 0.836339183151722, "num_tokens": 35650536.0, "step": 29690 }, { "entropy": 1.8453435346484184, "epoch": 0.09206740139759555, "grad_norm": 4.688096523284912, "learning_rate": 8.33771504499006e-06, "loss": 0.4755, "mean_token_accuracy": 0.8418554350733757, "num_tokens": 35663421.0, "step": 29700 }, { "entropy": 1.9342348664999007, "epoch": 0.09209840052264524, "grad_norm": 8.24661922454834, "learning_rate": 8.336311696378805e-06, "loss": 0.579, "mean_token_accuracy": 0.8277911230921745, "num_tokens": 35674421.0, "step": 29710 }, { "entropy": 1.9771510928869247, "epoch": 0.09212939964769494, "grad_norm": 11.507468223571777, "learning_rate": 8.334909056135947e-06, "loss": 0.6472, "mean_token_accuracy": 0.8096749007701873, "num_tokens": 35685226.0, "step": 29720 }, { "entropy": 1.8233107894659042, "epoch": 0.09216039877274464, "grad_norm": 11.91437816619873, "learning_rate": 8.333507123665745e-06, "loss": 0.5913, "mean_token_accuracy": 0.8291966646909714, "num_tokens": 35697511.0, "step": 29730 }, { "entropy": 1.9061884060502052, "epoch": 0.09219139789779433, "grad_norm": 9.895060539245605, "learning_rate": 8.332105898373162e-06, "loss": 0.5925, "mean_token_accuracy": 0.8209904283285141, "num_tokens": 35709184.0, "step": 29740 }, { "entropy": 1.8389362052083016, "epoch": 0.09222239702284403, "grad_norm": 9.202600479125977, "learning_rate": 8.330705379663864e-06, "loss": 0.538, "mean_token_accuracy": 0.8290344223380088, "num_tokens": 35722082.0, "step": 29750 }, { "entropy": 1.836703784763813, "epoch": 0.09225339614789373, "grad_norm": 9.766961097717285, "learning_rate": 8.32930556694421e-06, "loss": 0.6326, "mean_token_accuracy": 0.8234434753656388, "num_tokens": 35735502.0, "step": 29760 }, { "entropy": 1.823054054379463, "epoch": 0.09228439527294342, "grad_norm": 10.460724830627441, "learning_rate": 8.327906459621262e-06, "loss": 0.5132, "mean_token_accuracy": 0.8367855966091156, "num_tokens": 35747655.0, "step": 29770 }, { "entropy": 1.8504664957523347, "epoch": 0.09231539439799312, "grad_norm": 10.958568572998047, "learning_rate": 8.32650805710278e-06, "loss": 0.5747, "mean_token_accuracy": 0.8216020077466964, "num_tokens": 35759453.0, "step": 29780 }, { "entropy": 1.8954664573073388, "epoch": 0.09234639352304282, "grad_norm": 9.923591613769531, "learning_rate": 8.32511035879721e-06, "loss": 0.549, "mean_token_accuracy": 0.8215544745326042, "num_tokens": 35770958.0, "step": 29790 }, { "entropy": 1.8316803082823754, "epoch": 0.09237739264809251, "grad_norm": 9.00976848602295, "learning_rate": 8.323713364113706e-06, "loss": 0.5079, "mean_token_accuracy": 0.8291282281279564, "num_tokens": 35784494.0, "step": 29800 }, { "entropy": 1.8252049744129182, "epoch": 0.09240839177314221, "grad_norm": 4.551512241363525, "learning_rate": 8.322317072462106e-06, "loss": 0.4894, "mean_token_accuracy": 0.8328192785382271, "num_tokens": 35797555.0, "step": 29810 }, { "entropy": 1.86438340395689, "epoch": 0.09243939089819189, "grad_norm": 12.345170021057129, "learning_rate": 8.320921483252948e-06, "loss": 0.5755, "mean_token_accuracy": 0.8219951093196869, "num_tokens": 35810502.0, "step": 29820 }, { "entropy": 1.8994136035442353, "epoch": 0.09247039002324159, "grad_norm": 11.431099891662598, "learning_rate": 8.319526595897457e-06, "loss": 0.591, "mean_token_accuracy": 0.8122029930353165, "num_tokens": 35822303.0, "step": 29830 }, { "entropy": 1.8895697325468064, "epoch": 0.09250138914829129, "grad_norm": 9.15605354309082, "learning_rate": 8.318132409807547e-06, "loss": 0.5452, "mean_token_accuracy": 0.8253117471933364, "num_tokens": 35834691.0, "step": 29840 }, { "entropy": 1.8310220405459403, "epoch": 0.09253238827334098, "grad_norm": 9.539107322692871, "learning_rate": 8.31673892439583e-06, "loss": 0.5282, "mean_token_accuracy": 0.8205739751458168, "num_tokens": 35847122.0, "step": 29850 }, { "entropy": 1.8387802302837373, "epoch": 0.09256338739839068, "grad_norm": 9.641124725341797, "learning_rate": 8.315346139075596e-06, "loss": 0.5816, "mean_token_accuracy": 0.8287546694278717, "num_tokens": 35860015.0, "step": 29860 }, { "entropy": 1.8979364216327668, "epoch": 0.09259438652344038, "grad_norm": 9.997504234313965, "learning_rate": 8.31395405326083e-06, "loss": 0.5852, "mean_token_accuracy": 0.8195781350135803, "num_tokens": 35872240.0, "step": 29870 }, { "entropy": 1.9377481788396835, "epoch": 0.09262538564849007, "grad_norm": 9.89439868927002, "learning_rate": 8.3125626663662e-06, "loss": 0.6192, "mean_token_accuracy": 0.8212395742535591, "num_tokens": 35883875.0, "step": 29880 }, { "entropy": 1.9368197679519654, "epoch": 0.09265638477353977, "grad_norm": 9.990235328674316, "learning_rate": 8.311171977807062e-06, "loss": 0.5592, "mean_token_accuracy": 0.8287534207105637, "num_tokens": 35896237.0, "step": 29890 }, { "entropy": 1.9469063565135003, "epoch": 0.09268738389858946, "grad_norm": 8.493316650390625, "learning_rate": 8.309781986999454e-06, "loss": 0.5916, "mean_token_accuracy": 0.813089169561863, "num_tokens": 35907924.0, "step": 29900 }, { "entropy": 1.9415881425142287, "epoch": 0.09271838302363916, "grad_norm": 12.5552978515625, "learning_rate": 8.3083926933601e-06, "loss": 0.5415, "mean_token_accuracy": 0.8337080240249634, "num_tokens": 35919711.0, "step": 29910 }, { "entropy": 1.9274977430701257, "epoch": 0.09274938214868886, "grad_norm": 3.963390588760376, "learning_rate": 8.307004096306404e-06, "loss": 0.5286, "mean_token_accuracy": 0.8345837160944939, "num_tokens": 35931724.0, "step": 29920 }, { "entropy": 1.9158756092190743, "epoch": 0.09278038127373855, "grad_norm": 10.906941413879395, "learning_rate": 8.30561619525645e-06, "loss": 0.5781, "mean_token_accuracy": 0.8259297773241997, "num_tokens": 35943022.0, "step": 29930 }, { "entropy": 1.8155071794986726, "epoch": 0.09281138039878824, "grad_norm": 8.987025260925293, "learning_rate": 8.304228989629007e-06, "loss": 0.4983, "mean_token_accuracy": 0.8335763946175575, "num_tokens": 35955373.0, "step": 29940 }, { "entropy": 1.8580231979489326, "epoch": 0.09284237952383793, "grad_norm": 7.32357120513916, "learning_rate": 8.302842478843522e-06, "loss": 0.5705, "mean_token_accuracy": 0.8255671873688698, "num_tokens": 35968204.0, "step": 29950 }, { "entropy": 1.9050882533192635, "epoch": 0.09287337864888763, "grad_norm": 8.925579071044922, "learning_rate": 8.301456662320118e-06, "loss": 0.6012, "mean_token_accuracy": 0.810367950797081, "num_tokens": 35979573.0, "step": 29960 }, { "entropy": 1.836830762028694, "epoch": 0.09290437777393733, "grad_norm": 3.3085451126098633, "learning_rate": 8.300071539479595e-06, "loss": 0.5401, "mean_token_accuracy": 0.8347239702939987, "num_tokens": 35992865.0, "step": 29970 }, { "entropy": 1.9335076332092285, "epoch": 0.09293537689898702, "grad_norm": 4.406698226928711, "learning_rate": 8.298687109743434e-06, "loss": 0.5598, "mean_token_accuracy": 0.8291699022054673, "num_tokens": 36004840.0, "step": 29980 }, { "entropy": 1.9243319526314735, "epoch": 0.09296637602403672, "grad_norm": 9.231583595275879, "learning_rate": 8.297303372533783e-06, "loss": 0.5428, "mean_token_accuracy": 0.8422666415572166, "num_tokens": 36016239.0, "step": 29990 }, { "entropy": 1.9396521881222726, "epoch": 0.09299737514908642, "grad_norm": 10.043663024902344, "learning_rate": 8.295920327273474e-06, "loss": 0.6083, "mean_token_accuracy": 0.8137664362788201, "num_tokens": 36027621.0, "step": 30000 }, { "entropy": 1.8998918011784554, "epoch": 0.09302837427413611, "grad_norm": 3.658256769180298, "learning_rate": 8.294537973386005e-06, "loss": 0.5306, "mean_token_accuracy": 0.8297856241464615, "num_tokens": 36039304.0, "step": 30010 }, { "entropy": 1.905530793964863, "epoch": 0.09305937339918581, "grad_norm": 10.163437843322754, "learning_rate": 8.29315631029555e-06, "loss": 0.5719, "mean_token_accuracy": 0.824944506585598, "num_tokens": 36050882.0, "step": 30020 }, { "entropy": 1.7664498060941696, "epoch": 0.0930903725242355, "grad_norm": 11.35702896118164, "learning_rate": 8.291775337426954e-06, "loss": 0.4992, "mean_token_accuracy": 0.8312182664871216, "num_tokens": 36065249.0, "step": 30030 }, { "entropy": 1.8767417997121811, "epoch": 0.0931213716492852, "grad_norm": 8.986226081848145, "learning_rate": 8.290395054205727e-06, "loss": 0.5498, "mean_token_accuracy": 0.8315106689929962, "num_tokens": 36078427.0, "step": 30040 }, { "entropy": 1.9429257303476333, "epoch": 0.0931523707743349, "grad_norm": 5.139298915863037, "learning_rate": 8.289015460058055e-06, "loss": 0.6161, "mean_token_accuracy": 0.8064095541834831, "num_tokens": 36090554.0, "step": 30050 }, { "entropy": 1.8949994623661042, "epoch": 0.0931833698993846, "grad_norm": 3.8736157417297363, "learning_rate": 8.28763655441079e-06, "loss": 0.5681, "mean_token_accuracy": 0.8159836381673813, "num_tokens": 36103427.0, "step": 30060 }, { "entropy": 1.9144406855106353, "epoch": 0.09321436902443428, "grad_norm": 9.992815971374512, "learning_rate": 8.286258336691447e-06, "loss": 0.5257, "mean_token_accuracy": 0.8263021633028984, "num_tokens": 36116559.0, "step": 30070 }, { "entropy": 1.8954633638262748, "epoch": 0.09324536814948398, "grad_norm": 10.11132526397705, "learning_rate": 8.284880806328216e-06, "loss": 0.4951, "mean_token_accuracy": 0.8312999933958054, "num_tokens": 36129181.0, "step": 30080 }, { "entropy": 1.9233060747385025, "epoch": 0.09327636727453367, "grad_norm": 5.417168617248535, "learning_rate": 8.283503962749944e-06, "loss": 0.5479, "mean_token_accuracy": 0.8156366124749184, "num_tokens": 36141429.0, "step": 30090 }, { "entropy": 1.9522355869412422, "epoch": 0.09330736639958337, "grad_norm": 10.192994117736816, "learning_rate": 8.282127805386145e-06, "loss": 0.5551, "mean_token_accuracy": 0.8214729785919189, "num_tokens": 36153159.0, "step": 30100 }, { "entropy": 1.9816941738128662, "epoch": 0.09333836552463307, "grad_norm": 10.674017906188965, "learning_rate": 8.280752333666999e-06, "loss": 0.6031, "mean_token_accuracy": 0.8102478966116905, "num_tokens": 36164765.0, "step": 30110 }, { "entropy": 1.9644594475626946, "epoch": 0.09336936464968276, "grad_norm": 8.998485565185547, "learning_rate": 8.279377547023342e-06, "loss": 0.5509, "mean_token_accuracy": 0.8299065142869949, "num_tokens": 36176082.0, "step": 30120 }, { "entropy": 1.888843522965908, "epoch": 0.09340036377473246, "grad_norm": 4.065151691436768, "learning_rate": 8.278003444886679e-06, "loss": 0.5253, "mean_token_accuracy": 0.8304722428321838, "num_tokens": 36189223.0, "step": 30130 }, { "entropy": 1.8697800204157828, "epoch": 0.09343136289978216, "grad_norm": 9.428620338439941, "learning_rate": 8.276630026689168e-06, "loss": 0.5502, "mean_token_accuracy": 0.8309311479330063, "num_tokens": 36202150.0, "step": 30140 }, { "entropy": 1.933559250831604, "epoch": 0.09346236202483185, "grad_norm": 9.53989028930664, "learning_rate": 8.275257291863631e-06, "loss": 0.5439, "mean_token_accuracy": 0.8235199645161628, "num_tokens": 36213528.0, "step": 30150 }, { "entropy": 1.9191157951951028, "epoch": 0.09349336114988155, "grad_norm": 11.843420028686523, "learning_rate": 8.273885239843545e-06, "loss": 0.5874, "mean_token_accuracy": 0.8172587484121323, "num_tokens": 36225594.0, "step": 30160 }, { "entropy": 1.9522407323122024, "epoch": 0.09352436027493125, "grad_norm": 10.857767105102539, "learning_rate": 8.272513870063048e-06, "loss": 0.6238, "mean_token_accuracy": 0.8080313906073571, "num_tokens": 36236730.0, "step": 30170 }, { "entropy": 1.879507339000702, "epoch": 0.09355535939998094, "grad_norm": 4.612288951873779, "learning_rate": 8.271143181956931e-06, "loss": 0.561, "mean_token_accuracy": 0.8234292283654213, "num_tokens": 36249271.0, "step": 30180 }, { "entropy": 1.904217940568924, "epoch": 0.09358635852503062, "grad_norm": 9.96085262298584, "learning_rate": 8.269773174960643e-06, "loss": 0.6201, "mean_token_accuracy": 0.816309979557991, "num_tokens": 36261216.0, "step": 30190 }, { "entropy": 1.9194680109620095, "epoch": 0.09361735765008032, "grad_norm": 4.954514503479004, "learning_rate": 8.268403848510283e-06, "loss": 0.5507, "mean_token_accuracy": 0.8259898975491524, "num_tokens": 36272893.0, "step": 30200 }, { "entropy": 1.9340100079774856, "epoch": 0.09364835677513002, "grad_norm": 11.47993278503418, "learning_rate": 8.267035202042611e-06, "loss": 0.6081, "mean_token_accuracy": 0.813405393064022, "num_tokens": 36284689.0, "step": 30210 }, { "entropy": 1.904521045088768, "epoch": 0.09367935590017971, "grad_norm": 9.563363075256348, "learning_rate": 8.265667234995031e-06, "loss": 0.595, "mean_token_accuracy": 0.8256731614470482, "num_tokens": 36295838.0, "step": 30220 }, { "entropy": 1.8696922466158867, "epoch": 0.09371035502522941, "grad_norm": 11.36976432800293, "learning_rate": 8.264299946805606e-06, "loss": 0.5404, "mean_token_accuracy": 0.8290983706712722, "num_tokens": 36307929.0, "step": 30230 }, { "entropy": 1.9623483926057816, "epoch": 0.09374135415027911, "grad_norm": 9.148504257202148, "learning_rate": 8.26293333691304e-06, "loss": 0.6233, "mean_token_accuracy": 0.8086713761091232, "num_tokens": 36319244.0, "step": 30240 }, { "entropy": 1.9141168981790542, "epoch": 0.0937723532753288, "grad_norm": 8.628009796142578, "learning_rate": 8.261567404756697e-06, "loss": 0.5855, "mean_token_accuracy": 0.8221844986081124, "num_tokens": 36330692.0, "step": 30250 }, { "entropy": 1.898746033012867, "epoch": 0.0938033524003785, "grad_norm": 10.499122619628906, "learning_rate": 8.260202149776582e-06, "loss": 0.5483, "mean_token_accuracy": 0.8232525825500489, "num_tokens": 36342656.0, "step": 30260 }, { "entropy": 1.8459609940648078, "epoch": 0.0938343515254282, "grad_norm": 4.076352596282959, "learning_rate": 8.258837571413353e-06, "loss": 0.5287, "mean_token_accuracy": 0.8242526456713677, "num_tokens": 36355602.0, "step": 30270 }, { "entropy": 1.870631681382656, "epoch": 0.0938653506504779, "grad_norm": 9.26065731048584, "learning_rate": 8.25747366910831e-06, "loss": 0.5709, "mean_token_accuracy": 0.8275643572211265, "num_tokens": 36368347.0, "step": 30280 }, { "entropy": 1.9731937110424043, "epoch": 0.09389634977552759, "grad_norm": 10.137398719787598, "learning_rate": 8.256110442303401e-06, "loss": 0.6493, "mean_token_accuracy": 0.8186086341738701, "num_tokens": 36378870.0, "step": 30290 }, { "entropy": 1.9244423538446427, "epoch": 0.09392734890057729, "grad_norm": 8.93706226348877, "learning_rate": 8.254747890441217e-06, "loss": 0.5874, "mean_token_accuracy": 0.8153250128030777, "num_tokens": 36389925.0, "step": 30300 }, { "entropy": 1.956382469832897, "epoch": 0.09395834802562697, "grad_norm": 8.536420822143555, "learning_rate": 8.253386012964996e-06, "loss": 0.6129, "mean_token_accuracy": 0.8204043418169021, "num_tokens": 36401021.0, "step": 30310 }, { "entropy": 1.947486911714077, "epoch": 0.09398934715067667, "grad_norm": 9.407525062561035, "learning_rate": 8.252024809318618e-06, "loss": 0.6199, "mean_token_accuracy": 0.8143399626016616, "num_tokens": 36412021.0, "step": 30320 }, { "entropy": 1.9774963051080703, "epoch": 0.09402034627572636, "grad_norm": 9.66044807434082, "learning_rate": 8.250664278946598e-06, "loss": 0.6574, "mean_token_accuracy": 0.8052627012133599, "num_tokens": 36422783.0, "step": 30330 }, { "entropy": 1.9370567843317985, "epoch": 0.09405134540077606, "grad_norm": 10.304049491882324, "learning_rate": 8.249304421294103e-06, "loss": 0.617, "mean_token_accuracy": 0.8105893135070801, "num_tokens": 36434720.0, "step": 30340 }, { "entropy": 1.8703087165951728, "epoch": 0.09408234452582576, "grad_norm": 9.763945579528809, "learning_rate": 8.247945235806933e-06, "loss": 0.5499, "mean_token_accuracy": 0.8189043670892715, "num_tokens": 36447678.0, "step": 30350 }, { "entropy": 1.9350792229175569, "epoch": 0.09411334365087545, "grad_norm": 8.846423149108887, "learning_rate": 8.246586721931527e-06, "loss": 0.6098, "mean_token_accuracy": 0.8202366888523102, "num_tokens": 36459700.0, "step": 30360 }, { "entropy": 1.9018134266138076, "epoch": 0.09414434277592515, "grad_norm": 8.516942977905273, "learning_rate": 8.245228879114964e-06, "loss": 0.5191, "mean_token_accuracy": 0.8454120337963105, "num_tokens": 36471172.0, "step": 30370 }, { "entropy": 1.877271145582199, "epoch": 0.09417534190097485, "grad_norm": 9.023475646972656, "learning_rate": 8.24387170680496e-06, "loss": 0.5145, "mean_token_accuracy": 0.8345570683479309, "num_tokens": 36483107.0, "step": 30380 }, { "entropy": 1.8964007556438447, "epoch": 0.09420634102602454, "grad_norm": 8.33362102508545, "learning_rate": 8.242515204449868e-06, "loss": 0.5618, "mean_token_accuracy": 0.8253947287797928, "num_tokens": 36494970.0, "step": 30390 }, { "entropy": 2.000899037718773, "epoch": 0.09423734015107424, "grad_norm": 8.705028533935547, "learning_rate": 8.241159371498669e-06, "loss": 0.669, "mean_token_accuracy": 0.8023026213049889, "num_tokens": 36505728.0, "step": 30400 }, { "entropy": 1.9049580052495003, "epoch": 0.09426833927612394, "grad_norm": 9.36442756652832, "learning_rate": 8.23980420740099e-06, "loss": 0.5302, "mean_token_accuracy": 0.8285997763276101, "num_tokens": 36517575.0, "step": 30410 }, { "entropy": 1.9479586511850357, "epoch": 0.09429933840117363, "grad_norm": 4.493463039398193, "learning_rate": 8.238449711607085e-06, "loss": 0.6312, "mean_token_accuracy": 0.8059987321496009, "num_tokens": 36529609.0, "step": 30420 }, { "entropy": 1.8796984627842903, "epoch": 0.09433033752622333, "grad_norm": 5.25566291809082, "learning_rate": 8.237095883567837e-06, "loss": 0.5061, "mean_token_accuracy": 0.8322527810931206, "num_tokens": 36541818.0, "step": 30430 }, { "entropy": 1.8748697608709335, "epoch": 0.09436133665127301, "grad_norm": 7.110718250274658, "learning_rate": 8.235742722734768e-06, "loss": 0.6065, "mean_token_accuracy": 0.820853716135025, "num_tokens": 36554237.0, "step": 30440 }, { "entropy": 1.991670235991478, "epoch": 0.09439233577632271, "grad_norm": 9.52397632598877, "learning_rate": 8.234390228560024e-06, "loss": 0.6539, "mean_token_accuracy": 0.8077353879809379, "num_tokens": 36565127.0, "step": 30450 }, { "entropy": 1.8927300944924355, "epoch": 0.0944233349013724, "grad_norm": 10.027509689331055, "learning_rate": 8.233038400496384e-06, "loss": 0.5793, "mean_token_accuracy": 0.8261799916625023, "num_tokens": 36576878.0, "step": 30460 }, { "entropy": 1.9108740240335464, "epoch": 0.0944543340264221, "grad_norm": 4.891687870025635, "learning_rate": 8.231687237997258e-06, "loss": 0.5431, "mean_token_accuracy": 0.8279662787914276, "num_tokens": 36588895.0, "step": 30470 }, { "entropy": 1.8933662503957749, "epoch": 0.0944853331514718, "grad_norm": 9.058094024658203, "learning_rate": 8.230336740516675e-06, "loss": 0.5779, "mean_token_accuracy": 0.8237976714968681, "num_tokens": 36600847.0, "step": 30480 }, { "entropy": 1.9511252835392952, "epoch": 0.0945163322765215, "grad_norm": 3.954777479171753, "learning_rate": 8.228986907509301e-06, "loss": 0.598, "mean_token_accuracy": 0.8181298822164536, "num_tokens": 36612439.0, "step": 30490 }, { "entropy": 1.9695148766040802, "epoch": 0.09454733140157119, "grad_norm": 9.158353805541992, "learning_rate": 8.227637738430418e-06, "loss": 0.6427, "mean_token_accuracy": 0.812971468269825, "num_tokens": 36623202.0, "step": 30500 }, { "entropy": 1.9591504886746407, "epoch": 0.09457833052662089, "grad_norm": 9.691153526306152, "learning_rate": 8.226289232735947e-06, "loss": 0.5988, "mean_token_accuracy": 0.8101043596863746, "num_tokens": 36635035.0, "step": 30510 }, { "entropy": 1.8547694399952888, "epoch": 0.09460932965167058, "grad_norm": 7.980339527130127, "learning_rate": 8.224941389882417e-06, "loss": 0.503, "mean_token_accuracy": 0.8369179427623749, "num_tokens": 36647914.0, "step": 30520 }, { "entropy": 1.7833900511264802, "epoch": 0.09464032877672028, "grad_norm": 9.225784301757812, "learning_rate": 8.223594209326989e-06, "loss": 0.5084, "mean_token_accuracy": 0.8371155127882958, "num_tokens": 36660801.0, "step": 30530 }, { "entropy": 1.966453444957733, "epoch": 0.09467132790176998, "grad_norm": 9.450788497924805, "learning_rate": 8.222247690527445e-06, "loss": 0.6112, "mean_token_accuracy": 0.8205273166298866, "num_tokens": 36671949.0, "step": 30540 }, { "entropy": 1.9008652031421662, "epoch": 0.09470232702681967, "grad_norm": 5.126152992248535, "learning_rate": 8.220901832942189e-06, "loss": 0.5533, "mean_token_accuracy": 0.8248556137084961, "num_tokens": 36683882.0, "step": 30550 }, { "entropy": 1.8853500545024873, "epoch": 0.09473332615186936, "grad_norm": 9.585368156433105, "learning_rate": 8.219556636030243e-06, "loss": 0.5535, "mean_token_accuracy": 0.8217202216386795, "num_tokens": 36695769.0, "step": 30560 }, { "entropy": 1.9309042051434517, "epoch": 0.09476432527691905, "grad_norm": 14.714326858520508, "learning_rate": 8.21821209925125e-06, "loss": 0.603, "mean_token_accuracy": 0.8186698570847512, "num_tokens": 36707233.0, "step": 30570 }, { "entropy": 1.7473974063992501, "epoch": 0.09479532440196875, "grad_norm": 10.051163673400879, "learning_rate": 8.21686822206547e-06, "loss": 0.4686, "mean_token_accuracy": 0.8436778649687767, "num_tokens": 36721694.0, "step": 30580 }, { "entropy": 1.8862526759505271, "epoch": 0.09482632352701845, "grad_norm": 11.326528549194336, "learning_rate": 8.215525003933785e-06, "loss": 0.5516, "mean_token_accuracy": 0.8218036189675331, "num_tokens": 36734217.0, "step": 30590 }, { "entropy": 1.9055613458156586, "epoch": 0.09485732265206814, "grad_norm": 9.264266014099121, "learning_rate": 8.214182444317686e-06, "loss": 0.5895, "mean_token_accuracy": 0.821314187347889, "num_tokens": 36746342.0, "step": 30600 }, { "entropy": 1.8917794667184353, "epoch": 0.09488832177711784, "grad_norm": 3.4011175632476807, "learning_rate": 8.21284054267929e-06, "loss": 0.5038, "mean_token_accuracy": 0.8364427119493485, "num_tokens": 36758768.0, "step": 30610 }, { "entropy": 1.9201584607362747, "epoch": 0.09491932090216754, "grad_norm": 9.317156791687012, "learning_rate": 8.211499298481317e-06, "loss": 0.5849, "mean_token_accuracy": 0.8274592280387878, "num_tokens": 36770047.0, "step": 30620 }, { "entropy": 1.872726395726204, "epoch": 0.09495032002721723, "grad_norm": 4.236084938049316, "learning_rate": 8.210158711187111e-06, "loss": 0.4964, "mean_token_accuracy": 0.8388981744647026, "num_tokens": 36782594.0, "step": 30630 }, { "entropy": 1.9075644299387933, "epoch": 0.09498131915226693, "grad_norm": 8.899432182312012, "learning_rate": 8.208818780260624e-06, "loss": 0.5833, "mean_token_accuracy": 0.8260029882192612, "num_tokens": 36793757.0, "step": 30640 }, { "entropy": 1.878479714691639, "epoch": 0.09501231827731663, "grad_norm": 10.720561027526855, "learning_rate": 8.207479505166421e-06, "loss": 0.5665, "mean_token_accuracy": 0.822782176733017, "num_tokens": 36806341.0, "step": 30650 }, { "entropy": 1.7969744451344014, "epoch": 0.09504331740236632, "grad_norm": 1.9851399660110474, "learning_rate": 8.206140885369683e-06, "loss": 0.4671, "mean_token_accuracy": 0.8433524951338768, "num_tokens": 36819861.0, "step": 30660 }, { "entropy": 1.973240813612938, "epoch": 0.09507431652741602, "grad_norm": 10.891983985900879, "learning_rate": 8.20480292033619e-06, "loss": 0.6501, "mean_token_accuracy": 0.8093028724193573, "num_tokens": 36830803.0, "step": 30670 }, { "entropy": 1.9500004380941391, "epoch": 0.0951053156524657, "grad_norm": 8.56579303741455, "learning_rate": 8.203465609532345e-06, "loss": 0.641, "mean_token_accuracy": 0.817388865351677, "num_tokens": 36842015.0, "step": 30680 }, { "entropy": 1.945586933195591, "epoch": 0.0951363147775154, "grad_norm": 10.45765495300293, "learning_rate": 8.20212895242515e-06, "loss": 0.6136, "mean_token_accuracy": 0.804017736017704, "num_tokens": 36853267.0, "step": 30690 }, { "entropy": 1.921870057284832, "epoch": 0.0951673139025651, "grad_norm": 9.248764038085938, "learning_rate": 8.20079294848222e-06, "loss": 0.5519, "mean_token_accuracy": 0.8254543885588645, "num_tokens": 36864795.0, "step": 30700 }, { "entropy": 1.945303277671337, "epoch": 0.09519831302761479, "grad_norm": 8.114592552185059, "learning_rate": 8.199457597171774e-06, "loss": 0.5779, "mean_token_accuracy": 0.825435708463192, "num_tokens": 36875864.0, "step": 30710 }, { "entropy": 1.9745984852313996, "epoch": 0.09522931215266449, "grad_norm": 8.283961296081543, "learning_rate": 8.198122897962637e-06, "loss": 0.6276, "mean_token_accuracy": 0.813007952272892, "num_tokens": 36887565.0, "step": 30720 }, { "entropy": 1.971392062306404, "epoch": 0.09526031127771419, "grad_norm": 9.020630836486816, "learning_rate": 8.19678885032424e-06, "loss": 0.6255, "mean_token_accuracy": 0.8155330777168274, "num_tokens": 36899194.0, "step": 30730 }, { "entropy": 1.9447349101305007, "epoch": 0.09529131040276388, "grad_norm": 9.019769668579102, "learning_rate": 8.195455453726619e-06, "loss": 0.605, "mean_token_accuracy": 0.8189174398779869, "num_tokens": 36910579.0, "step": 30740 }, { "entropy": 1.9661620736122132, "epoch": 0.09532230952781358, "grad_norm": 5.993375778198242, "learning_rate": 8.194122707640413e-06, "loss": 0.6069, "mean_token_accuracy": 0.8191442266106606, "num_tokens": 36921589.0, "step": 30750 }, { "entropy": 1.8841305732727052, "epoch": 0.09535330865286328, "grad_norm": 3.033937692642212, "learning_rate": 8.19279061153686e-06, "loss": 0.5971, "mean_token_accuracy": 0.8168802231550216, "num_tokens": 36934205.0, "step": 30760 }, { "entropy": 1.8415063828229905, "epoch": 0.09538430777791297, "grad_norm": 4.099487781524658, "learning_rate": 8.191459164887803e-06, "loss": 0.5092, "mean_token_accuracy": 0.830090343952179, "num_tokens": 36946800.0, "step": 30770 }, { "entropy": 1.9041469410061835, "epoch": 0.09541530690296267, "grad_norm": 9.430980682373047, "learning_rate": 8.190128367165687e-06, "loss": 0.6402, "mean_token_accuracy": 0.8095077216625214, "num_tokens": 36958615.0, "step": 30780 }, { "entropy": 1.8373140662908554, "epoch": 0.09544630602801236, "grad_norm": 9.46648120880127, "learning_rate": 8.188798217843552e-06, "loss": 0.5476, "mean_token_accuracy": 0.8348923206329346, "num_tokens": 36971891.0, "step": 30790 }, { "entropy": 1.8742023393511773, "epoch": 0.09547730515306206, "grad_norm": 3.767117738723755, "learning_rate": 8.187468716395042e-06, "loss": 0.522, "mean_token_accuracy": 0.8293885499238968, "num_tokens": 36984478.0, "step": 30800 }, { "entropy": 1.8809196785092355, "epoch": 0.09550830427811174, "grad_norm": 4.8539276123046875, "learning_rate": 8.186139862294395e-06, "loss": 0.576, "mean_token_accuracy": 0.818726259469986, "num_tokens": 36996951.0, "step": 30810 }, { "entropy": 1.835391464829445, "epoch": 0.09553930340316144, "grad_norm": 9.504169464111328, "learning_rate": 8.184811655016448e-06, "loss": 0.52, "mean_token_accuracy": 0.8270650163292885, "num_tokens": 37010328.0, "step": 30820 }, { "entropy": 1.9224170967936516, "epoch": 0.09557030252821114, "grad_norm": 8.346292495727539, "learning_rate": 8.183484094036632e-06, "loss": 0.5438, "mean_token_accuracy": 0.8288520753383637, "num_tokens": 37021973.0, "step": 30830 }, { "entropy": 1.9015969276428222, "epoch": 0.09560130165326083, "grad_norm": 11.159916877746582, "learning_rate": 8.182157178830978e-06, "loss": 0.5541, "mean_token_accuracy": 0.8173329204320907, "num_tokens": 37034374.0, "step": 30840 }, { "entropy": 1.914992319047451, "epoch": 0.09563230077831053, "grad_norm": 11.362371444702148, "learning_rate": 8.180830908876107e-06, "loss": 0.598, "mean_token_accuracy": 0.8112172558903694, "num_tokens": 37046586.0, "step": 30850 }, { "entropy": 1.869801890850067, "epoch": 0.09566329990336023, "grad_norm": 9.398642539978027, "learning_rate": 8.179505283649239e-06, "loss": 0.4976, "mean_token_accuracy": 0.8326647847890853, "num_tokens": 37059363.0, "step": 30860 }, { "entropy": 1.8593883782625198, "epoch": 0.09569429902840992, "grad_norm": 8.429418563842773, "learning_rate": 8.178180302628178e-06, "loss": 0.5187, "mean_token_accuracy": 0.833593225479126, "num_tokens": 37071727.0, "step": 30870 }, { "entropy": 1.8928765952587128, "epoch": 0.09572529815345962, "grad_norm": 8.581494331359863, "learning_rate": 8.176855965291328e-06, "loss": 0.499, "mean_token_accuracy": 0.836057162284851, "num_tokens": 37083853.0, "step": 30880 }, { "entropy": 1.854266819357872, "epoch": 0.09575629727850932, "grad_norm": 4.337390422821045, "learning_rate": 8.175532271117681e-06, "loss": 0.5466, "mean_token_accuracy": 0.8306478515267373, "num_tokens": 37096495.0, "step": 30890 }, { "entropy": 1.7374467805027962, "epoch": 0.09578729640355901, "grad_norm": 8.350436210632324, "learning_rate": 8.17420921958682e-06, "loss": 0.4234, "mean_token_accuracy": 0.843256051838398, "num_tokens": 37110413.0, "step": 30900 }, { "entropy": 1.956564149260521, "epoch": 0.09581829552860871, "grad_norm": 9.846826553344727, "learning_rate": 8.172886810178917e-06, "loss": 0.6155, "mean_token_accuracy": 0.8104242667555809, "num_tokens": 37121734.0, "step": 30910 }, { "entropy": 1.8714107498526573, "epoch": 0.0958492946536584, "grad_norm": 9.786186218261719, "learning_rate": 8.171565042374731e-06, "loss": 0.5514, "mean_token_accuracy": 0.8231972604990005, "num_tokens": 37133847.0, "step": 30920 }, { "entropy": 1.92191222012043, "epoch": 0.09588029377870809, "grad_norm": 9.510786056518555, "learning_rate": 8.17024391565561e-06, "loss": 0.5702, "mean_token_accuracy": 0.825206808745861, "num_tokens": 37145862.0, "step": 30930 }, { "entropy": 1.9616800487041473, "epoch": 0.09591129290375779, "grad_norm": 10.04150676727295, "learning_rate": 8.168923429503489e-06, "loss": 0.5974, "mean_token_accuracy": 0.8210392817854881, "num_tokens": 37156951.0, "step": 30940 }, { "entropy": 1.920756246894598, "epoch": 0.09594229202880748, "grad_norm": 3.579078435897827, "learning_rate": 8.167603583400891e-06, "loss": 0.5547, "mean_token_accuracy": 0.8260960400104522, "num_tokens": 37169346.0, "step": 30950 }, { "entropy": 1.9267018765211106, "epoch": 0.09597329115385718, "grad_norm": 9.2460355758667, "learning_rate": 8.166284376830917e-06, "loss": 0.5963, "mean_token_accuracy": 0.8267333999276161, "num_tokens": 37181057.0, "step": 30960 }, { "entropy": 1.9822622686624527, "epoch": 0.09600429027890688, "grad_norm": 12.592954635620117, "learning_rate": 8.164965809277262e-06, "loss": 0.6087, "mean_token_accuracy": 0.8141214087605476, "num_tokens": 37192122.0, "step": 30970 }, { "entropy": 1.9343804091215133, "epoch": 0.09603528940395657, "grad_norm": 3.9561359882354736, "learning_rate": 8.163647880224195e-06, "loss": 0.5621, "mean_token_accuracy": 0.8185720920562745, "num_tokens": 37203777.0, "step": 30980 }, { "entropy": 1.9694799184799194, "epoch": 0.09606628852900627, "grad_norm": 9.182045936584473, "learning_rate": 8.162330589156574e-06, "loss": 0.5635, "mean_token_accuracy": 0.8282784789800643, "num_tokens": 37215743.0, "step": 30990 }, { "entropy": 1.8368911847472191, "epoch": 0.09609728765405597, "grad_norm": 4.183026313781738, "learning_rate": 8.161013935559836e-06, "loss": 0.4993, "mean_token_accuracy": 0.8347128361463547, "num_tokens": 37229304.0, "step": 31000 }, { "entropy": 1.8615291342139244, "epoch": 0.09612828677910566, "grad_norm": 10.485788345336914, "learning_rate": 8.15969791892e-06, "loss": 0.4681, "mean_token_accuracy": 0.8415818125009537, "num_tokens": 37241842.0, "step": 31010 }, { "entropy": 1.8837064534425736, "epoch": 0.09615928590415536, "grad_norm": 9.813204765319824, "learning_rate": 8.158382538723663e-06, "loss": 0.5599, "mean_token_accuracy": 0.8298415571451188, "num_tokens": 37253894.0, "step": 31020 }, { "entropy": 1.9101071432232857, "epoch": 0.09619028502920506, "grad_norm": 9.026832580566406, "learning_rate": 8.157067794458002e-06, "loss": 0.5352, "mean_token_accuracy": 0.8250282123684883, "num_tokens": 37266281.0, "step": 31030 }, { "entropy": 1.9916753947734833, "epoch": 0.09622128415425475, "grad_norm": 8.812485694885254, "learning_rate": 8.155753685610777e-06, "loss": 0.6106, "mean_token_accuracy": 0.8189787149429322, "num_tokens": 37277277.0, "step": 31040 }, { "entropy": 1.8934721648693085, "epoch": 0.09625228327930443, "grad_norm": 4.702004909515381, "learning_rate": 8.154440211670315e-06, "loss": 0.5081, "mean_token_accuracy": 0.8383319437503814, "num_tokens": 37288790.0, "step": 31050 }, { "entropy": 1.8943874284625053, "epoch": 0.09628328240435413, "grad_norm": 5.008129119873047, "learning_rate": 8.153127372125532e-06, "loss": 0.5132, "mean_token_accuracy": 0.8305976986885071, "num_tokens": 37301541.0, "step": 31060 }, { "entropy": 1.9754459619522096, "epoch": 0.09631428152940383, "grad_norm": 8.563139915466309, "learning_rate": 8.151815166465911e-06, "loss": 0.6077, "mean_token_accuracy": 0.82650166451931, "num_tokens": 37313145.0, "step": 31070 }, { "entropy": 1.9236046463251113, "epoch": 0.09634528065445352, "grad_norm": 9.933487892150879, "learning_rate": 8.150503594181513e-06, "loss": 0.5484, "mean_token_accuracy": 0.8255937352776528, "num_tokens": 37325171.0, "step": 31080 }, { "entropy": 1.8460835695266724, "epoch": 0.09637627977950322, "grad_norm": 4.579317569732666, "learning_rate": 8.149192654762971e-06, "loss": 0.5314, "mean_token_accuracy": 0.828292778134346, "num_tokens": 37338677.0, "step": 31090 }, { "entropy": 1.9498460546135903, "epoch": 0.09640727890455292, "grad_norm": 10.442451477050781, "learning_rate": 8.147882347701493e-06, "loss": 0.6204, "mean_token_accuracy": 0.818555298447609, "num_tokens": 37350287.0, "step": 31100 }, { "entropy": 1.890355022251606, "epoch": 0.09643827802960261, "grad_norm": 9.938260078430176, "learning_rate": 8.146572672488863e-06, "loss": 0.535, "mean_token_accuracy": 0.828145657479763, "num_tokens": 37361789.0, "step": 31110 }, { "entropy": 1.9762079477310182, "epoch": 0.09646927715465231, "grad_norm": 10.554993629455566, "learning_rate": 8.145263628617433e-06, "loss": 0.6357, "mean_token_accuracy": 0.8193916127085685, "num_tokens": 37372191.0, "step": 31120 }, { "entropy": 1.93240787088871, "epoch": 0.09650027627970201, "grad_norm": 8.852706909179688, "learning_rate": 8.143955215580123e-06, "loss": 0.6071, "mean_token_accuracy": 0.8182915225625038, "num_tokens": 37384812.0, "step": 31130 }, { "entropy": 2.0304384171962737, "epoch": 0.0965312754047517, "grad_norm": 10.083441734313965, "learning_rate": 8.142647432870427e-06, "loss": 0.6445, "mean_token_accuracy": 0.8097123354673386, "num_tokens": 37395995.0, "step": 31140 }, { "entropy": 1.9195516496896743, "epoch": 0.0965622745298014, "grad_norm": 8.50387954711914, "learning_rate": 8.141340279982408e-06, "loss": 0.5442, "mean_token_accuracy": 0.8346219673752785, "num_tokens": 37407286.0, "step": 31150 }, { "entropy": 1.9149638175964356, "epoch": 0.0965932736548511, "grad_norm": 8.920450210571289, "learning_rate": 8.140033756410697e-06, "loss": 0.5605, "mean_token_accuracy": 0.8223188519477844, "num_tokens": 37419478.0, "step": 31160 }, { "entropy": 1.8281333968043327, "epoch": 0.0966242727799008, "grad_norm": 9.24412727355957, "learning_rate": 8.138727861650492e-06, "loss": 0.4968, "mean_token_accuracy": 0.8335327923297882, "num_tokens": 37433099.0, "step": 31170 }, { "entropy": 1.8802730202674867, "epoch": 0.09665527190495048, "grad_norm": 6.880424976348877, "learning_rate": 8.137422595197554e-06, "loss": 0.588, "mean_token_accuracy": 0.8161317735910416, "num_tokens": 37445489.0, "step": 31180 }, { "entropy": 1.8878979504108429, "epoch": 0.09668627103000017, "grad_norm": 5.010068893432617, "learning_rate": 8.136117956548222e-06, "loss": 0.5278, "mean_token_accuracy": 0.8283527597784996, "num_tokens": 37458005.0, "step": 31190 }, { "entropy": 1.88038859218359, "epoch": 0.09671727015504987, "grad_norm": 3.999337673187256, "learning_rate": 8.134813945199384e-06, "loss": 0.5158, "mean_token_accuracy": 0.8222674325108528, "num_tokens": 37470320.0, "step": 31200 }, { "entropy": 1.926040168106556, "epoch": 0.09674826928009957, "grad_norm": 9.107236862182617, "learning_rate": 8.133510560648504e-06, "loss": 0.592, "mean_token_accuracy": 0.8095137163996696, "num_tokens": 37482370.0, "step": 31210 }, { "entropy": 1.8264732763171196, "epoch": 0.09677926840514926, "grad_norm": 8.244362831115723, "learning_rate": 8.132207802393603e-06, "loss": 0.5172, "mean_token_accuracy": 0.8364429995417595, "num_tokens": 37495518.0, "step": 31220 }, { "entropy": 1.9302683904767037, "epoch": 0.09681026753019896, "grad_norm": 7.822775363922119, "learning_rate": 8.13090566993327e-06, "loss": 0.565, "mean_token_accuracy": 0.8201239988207817, "num_tokens": 37507302.0, "step": 31230 }, { "entropy": 1.8702651888132096, "epoch": 0.09684126665524866, "grad_norm": 4.487738132476807, "learning_rate": 8.12960416276665e-06, "loss": 0.5, "mean_token_accuracy": 0.8344218656420708, "num_tokens": 37520315.0, "step": 31240 }, { "entropy": 1.9409600526094437, "epoch": 0.09687226578029835, "grad_norm": 9.113760948181152, "learning_rate": 8.128303280393453e-06, "loss": 0.5982, "mean_token_accuracy": 0.815177421271801, "num_tokens": 37532697.0, "step": 31250 }, { "entropy": 1.8489416658878326, "epoch": 0.09690326490534805, "grad_norm": 10.157899856567383, "learning_rate": 8.12700302231395e-06, "loss": 0.4969, "mean_token_accuracy": 0.8288843706250191, "num_tokens": 37545422.0, "step": 31260 }, { "entropy": 1.9021289080381394, "epoch": 0.09693426403039775, "grad_norm": 9.482295036315918, "learning_rate": 8.125703388028969e-06, "loss": 0.6059, "mean_token_accuracy": 0.8187751770019531, "num_tokens": 37557355.0, "step": 31270 }, { "entropy": 1.8654734954237937, "epoch": 0.09696526315544744, "grad_norm": 10.133862495422363, "learning_rate": 8.124404377039897e-06, "loss": 0.5901, "mean_token_accuracy": 0.8247566968202591, "num_tokens": 37570124.0, "step": 31280 }, { "entropy": 1.9067189067602157, "epoch": 0.09699626228049714, "grad_norm": 8.202488899230957, "learning_rate": 8.123105988848677e-06, "loss": 0.5681, "mean_token_accuracy": 0.8185949176549911, "num_tokens": 37582141.0, "step": 31290 }, { "entropy": 1.9307459384202956, "epoch": 0.09702726140554682, "grad_norm": 11.123476028442383, "learning_rate": 8.121808222957812e-06, "loss": 0.6175, "mean_token_accuracy": 0.8119331106543541, "num_tokens": 37592965.0, "step": 31300 }, { "entropy": 1.8705940045416356, "epoch": 0.09705826053059652, "grad_norm": 2.6256701946258545, "learning_rate": 8.120511078870361e-06, "loss": 0.5387, "mean_token_accuracy": 0.8242152616381645, "num_tokens": 37605456.0, "step": 31310 }, { "entropy": 1.9379503265023232, "epoch": 0.09708925965564622, "grad_norm": 3.4866576194763184, "learning_rate": 8.119214556089939e-06, "loss": 0.5883, "mean_token_accuracy": 0.819600661098957, "num_tokens": 37616988.0, "step": 31320 }, { "entropy": 1.8673909679055214, "epoch": 0.09712025878069591, "grad_norm": 8.800174713134766, "learning_rate": 8.11791865412071e-06, "loss": 0.489, "mean_token_accuracy": 0.8349416226148605, "num_tokens": 37630013.0, "step": 31330 }, { "entropy": 1.9335881680250169, "epoch": 0.09715125790574561, "grad_norm": 9.899873733520508, "learning_rate": 8.1166233724674e-06, "loss": 0.6003, "mean_token_accuracy": 0.8110765367746353, "num_tokens": 37641637.0, "step": 31340 }, { "entropy": 1.8817772254347802, "epoch": 0.0971822570307953, "grad_norm": 5.787439346313477, "learning_rate": 8.115328710635283e-06, "loss": 0.5509, "mean_token_accuracy": 0.8280811190605164, "num_tokens": 37653917.0, "step": 31350 }, { "entropy": 1.8765314996242524, "epoch": 0.097213256155845, "grad_norm": 8.655739784240723, "learning_rate": 8.114034668130184e-06, "loss": 0.5687, "mean_token_accuracy": 0.8303320676088333, "num_tokens": 37665711.0, "step": 31360 }, { "entropy": 1.9583671048283577, "epoch": 0.0972442552808947, "grad_norm": 8.754752159118652, "learning_rate": 8.112741244458482e-06, "loss": 0.614, "mean_token_accuracy": 0.8211699604988099, "num_tokens": 37677018.0, "step": 31370 }, { "entropy": 1.8967312201857567, "epoch": 0.0972752544059444, "grad_norm": 4.7562575340271, "learning_rate": 8.11144843912711e-06, "loss": 0.569, "mean_token_accuracy": 0.8239043042063713, "num_tokens": 37689120.0, "step": 31380 }, { "entropy": 1.952524009346962, "epoch": 0.09730625353099409, "grad_norm": 9.356344223022461, "learning_rate": 8.110156251643543e-06, "loss": 0.6207, "mean_token_accuracy": 0.8118321433663368, "num_tokens": 37700583.0, "step": 31390 }, { "entropy": 1.912900096178055, "epoch": 0.09733725265604379, "grad_norm": 9.547168731689453, "learning_rate": 8.10886468151581e-06, "loss": 0.5595, "mean_token_accuracy": 0.8243383869528771, "num_tokens": 37712109.0, "step": 31400 }, { "entropy": 1.9011776894330978, "epoch": 0.09736825178109348, "grad_norm": 9.910781860351562, "learning_rate": 8.107573728252488e-06, "loss": 0.5361, "mean_token_accuracy": 0.8336651280522347, "num_tokens": 37724208.0, "step": 31410 }, { "entropy": 1.8022936284542084, "epoch": 0.09739925090614318, "grad_norm": 10.076736450195312, "learning_rate": 8.106283391362702e-06, "loss": 0.4303, "mean_token_accuracy": 0.8385085701942444, "num_tokens": 37738243.0, "step": 31420 }, { "entropy": 1.8668938994407653, "epoch": 0.09743025003119286, "grad_norm": 3.9053683280944824, "learning_rate": 8.10499367035612e-06, "loss": 0.5387, "mean_token_accuracy": 0.8268329188227653, "num_tokens": 37750652.0, "step": 31430 }, { "entropy": 1.989102879166603, "epoch": 0.09746124915624256, "grad_norm": 8.179647445678711, "learning_rate": 8.10370456474296e-06, "loss": 0.6414, "mean_token_accuracy": 0.8093044266104699, "num_tokens": 37761475.0, "step": 31440 }, { "entropy": 1.9298017874360085, "epoch": 0.09749224828129226, "grad_norm": 9.899726867675781, "learning_rate": 8.102416074033986e-06, "loss": 0.5832, "mean_token_accuracy": 0.82451683729887, "num_tokens": 37773158.0, "step": 31450 }, { "entropy": 1.926207932829857, "epoch": 0.09752324740634195, "grad_norm": 10.14471435546875, "learning_rate": 8.101128197740498e-06, "loss": 0.6074, "mean_token_accuracy": 0.8211355611681939, "num_tokens": 37784675.0, "step": 31460 }, { "entropy": 1.9524447560310363, "epoch": 0.09755424653139165, "grad_norm": 10.060158729553223, "learning_rate": 8.09984093537435e-06, "loss": 0.547, "mean_token_accuracy": 0.8268745079636574, "num_tokens": 37795692.0, "step": 31470 }, { "entropy": 1.900469544529915, "epoch": 0.09758524565644135, "grad_norm": 8.439093589782715, "learning_rate": 8.098554286447932e-06, "loss": 0.5246, "mean_token_accuracy": 0.8321187406778335, "num_tokens": 37808059.0, "step": 31480 }, { "entropy": 1.931046536564827, "epoch": 0.09761624478149104, "grad_norm": 9.36986255645752, "learning_rate": 8.09726825047418e-06, "loss": 0.5602, "mean_token_accuracy": 0.8171745136380195, "num_tokens": 37819692.0, "step": 31490 }, { "entropy": 1.9444507226347922, "epoch": 0.09764724390654074, "grad_norm": 10.288396835327148, "learning_rate": 8.095982826966572e-06, "loss": 0.5723, "mean_token_accuracy": 0.835323378443718, "num_tokens": 37831392.0, "step": 31500 }, { "entropy": 1.892457826435566, "epoch": 0.09767824303159044, "grad_norm": 9.328455924987793, "learning_rate": 8.094698015439117e-06, "loss": 0.5247, "mean_token_accuracy": 0.8328243359923363, "num_tokens": 37842993.0, "step": 31510 }, { "entropy": 1.9004325211048125, "epoch": 0.09770924215664013, "grad_norm": 10.092707633972168, "learning_rate": 8.093413815406375e-06, "loss": 0.5346, "mean_token_accuracy": 0.8365309268236161, "num_tokens": 37854723.0, "step": 31520 }, { "entropy": 1.8958807915449143, "epoch": 0.09774024128168983, "grad_norm": 10.082904815673828, "learning_rate": 8.092130226383442e-06, "loss": 0.5567, "mean_token_accuracy": 0.8289808630943298, "num_tokens": 37866020.0, "step": 31530 }, { "entropy": 1.8704974979162217, "epoch": 0.09777124040673953, "grad_norm": 9.165472984313965, "learning_rate": 8.090847247885948e-06, "loss": 0.5996, "mean_token_accuracy": 0.8228276550769806, "num_tokens": 37877097.0, "step": 31540 }, { "entropy": 1.9054939955472947, "epoch": 0.09780223953178921, "grad_norm": 7.8041205406188965, "learning_rate": 8.089564879430065e-06, "loss": 0.5535, "mean_token_accuracy": 0.8295403525233269, "num_tokens": 37887835.0, "step": 31550 }, { "entropy": 1.886386200785637, "epoch": 0.0978332386568389, "grad_norm": 8.552273750305176, "learning_rate": 8.088283120532499e-06, "loss": 0.5835, "mean_token_accuracy": 0.830622710287571, "num_tokens": 37898855.0, "step": 31560 }, { "entropy": 1.867349809408188, "epoch": 0.0978642377818886, "grad_norm": 4.293386936187744, "learning_rate": 8.087001970710495e-06, "loss": 0.6032, "mean_token_accuracy": 0.8121476486325264, "num_tokens": 37910879.0, "step": 31570 }, { "entropy": 1.9287435024976731, "epoch": 0.0978952369069383, "grad_norm": 10.854731559753418, "learning_rate": 8.085721429481825e-06, "loss": 0.6146, "mean_token_accuracy": 0.8168462857604026, "num_tokens": 37921745.0, "step": 31580 }, { "entropy": 1.9377621293067933, "epoch": 0.097926236031988, "grad_norm": 11.060909271240234, "learning_rate": 8.084441496364808e-06, "loss": 0.6164, "mean_token_accuracy": 0.8070833861827851, "num_tokens": 37933377.0, "step": 31590 }, { "entropy": 1.854214581847191, "epoch": 0.09795723515703769, "grad_norm": 8.883403778076172, "learning_rate": 8.083162170878286e-06, "loss": 0.5058, "mean_token_accuracy": 0.8322727754712105, "num_tokens": 37945413.0, "step": 31600 }, { "entropy": 1.8605719447135924, "epoch": 0.09798823428208739, "grad_norm": 8.319462776184082, "learning_rate": 8.081883452541636e-06, "loss": 0.5203, "mean_token_accuracy": 0.8308003038167954, "num_tokens": 37956712.0, "step": 31610 }, { "entropy": 1.8450894683599472, "epoch": 0.09801923340713709, "grad_norm": 4.5528388023376465, "learning_rate": 8.080605340874773e-06, "loss": 0.6098, "mean_token_accuracy": 0.8137845560908318, "num_tokens": 37968907.0, "step": 31620 }, { "entropy": 1.864371307194233, "epoch": 0.09805023253218678, "grad_norm": 10.319393157958984, "learning_rate": 8.079327835398136e-06, "loss": 0.6086, "mean_token_accuracy": 0.8172882586717606, "num_tokens": 37980451.0, "step": 31630 }, { "entropy": 1.8554281413555145, "epoch": 0.09808123165723648, "grad_norm": 8.902338981628418, "learning_rate": 8.078050935632698e-06, "loss": 0.5693, "mean_token_accuracy": 0.831707838177681, "num_tokens": 37992222.0, "step": 31640 }, { "entropy": 1.8842545390129088, "epoch": 0.09811223078228618, "grad_norm": 8.492579460144043, "learning_rate": 8.076774641099962e-06, "loss": 0.5778, "mean_token_accuracy": 0.8209319055080414, "num_tokens": 38003647.0, "step": 31650 }, { "entropy": 1.8265250965952873, "epoch": 0.09814322990733587, "grad_norm": 10.900943756103516, "learning_rate": 8.075498951321958e-06, "loss": 0.5486, "mean_token_accuracy": 0.8293390095233917, "num_tokens": 38016248.0, "step": 31660 }, { "entropy": 1.9135718867182732, "epoch": 0.09817422903238555, "grad_norm": 9.381340026855469, "learning_rate": 8.074223865821245e-06, "loss": 0.5587, "mean_token_accuracy": 0.8256983861327172, "num_tokens": 38027814.0, "step": 31670 }, { "entropy": 1.851310819387436, "epoch": 0.09820522815743525, "grad_norm": 11.365713119506836, "learning_rate": 8.072949384120915e-06, "loss": 0.5346, "mean_token_accuracy": 0.8150163918733597, "num_tokens": 38040446.0, "step": 31680 }, { "entropy": 1.9479857064783572, "epoch": 0.09823622728248495, "grad_norm": 10.72806167602539, "learning_rate": 8.071675505744575e-06, "loss": 0.6235, "mean_token_accuracy": 0.8115170300006866, "num_tokens": 38051965.0, "step": 31690 }, { "entropy": 1.9195650681853293, "epoch": 0.09826722640753464, "grad_norm": 8.423272132873535, "learning_rate": 8.070402230216367e-06, "loss": 0.5614, "mean_token_accuracy": 0.8288802221417427, "num_tokens": 38063124.0, "step": 31700 }, { "entropy": 1.8054373532533645, "epoch": 0.09829822553258434, "grad_norm": 9.05594253540039, "learning_rate": 8.06912955706096e-06, "loss": 0.4789, "mean_token_accuracy": 0.8311830461025238, "num_tokens": 38076115.0, "step": 31710 }, { "entropy": 1.80671064555645, "epoch": 0.09832922465763404, "grad_norm": 7.974825382232666, "learning_rate": 8.067857485803538e-06, "loss": 0.4884, "mean_token_accuracy": 0.8289128035306931, "num_tokens": 38089278.0, "step": 31720 }, { "entropy": 1.782078829407692, "epoch": 0.09836022378268373, "grad_norm": 11.095165252685547, "learning_rate": 8.066586015969819e-06, "loss": 0.4897, "mean_token_accuracy": 0.8267631024122238, "num_tokens": 38102671.0, "step": 31730 }, { "entropy": 1.9678689301013947, "epoch": 0.09839122290773343, "grad_norm": 11.549463272094727, "learning_rate": 8.065315147086036e-06, "loss": 0.666, "mean_token_accuracy": 0.8048723593354226, "num_tokens": 38113699.0, "step": 31740 }, { "entropy": 1.836279061436653, "epoch": 0.09842222203278313, "grad_norm": 9.380285263061523, "learning_rate": 8.06404487867895e-06, "loss": 0.5539, "mean_token_accuracy": 0.8251174956560134, "num_tokens": 38127036.0, "step": 31750 }, { "entropy": 1.887456201016903, "epoch": 0.09845322115783282, "grad_norm": 9.466583251953125, "learning_rate": 8.062775210275841e-06, "loss": 0.6013, "mean_token_accuracy": 0.8264592796564102, "num_tokens": 38137940.0, "step": 31760 }, { "entropy": 1.870304611325264, "epoch": 0.09848422028288252, "grad_norm": 9.614102363586426, "learning_rate": 8.061506141404512e-06, "loss": 0.5776, "mean_token_accuracy": 0.813577763736248, "num_tokens": 38150705.0, "step": 31770 }, { "entropy": 1.8906529873609543, "epoch": 0.09851521940793222, "grad_norm": 4.766526222229004, "learning_rate": 8.060237671593283e-06, "loss": 0.5561, "mean_token_accuracy": 0.8281260386109353, "num_tokens": 38162785.0, "step": 31780 }, { "entropy": 1.817509751021862, "epoch": 0.09854621853298191, "grad_norm": 8.861818313598633, "learning_rate": 8.058969800370995e-06, "loss": 0.4718, "mean_token_accuracy": 0.8299407482147216, "num_tokens": 38176287.0, "step": 31790 }, { "entropy": 1.9326122373342514, "epoch": 0.0985772176580316, "grad_norm": 8.393477439880371, "learning_rate": 8.057702527267008e-06, "loss": 0.586, "mean_token_accuracy": 0.8213211745023727, "num_tokens": 38187326.0, "step": 31800 }, { "entropy": 1.9347236022353171, "epoch": 0.09860821678308129, "grad_norm": 9.152666091918945, "learning_rate": 8.056435851811194e-06, "loss": 0.5519, "mean_token_accuracy": 0.8280371204018593, "num_tokens": 38198420.0, "step": 31810 }, { "entropy": 1.9405176371335984, "epoch": 0.09863921590813099, "grad_norm": 10.188054084777832, "learning_rate": 8.055169773533956e-06, "loss": 0.6435, "mean_token_accuracy": 0.8134154111146927, "num_tokens": 38209643.0, "step": 31820 }, { "entropy": 1.8945807382464408, "epoch": 0.09867021503318069, "grad_norm": 8.68061637878418, "learning_rate": 8.053904291966199e-06, "loss": 0.5581, "mean_token_accuracy": 0.826158694922924, "num_tokens": 38221808.0, "step": 31830 }, { "entropy": 1.8253241881728173, "epoch": 0.09870121415823038, "grad_norm": 10.65565299987793, "learning_rate": 8.052639406639352e-06, "loss": 0.5494, "mean_token_accuracy": 0.8176590099930763, "num_tokens": 38235228.0, "step": 31840 }, { "entropy": 1.9344543784856796, "epoch": 0.09873221328328008, "grad_norm": 8.646658897399902, "learning_rate": 8.051375117085356e-06, "loss": 0.6081, "mean_token_accuracy": 0.8185088708996773, "num_tokens": 38246052.0, "step": 31850 }, { "entropy": 1.8455456778407098, "epoch": 0.09876321240832978, "grad_norm": 8.64867115020752, "learning_rate": 8.050111422836666e-06, "loss": 0.5153, "mean_token_accuracy": 0.8226561039686203, "num_tokens": 38258783.0, "step": 31860 }, { "entropy": 1.942963680624962, "epoch": 0.09879421153337947, "grad_norm": 9.261329650878906, "learning_rate": 8.048848323426254e-06, "loss": 0.5971, "mean_token_accuracy": 0.8249636858701705, "num_tokens": 38269528.0, "step": 31870 }, { "entropy": 1.9168901443481445, "epoch": 0.09882521065842917, "grad_norm": 9.246033668518066, "learning_rate": 8.047585818387599e-06, "loss": 0.5692, "mean_token_accuracy": 0.8281188175082207, "num_tokens": 38280554.0, "step": 31880 }, { "entropy": 1.8628426641225815, "epoch": 0.09885620978347887, "grad_norm": 8.739521026611328, "learning_rate": 8.046323907254695e-06, "loss": 0.524, "mean_token_accuracy": 0.8306362301111221, "num_tokens": 38292518.0, "step": 31890 }, { "entropy": 1.8454672649502755, "epoch": 0.09888720890852856, "grad_norm": 12.10193920135498, "learning_rate": 8.045062589562051e-06, "loss": 0.5709, "mean_token_accuracy": 0.827955187857151, "num_tokens": 38304577.0, "step": 31900 }, { "entropy": 1.9152088075876237, "epoch": 0.09891820803357826, "grad_norm": 9.613424301147461, "learning_rate": 8.04380186484468e-06, "loss": 0.6403, "mean_token_accuracy": 0.8144320785999298, "num_tokens": 38315941.0, "step": 31910 }, { "entropy": 1.9276456892490388, "epoch": 0.09894920715862794, "grad_norm": 9.09563159942627, "learning_rate": 8.04254173263811e-06, "loss": 0.557, "mean_token_accuracy": 0.8385657519102097, "num_tokens": 38326358.0, "step": 31920 }, { "entropy": 1.8148802295327187, "epoch": 0.09898020628367764, "grad_norm": 8.845330238342285, "learning_rate": 8.041282192478376e-06, "loss": 0.5189, "mean_token_accuracy": 0.8373483180999756, "num_tokens": 38339092.0, "step": 31930 }, { "entropy": 1.8389425054192543, "epoch": 0.09901120540872733, "grad_norm": 8.750908851623535, "learning_rate": 8.040023243902018e-06, "loss": 0.5495, "mean_token_accuracy": 0.8151543363928795, "num_tokens": 38351190.0, "step": 31940 }, { "entropy": 1.8402899622917175, "epoch": 0.09904220453377703, "grad_norm": 4.322279930114746, "learning_rate": 8.038764886446095e-06, "loss": 0.5531, "mean_token_accuracy": 0.8239884585142135, "num_tokens": 38363817.0, "step": 31950 }, { "entropy": 1.8671101585030556, "epoch": 0.09907320365882673, "grad_norm": 4.702272415161133, "learning_rate": 8.037507119648157e-06, "loss": 0.5213, "mean_token_accuracy": 0.8226954385638237, "num_tokens": 38376802.0, "step": 31960 }, { "entropy": 1.9011655792593956, "epoch": 0.09910420278387642, "grad_norm": 9.228616714477539, "learning_rate": 8.036249943046277e-06, "loss": 0.5436, "mean_token_accuracy": 0.8301041334867477, "num_tokens": 38388556.0, "step": 31970 }, { "entropy": 1.854082614183426, "epoch": 0.09913520190892612, "grad_norm": 4.278061389923096, "learning_rate": 8.034993356179019e-06, "loss": 0.5468, "mean_token_accuracy": 0.8298392608761788, "num_tokens": 38400576.0, "step": 31980 }, { "entropy": 1.8935617864131928, "epoch": 0.09916620103397582, "grad_norm": 8.677916526794434, "learning_rate": 8.03373735858546e-06, "loss": 0.5457, "mean_token_accuracy": 0.8281985089182854, "num_tokens": 38412502.0, "step": 31990 }, { "entropy": 1.878693199157715, "epoch": 0.09919720015902551, "grad_norm": 9.149133682250977, "learning_rate": 8.032481949805182e-06, "loss": 0.5288, "mean_token_accuracy": 0.8354146972298622, "num_tokens": 38423360.0, "step": 32000 }, { "entropy": 1.9500204533338548, "epoch": 0.09922819928407521, "grad_norm": 10.257466316223145, "learning_rate": 8.031227129378268e-06, "loss": 0.6755, "mean_token_accuracy": 0.8080421656370163, "num_tokens": 38433951.0, "step": 32010 }, { "entropy": 1.9539231777191162, "epoch": 0.09925919840912491, "grad_norm": 9.122477531433105, "learning_rate": 8.029972896845298e-06, "loss": 0.5941, "mean_token_accuracy": 0.8169040486216546, "num_tokens": 38445236.0, "step": 32020 }, { "entropy": 1.846830153465271, "epoch": 0.0992901975341746, "grad_norm": 9.854817390441895, "learning_rate": 8.028719251747369e-06, "loss": 0.5172, "mean_token_accuracy": 0.8273584708571434, "num_tokens": 38457779.0, "step": 32030 }, { "entropy": 1.9177356496453286, "epoch": 0.09932119665922429, "grad_norm": 8.14139461517334, "learning_rate": 8.027466193626063e-06, "loss": 0.563, "mean_token_accuracy": 0.8339667573571206, "num_tokens": 38469104.0, "step": 32040 }, { "entropy": 1.9175456002354623, "epoch": 0.09935219578427398, "grad_norm": 4.319037914276123, "learning_rate": 8.026213722023473e-06, "loss": 0.5978, "mean_token_accuracy": 0.8246249735355378, "num_tokens": 38480992.0, "step": 32050 }, { "entropy": 1.9631080061197281, "epoch": 0.09938319490932368, "grad_norm": 9.7236328125, "learning_rate": 8.024961836482187e-06, "loss": 0.6166, "mean_token_accuracy": 0.8274016365408897, "num_tokens": 38492002.0, "step": 32060 }, { "entropy": 1.943944238126278, "epoch": 0.09941419403437338, "grad_norm": 8.304064750671387, "learning_rate": 8.023710536545295e-06, "loss": 0.6108, "mean_token_accuracy": 0.8115967348217964, "num_tokens": 38502940.0, "step": 32070 }, { "entropy": 1.9384367182850837, "epoch": 0.09944519315942307, "grad_norm": 11.586662292480469, "learning_rate": 8.022459821756386e-06, "loss": 0.6286, "mean_token_accuracy": 0.820226113498211, "num_tokens": 38514238.0, "step": 32080 }, { "entropy": 1.9123076066374778, "epoch": 0.09947619228447277, "grad_norm": 9.793305397033691, "learning_rate": 8.021209691659546e-06, "loss": 0.5969, "mean_token_accuracy": 0.8114640265703201, "num_tokens": 38525572.0, "step": 32090 }, { "entropy": 1.8171996608376504, "epoch": 0.09950719140952247, "grad_norm": 4.4752020835876465, "learning_rate": 8.019960145799353e-06, "loss": 0.5131, "mean_token_accuracy": 0.8394015192985534, "num_tokens": 38538612.0, "step": 32100 }, { "entropy": 1.8743107169866562, "epoch": 0.09953819053457216, "grad_norm": 9.059576034545898, "learning_rate": 8.01871118372089e-06, "loss": 0.5375, "mean_token_accuracy": 0.8266194269061089, "num_tokens": 38550013.0, "step": 32110 }, { "entropy": 1.8382051154971122, "epoch": 0.09956918965962186, "grad_norm": 2.9260752201080322, "learning_rate": 8.017462804969733e-06, "loss": 0.566, "mean_token_accuracy": 0.837664969265461, "num_tokens": 38562520.0, "step": 32120 }, { "entropy": 1.946621085703373, "epoch": 0.09960018878467156, "grad_norm": 8.969979286193848, "learning_rate": 8.01621500909195e-06, "loss": 0.5906, "mean_token_accuracy": 0.8193635001778603, "num_tokens": 38573702.0, "step": 32130 }, { "entropy": 1.9753321021795274, "epoch": 0.09963118790972125, "grad_norm": 8.497583389282227, "learning_rate": 8.014967795634104e-06, "loss": 0.6373, "mean_token_accuracy": 0.8141985774040222, "num_tokens": 38584725.0, "step": 32140 }, { "entropy": 1.9402645155787468, "epoch": 0.09966218703477095, "grad_norm": 10.377409934997559, "learning_rate": 8.013721164143257e-06, "loss": 0.5876, "mean_token_accuracy": 0.8230377018451691, "num_tokens": 38596403.0, "step": 32150 }, { "entropy": 1.7853017404675484, "epoch": 0.09969318615982065, "grad_norm": 8.521496772766113, "learning_rate": 8.012475114166955e-06, "loss": 0.4442, "mean_token_accuracy": 0.8455995440483093, "num_tokens": 38610309.0, "step": 32160 }, { "entropy": 1.846723584830761, "epoch": 0.09972418528487033, "grad_norm": 5.919735908508301, "learning_rate": 8.011229645253245e-06, "loss": 0.5358, "mean_token_accuracy": 0.8269700452685356, "num_tokens": 38623255.0, "step": 32170 }, { "entropy": 1.8817561469972133, "epoch": 0.09975518440992003, "grad_norm": 10.0753755569458, "learning_rate": 8.009984756950662e-06, "loss": 0.5353, "mean_token_accuracy": 0.8250563368201256, "num_tokens": 38636156.0, "step": 32180 }, { "entropy": 1.8628202617168426, "epoch": 0.09978618353496972, "grad_norm": 9.534749031066895, "learning_rate": 8.008740448808228e-06, "loss": 0.5378, "mean_token_accuracy": 0.8297192022204399, "num_tokens": 38648825.0, "step": 32190 }, { "entropy": 1.8859781965613365, "epoch": 0.09981718266001942, "grad_norm": 9.754186630249023, "learning_rate": 8.007496720375465e-06, "loss": 0.554, "mean_token_accuracy": 0.8234538331627845, "num_tokens": 38660622.0, "step": 32200 }, { "entropy": 1.8697243973612785, "epoch": 0.09984818178506912, "grad_norm": 5.172030448913574, "learning_rate": 8.006253571202375e-06, "loss": 0.6183, "mean_token_accuracy": 0.8093932062387467, "num_tokens": 38672976.0, "step": 32210 }, { "entropy": 1.7175804510712624, "epoch": 0.09987918091011881, "grad_norm": 8.345678329467773, "learning_rate": 8.005011000839453e-06, "loss": 0.4797, "mean_token_accuracy": 0.8368314996361732, "num_tokens": 38687008.0, "step": 32220 }, { "entropy": 1.8780830636620522, "epoch": 0.09991018003516851, "grad_norm": 4.0767951011657715, "learning_rate": 8.003769008837679e-06, "loss": 0.5691, "mean_token_accuracy": 0.8209434077143669, "num_tokens": 38698550.0, "step": 32230 }, { "entropy": 1.9147436633706092, "epoch": 0.0999411791602182, "grad_norm": 10.640585899353027, "learning_rate": 8.00252759474853e-06, "loss": 0.6051, "mean_token_accuracy": 0.8163613364100456, "num_tokens": 38709869.0, "step": 32240 }, { "entropy": 1.8193964034318924, "epoch": 0.0999721782852679, "grad_norm": 10.641173362731934, "learning_rate": 8.001286758123959e-06, "loss": 0.5251, "mean_token_accuracy": 0.8270770683884621, "num_tokens": 38721887.0, "step": 32250 }, { "entropy": 1.7959134474396705, "epoch": 0.1000031774103176, "grad_norm": 7.74287748336792, "learning_rate": 8.000046498516408e-06, "loss": 0.4992, "mean_token_accuracy": 0.8405986934900284, "num_tokens": 38733462.0, "step": 32260 }, { "entropy": 1.851724424213171, "epoch": 0.1000341765353673, "grad_norm": 10.603741645812988, "learning_rate": 7.998806815478807e-06, "loss": 0.6026, "mean_token_accuracy": 0.8136375203728676, "num_tokens": 38744694.0, "step": 32270 }, { "entropy": 1.88350567817688, "epoch": 0.10006517566041699, "grad_norm": 10.134400367736816, "learning_rate": 7.99756770856457e-06, "loss": 0.5918, "mean_token_accuracy": 0.8215403065085412, "num_tokens": 38756425.0, "step": 32280 }, { "entropy": 1.9489238500595092, "epoch": 0.10009617478546667, "grad_norm": 9.280869483947754, "learning_rate": 7.996329177327595e-06, "loss": 0.6208, "mean_token_accuracy": 0.8188161373138427, "num_tokens": 38767890.0, "step": 32290 }, { "entropy": 1.9058944791555406, "epoch": 0.10012717391051637, "grad_norm": 4.553225994110107, "learning_rate": 7.995091221322265e-06, "loss": 0.5334, "mean_token_accuracy": 0.8262131318449975, "num_tokens": 38779858.0, "step": 32300 }, { "entropy": 1.7887984693050385, "epoch": 0.10015817303556607, "grad_norm": 10.373005867004395, "learning_rate": 7.993853840103436e-06, "loss": 0.4868, "mean_token_accuracy": 0.8316322803497315, "num_tokens": 38792827.0, "step": 32310 }, { "entropy": 1.7936713725328446, "epoch": 0.10018917216061576, "grad_norm": 9.501617431640625, "learning_rate": 7.992617033226463e-06, "loss": 0.4981, "mean_token_accuracy": 0.8299453228712081, "num_tokens": 38805462.0, "step": 32320 }, { "entropy": 1.9167312130331993, "epoch": 0.10022017128566546, "grad_norm": 9.644965171813965, "learning_rate": 7.991380800247169e-06, "loss": 0.5801, "mean_token_accuracy": 0.8267593100667, "num_tokens": 38816933.0, "step": 32330 }, { "entropy": 1.8285753324627876, "epoch": 0.10025117041071516, "grad_norm": 8.455294609069824, "learning_rate": 7.990145140721862e-06, "loss": 0.5222, "mean_token_accuracy": 0.8261840432882309, "num_tokens": 38829859.0, "step": 32340 }, { "entropy": 1.89251828789711, "epoch": 0.10028216953576485, "grad_norm": 8.720006942749023, "learning_rate": 7.98891005420733e-06, "loss": 0.6323, "mean_token_accuracy": 0.8221445247530937, "num_tokens": 38841339.0, "step": 32350 }, { "entropy": 1.845874121785164, "epoch": 0.10031316866081455, "grad_norm": 8.771798133850098, "learning_rate": 7.987675540260844e-06, "loss": 0.5616, "mean_token_accuracy": 0.8211918070912361, "num_tokens": 38853479.0, "step": 32360 }, { "entropy": 1.8901546359062196, "epoch": 0.10034416778586425, "grad_norm": 6.715768337249756, "learning_rate": 7.986441598440147e-06, "loss": 0.5698, "mean_token_accuracy": 0.8205464497208595, "num_tokens": 38865059.0, "step": 32370 }, { "entropy": 1.879352556169033, "epoch": 0.10037516691091394, "grad_norm": 9.13968563079834, "learning_rate": 7.985208228303463e-06, "loss": 0.6066, "mean_token_accuracy": 0.8105901688337326, "num_tokens": 38876867.0, "step": 32380 }, { "entropy": 1.84751605540514, "epoch": 0.10040616603596364, "grad_norm": 8.604146003723145, "learning_rate": 7.983975429409497e-06, "loss": 0.5442, "mean_token_accuracy": 0.8336275666952133, "num_tokens": 38889307.0, "step": 32390 }, { "entropy": 1.7912178918719293, "epoch": 0.10043716516101334, "grad_norm": 9.056184768676758, "learning_rate": 7.982743201317426e-06, "loss": 0.4912, "mean_token_accuracy": 0.8329012975096702, "num_tokens": 38902802.0, "step": 32400 }, { "entropy": 1.8672286108136178, "epoch": 0.10046816428606302, "grad_norm": 8.324599266052246, "learning_rate": 7.981511543586906e-06, "loss": 0.5679, "mean_token_accuracy": 0.8331062749028206, "num_tokens": 38914048.0, "step": 32410 }, { "entropy": 1.8162163376808167, "epoch": 0.10049916341111272, "grad_norm": 2.5847465991973877, "learning_rate": 7.980280455778062e-06, "loss": 0.524, "mean_token_accuracy": 0.832746496796608, "num_tokens": 38926231.0, "step": 32420 }, { "entropy": 1.8801830425858497, "epoch": 0.10053016253616241, "grad_norm": 8.904119491577148, "learning_rate": 7.979049937451507e-06, "loss": 0.5467, "mean_token_accuracy": 0.8272658884525299, "num_tokens": 38938316.0, "step": 32430 }, { "entropy": 1.9005606561899184, "epoch": 0.10056116166121211, "grad_norm": 5.724231719970703, "learning_rate": 7.977819988168313e-06, "loss": 0.5275, "mean_token_accuracy": 0.829011881351471, "num_tokens": 38950624.0, "step": 32440 }, { "entropy": 1.9463760763406754, "epoch": 0.1005921607862618, "grad_norm": 8.63336181640625, "learning_rate": 7.97659060749004e-06, "loss": 0.5908, "mean_token_accuracy": 0.82104711830616, "num_tokens": 38962016.0, "step": 32450 }, { "entropy": 1.8716831341385842, "epoch": 0.1006231599113115, "grad_norm": 10.11276912689209, "learning_rate": 7.975361794978705e-06, "loss": 0.5483, "mean_token_accuracy": 0.8303984850645065, "num_tokens": 38973912.0, "step": 32460 }, { "entropy": 1.8566598653793336, "epoch": 0.1006541590363612, "grad_norm": 7.346640586853027, "learning_rate": 7.974133550196811e-06, "loss": 0.5429, "mean_token_accuracy": 0.8243806138634682, "num_tokens": 38985487.0, "step": 32470 }, { "entropy": 1.9269026920199395, "epoch": 0.1006851581614109, "grad_norm": 8.665224075317383, "learning_rate": 7.972905872707326e-06, "loss": 0.6133, "mean_token_accuracy": 0.8166972935199738, "num_tokens": 38997045.0, "step": 32480 }, { "entropy": 1.7712698966264724, "epoch": 0.10071615728646059, "grad_norm": 4.503955364227295, "learning_rate": 7.97167876207369e-06, "loss": 0.4208, "mean_token_accuracy": 0.8519754007458686, "num_tokens": 39010592.0, "step": 32490 }, { "entropy": 1.8833463504910468, "epoch": 0.10074715641151029, "grad_norm": 10.112072944641113, "learning_rate": 7.970452217859811e-06, "loss": 0.5322, "mean_token_accuracy": 0.8291536673903466, "num_tokens": 39021828.0, "step": 32500 }, { "entropy": 1.9463322639465332, "epoch": 0.10077815553655999, "grad_norm": 11.4695405960083, "learning_rate": 7.96922623963007e-06, "loss": 0.6672, "mean_token_accuracy": 0.8071766123175621, "num_tokens": 39032954.0, "step": 32510 }, { "entropy": 1.830717845261097, "epoch": 0.10080915466160968, "grad_norm": 5.045051574707031, "learning_rate": 7.968000826949319e-06, "loss": 0.4842, "mean_token_accuracy": 0.8264394268393517, "num_tokens": 39046531.0, "step": 32520 }, { "entropy": 1.8757388591766357, "epoch": 0.10084015378665938, "grad_norm": 5.511293888092041, "learning_rate": 7.96677597938287e-06, "loss": 0.5013, "mean_token_accuracy": 0.8393940135836602, "num_tokens": 39059751.0, "step": 32530 }, { "entropy": 1.9044818967580794, "epoch": 0.10087115291170906, "grad_norm": 9.242785453796387, "learning_rate": 7.965551696496507e-06, "loss": 0.5406, "mean_token_accuracy": 0.8220992356538772, "num_tokens": 39072505.0, "step": 32540 }, { "entropy": 1.9557638376951219, "epoch": 0.10090215203675876, "grad_norm": 9.517579078674316, "learning_rate": 7.964327977856484e-06, "loss": 0.6215, "mean_token_accuracy": 0.8174139723181725, "num_tokens": 39083179.0, "step": 32550 }, { "entropy": 2.0160479575395582, "epoch": 0.10093315116180845, "grad_norm": 9.149009704589844, "learning_rate": 7.963104823029519e-06, "loss": 0.644, "mean_token_accuracy": 0.8098774880170823, "num_tokens": 39093653.0, "step": 32560 }, { "entropy": 1.9425939425826073, "epoch": 0.10096415028685815, "grad_norm": 8.069141387939453, "learning_rate": 7.961882231582794e-06, "loss": 0.5416, "mean_token_accuracy": 0.8205993890762329, "num_tokens": 39105306.0, "step": 32570 }, { "entropy": 1.9585781380534173, "epoch": 0.10099514941190785, "grad_norm": 8.68270206451416, "learning_rate": 7.960660203083954e-06, "loss": 0.6058, "mean_token_accuracy": 0.8138043344020843, "num_tokens": 39117926.0, "step": 32580 }, { "entropy": 1.952934955060482, "epoch": 0.10102614853695754, "grad_norm": 9.11218547821045, "learning_rate": 7.959438737101118e-06, "loss": 0.5391, "mean_token_accuracy": 0.8283255323767662, "num_tokens": 39130134.0, "step": 32590 }, { "entropy": 1.9153422683477401, "epoch": 0.10105714766200724, "grad_norm": 5.233044147491455, "learning_rate": 7.958217833202859e-06, "loss": 0.5562, "mean_token_accuracy": 0.8188350781798363, "num_tokens": 39142127.0, "step": 32600 }, { "entropy": 1.8911504298448563, "epoch": 0.10108814678705694, "grad_norm": 4.27568244934082, "learning_rate": 7.956997490958216e-06, "loss": 0.533, "mean_token_accuracy": 0.8261435091495514, "num_tokens": 39154491.0, "step": 32610 }, { "entropy": 1.7859768718481064, "epoch": 0.10111914591210663, "grad_norm": 4.534674644470215, "learning_rate": 7.955777709936692e-06, "loss": 0.4525, "mean_token_accuracy": 0.8465276822447777, "num_tokens": 39168135.0, "step": 32620 }, { "entropy": 1.90359725356102, "epoch": 0.10115014503715633, "grad_norm": 7.7813496589660645, "learning_rate": 7.95455848970825e-06, "loss": 0.6129, "mean_token_accuracy": 0.8201944395899773, "num_tokens": 39180412.0, "step": 32630 }, { "entropy": 1.924836564064026, "epoch": 0.10118114416220603, "grad_norm": 9.442864418029785, "learning_rate": 7.953339829843315e-06, "loss": 0.5363, "mean_token_accuracy": 0.8264301627874374, "num_tokens": 39192257.0, "step": 32640 }, { "entropy": 1.9304316490888596, "epoch": 0.10121214328725572, "grad_norm": 3.983366012573242, "learning_rate": 7.952121729912772e-06, "loss": 0.612, "mean_token_accuracy": 0.8225965097546577, "num_tokens": 39203910.0, "step": 32650 }, { "entropy": 1.9028907686471939, "epoch": 0.1012431424123054, "grad_norm": 9.803033828735352, "learning_rate": 7.95090418948797e-06, "loss": 0.6047, "mean_token_accuracy": 0.8184835597872734, "num_tokens": 39216797.0, "step": 32660 }, { "entropy": 1.8274445042014122, "epoch": 0.1012741415373551, "grad_norm": 10.706036567687988, "learning_rate": 7.949687208140709e-06, "loss": 0.5313, "mean_token_accuracy": 0.8392621099948883, "num_tokens": 39229208.0, "step": 32670 }, { "entropy": 1.983510084450245, "epoch": 0.1013051406624048, "grad_norm": 9.402242660522461, "learning_rate": 7.948470785443254e-06, "loss": 0.6156, "mean_token_accuracy": 0.8155052870512008, "num_tokens": 39240609.0, "step": 32680 }, { "entropy": 1.8997413352131844, "epoch": 0.1013361397874545, "grad_norm": 11.164144515991211, "learning_rate": 7.947254920968327e-06, "loss": 0.5142, "mean_token_accuracy": 0.823790366947651, "num_tokens": 39252794.0, "step": 32690 }, { "entropy": 1.843451727926731, "epoch": 0.10136713891250419, "grad_norm": 9.909012794494629, "learning_rate": 7.946039614289105e-06, "loss": 0.546, "mean_token_accuracy": 0.8248791456222534, "num_tokens": 39265204.0, "step": 32700 }, { "entropy": 1.9523500233888627, "epoch": 0.10139813803755389, "grad_norm": 8.847643852233887, "learning_rate": 7.944824864979225e-06, "loss": 0.6042, "mean_token_accuracy": 0.8163961425423623, "num_tokens": 39276926.0, "step": 32710 }, { "entropy": 1.8452000051736832, "epoch": 0.10142913716260359, "grad_norm": 8.985699653625488, "learning_rate": 7.943610672612779e-06, "loss": 0.4633, "mean_token_accuracy": 0.8368108674883843, "num_tokens": 39288593.0, "step": 32720 }, { "entropy": 1.8509555876255035, "epoch": 0.10146013628765328, "grad_norm": 8.876476287841797, "learning_rate": 7.94239703676431e-06, "loss": 0.5477, "mean_token_accuracy": 0.8287522226572037, "num_tokens": 39301355.0, "step": 32730 }, { "entropy": 1.982967707514763, "epoch": 0.10149113541270298, "grad_norm": 11.262484550476074, "learning_rate": 7.941183957008825e-06, "loss": 0.5909, "mean_token_accuracy": 0.8227292239665985, "num_tokens": 39312750.0, "step": 32740 }, { "entropy": 1.9494792476296425, "epoch": 0.10152213453775268, "grad_norm": 12.197062492370605, "learning_rate": 7.939971432921778e-06, "loss": 0.5947, "mean_token_accuracy": 0.8156418994069099, "num_tokens": 39323923.0, "step": 32750 }, { "entropy": 1.9586309775710107, "epoch": 0.10155313366280237, "grad_norm": 8.947646141052246, "learning_rate": 7.93875946407908e-06, "loss": 0.5582, "mean_token_accuracy": 0.8243048340082169, "num_tokens": 39335909.0, "step": 32760 }, { "entropy": 1.9357258334755898, "epoch": 0.10158413278785207, "grad_norm": 8.137892723083496, "learning_rate": 7.937548050057092e-06, "loss": 0.5899, "mean_token_accuracy": 0.8267382949590683, "num_tokens": 39347350.0, "step": 32770 }, { "entropy": 1.961161696910858, "epoch": 0.10161513191290175, "grad_norm": 9.436999320983887, "learning_rate": 7.936337190432627e-06, "loss": 0.5788, "mean_token_accuracy": 0.8202362656593323, "num_tokens": 39359043.0, "step": 32780 }, { "entropy": 1.9914384290575982, "epoch": 0.10164613103795145, "grad_norm": 8.315853118896484, "learning_rate": 7.935126884782958e-06, "loss": 0.5859, "mean_token_accuracy": 0.8224820002913475, "num_tokens": 39370438.0, "step": 32790 }, { "entropy": 1.9369804859161377, "epoch": 0.10167713016300114, "grad_norm": 9.922943115234375, "learning_rate": 7.9339171326858e-06, "loss": 0.5687, "mean_token_accuracy": 0.8243139579892158, "num_tokens": 39380976.0, "step": 32800 }, { "entropy": 1.9085180804133415, "epoch": 0.10170812928805084, "grad_norm": 9.743084907531738, "learning_rate": 7.932707933719321e-06, "loss": 0.614, "mean_token_accuracy": 0.8235570520162583, "num_tokens": 39392951.0, "step": 32810 }, { "entropy": 1.9496404856443406, "epoch": 0.10173912841310054, "grad_norm": 8.715404510498047, "learning_rate": 7.931499287462138e-06, "loss": 0.6393, "mean_token_accuracy": 0.811781495809555, "num_tokens": 39404058.0, "step": 32820 }, { "entropy": 1.9330283522605896, "epoch": 0.10177012753815023, "grad_norm": 9.045530319213867, "learning_rate": 7.930291193493323e-06, "loss": 0.5864, "mean_token_accuracy": 0.8231693536043168, "num_tokens": 39414589.0, "step": 32830 }, { "entropy": 2.0210474640130998, "epoch": 0.10180112666319993, "grad_norm": 7.992177486419678, "learning_rate": 7.929083651392389e-06, "loss": 0.6262, "mean_token_accuracy": 0.8037309676408768, "num_tokens": 39425755.0, "step": 32840 }, { "entropy": 1.9645818829536439, "epoch": 0.10183212578824963, "grad_norm": 9.055630683898926, "learning_rate": 7.9278766607393e-06, "loss": 0.5811, "mean_token_accuracy": 0.8236428961157799, "num_tokens": 39437178.0, "step": 32850 }, { "entropy": 1.9607673034071922, "epoch": 0.10186312491329932, "grad_norm": 10.347228050231934, "learning_rate": 7.92667022111447e-06, "loss": 0.5655, "mean_token_accuracy": 0.8204041153192521, "num_tokens": 39449415.0, "step": 32860 }, { "entropy": 1.913771566748619, "epoch": 0.10189412403834902, "grad_norm": 10.000711441040039, "learning_rate": 7.92546433209876e-06, "loss": 0.5313, "mean_token_accuracy": 0.8246194154024125, "num_tokens": 39461394.0, "step": 32870 }, { "entropy": 1.9231099665164948, "epoch": 0.10192512316339872, "grad_norm": 5.153701305389404, "learning_rate": 7.92425899327347e-06, "loss": 0.5586, "mean_token_accuracy": 0.8274278253316879, "num_tokens": 39473357.0, "step": 32880 }, { "entropy": 1.9601033926010132, "epoch": 0.10195612228844841, "grad_norm": 9.463569641113281, "learning_rate": 7.923054204220351e-06, "loss": 0.6141, "mean_token_accuracy": 0.8186487257480621, "num_tokens": 39484263.0, "step": 32890 }, { "entropy": 1.989539209008217, "epoch": 0.10198712141349811, "grad_norm": 11.15012264251709, "learning_rate": 7.921849964521603e-06, "loss": 0.5973, "mean_token_accuracy": 0.8308931365609169, "num_tokens": 39495754.0, "step": 32900 }, { "entropy": 1.8660317227244376, "epoch": 0.1020181205385478, "grad_norm": 9.449760437011719, "learning_rate": 7.92064627375986e-06, "loss": 0.5162, "mean_token_accuracy": 0.8349911078810692, "num_tokens": 39508683.0, "step": 32910 }, { "entropy": 1.956910152733326, "epoch": 0.10204911966359749, "grad_norm": 8.791659355163574, "learning_rate": 7.919443131518211e-06, "loss": 0.546, "mean_token_accuracy": 0.8249326780438423, "num_tokens": 39520549.0, "step": 32920 }, { "entropy": 1.9195820301771165, "epoch": 0.10208011878864719, "grad_norm": 4.308915615081787, "learning_rate": 7.91824053738018e-06, "loss": 0.5599, "mean_token_accuracy": 0.8237533152103425, "num_tokens": 39533438.0, "step": 32930 }, { "entropy": 2.001399652659893, "epoch": 0.10211111791369688, "grad_norm": 11.334473609924316, "learning_rate": 7.917038490929737e-06, "loss": 0.5865, "mean_token_accuracy": 0.8264929071068764, "num_tokens": 39544650.0, "step": 32940 }, { "entropy": 1.9738114327192307, "epoch": 0.10214211703874658, "grad_norm": 4.6155595779418945, "learning_rate": 7.915836991751293e-06, "loss": 0.6039, "mean_token_accuracy": 0.8131748422980308, "num_tokens": 39556308.0, "step": 32950 }, { "entropy": 1.884914068877697, "epoch": 0.10217311616379628, "grad_norm": 9.995161056518555, "learning_rate": 7.914636039429701e-06, "loss": 0.5548, "mean_token_accuracy": 0.8334923848509789, "num_tokens": 39568702.0, "step": 32960 }, { "entropy": 1.7988762125372886, "epoch": 0.10220411528884597, "grad_norm": 9.979853630065918, "learning_rate": 7.913435633550255e-06, "loss": 0.4959, "mean_token_accuracy": 0.83943440169096, "num_tokens": 39581966.0, "step": 32970 }, { "entropy": 1.8382903322577477, "epoch": 0.10223511441389567, "grad_norm": 8.669686317443848, "learning_rate": 7.912235773698689e-06, "loss": 0.4643, "mean_token_accuracy": 0.838770891726017, "num_tokens": 39595357.0, "step": 32980 }, { "entropy": 1.9224609807133675, "epoch": 0.10226611353894537, "grad_norm": 8.906074523925781, "learning_rate": 7.911036459461177e-06, "loss": 0.5724, "mean_token_accuracy": 0.8308119490742684, "num_tokens": 39606657.0, "step": 32990 }, { "entropy": 1.8199217766523361, "epoch": 0.10229711266399506, "grad_norm": 9.60714340209961, "learning_rate": 7.909837690424327e-06, "loss": 0.5371, "mean_token_accuracy": 0.8385667935013771, "num_tokens": 39619596.0, "step": 33000 }, { "entropy": 1.9392257377505302, "epoch": 0.10232811178904476, "grad_norm": 9.89013671875, "learning_rate": 7.908639466175193e-06, "loss": 0.5837, "mean_token_accuracy": 0.8208078041672706, "num_tokens": 39631627.0, "step": 33010 }, { "entropy": 1.9637094184756279, "epoch": 0.10235911091409446, "grad_norm": 9.878365516662598, "learning_rate": 7.907441786301261e-06, "loss": 0.5518, "mean_token_accuracy": 0.8377602383494377, "num_tokens": 39642887.0, "step": 33020 }, { "entropy": 1.8965819612145425, "epoch": 0.10239011003914414, "grad_norm": 9.686004638671875, "learning_rate": 7.906244650390462e-06, "loss": 0.5211, "mean_token_accuracy": 0.8283202305436135, "num_tokens": 39654706.0, "step": 33030 }, { "entropy": 1.8306727975606918, "epoch": 0.10242110916419384, "grad_norm": 10.983489990234375, "learning_rate": 7.905048058031153e-06, "loss": 0.4686, "mean_token_accuracy": 0.8358713150024414, "num_tokens": 39668226.0, "step": 33040 }, { "entropy": 1.898146103322506, "epoch": 0.10245210828924353, "grad_norm": 8.633929252624512, "learning_rate": 7.903852008812132e-06, "loss": 0.5579, "mean_token_accuracy": 0.8244705319404602, "num_tokens": 39680301.0, "step": 33050 }, { "entropy": 1.8473190173506737, "epoch": 0.10248310741429323, "grad_norm": 4.211243629455566, "learning_rate": 7.902656502322633e-06, "loss": 0.5038, "mean_token_accuracy": 0.8376186162233352, "num_tokens": 39693311.0, "step": 33060 }, { "entropy": 1.8591630086302757, "epoch": 0.10251410653934293, "grad_norm": 4.487063884735107, "learning_rate": 7.901461538152326e-06, "loss": 0.46, "mean_token_accuracy": 0.8340800628066063, "num_tokens": 39706806.0, "step": 33070 }, { "entropy": 1.9278041929006577, "epoch": 0.10254510566439262, "grad_norm": 9.240703582763672, "learning_rate": 7.90026711589131e-06, "loss": 0.5148, "mean_token_accuracy": 0.8337907418608665, "num_tokens": 39719457.0, "step": 33080 }, { "entropy": 1.9978653281927108, "epoch": 0.10257610478944232, "grad_norm": 10.427108764648438, "learning_rate": 7.899073235130122e-06, "loss": 0.6744, "mean_token_accuracy": 0.8095801830291748, "num_tokens": 39730503.0, "step": 33090 }, { "entropy": 1.9866398319602012, "epoch": 0.10260710391449201, "grad_norm": 8.23255729675293, "learning_rate": 7.897879895459734e-06, "loss": 0.5904, "mean_token_accuracy": 0.8214378640055656, "num_tokens": 39741771.0, "step": 33100 }, { "entropy": 1.8491350293159485, "epoch": 0.10263810303954171, "grad_norm": 3.994368076324463, "learning_rate": 7.896687096471543e-06, "loss": 0.5219, "mean_token_accuracy": 0.8272738143801689, "num_tokens": 39754475.0, "step": 33110 }, { "entropy": 1.9193340376019479, "epoch": 0.10266910216459141, "grad_norm": 8.316622734069824, "learning_rate": 7.895494837757387e-06, "loss": 0.5405, "mean_token_accuracy": 0.82731723934412, "num_tokens": 39766516.0, "step": 33120 }, { "entropy": 1.775647784769535, "epoch": 0.1027001012896411, "grad_norm": 4.390643119812012, "learning_rate": 7.894303118909526e-06, "loss": 0.4468, "mean_token_accuracy": 0.8442358478903771, "num_tokens": 39780754.0, "step": 33130 }, { "entropy": 1.9259379595518111, "epoch": 0.1027311004146908, "grad_norm": 3.8750622272491455, "learning_rate": 7.893111939520659e-06, "loss": 0.5695, "mean_token_accuracy": 0.8207632303237915, "num_tokens": 39792005.0, "step": 33140 }, { "entropy": 2.0047280192375183, "epoch": 0.10276209953974048, "grad_norm": 8.025463104248047, "learning_rate": 7.891921299183906e-06, "loss": 0.5898, "mean_token_accuracy": 0.8211274608969689, "num_tokens": 39803614.0, "step": 33150 }, { "entropy": 2.0167581260204317, "epoch": 0.10279309866479018, "grad_norm": 10.989029884338379, "learning_rate": 7.890731197492827e-06, "loss": 0.6009, "mean_token_accuracy": 0.8194682911038399, "num_tokens": 39814772.0, "step": 33160 }, { "entropy": 2.003499576449394, "epoch": 0.10282409778983988, "grad_norm": 8.563013076782227, "learning_rate": 7.889541634041405e-06, "loss": 0.5893, "mean_token_accuracy": 0.8160374283790588, "num_tokens": 39826015.0, "step": 33170 }, { "entropy": 1.9005308762192725, "epoch": 0.10285509691488957, "grad_norm": 10.718070030212402, "learning_rate": 7.888352608424046e-06, "loss": 0.4896, "mean_token_accuracy": 0.8393619701266288, "num_tokens": 39838620.0, "step": 33180 }, { "entropy": 1.9308624148368836, "epoch": 0.10288609603993927, "grad_norm": 9.094322204589844, "learning_rate": 7.887164120235598e-06, "loss": 0.5783, "mean_token_accuracy": 0.8276225015521049, "num_tokens": 39850572.0, "step": 33190 }, { "entropy": 1.9245057985186578, "epoch": 0.10291709516498897, "grad_norm": 9.67980670928955, "learning_rate": 7.885976169071323e-06, "loss": 0.5649, "mean_token_accuracy": 0.8204725295305252, "num_tokens": 39863035.0, "step": 33200 }, { "entropy": 1.8938257083296777, "epoch": 0.10294809429003866, "grad_norm": 11.41298770904541, "learning_rate": 7.884788754526915e-06, "loss": 0.5521, "mean_token_accuracy": 0.8219778835773468, "num_tokens": 39875895.0, "step": 33210 }, { "entropy": 1.9652521327137946, "epoch": 0.10297909341508836, "grad_norm": 4.608140468597412, "learning_rate": 7.883601876198497e-06, "loss": 0.5639, "mean_token_accuracy": 0.8251558512449264, "num_tokens": 39888247.0, "step": 33220 }, { "entropy": 1.9580962508916855, "epoch": 0.10301009254013806, "grad_norm": 7.537057399749756, "learning_rate": 7.882415533682607e-06, "loss": 0.5439, "mean_token_accuracy": 0.8344349786639214, "num_tokens": 39900720.0, "step": 33230 }, { "entropy": 1.9044880703091622, "epoch": 0.10304109166518775, "grad_norm": 8.43292236328125, "learning_rate": 7.88122972657622e-06, "loss": 0.5042, "mean_token_accuracy": 0.8305587694048882, "num_tokens": 39913681.0, "step": 33240 }, { "entropy": 1.8795413970947266, "epoch": 0.10307209079023745, "grad_norm": 3.7221243381500244, "learning_rate": 7.88004445447673e-06, "loss": 0.5004, "mean_token_accuracy": 0.8384215503931045, "num_tokens": 39926055.0, "step": 33250 }, { "entropy": 1.9621031075716018, "epoch": 0.10310308991528715, "grad_norm": 10.001662254333496, "learning_rate": 7.878859716981954e-06, "loss": 0.6182, "mean_token_accuracy": 0.8217739418148995, "num_tokens": 39936665.0, "step": 33260 }, { "entropy": 1.935880383849144, "epoch": 0.10313408904033684, "grad_norm": 9.233072280883789, "learning_rate": 7.87767551369013e-06, "loss": 0.5461, "mean_token_accuracy": 0.8238107547163963, "num_tokens": 39948808.0, "step": 33270 }, { "entropy": 1.9450037762522698, "epoch": 0.10316508816538653, "grad_norm": 4.693704128265381, "learning_rate": 7.876491844199926e-06, "loss": 0.6156, "mean_token_accuracy": 0.8216011270880699, "num_tokens": 39960695.0, "step": 33280 }, { "entropy": 1.910569779574871, "epoch": 0.10319608729043622, "grad_norm": 10.65746021270752, "learning_rate": 7.875308708110426e-06, "loss": 0.5713, "mean_token_accuracy": 0.8182385757565498, "num_tokens": 39973311.0, "step": 33290 }, { "entropy": 1.8974032238125802, "epoch": 0.10322708641548592, "grad_norm": 8.886612892150879, "learning_rate": 7.874126105021134e-06, "loss": 0.5677, "mean_token_accuracy": 0.8244770526885986, "num_tokens": 39985794.0, "step": 33300 }, { "entropy": 1.9022233217954636, "epoch": 0.10325808554053562, "grad_norm": 4.597454071044922, "learning_rate": 7.872944034531982e-06, "loss": 0.4985, "mean_token_accuracy": 0.8278315708041191, "num_tokens": 39998753.0, "step": 33310 }, { "entropy": 1.8010682314634323, "epoch": 0.10328908466558531, "grad_norm": 2.244396924972534, "learning_rate": 7.871762496243318e-06, "loss": 0.4603, "mean_token_accuracy": 0.84285439401865, "num_tokens": 40013133.0, "step": 33320 }, { "entropy": 1.868261407315731, "epoch": 0.10332008379063501, "grad_norm": 10.890486717224121, "learning_rate": 7.870581489755905e-06, "loss": 0.4996, "mean_token_accuracy": 0.8372749105095864, "num_tokens": 40025402.0, "step": 33330 }, { "entropy": 1.8567996740341186, "epoch": 0.1033510829156847, "grad_norm": 8.594077110290527, "learning_rate": 7.869401014670937e-06, "loss": 0.4789, "mean_token_accuracy": 0.8362266525626183, "num_tokens": 40038004.0, "step": 33340 }, { "entropy": 1.8234371304512025, "epoch": 0.1033820820407344, "grad_norm": 8.579446792602539, "learning_rate": 7.868221070590013e-06, "loss": 0.4925, "mean_token_accuracy": 0.8359899684786797, "num_tokens": 40051320.0, "step": 33350 }, { "entropy": 1.9455517828464508, "epoch": 0.1034130811657841, "grad_norm": 10.794142723083496, "learning_rate": 7.86704165711516e-06, "loss": 0.5869, "mean_token_accuracy": 0.8300697863101959, "num_tokens": 40063005.0, "step": 33360 }, { "entropy": 1.902688279747963, "epoch": 0.1034440802908338, "grad_norm": 2.90916109085083, "learning_rate": 7.865862773848816e-06, "loss": 0.5366, "mean_token_accuracy": 0.8291781514883041, "num_tokens": 40074960.0, "step": 33370 }, { "entropy": 1.8599769324064255, "epoch": 0.10347507941588349, "grad_norm": 4.288544178009033, "learning_rate": 7.864684420393842e-06, "loss": 0.4894, "mean_token_accuracy": 0.8394434854388237, "num_tokens": 40087590.0, "step": 33380 }, { "entropy": 1.8361037239432334, "epoch": 0.10350607854093319, "grad_norm": 4.094844818115234, "learning_rate": 7.863506596353514e-06, "loss": 0.4848, "mean_token_accuracy": 0.8373026907444, "num_tokens": 40101356.0, "step": 33390 }, { "entropy": 1.8626354187726974, "epoch": 0.10353707766598287, "grad_norm": 9.581649780273438, "learning_rate": 7.862329301331517e-06, "loss": 0.5136, "mean_token_accuracy": 0.8337352395057678, "num_tokens": 40113440.0, "step": 33400 }, { "entropy": 1.9167112335562706, "epoch": 0.10356807679103257, "grad_norm": 12.560930252075195, "learning_rate": 7.86115253493196e-06, "loss": 0.559, "mean_token_accuracy": 0.8210726290941238, "num_tokens": 40125063.0, "step": 33410 }, { "entropy": 1.917654138803482, "epoch": 0.10359907591608226, "grad_norm": 9.083182334899902, "learning_rate": 7.859976296759359e-06, "loss": 0.5644, "mean_token_accuracy": 0.8320072874426842, "num_tokens": 40136790.0, "step": 33420 }, { "entropy": 1.884972706437111, "epoch": 0.10363007504113196, "grad_norm": 9.505833625793457, "learning_rate": 7.858800586418653e-06, "loss": 0.5672, "mean_token_accuracy": 0.8238067865371704, "num_tokens": 40148423.0, "step": 33430 }, { "entropy": 1.8900686636567117, "epoch": 0.10366107416618166, "grad_norm": 9.053071022033691, "learning_rate": 7.857625403515186e-06, "loss": 0.6058, "mean_token_accuracy": 0.8166046008467674, "num_tokens": 40160812.0, "step": 33440 }, { "entropy": 1.8167245775461196, "epoch": 0.10369207329123135, "grad_norm": 10.170108795166016, "learning_rate": 7.856450747654719e-06, "loss": 0.4918, "mean_token_accuracy": 0.8358997702598572, "num_tokens": 40174474.0, "step": 33450 }, { "entropy": 1.8785471424460412, "epoch": 0.10372307241628105, "grad_norm": 10.019652366638184, "learning_rate": 7.855276618443426e-06, "loss": 0.5148, "mean_token_accuracy": 0.8401072070002555, "num_tokens": 40186694.0, "step": 33460 }, { "entropy": 1.8747892037034035, "epoch": 0.10375407154133075, "grad_norm": 9.287095069885254, "learning_rate": 7.854103015487889e-06, "loss": 0.5128, "mean_token_accuracy": 0.8236067086458206, "num_tokens": 40199002.0, "step": 33470 }, { "entropy": 1.8964219346642495, "epoch": 0.10378507066638044, "grad_norm": 8.723387718200684, "learning_rate": 7.852929938395108e-06, "loss": 0.5132, "mean_token_accuracy": 0.8313620582222938, "num_tokens": 40211446.0, "step": 33480 }, { "entropy": 1.8906508162617683, "epoch": 0.10381606979143014, "grad_norm": 9.373579025268555, "learning_rate": 7.85175738677249e-06, "loss": 0.5188, "mean_token_accuracy": 0.8374797239899635, "num_tokens": 40222830.0, "step": 33490 }, { "entropy": 1.9234597817063332, "epoch": 0.10384706891647984, "grad_norm": 7.460415363311768, "learning_rate": 7.85058536022785e-06, "loss": 0.5451, "mean_token_accuracy": 0.8325879499316216, "num_tokens": 40234416.0, "step": 33500 }, { "entropy": 1.9918899089097977, "epoch": 0.10387806804152953, "grad_norm": 8.504613876342773, "learning_rate": 7.849413858369415e-06, "loss": 0.6361, "mean_token_accuracy": 0.8134088665246964, "num_tokens": 40245140.0, "step": 33510 }, { "entropy": 1.9241836652159692, "epoch": 0.10390906716657922, "grad_norm": 8.564291000366211, "learning_rate": 7.848242880805818e-06, "loss": 0.5701, "mean_token_accuracy": 0.8313742846250534, "num_tokens": 40257078.0, "step": 33520 }, { "entropy": 1.8901431530714035, "epoch": 0.10394006629162891, "grad_norm": 4.144465923309326, "learning_rate": 7.847072427146111e-06, "loss": 0.5654, "mean_token_accuracy": 0.8276931583881378, "num_tokens": 40269349.0, "step": 33530 }, { "entropy": 1.887324059009552, "epoch": 0.10397106541667861, "grad_norm": 14.356313705444336, "learning_rate": 7.845902496999739e-06, "loss": 0.5668, "mean_token_accuracy": 0.8178930029273033, "num_tokens": 40281901.0, "step": 33540 }, { "entropy": 1.9168416380882263, "epoch": 0.1040020645417283, "grad_norm": 11.27478313446045, "learning_rate": 7.844733089976564e-06, "loss": 0.5204, "mean_token_accuracy": 0.8334863767027855, "num_tokens": 40293978.0, "step": 33550 }, { "entropy": 1.813339551538229, "epoch": 0.104033063666778, "grad_norm": 8.250176429748535, "learning_rate": 7.843564205686856e-06, "loss": 0.4894, "mean_token_accuracy": 0.838591480255127, "num_tokens": 40306930.0, "step": 33560 }, { "entropy": 1.9164869621396066, "epoch": 0.1040640627918277, "grad_norm": 8.932500839233398, "learning_rate": 7.842395843741287e-06, "loss": 0.5966, "mean_token_accuracy": 0.8232863306999206, "num_tokens": 40318760.0, "step": 33570 }, { "entropy": 1.9515147507190704, "epoch": 0.1040950619168774, "grad_norm": 8.837969779968262, "learning_rate": 7.841228003750933e-06, "loss": 0.6046, "mean_token_accuracy": 0.8223684683442116, "num_tokens": 40330749.0, "step": 33580 }, { "entropy": 1.9510893940925598, "epoch": 0.10412606104192709, "grad_norm": 10.218021392822266, "learning_rate": 7.84006068532728e-06, "loss": 0.5715, "mean_token_accuracy": 0.8213855206966401, "num_tokens": 40342128.0, "step": 33590 }, { "entropy": 1.9364799603819847, "epoch": 0.10415706016697679, "grad_norm": 9.228703498840332, "learning_rate": 7.838893888082218e-06, "loss": 0.5729, "mean_token_accuracy": 0.8253035977482795, "num_tokens": 40353315.0, "step": 33600 }, { "entropy": 1.9496245756745338, "epoch": 0.10418805929202649, "grad_norm": 9.735555648803711, "learning_rate": 7.837727611628043e-06, "loss": 0.5944, "mean_token_accuracy": 0.8161328807473183, "num_tokens": 40365395.0, "step": 33610 }, { "entropy": 1.952901628613472, "epoch": 0.10421905841707618, "grad_norm": 10.927205085754395, "learning_rate": 7.836561855577443e-06, "loss": 0.5448, "mean_token_accuracy": 0.8294717028737069, "num_tokens": 40376818.0, "step": 33620 }, { "entropy": 1.8274588227272033, "epoch": 0.10425005754212588, "grad_norm": 9.007339477539062, "learning_rate": 7.835396619543528e-06, "loss": 0.5046, "mean_token_accuracy": 0.8340933352708817, "num_tokens": 40390071.0, "step": 33630 }, { "entropy": 1.9603443384170531, "epoch": 0.10428105666717558, "grad_norm": 9.797590255737305, "learning_rate": 7.834231903139795e-06, "loss": 0.6205, "mean_token_accuracy": 0.8226620614528656, "num_tokens": 40401203.0, "step": 33640 }, { "entropy": 1.8578498139977455, "epoch": 0.10431205579222526, "grad_norm": 8.355208396911621, "learning_rate": 7.833067705980151e-06, "loss": 0.5405, "mean_token_accuracy": 0.8306270629167557, "num_tokens": 40413367.0, "step": 33650 }, { "entropy": 1.8901911661028863, "epoch": 0.10434305491727495, "grad_norm": 9.124183654785156, "learning_rate": 7.831904027678902e-06, "loss": 0.5805, "mean_token_accuracy": 0.8174741074442864, "num_tokens": 40425631.0, "step": 33660 }, { "entropy": 2.0042662411928176, "epoch": 0.10437405404232465, "grad_norm": 8.978107452392578, "learning_rate": 7.830740867850753e-06, "loss": 0.5935, "mean_token_accuracy": 0.8254029154777527, "num_tokens": 40435603.0, "step": 33670 }, { "entropy": 1.8774489134550094, "epoch": 0.10440505316737435, "grad_norm": 10.732882499694824, "learning_rate": 7.829578226110816e-06, "loss": 0.6055, "mean_token_accuracy": 0.8271615326404571, "num_tokens": 40447665.0, "step": 33680 }, { "entropy": 1.981630663573742, "epoch": 0.10443605229242404, "grad_norm": 11.340205192565918, "learning_rate": 7.828416102074594e-06, "loss": 0.5885, "mean_token_accuracy": 0.8235415056347847, "num_tokens": 40458773.0, "step": 33690 }, { "entropy": 1.8451252445578574, "epoch": 0.10446705141747374, "grad_norm": 11.530259132385254, "learning_rate": 7.827254495357994e-06, "loss": 0.4994, "mean_token_accuracy": 0.8376246988773346, "num_tokens": 40471667.0, "step": 33700 }, { "entropy": 1.950427158176899, "epoch": 0.10449805054252344, "grad_norm": 10.422981262207031, "learning_rate": 7.826093405577326e-06, "loss": 0.5776, "mean_token_accuracy": 0.8280929684638977, "num_tokens": 40483536.0, "step": 33710 }, { "entropy": 1.9090193212032318, "epoch": 0.10452904966757313, "grad_norm": 8.866836547851562, "learning_rate": 7.824932832349292e-06, "loss": 0.5093, "mean_token_accuracy": 0.8405329346656799, "num_tokens": 40494937.0, "step": 33720 }, { "entropy": 1.8627185627818108, "epoch": 0.10456004879262283, "grad_norm": 9.671921730041504, "learning_rate": 7.823772775290993e-06, "loss": 0.5157, "mean_token_accuracy": 0.8325039520859718, "num_tokens": 40507661.0, "step": 33730 }, { "entropy": 1.8910223886370658, "epoch": 0.10459104791767253, "grad_norm": 4.5752458572387695, "learning_rate": 7.822613234019927e-06, "loss": 0.5511, "mean_token_accuracy": 0.8261340275406838, "num_tokens": 40520066.0, "step": 33740 }, { "entropy": 1.9918366223573685, "epoch": 0.10462204704272222, "grad_norm": 9.358546257019043, "learning_rate": 7.821454208153992e-06, "loss": 0.6184, "mean_token_accuracy": 0.8228957876563072, "num_tokens": 40531109.0, "step": 33750 }, { "entropy": 1.9470916539430618, "epoch": 0.10465304616777192, "grad_norm": 10.189216613769531, "learning_rate": 7.82029569731148e-06, "loss": 0.6156, "mean_token_accuracy": 0.8150023117661476, "num_tokens": 40542537.0, "step": 33760 }, { "entropy": 1.9262451082468033, "epoch": 0.1046840452928216, "grad_norm": 8.381092071533203, "learning_rate": 7.819137701111077e-06, "loss": 0.5313, "mean_token_accuracy": 0.8301904648542404, "num_tokens": 40554442.0, "step": 33770 }, { "entropy": 1.910345396399498, "epoch": 0.1047150444178713, "grad_norm": 8.140871047973633, "learning_rate": 7.817980219171866e-06, "loss": 0.5684, "mean_token_accuracy": 0.8225700587034226, "num_tokens": 40566939.0, "step": 33780 }, { "entropy": 1.974452766776085, "epoch": 0.104746043542921, "grad_norm": 7.459367275238037, "learning_rate": 7.816823251113325e-06, "loss": 0.5918, "mean_token_accuracy": 0.8310072511434555, "num_tokens": 40578413.0, "step": 33790 }, { "entropy": 1.9643989235162735, "epoch": 0.1047770426679707, "grad_norm": 9.469351768493652, "learning_rate": 7.815666796555324e-06, "loss": 0.5912, "mean_token_accuracy": 0.8164726868271828, "num_tokens": 40589015.0, "step": 33800 }, { "entropy": 1.7843807175755502, "epoch": 0.10480804179302039, "grad_norm": 9.5103178024292, "learning_rate": 7.814510855118131e-06, "loss": 0.4146, "mean_token_accuracy": 0.8469722703099251, "num_tokens": 40602753.0, "step": 33810 }, { "entropy": 1.786408032476902, "epoch": 0.10483904091807009, "grad_norm": 8.267829895019531, "learning_rate": 7.8133554264224e-06, "loss": 0.4993, "mean_token_accuracy": 0.8310681402683258, "num_tokens": 40616558.0, "step": 33820 }, { "entropy": 1.9000122025609016, "epoch": 0.10487004004311978, "grad_norm": 8.900775909423828, "learning_rate": 7.812200510089185e-06, "loss": 0.5472, "mean_token_accuracy": 0.8183383151888848, "num_tokens": 40629539.0, "step": 33830 }, { "entropy": 1.86521013379097, "epoch": 0.10490103916816948, "grad_norm": 9.411357879638672, "learning_rate": 7.811046105739927e-06, "loss": 0.5011, "mean_token_accuracy": 0.8357746317982674, "num_tokens": 40641726.0, "step": 33840 }, { "entropy": 1.8550844490528107, "epoch": 0.10493203829321918, "grad_norm": 7.637185096740723, "learning_rate": 7.809892212996458e-06, "loss": 0.507, "mean_token_accuracy": 0.836801141500473, "num_tokens": 40654094.0, "step": 33850 }, { "entropy": 1.8382833272218704, "epoch": 0.10496303741826887, "grad_norm": 9.953614234924316, "learning_rate": 7.808738831481007e-06, "loss": 0.5009, "mean_token_accuracy": 0.8312429085373878, "num_tokens": 40667469.0, "step": 33860 }, { "entropy": 1.9534501269459725, "epoch": 0.10499403654331857, "grad_norm": 11.463193893432617, "learning_rate": 7.807585960816184e-06, "loss": 0.567, "mean_token_accuracy": 0.8186778038740158, "num_tokens": 40679414.0, "step": 33870 }, { "entropy": 1.8588482439517975, "epoch": 0.10502503566836827, "grad_norm": 8.321738243103027, "learning_rate": 7.806433600624999e-06, "loss": 0.501, "mean_token_accuracy": 0.8387341931462288, "num_tokens": 40692355.0, "step": 33880 }, { "entropy": 1.949586683511734, "epoch": 0.10505603479341795, "grad_norm": 8.959894180297852, "learning_rate": 7.805281750530844e-06, "loss": 0.5933, "mean_token_accuracy": 0.8129965797066688, "num_tokens": 40703355.0, "step": 33890 }, { "entropy": 1.9277196362614633, "epoch": 0.10508703391846765, "grad_norm": 4.52024507522583, "learning_rate": 7.804130410157503e-06, "loss": 0.536, "mean_token_accuracy": 0.8277270719408989, "num_tokens": 40715053.0, "step": 33900 }, { "entropy": 1.901253941655159, "epoch": 0.10511803304351734, "grad_norm": 9.572426795959473, "learning_rate": 7.802979579129147e-06, "loss": 0.5616, "mean_token_accuracy": 0.8186801239848137, "num_tokens": 40726747.0, "step": 33910 }, { "entropy": 1.9022594541311264, "epoch": 0.10514903216856704, "grad_norm": 9.995318412780762, "learning_rate": 7.801829257070337e-06, "loss": 0.5805, "mean_token_accuracy": 0.8272980287671089, "num_tokens": 40738081.0, "step": 33920 }, { "entropy": 1.8792105168104172, "epoch": 0.10518003129361674, "grad_norm": 5.292135715484619, "learning_rate": 7.800679443606019e-06, "loss": 0.5246, "mean_token_accuracy": 0.8302574872970581, "num_tokens": 40750419.0, "step": 33930 }, { "entropy": 1.8759152442216873, "epoch": 0.10521103041866643, "grad_norm": 4.318228721618652, "learning_rate": 7.799530138361527e-06, "loss": 0.455, "mean_token_accuracy": 0.8474466159939766, "num_tokens": 40762714.0, "step": 33940 }, { "entropy": 1.871192954480648, "epoch": 0.10524202954371613, "grad_norm": 4.8864336013793945, "learning_rate": 7.79838134096258e-06, "loss": 0.4999, "mean_token_accuracy": 0.8384427651762962, "num_tokens": 40775536.0, "step": 33950 }, { "entropy": 1.8584145426750183, "epoch": 0.10527302866876583, "grad_norm": 7.488039493560791, "learning_rate": 7.797233051035284e-06, "loss": 0.5697, "mean_token_accuracy": 0.8298766538500786, "num_tokens": 40789635.0, "step": 33960 }, { "entropy": 1.9506609112024307, "epoch": 0.10530402779381552, "grad_norm": 9.773181915283203, "learning_rate": 7.796085268206132e-06, "loss": 0.5754, "mean_token_accuracy": 0.8298773288726806, "num_tokens": 40800485.0, "step": 33970 }, { "entropy": 1.8901255756616593, "epoch": 0.10533502691886522, "grad_norm": 3.6833689212799072, "learning_rate": 7.794937992102e-06, "loss": 0.5683, "mean_token_accuracy": 0.8308203518390656, "num_tokens": 40812631.0, "step": 33980 }, { "entropy": 1.8272457644343376, "epoch": 0.10536602604391491, "grad_norm": 5.967985153198242, "learning_rate": 7.793791222350145e-06, "loss": 0.5022, "mean_token_accuracy": 0.8280576094985008, "num_tokens": 40825791.0, "step": 33990 }, { "entropy": 1.9373245880007743, "epoch": 0.10539702516896461, "grad_norm": 8.980064392089844, "learning_rate": 7.792644958578212e-06, "loss": 0.585, "mean_token_accuracy": 0.8223650082945824, "num_tokens": 40837496.0, "step": 34000 }, { "entropy": 1.9898645401000976, "epoch": 0.10542802429401431, "grad_norm": 9.691417694091797, "learning_rate": 7.79149920041423e-06, "loss": 0.5725, "mean_token_accuracy": 0.8341428533196449, "num_tokens": 40848939.0, "step": 34010 }, { "entropy": 1.9258566856384278, "epoch": 0.10545902341906399, "grad_norm": 9.314157485961914, "learning_rate": 7.790353947486607e-06, "loss": 0.5529, "mean_token_accuracy": 0.83617302775383, "num_tokens": 40861200.0, "step": 34020 }, { "entropy": 1.9404375448822975, "epoch": 0.10549002254411369, "grad_norm": 9.493596076965332, "learning_rate": 7.789209199424134e-06, "loss": 0.5846, "mean_token_accuracy": 0.8195881083607673, "num_tokens": 40872706.0, "step": 34030 }, { "entropy": 1.8327529534697533, "epoch": 0.10552102166916338, "grad_norm": 9.407135963439941, "learning_rate": 7.788064955855987e-06, "loss": 0.4977, "mean_token_accuracy": 0.8332900017499923, "num_tokens": 40885468.0, "step": 34040 }, { "entropy": 1.8383815258741378, "epoch": 0.10555202079421308, "grad_norm": 9.750924110412598, "learning_rate": 7.78692121641172e-06, "loss": 0.4872, "mean_token_accuracy": 0.830281549692154, "num_tokens": 40898646.0, "step": 34050 }, { "entropy": 1.9488092795014382, "epoch": 0.10558301991926278, "grad_norm": 8.798650741577148, "learning_rate": 7.785777980721267e-06, "loss": 0.5789, "mean_token_accuracy": 0.8310556679964065, "num_tokens": 40909910.0, "step": 34060 }, { "entropy": 1.8724893182516098, "epoch": 0.10561401904431247, "grad_norm": 8.707452774047852, "learning_rate": 7.784635248414945e-06, "loss": 0.5413, "mean_token_accuracy": 0.825086310505867, "num_tokens": 40921391.0, "step": 34070 }, { "entropy": 1.8841823562979698, "epoch": 0.10564501816936217, "grad_norm": 9.368388175964355, "learning_rate": 7.783493019123451e-06, "loss": 0.5391, "mean_token_accuracy": 0.8284827679395675, "num_tokens": 40933106.0, "step": 34080 }, { "entropy": 1.893832103908062, "epoch": 0.10567601729441187, "grad_norm": 9.662880897521973, "learning_rate": 7.78235129247786e-06, "loss": 0.5353, "mean_token_accuracy": 0.8264197260141373, "num_tokens": 40945947.0, "step": 34090 }, { "entropy": 1.959150141477585, "epoch": 0.10570701641946156, "grad_norm": 11.217707633972168, "learning_rate": 7.781210068109623e-06, "loss": 0.6322, "mean_token_accuracy": 0.8149213716387749, "num_tokens": 40957886.0, "step": 34100 }, { "entropy": 1.8911303892731666, "epoch": 0.10573801554451126, "grad_norm": 9.337475776672363, "learning_rate": 7.780069345650573e-06, "loss": 0.5306, "mean_token_accuracy": 0.8183432757854462, "num_tokens": 40969448.0, "step": 34110 }, { "entropy": 1.8776145607233048, "epoch": 0.10576901466956096, "grad_norm": 9.5222749710083, "learning_rate": 7.778929124732918e-06, "loss": 0.5401, "mean_token_accuracy": 0.8276842266321183, "num_tokens": 40981711.0, "step": 34120 }, { "entropy": 1.94348586499691, "epoch": 0.10580001379461065, "grad_norm": 8.874763488769531, "learning_rate": 7.777789404989248e-06, "loss": 0.5776, "mean_token_accuracy": 0.8309635683894158, "num_tokens": 40992978.0, "step": 34130 }, { "entropy": 1.8115389600396157, "epoch": 0.10583101291966034, "grad_norm": 8.382539749145508, "learning_rate": 7.776650186052521e-06, "loss": 0.4918, "mean_token_accuracy": 0.8349054649472236, "num_tokens": 41005680.0, "step": 34140 }, { "entropy": 1.8448505356907845, "epoch": 0.10586201204471003, "grad_norm": 4.7137451171875, "learning_rate": 7.77551146755608e-06, "loss": 0.4434, "mean_token_accuracy": 0.8491442829370499, "num_tokens": 41017977.0, "step": 34150 }, { "entropy": 1.8678378522396089, "epoch": 0.10589301116975973, "grad_norm": 8.694931983947754, "learning_rate": 7.774373249133641e-06, "loss": 0.5733, "mean_token_accuracy": 0.8227035477757454, "num_tokens": 41029814.0, "step": 34160 }, { "entropy": 1.9137464344501496, "epoch": 0.10592401029480943, "grad_norm": 8.952034950256348, "learning_rate": 7.773235530419292e-06, "loss": 0.558, "mean_token_accuracy": 0.8287974938750267, "num_tokens": 41041332.0, "step": 34170 }, { "entropy": 1.872650384902954, "epoch": 0.10595500941985912, "grad_norm": 9.672776222229004, "learning_rate": 7.7720983110475e-06, "loss": 0.5314, "mean_token_accuracy": 0.8209738954901695, "num_tokens": 41054678.0, "step": 34180 }, { "entropy": 1.8565445743501185, "epoch": 0.10598600854490882, "grad_norm": 2.5345826148986816, "learning_rate": 7.770961590653102e-06, "loss": 0.5304, "mean_token_accuracy": 0.8245491355657577, "num_tokens": 41067439.0, "step": 34190 }, { "entropy": 1.997004970908165, "epoch": 0.10601700766995852, "grad_norm": 10.320369720458984, "learning_rate": 7.769825368871312e-06, "loss": 0.5853, "mean_token_accuracy": 0.8295348644256592, "num_tokens": 41077968.0, "step": 34200 }, { "entropy": 1.916537807881832, "epoch": 0.10604800679500821, "grad_norm": 9.567767143249512, "learning_rate": 7.76868964533772e-06, "loss": 0.559, "mean_token_accuracy": 0.8223750725388527, "num_tokens": 41089515.0, "step": 34210 }, { "entropy": 1.8700389847159387, "epoch": 0.10607900592005791, "grad_norm": 8.427227020263672, "learning_rate": 7.767554419688279e-06, "loss": 0.5372, "mean_token_accuracy": 0.8274742797017097, "num_tokens": 41103388.0, "step": 34220 }, { "entropy": 1.920013178884983, "epoch": 0.1061100050451076, "grad_norm": 9.945453643798828, "learning_rate": 7.766419691559324e-06, "loss": 0.5577, "mean_token_accuracy": 0.8253174960613251, "num_tokens": 41116163.0, "step": 34230 }, { "entropy": 1.8836813405156136, "epoch": 0.1061410041701573, "grad_norm": 7.964671611785889, "learning_rate": 7.765285460587557e-06, "loss": 0.4761, "mean_token_accuracy": 0.8437875971198082, "num_tokens": 41128197.0, "step": 34240 }, { "entropy": 1.8750670045614242, "epoch": 0.106172003295207, "grad_norm": 8.137727737426758, "learning_rate": 7.764151726410055e-06, "loss": 0.5366, "mean_token_accuracy": 0.8270835474133491, "num_tokens": 41140601.0, "step": 34250 }, { "entropy": 1.9073902159929275, "epoch": 0.10620300242025668, "grad_norm": 8.645315170288086, "learning_rate": 7.76301848866426e-06, "loss": 0.5461, "mean_token_accuracy": 0.8317181885242462, "num_tokens": 41152349.0, "step": 34260 }, { "entropy": 1.9043108850717545, "epoch": 0.10623400154530638, "grad_norm": 10.44301700592041, "learning_rate": 7.761885746987988e-06, "loss": 0.5454, "mean_token_accuracy": 0.8260853886604309, "num_tokens": 41164311.0, "step": 34270 }, { "entropy": 1.791078907251358, "epoch": 0.10626500067035607, "grad_norm": 8.425952911376953, "learning_rate": 7.760753501019428e-06, "loss": 0.46, "mean_token_accuracy": 0.8489831522107124, "num_tokens": 41178238.0, "step": 34280 }, { "entropy": 1.8554448202252387, "epoch": 0.10629599979540577, "grad_norm": 8.66896915435791, "learning_rate": 7.759621750397129e-06, "loss": 0.5169, "mean_token_accuracy": 0.8302797332406044, "num_tokens": 41191021.0, "step": 34290 }, { "entropy": 1.9642982304096221, "epoch": 0.10632699892045547, "grad_norm": 9.48714828491211, "learning_rate": 7.758490494760018e-06, "loss": 0.5871, "mean_token_accuracy": 0.8185561686754227, "num_tokens": 41202431.0, "step": 34300 }, { "entropy": 1.8738651275634766, "epoch": 0.10635799804550516, "grad_norm": 4.17008113861084, "learning_rate": 7.757359733747389e-06, "loss": 0.5411, "mean_token_accuracy": 0.830881142616272, "num_tokens": 41215618.0, "step": 34310 }, { "entropy": 1.993169930577278, "epoch": 0.10638899717055486, "grad_norm": 8.395801544189453, "learning_rate": 7.756229466998896e-06, "loss": 0.6001, "mean_token_accuracy": 0.8117834225296974, "num_tokens": 41226702.0, "step": 34320 }, { "entropy": 1.8517363399267197, "epoch": 0.10641999629560456, "grad_norm": 8.656838417053223, "learning_rate": 7.755099694154571e-06, "loss": 0.5783, "mean_token_accuracy": 0.8280891165137291, "num_tokens": 41239449.0, "step": 34330 }, { "entropy": 1.9847760811448096, "epoch": 0.10645099542065425, "grad_norm": 8.594053268432617, "learning_rate": 7.753970414854808e-06, "loss": 0.6134, "mean_token_accuracy": 0.806470163166523, "num_tokens": 41250602.0, "step": 34340 }, { "entropy": 1.9564678460359572, "epoch": 0.10648199454570395, "grad_norm": 12.032402992248535, "learning_rate": 7.752841628740366e-06, "loss": 0.6145, "mean_token_accuracy": 0.8159340843558311, "num_tokens": 41262285.0, "step": 34350 }, { "entropy": 1.9261070042848587, "epoch": 0.10651299367075365, "grad_norm": 4.049191951751709, "learning_rate": 7.751713335452372e-06, "loss": 0.5818, "mean_token_accuracy": 0.8276063561439514, "num_tokens": 41273764.0, "step": 34360 }, { "entropy": 1.879560787975788, "epoch": 0.10654399279580334, "grad_norm": 8.463150024414062, "learning_rate": 7.750585534632318e-06, "loss": 0.525, "mean_token_accuracy": 0.8304709896445275, "num_tokens": 41285963.0, "step": 34370 }, { "entropy": 1.8946872353553772, "epoch": 0.10657499192085304, "grad_norm": 3.9842400550842285, "learning_rate": 7.74945822592206e-06, "loss": 0.5734, "mean_token_accuracy": 0.8228972956538201, "num_tokens": 41297942.0, "step": 34380 }, { "entropy": 1.9021208494901658, "epoch": 0.10660599104590272, "grad_norm": 8.44839096069336, "learning_rate": 7.748331408963822e-06, "loss": 0.5639, "mean_token_accuracy": 0.8181784346699714, "num_tokens": 41309011.0, "step": 34390 }, { "entropy": 1.9210452124476434, "epoch": 0.10663699017095242, "grad_norm": 7.22066593170166, "learning_rate": 7.747205083400192e-06, "loss": 0.5812, "mean_token_accuracy": 0.8235123857855797, "num_tokens": 41320533.0, "step": 34400 }, { "entropy": 1.8867201492190362, "epoch": 0.10666798929600212, "grad_norm": 5.259980201721191, "learning_rate": 7.746079248874114e-06, "loss": 0.577, "mean_token_accuracy": 0.8186955004930496, "num_tokens": 41333184.0, "step": 34410 }, { "entropy": 1.8663445100188256, "epoch": 0.10669898842105181, "grad_norm": 10.16250228881836, "learning_rate": 7.7449539050289e-06, "loss": 0.5772, "mean_token_accuracy": 0.8328115671873093, "num_tokens": 41344781.0, "step": 34420 }, { "entropy": 1.8742147445678712, "epoch": 0.10672998754610151, "grad_norm": 8.090653419494629, "learning_rate": 7.743829051508229e-06, "loss": 0.5541, "mean_token_accuracy": 0.8289492219686508, "num_tokens": 41356329.0, "step": 34430 }, { "entropy": 1.8659478917717933, "epoch": 0.1067609866711512, "grad_norm": 11.673904418945312, "learning_rate": 7.742704687956137e-06, "loss": 0.5138, "mean_token_accuracy": 0.8437393367290497, "num_tokens": 41368494.0, "step": 34440 }, { "entropy": 1.884896233677864, "epoch": 0.1067919857962009, "grad_norm": 8.397565841674805, "learning_rate": 7.741580814017023e-06, "loss": 0.5282, "mean_token_accuracy": 0.830049929022789, "num_tokens": 41380843.0, "step": 34450 }, { "entropy": 1.8628281027078628, "epoch": 0.1068229849212506, "grad_norm": 9.932318687438965, "learning_rate": 7.740457429335646e-06, "loss": 0.5343, "mean_token_accuracy": 0.8300712257623672, "num_tokens": 41393082.0, "step": 34460 }, { "entropy": 1.8951723709702493, "epoch": 0.1068539840463003, "grad_norm": 9.382803916931152, "learning_rate": 7.739334533557126e-06, "loss": 0.5893, "mean_token_accuracy": 0.8235040470957756, "num_tokens": 41404878.0, "step": 34470 }, { "entropy": 1.8519971266388893, "epoch": 0.10688498317134999, "grad_norm": 9.738779067993164, "learning_rate": 7.738212126326949e-06, "loss": 0.5648, "mean_token_accuracy": 0.8380891785025597, "num_tokens": 41416588.0, "step": 34480 }, { "entropy": 1.9151021018624306, "epoch": 0.10691598229639969, "grad_norm": 9.413227081298828, "learning_rate": 7.73709020729095e-06, "loss": 0.655, "mean_token_accuracy": 0.8125704079866409, "num_tokens": 41428429.0, "step": 34490 }, { "entropy": 1.9181360185146332, "epoch": 0.10694698142144939, "grad_norm": 8.843558311462402, "learning_rate": 7.735968776095331e-06, "loss": 0.5688, "mean_token_accuracy": 0.8319054901599884, "num_tokens": 41440103.0, "step": 34500 }, { "entropy": 1.8450947090983392, "epoch": 0.10697798054649907, "grad_norm": 9.410344123840332, "learning_rate": 7.734847832386653e-06, "loss": 0.5725, "mean_token_accuracy": 0.8249197214841842, "num_tokens": 41452461.0, "step": 34510 }, { "entropy": 1.8387939289212227, "epoch": 0.10700897967154877, "grad_norm": 3.964416980743408, "learning_rate": 7.73372737581183e-06, "loss": 0.5444, "mean_token_accuracy": 0.8225889384746552, "num_tokens": 41464924.0, "step": 34520 }, { "entropy": 1.929051786661148, "epoch": 0.10703997879659846, "grad_norm": 12.12761402130127, "learning_rate": 7.73260740601814e-06, "loss": 0.5192, "mean_token_accuracy": 0.8324173107743263, "num_tokens": 41475753.0, "step": 34530 }, { "entropy": 1.9498266860842706, "epoch": 0.10707097792164816, "grad_norm": 3.5817205905914307, "learning_rate": 7.731487922653216e-06, "loss": 0.6098, "mean_token_accuracy": 0.8192368969321251, "num_tokens": 41486719.0, "step": 34540 }, { "entropy": 1.7467003166675568, "epoch": 0.10710197704669785, "grad_norm": 4.695430278778076, "learning_rate": 7.730368925365049e-06, "loss": 0.4131, "mean_token_accuracy": 0.8428031265735626, "num_tokens": 41500593.0, "step": 34550 }, { "entropy": 1.9088023439049722, "epoch": 0.10713297617174755, "grad_norm": 9.617088317871094, "learning_rate": 7.72925041380198e-06, "loss": 0.5735, "mean_token_accuracy": 0.8277084872126579, "num_tokens": 41512514.0, "step": 34560 }, { "entropy": 1.9499268174171447, "epoch": 0.10716397529679725, "grad_norm": 9.283459663391113, "learning_rate": 7.728132387612718e-06, "loss": 0.6333, "mean_token_accuracy": 0.8034616574645043, "num_tokens": 41523613.0, "step": 34570 }, { "entropy": 1.857503816485405, "epoch": 0.10719497442184694, "grad_norm": 9.62498950958252, "learning_rate": 7.727014846446315e-06, "loss": 0.5637, "mean_token_accuracy": 0.8330330818891525, "num_tokens": 41536222.0, "step": 34580 }, { "entropy": 1.927737507224083, "epoch": 0.10722597354689664, "grad_norm": 7.827761173248291, "learning_rate": 7.72589778995219e-06, "loss": 0.6017, "mean_token_accuracy": 0.8250738069415092, "num_tokens": 41547116.0, "step": 34590 }, { "entropy": 1.894035741686821, "epoch": 0.10725697267194634, "grad_norm": 7.857428073883057, "learning_rate": 7.724781217780106e-06, "loss": 0.546, "mean_token_accuracy": 0.8303033024072647, "num_tokens": 41558322.0, "step": 34600 }, { "entropy": 1.8952913254499435, "epoch": 0.10728797179699603, "grad_norm": 7.348844051361084, "learning_rate": 7.723665129580187e-06, "loss": 0.6053, "mean_token_accuracy": 0.8175556018948555, "num_tokens": 41569188.0, "step": 34610 }, { "entropy": 1.8319330915808678, "epoch": 0.10731897092204573, "grad_norm": 4.2293901443481445, "learning_rate": 7.72254952500291e-06, "loss": 0.5056, "mean_token_accuracy": 0.8246565207839012, "num_tokens": 41582070.0, "step": 34620 }, { "entropy": 1.9387992650270462, "epoch": 0.10734997004709541, "grad_norm": 9.970220565795898, "learning_rate": 7.721434403699101e-06, "loss": 0.6521, "mean_token_accuracy": 0.8120075181126595, "num_tokens": 41593014.0, "step": 34630 }, { "entropy": 1.781299701333046, "epoch": 0.10738096917214511, "grad_norm": 4.431691646575928, "learning_rate": 7.720319765319946e-06, "loss": 0.4936, "mean_token_accuracy": 0.8438036203384399, "num_tokens": 41606201.0, "step": 34640 }, { "entropy": 1.902137640118599, "epoch": 0.10741196829719481, "grad_norm": 8.883857727050781, "learning_rate": 7.719205609516975e-06, "loss": 0.6318, "mean_token_accuracy": 0.8142830148339272, "num_tokens": 41616952.0, "step": 34650 }, { "entropy": 1.7906480133533478, "epoch": 0.1074429674222445, "grad_norm": 8.499350547790527, "learning_rate": 7.718091935942078e-06, "loss": 0.5041, "mean_token_accuracy": 0.8440255209803581, "num_tokens": 41629686.0, "step": 34660 }, { "entropy": 1.8703452154994011, "epoch": 0.1074739665472942, "grad_norm": 9.963065147399902, "learning_rate": 7.71697874424749e-06, "loss": 0.5693, "mean_token_accuracy": 0.8165108144283295, "num_tokens": 41641553.0, "step": 34670 }, { "entropy": 1.842675694823265, "epoch": 0.1075049656723439, "grad_norm": 7.852229118347168, "learning_rate": 7.7158660340858e-06, "loss": 0.5359, "mean_token_accuracy": 0.8368835672736168, "num_tokens": 41653222.0, "step": 34680 }, { "entropy": 1.8542825773358345, "epoch": 0.1075359647973936, "grad_norm": 8.768325805664062, "learning_rate": 7.71475380510995e-06, "loss": 0.5247, "mean_token_accuracy": 0.8340888366103172, "num_tokens": 41665255.0, "step": 34690 }, { "entropy": 1.796558152139187, "epoch": 0.10756696392244329, "grad_norm": 9.535799980163574, "learning_rate": 7.713642056973227e-06, "loss": 0.5226, "mean_token_accuracy": 0.8312591359019279, "num_tokens": 41677579.0, "step": 34700 }, { "entropy": 1.8598329350352287, "epoch": 0.10759796304749299, "grad_norm": 9.506328582763672, "learning_rate": 7.71253078932927e-06, "loss": 0.5596, "mean_token_accuracy": 0.8224905788898468, "num_tokens": 41688761.0, "step": 34710 }, { "entropy": 1.8570543482899666, "epoch": 0.10762896217254268, "grad_norm": 8.487010955810547, "learning_rate": 7.711420001832066e-06, "loss": 0.5343, "mean_token_accuracy": 0.8308738321065903, "num_tokens": 41700400.0, "step": 34720 }, { "entropy": 1.8241771847009658, "epoch": 0.10765996129759238, "grad_norm": 8.443219184875488, "learning_rate": 7.710309694135956e-06, "loss": 0.4889, "mean_token_accuracy": 0.8372260481119156, "num_tokens": 41712818.0, "step": 34730 }, { "entropy": 1.8751182019710542, "epoch": 0.10769096042264208, "grad_norm": 10.146086692810059, "learning_rate": 7.709199865895622e-06, "loss": 0.5334, "mean_token_accuracy": 0.8366193175315857, "num_tokens": 41724064.0, "step": 34740 }, { "entropy": 1.8233239591121673, "epoch": 0.10772195954769177, "grad_norm": 10.28363037109375, "learning_rate": 7.708090516766096e-06, "loss": 0.5517, "mean_token_accuracy": 0.8335344001650811, "num_tokens": 41736368.0, "step": 34750 }, { "entropy": 1.8685315743088722, "epoch": 0.10775295867274146, "grad_norm": 4.088825702667236, "learning_rate": 7.706981646402762e-06, "loss": 0.5963, "mean_token_accuracy": 0.8252598002552987, "num_tokens": 41747379.0, "step": 34760 }, { "entropy": 1.8391776710748673, "epoch": 0.10778395779779115, "grad_norm": 9.287007331848145, "learning_rate": 7.705873254461345e-06, "loss": 0.5292, "mean_token_accuracy": 0.8283318698406219, "num_tokens": 41759137.0, "step": 34770 }, { "entropy": 1.825559838116169, "epoch": 0.10781495692284085, "grad_norm": 8.258102416992188, "learning_rate": 7.704765340597917e-06, "loss": 0.5032, "mean_token_accuracy": 0.8404336258769035, "num_tokens": 41771923.0, "step": 34780 }, { "entropy": 1.876462672650814, "epoch": 0.10784595604789055, "grad_norm": 9.638961791992188, "learning_rate": 7.703657904468902e-06, "loss": 0.5253, "mean_token_accuracy": 0.8340917885303497, "num_tokens": 41784775.0, "step": 34790 }, { "entropy": 1.7784705072641374, "epoch": 0.10787695517294024, "grad_norm": 8.940130233764648, "learning_rate": 7.702550945731066e-06, "loss": 0.5115, "mean_token_accuracy": 0.831823606789112, "num_tokens": 41797943.0, "step": 34800 }, { "entropy": 1.882892769575119, "epoch": 0.10790795429798994, "grad_norm": 10.979787826538086, "learning_rate": 7.701444464041514e-06, "loss": 0.5337, "mean_token_accuracy": 0.827871498465538, "num_tokens": 41810184.0, "step": 34810 }, { "entropy": 1.912169161438942, "epoch": 0.10793895342303964, "grad_norm": 8.458621978759766, "learning_rate": 7.700338459057705e-06, "loss": 0.5786, "mean_token_accuracy": 0.8240523219108582, "num_tokens": 41821501.0, "step": 34820 }, { "entropy": 1.8121132165193559, "epoch": 0.10796995254808933, "grad_norm": 4.544675350189209, "learning_rate": 7.699232930437439e-06, "loss": 0.5187, "mean_token_accuracy": 0.8329327836632728, "num_tokens": 41834757.0, "step": 34830 }, { "entropy": 1.811763161420822, "epoch": 0.10800095167313903, "grad_norm": 5.41200590133667, "learning_rate": 7.698127877838858e-06, "loss": 0.4817, "mean_token_accuracy": 0.8376516059041024, "num_tokens": 41847967.0, "step": 34840 }, { "entropy": 1.9670744597911836, "epoch": 0.10803195079818873, "grad_norm": 8.704374313354492, "learning_rate": 7.69702330092045e-06, "loss": 0.5992, "mean_token_accuracy": 0.8268567532300949, "num_tokens": 41859345.0, "step": 34850 }, { "entropy": 1.9194496154785157, "epoch": 0.10806294992323842, "grad_norm": 4.117202281951904, "learning_rate": 7.695919199341043e-06, "loss": 0.5799, "mean_token_accuracy": 0.8342431426048279, "num_tokens": 41870624.0, "step": 34860 }, { "entropy": 1.8487198695540428, "epoch": 0.10809394904828812, "grad_norm": 4.074057102203369, "learning_rate": 7.694815572759812e-06, "loss": 0.4905, "mean_token_accuracy": 0.8297583907842636, "num_tokens": 41882966.0, "step": 34870 }, { "entropy": 1.8381000190973282, "epoch": 0.1081249481733378, "grad_norm": 7.816097736358643, "learning_rate": 7.693712420836265e-06, "loss": 0.4824, "mean_token_accuracy": 0.8306377053260803, "num_tokens": 41895795.0, "step": 34880 }, { "entropy": 1.9027506560087204, "epoch": 0.1081559472983875, "grad_norm": 8.619202613830566, "learning_rate": 7.692609743230265e-06, "loss": 0.5376, "mean_token_accuracy": 0.8351558074355125, "num_tokens": 41907196.0, "step": 34890 }, { "entropy": 1.937445905804634, "epoch": 0.1081869464234372, "grad_norm": 8.807971000671387, "learning_rate": 7.691507539602005e-06, "loss": 0.6096, "mean_token_accuracy": 0.8202361524105072, "num_tokens": 41918599.0, "step": 34900 }, { "entropy": 1.8574762254953385, "epoch": 0.10821794554848689, "grad_norm": 8.557307243347168, "learning_rate": 7.690405809612025e-06, "loss": 0.4873, "mean_token_accuracy": 0.8375605642795563, "num_tokens": 41930689.0, "step": 34910 }, { "entropy": 1.9088428899645806, "epoch": 0.10824894467353659, "grad_norm": 8.38049602508545, "learning_rate": 7.689304552921199e-06, "loss": 0.5719, "mean_token_accuracy": 0.8217715606093406, "num_tokens": 41941970.0, "step": 34920 }, { "entropy": 1.8612014800310135, "epoch": 0.10827994379858628, "grad_norm": 10.078642845153809, "learning_rate": 7.688203769190748e-06, "loss": 0.5714, "mean_token_accuracy": 0.8198879033327102, "num_tokens": 41954804.0, "step": 34930 }, { "entropy": 1.8768370226025581, "epoch": 0.10831094292363598, "grad_norm": 10.8411865234375, "learning_rate": 7.687103458082228e-06, "loss": 0.534, "mean_token_accuracy": 0.8370490476489068, "num_tokens": 41965644.0, "step": 34940 }, { "entropy": 1.8967427670955659, "epoch": 0.10834194204868568, "grad_norm": 9.28530502319336, "learning_rate": 7.686003619257535e-06, "loss": 0.6235, "mean_token_accuracy": 0.8072300642728806, "num_tokens": 41977403.0, "step": 34950 }, { "entropy": 1.8946552872657776, "epoch": 0.10837294117373537, "grad_norm": 4.223282337188721, "learning_rate": 7.684904252378904e-06, "loss": 0.6063, "mean_token_accuracy": 0.8308607801795006, "num_tokens": 41989197.0, "step": 34960 }, { "entropy": 1.9117924958467483, "epoch": 0.10840394029878507, "grad_norm": 12.681775093078613, "learning_rate": 7.683805357108907e-06, "loss": 0.5557, "mean_token_accuracy": 0.823358316719532, "num_tokens": 42000840.0, "step": 34970 }, { "entropy": 1.8810236111283303, "epoch": 0.10843493942383477, "grad_norm": 8.788151741027832, "learning_rate": 7.682706933110456e-06, "loss": 0.5739, "mean_token_accuracy": 0.8217835262417793, "num_tokens": 42013194.0, "step": 34980 }, { "entropy": 1.8799246475100517, "epoch": 0.10846593854888446, "grad_norm": 8.282735824584961, "learning_rate": 7.681608980046798e-06, "loss": 0.5044, "mean_token_accuracy": 0.8285661846399307, "num_tokens": 42025091.0, "step": 34990 }, { "entropy": 1.8529492557048797, "epoch": 0.10849693767393416, "grad_norm": 8.723884582519531, "learning_rate": 7.680511497581516e-06, "loss": 0.4907, "mean_token_accuracy": 0.8336036711931228, "num_tokens": 42038406.0, "step": 35000 }, { "entropy": 1.8494236335158347, "epoch": 0.10852793679898384, "grad_norm": 4.324466705322266, "learning_rate": 7.67941448537853e-06, "loss": 0.5534, "mean_token_accuracy": 0.8200669184327125, "num_tokens": 42051116.0, "step": 35010 }, { "entropy": 1.9139706775546075, "epoch": 0.10855893592403354, "grad_norm": 8.67935562133789, "learning_rate": 7.6783179431021e-06, "loss": 0.5685, "mean_token_accuracy": 0.8277176558971405, "num_tokens": 42063002.0, "step": 35020 }, { "entropy": 1.8269087105989457, "epoch": 0.10858993504908324, "grad_norm": 9.06470775604248, "learning_rate": 7.677221870416817e-06, "loss": 0.4758, "mean_token_accuracy": 0.8408965677022934, "num_tokens": 42075421.0, "step": 35030 }, { "entropy": 1.862933087348938, "epoch": 0.10862093417413293, "grad_norm": 9.062261581420898, "learning_rate": 7.676126266987606e-06, "loss": 0.5405, "mean_token_accuracy": 0.8264860972762108, "num_tokens": 42087233.0, "step": 35040 }, { "entropy": 1.874533024430275, "epoch": 0.10865193329918263, "grad_norm": 8.091785430908203, "learning_rate": 7.67503113247973e-06, "loss": 0.4997, "mean_token_accuracy": 0.8257019862532615, "num_tokens": 42099681.0, "step": 35050 }, { "entropy": 1.956665936112404, "epoch": 0.10868293242423233, "grad_norm": 8.70102310180664, "learning_rate": 7.673936466558786e-06, "loss": 0.6122, "mean_token_accuracy": 0.8198647305369378, "num_tokens": 42110874.0, "step": 35060 }, { "entropy": 1.912957863509655, "epoch": 0.10871393154928202, "grad_norm": 9.428136825561523, "learning_rate": 7.672842268890703e-06, "loss": 0.6027, "mean_token_accuracy": 0.8237558603286743, "num_tokens": 42122704.0, "step": 35070 }, { "entropy": 1.9081886559724808, "epoch": 0.10874493067433172, "grad_norm": 9.511113166809082, "learning_rate": 7.671748539141744e-06, "loss": 0.6123, "mean_token_accuracy": 0.8153191924095153, "num_tokens": 42135165.0, "step": 35080 }, { "entropy": 1.8714108362793922, "epoch": 0.10877592979938142, "grad_norm": 9.29901123046875, "learning_rate": 7.670655276978506e-06, "loss": 0.5503, "mean_token_accuracy": 0.8311486825346946, "num_tokens": 42147431.0, "step": 35090 }, { "entropy": 1.8901761874556542, "epoch": 0.10880692892443111, "grad_norm": 8.597390174865723, "learning_rate": 7.669562482067915e-06, "loss": 0.5282, "mean_token_accuracy": 0.8393839746713638, "num_tokens": 42159935.0, "step": 35100 }, { "entropy": 1.8835955768823625, "epoch": 0.10883792804948081, "grad_norm": 8.558197021484375, "learning_rate": 7.668470154077237e-06, "loss": 0.594, "mean_token_accuracy": 0.8140904232859612, "num_tokens": 42172790.0, "step": 35110 }, { "entropy": 1.9337810754776001, "epoch": 0.1088689271745305, "grad_norm": 8.871868133544922, "learning_rate": 7.667378292674056e-06, "loss": 0.5931, "mean_token_accuracy": 0.8145381569862366, "num_tokens": 42184215.0, "step": 35120 }, { "entropy": 1.8214157864451408, "epoch": 0.10889992629958019, "grad_norm": 4.693521499633789, "learning_rate": 7.666286897526304e-06, "loss": 0.4763, "mean_token_accuracy": 0.8404714584350585, "num_tokens": 42197411.0, "step": 35130 }, { "entropy": 1.8844437003135681, "epoch": 0.10893092542462988, "grad_norm": 9.958372116088867, "learning_rate": 7.66519596830223e-06, "loss": 0.5385, "mean_token_accuracy": 0.822202742099762, "num_tokens": 42209406.0, "step": 35140 }, { "entropy": 1.9353954821825028, "epoch": 0.10896192454967958, "grad_norm": 9.83383560180664, "learning_rate": 7.664105504670418e-06, "loss": 0.5952, "mean_token_accuracy": 0.8054858520627022, "num_tokens": 42221328.0, "step": 35150 }, { "entropy": 1.7801556140184402, "epoch": 0.10899292367472928, "grad_norm": 5.5207061767578125, "learning_rate": 7.663015506299786e-06, "loss": 0.4345, "mean_token_accuracy": 0.844315542280674, "num_tokens": 42234769.0, "step": 35160 }, { "entropy": 1.862704548239708, "epoch": 0.10902392279977897, "grad_norm": 7.391566276550293, "learning_rate": 7.661925972859575e-06, "loss": 0.4737, "mean_token_accuracy": 0.8368149012327194, "num_tokens": 42247060.0, "step": 35170 }, { "entropy": 1.9155489355325699, "epoch": 0.10905492192482867, "grad_norm": 4.612364768981934, "learning_rate": 7.66083690401936e-06, "loss": 0.5633, "mean_token_accuracy": 0.8262100204825401, "num_tokens": 42258503.0, "step": 35180 }, { "entropy": 1.9643469721078872, "epoch": 0.10908592104987837, "grad_norm": 9.24181079864502, "learning_rate": 7.659748299449044e-06, "loss": 0.6368, "mean_token_accuracy": 0.8126005738973617, "num_tokens": 42269513.0, "step": 35190 }, { "entropy": 1.8432982012629509, "epoch": 0.10911692017492806, "grad_norm": 9.919602394104004, "learning_rate": 7.658660158818853e-06, "loss": 0.5025, "mean_token_accuracy": 0.8412719219923019, "num_tokens": 42281589.0, "step": 35200 }, { "entropy": 1.8541820272803307, "epoch": 0.10914791929997776, "grad_norm": 4.436375617980957, "learning_rate": 7.657572481799348e-06, "loss": 0.5486, "mean_token_accuracy": 0.8266458585858345, "num_tokens": 42293563.0, "step": 35210 }, { "entropy": 1.8942555904388427, "epoch": 0.10917891842502746, "grad_norm": 10.810229301452637, "learning_rate": 7.656485268061414e-06, "loss": 0.5745, "mean_token_accuracy": 0.8252988263964653, "num_tokens": 42305355.0, "step": 35220 }, { "entropy": 1.8540264323353768, "epoch": 0.10920991755007715, "grad_norm": 8.089149475097656, "learning_rate": 7.655398517276262e-06, "loss": 0.5059, "mean_token_accuracy": 0.8406173124909401, "num_tokens": 42317042.0, "step": 35230 }, { "entropy": 1.831780730187893, "epoch": 0.10924091667512685, "grad_norm": 10.136151313781738, "learning_rate": 7.654312229115433e-06, "loss": 0.4951, "mean_token_accuracy": 0.8417727887630463, "num_tokens": 42329730.0, "step": 35240 }, { "entropy": 1.8968460232019424, "epoch": 0.10927191580017653, "grad_norm": 10.715241432189941, "learning_rate": 7.65322640325079e-06, "loss": 0.6043, "mean_token_accuracy": 0.8156336963176727, "num_tokens": 42341271.0, "step": 35250 }, { "entropy": 1.8117874696850778, "epoch": 0.10930291492522623, "grad_norm": 8.665305137634277, "learning_rate": 7.652141039354524e-06, "loss": 0.4908, "mean_token_accuracy": 0.8276491865515709, "num_tokens": 42354850.0, "step": 35260 }, { "entropy": 1.85145965218544, "epoch": 0.10933391405027593, "grad_norm": 10.74636173248291, "learning_rate": 7.651056137099154e-06, "loss": 0.512, "mean_token_accuracy": 0.8369231000542641, "num_tokens": 42367039.0, "step": 35270 }, { "entropy": 1.818227480351925, "epoch": 0.10936491317532562, "grad_norm": 10.717218399047852, "learning_rate": 7.649971696157518e-06, "loss": 0.4993, "mean_token_accuracy": 0.8339720323681832, "num_tokens": 42380069.0, "step": 35280 }, { "entropy": 1.8332565039396287, "epoch": 0.10939591230037532, "grad_norm": 8.914962768554688, "learning_rate": 7.648887716202781e-06, "loss": 0.503, "mean_token_accuracy": 0.8378045499324799, "num_tokens": 42392532.0, "step": 35290 }, { "entropy": 1.8315134570002556, "epoch": 0.10942691142542502, "grad_norm": 8.588201522827148, "learning_rate": 7.647804196908435e-06, "loss": 0.4782, "mean_token_accuracy": 0.8481560617685318, "num_tokens": 42404545.0, "step": 35300 }, { "entropy": 1.9187032520771026, "epoch": 0.10945791055047471, "grad_norm": 7.530204772949219, "learning_rate": 7.646721137948292e-06, "loss": 0.6584, "mean_token_accuracy": 0.8151246398687363, "num_tokens": 42416517.0, "step": 35310 }, { "entropy": 1.8617155611515046, "epoch": 0.10948890967552441, "grad_norm": 9.000489234924316, "learning_rate": 7.64563853899649e-06, "loss": 0.5521, "mean_token_accuracy": 0.837635052204132, "num_tokens": 42429370.0, "step": 35320 }, { "entropy": 1.9121372044086455, "epoch": 0.1095199088005741, "grad_norm": 11.23980712890625, "learning_rate": 7.644556399727486e-06, "loss": 0.5493, "mean_token_accuracy": 0.8292974248528481, "num_tokens": 42440341.0, "step": 35330 }, { "entropy": 1.9182278335094451, "epoch": 0.1095509079256238, "grad_norm": 10.349918365478516, "learning_rate": 7.643474719816064e-06, "loss": 0.5964, "mean_token_accuracy": 0.8224563241004944, "num_tokens": 42451755.0, "step": 35340 }, { "entropy": 1.9242574140429496, "epoch": 0.1095819070506735, "grad_norm": 8.09676742553711, "learning_rate": 7.642393498937326e-06, "loss": 0.5653, "mean_token_accuracy": 0.8311538890004158, "num_tokens": 42463039.0, "step": 35350 }, { "entropy": 1.9706586122512817, "epoch": 0.1096129061757232, "grad_norm": 10.670557022094727, "learning_rate": 7.6413127367667e-06, "loss": 0.6166, "mean_token_accuracy": 0.817748686671257, "num_tokens": 42473647.0, "step": 35360 }, { "entropy": 1.8795671597123147, "epoch": 0.10964390530077289, "grad_norm": 4.104644775390625, "learning_rate": 7.640232432979932e-06, "loss": 0.5604, "mean_token_accuracy": 0.8274060249328613, "num_tokens": 42485582.0, "step": 35370 }, { "entropy": 1.9466561555862427, "epoch": 0.10967490442582258, "grad_norm": 11.086281776428223, "learning_rate": 7.639152587253087e-06, "loss": 0.6171, "mean_token_accuracy": 0.8145519152283669, "num_tokens": 42496181.0, "step": 35380 }, { "entropy": 1.8248122036457062, "epoch": 0.10970590355087227, "grad_norm": 5.816926956176758, "learning_rate": 7.638073199262556e-06, "loss": 0.5611, "mean_token_accuracy": 0.8266668051481247, "num_tokens": 42509157.0, "step": 35390 }, { "entropy": 1.8483053863048553, "epoch": 0.10973690267592197, "grad_norm": 9.291295051574707, "learning_rate": 7.636994268685048e-06, "loss": 0.5601, "mean_token_accuracy": 0.8281493291258812, "num_tokens": 42520936.0, "step": 35400 }, { "entropy": 1.906061513721943, "epoch": 0.10976790180097167, "grad_norm": 9.213872909545898, "learning_rate": 7.635915795197586e-06, "loss": 0.577, "mean_token_accuracy": 0.8235763564705849, "num_tokens": 42531678.0, "step": 35410 }, { "entropy": 1.9281126588582993, "epoch": 0.10979890092602136, "grad_norm": 9.515471458435059, "learning_rate": 7.634837778477519e-06, "loss": 0.5632, "mean_token_accuracy": 0.8304232120513916, "num_tokens": 42542312.0, "step": 35420 }, { "entropy": 1.8663022175431252, "epoch": 0.10982990005107106, "grad_norm": 4.878605842590332, "learning_rate": 7.633760218202513e-06, "loss": 0.5287, "mean_token_accuracy": 0.8294124737381935, "num_tokens": 42554279.0, "step": 35430 }, { "entropy": 1.774543423950672, "epoch": 0.10986089917612075, "grad_norm": 5.2295002937316895, "learning_rate": 7.632683114050551e-06, "loss": 0.4696, "mean_token_accuracy": 0.8427405998110771, "num_tokens": 42567273.0, "step": 35440 }, { "entropy": 1.8794532790780067, "epoch": 0.10989189830117045, "grad_norm": 9.379169464111328, "learning_rate": 7.631606465699934e-06, "loss": 0.5689, "mean_token_accuracy": 0.8301954373717308, "num_tokens": 42579668.0, "step": 35450 }, { "entropy": 1.878871650993824, "epoch": 0.10992289742622015, "grad_norm": 4.546285152435303, "learning_rate": 7.630530272829285e-06, "loss": 0.5423, "mean_token_accuracy": 0.8306925192475318, "num_tokens": 42591107.0, "step": 35460 }, { "entropy": 1.7232867300510406, "epoch": 0.10995389655126984, "grad_norm": 4.568423748016357, "learning_rate": 7.629454535117535e-06, "loss": 0.3853, "mean_token_accuracy": 0.8541397914290428, "num_tokens": 42604850.0, "step": 35470 }, { "entropy": 1.8509867563843727, "epoch": 0.10998489567631954, "grad_norm": 9.155996322631836, "learning_rate": 7.6283792522439415e-06, "loss": 0.5216, "mean_token_accuracy": 0.8339457020163537, "num_tokens": 42616914.0, "step": 35480 }, { "entropy": 1.9788624405860902, "epoch": 0.11001589480136924, "grad_norm": 9.914673805236816, "learning_rate": 7.6273044238880745e-06, "loss": 0.6481, "mean_token_accuracy": 0.8120816603302956, "num_tokens": 42628068.0, "step": 35490 }, { "entropy": 1.8431833282113075, "epoch": 0.11004689392641892, "grad_norm": 8.730440139770508, "learning_rate": 7.626230049729815e-06, "loss": 0.5176, "mean_token_accuracy": 0.8376294538378716, "num_tokens": 42640526.0, "step": 35500 }, { "entropy": 1.8701015904545784, "epoch": 0.11007789305146862, "grad_norm": 9.417694091796875, "learning_rate": 7.625156129449368e-06, "loss": 0.5799, "mean_token_accuracy": 0.8195733115077019, "num_tokens": 42652641.0, "step": 35510 }, { "entropy": 1.9821488350629806, "epoch": 0.11010889217651831, "grad_norm": 7.322350025177002, "learning_rate": 7.624082662727249e-06, "loss": 0.6099, "mean_token_accuracy": 0.8140304252505303, "num_tokens": 42664155.0, "step": 35520 }, { "entropy": 1.8935043781995773, "epoch": 0.11013989130156801, "grad_norm": 9.178773880004883, "learning_rate": 7.623009649244287e-06, "loss": 0.513, "mean_token_accuracy": 0.8334458202123642, "num_tokens": 42676188.0, "step": 35530 }, { "entropy": 1.891274669766426, "epoch": 0.1101708904266177, "grad_norm": 10.043092727661133, "learning_rate": 7.62193708868163e-06, "loss": 0.5871, "mean_token_accuracy": 0.816723170876503, "num_tokens": 42688395.0, "step": 35540 }, { "entropy": 1.8267503723502159, "epoch": 0.1102018895516674, "grad_norm": 4.575459957122803, "learning_rate": 7.620864980720736e-06, "loss": 0.5009, "mean_token_accuracy": 0.8234075903892517, "num_tokens": 42701643.0, "step": 35550 }, { "entropy": 1.956119680404663, "epoch": 0.1102328886767171, "grad_norm": 7.073038101196289, "learning_rate": 7.619793325043378e-06, "loss": 0.6057, "mean_token_accuracy": 0.8232446178793907, "num_tokens": 42713319.0, "step": 35560 }, { "entropy": 1.9920398950576783, "epoch": 0.1102638878017668, "grad_norm": 8.376012802124023, "learning_rate": 7.618722121331642e-06, "loss": 0.598, "mean_token_accuracy": 0.8307427033782006, "num_tokens": 42724006.0, "step": 35570 }, { "entropy": 1.906696754693985, "epoch": 0.1102948869268165, "grad_norm": 4.264830589294434, "learning_rate": 7.617651369267926e-06, "loss": 0.591, "mean_token_accuracy": 0.8193379983305931, "num_tokens": 42735888.0, "step": 35580 }, { "entropy": 1.8427229791879653, "epoch": 0.11032588605186619, "grad_norm": 9.698928833007812, "learning_rate": 7.6165810685349415e-06, "loss": 0.5671, "mean_token_accuracy": 0.8317845165729523, "num_tokens": 42747806.0, "step": 35590 }, { "entropy": 1.8134034514427184, "epoch": 0.11035688517691589, "grad_norm": 3.2644388675689697, "learning_rate": 7.615511218815713e-06, "loss": 0.4577, "mean_token_accuracy": 0.8382085859775543, "num_tokens": 42761313.0, "step": 35600 }, { "entropy": 1.8178164064884186, "epoch": 0.11038788430196558, "grad_norm": 8.92131233215332, "learning_rate": 7.614441819793575e-06, "loss": 0.5052, "mean_token_accuracy": 0.8241424784064293, "num_tokens": 42774872.0, "step": 35610 }, { "entropy": 1.953722095489502, "epoch": 0.11041888342701527, "grad_norm": 10.026144981384277, "learning_rate": 7.61337287115217e-06, "loss": 0.5721, "mean_token_accuracy": 0.8248757779598236, "num_tokens": 42787444.0, "step": 35620 }, { "entropy": 1.8925233513116837, "epoch": 0.11044988255206496, "grad_norm": 9.82945728302002, "learning_rate": 7.612304372575457e-06, "loss": 0.5397, "mean_token_accuracy": 0.8335311338305473, "num_tokens": 42800219.0, "step": 35630 }, { "entropy": 1.9320378288626672, "epoch": 0.11048088167711466, "grad_norm": 8.172740936279297, "learning_rate": 7.611236323747706e-06, "loss": 0.567, "mean_token_accuracy": 0.8279871761798858, "num_tokens": 42811737.0, "step": 35640 }, { "entropy": 1.8768427297472954, "epoch": 0.11051188080216436, "grad_norm": 4.6713972091674805, "learning_rate": 7.610168724353488e-06, "loss": 0.5321, "mean_token_accuracy": 0.8223695576190948, "num_tokens": 42823969.0, "step": 35650 }, { "entropy": 1.8219685062766076, "epoch": 0.11054287992721405, "grad_norm": 8.601825714111328, "learning_rate": 7.6091015740776955e-06, "loss": 0.515, "mean_token_accuracy": 0.8326170966029167, "num_tokens": 42835925.0, "step": 35660 }, { "entropy": 1.9021424800157547, "epoch": 0.11057387905226375, "grad_norm": 8.71493148803711, "learning_rate": 7.608034872605521e-06, "loss": 0.5521, "mean_token_accuracy": 0.8241175979375839, "num_tokens": 42847533.0, "step": 35670 }, { "entropy": 1.8227483302354812, "epoch": 0.11060487817731345, "grad_norm": 9.012811660766602, "learning_rate": 7.606968619622469e-06, "loss": 0.4728, "mean_token_accuracy": 0.8344050481915474, "num_tokens": 42860072.0, "step": 35680 }, { "entropy": 1.9262940838932991, "epoch": 0.11063587730236314, "grad_norm": 8.894078254699707, "learning_rate": 7.605902814814354e-06, "loss": 0.586, "mean_token_accuracy": 0.8217165872454644, "num_tokens": 42871958.0, "step": 35690 }, { "entropy": 1.9645220905542373, "epoch": 0.11066687642741284, "grad_norm": 6.617010116577148, "learning_rate": 7.604837457867298e-06, "loss": 0.5833, "mean_token_accuracy": 0.8164415255188942, "num_tokens": 42882974.0, "step": 35700 }, { "entropy": 1.8369232729077338, "epoch": 0.11069787555246254, "grad_norm": 4.062877178192139, "learning_rate": 7.603772548467727e-06, "loss": 0.4606, "mean_token_accuracy": 0.8306289702653885, "num_tokens": 42895928.0, "step": 35710 }, { "entropy": 1.9046246379613876, "epoch": 0.11072887467751223, "grad_norm": 9.858223915100098, "learning_rate": 7.6027080863023806e-06, "loss": 0.5456, "mean_token_accuracy": 0.8280208811163903, "num_tokens": 42908519.0, "step": 35720 }, { "entropy": 1.861171054840088, "epoch": 0.11075987380256193, "grad_norm": 4.55070686340332, "learning_rate": 7.601644071058299e-06, "loss": 0.5389, "mean_token_accuracy": 0.8228657454252243, "num_tokens": 42921353.0, "step": 35730 }, { "entropy": 1.8522720783948898, "epoch": 0.11079087292761162, "grad_norm": 9.031878471374512, "learning_rate": 7.600580502422833e-06, "loss": 0.4988, "mean_token_accuracy": 0.8284451842308045, "num_tokens": 42934541.0, "step": 35740 }, { "entropy": 1.965764506161213, "epoch": 0.11082187205266131, "grad_norm": 9.1340970993042, "learning_rate": 7.59951738008364e-06, "loss": 0.5677, "mean_token_accuracy": 0.8256940588355064, "num_tokens": 42946350.0, "step": 35750 }, { "entropy": 1.9360377460718154, "epoch": 0.110852871177711, "grad_norm": 9.9710693359375, "learning_rate": 7.598454703728679e-06, "loss": 0.5423, "mean_token_accuracy": 0.8291813790798187, "num_tokens": 42957478.0, "step": 35760 }, { "entropy": 1.898517556488514, "epoch": 0.1108838703027607, "grad_norm": 11.122152328491211, "learning_rate": 7.597392473046215e-06, "loss": 0.5485, "mean_token_accuracy": 0.8223585531115531, "num_tokens": 42969607.0, "step": 35770 }, { "entropy": 1.9774446800351142, "epoch": 0.1109148694278104, "grad_norm": 8.738960266113281, "learning_rate": 7.596330687724825e-06, "loss": 0.5825, "mean_token_accuracy": 0.8237465664744377, "num_tokens": 42981808.0, "step": 35780 }, { "entropy": 2.0006524354219435, "epoch": 0.1109458685528601, "grad_norm": 10.508487701416016, "learning_rate": 7.595269347453383e-06, "loss": 0.5879, "mean_token_accuracy": 0.8199486985802651, "num_tokens": 42993002.0, "step": 35790 }, { "entropy": 1.9162476733326912, "epoch": 0.11097686767790979, "grad_norm": 8.760506629943848, "learning_rate": 7.5942084519210665e-06, "loss": 0.5071, "mean_token_accuracy": 0.8336918845772743, "num_tokens": 43005023.0, "step": 35800 }, { "entropy": 1.9015693604946136, "epoch": 0.11100786680295949, "grad_norm": 3.9503958225250244, "learning_rate": 7.5931480008173654e-06, "loss": 0.5675, "mean_token_accuracy": 0.8282639935612679, "num_tokens": 43017177.0, "step": 35810 }, { "entropy": 1.8436180412769319, "epoch": 0.11103886592800918, "grad_norm": 5.058619022369385, "learning_rate": 7.592087993832064e-06, "loss": 0.556, "mean_token_accuracy": 0.8285689920186996, "num_tokens": 43029713.0, "step": 35820 }, { "entropy": 1.934212613105774, "epoch": 0.11106986505305888, "grad_norm": 9.86316967010498, "learning_rate": 7.591028430655252e-06, "loss": 0.587, "mean_token_accuracy": 0.8228744760155677, "num_tokens": 43041006.0, "step": 35830 }, { "entropy": 1.9125287413597107, "epoch": 0.11110086417810858, "grad_norm": 11.178047180175781, "learning_rate": 7.589969310977325e-06, "loss": 0.5717, "mean_token_accuracy": 0.8275589749217034, "num_tokens": 43053262.0, "step": 35840 }, { "entropy": 1.890585133433342, "epoch": 0.11113186330315827, "grad_norm": 8.734784126281738, "learning_rate": 7.588910634488981e-06, "loss": 0.5319, "mean_token_accuracy": 0.8315401718020439, "num_tokens": 43064804.0, "step": 35850 }, { "entropy": 1.9079217493534089, "epoch": 0.11116286242820797, "grad_norm": 8.155106544494629, "learning_rate": 7.587852400881212e-06, "loss": 0.5076, "mean_token_accuracy": 0.8348406001925468, "num_tokens": 43076576.0, "step": 35860 }, { "entropy": 1.9703772380948066, "epoch": 0.11119386155325765, "grad_norm": 9.1405611038208, "learning_rate": 7.586794609845321e-06, "loss": 0.5764, "mean_token_accuracy": 0.8214395239949226, "num_tokens": 43087904.0, "step": 35870 }, { "entropy": 2.0060620248317718, "epoch": 0.11122486067830735, "grad_norm": 8.357717514038086, "learning_rate": 7.58573726107291e-06, "loss": 0.6403, "mean_token_accuracy": 0.8090036302804947, "num_tokens": 43098746.0, "step": 35880 }, { "entropy": 1.9552163541316987, "epoch": 0.11125585980335705, "grad_norm": 12.635717391967773, "learning_rate": 7.584680354255878e-06, "loss": 0.6179, "mean_token_accuracy": 0.809755727648735, "num_tokens": 43109605.0, "step": 35890 }, { "entropy": 1.9294922351837158, "epoch": 0.11128685892840674, "grad_norm": 7.567071437835693, "learning_rate": 7.583623889086426e-06, "loss": 0.5897, "mean_token_accuracy": 0.8201198145747185, "num_tokens": 43121734.0, "step": 35900 }, { "entropy": 1.8198191657662393, "epoch": 0.11131785805345644, "grad_norm": 10.164730072021484, "learning_rate": 7.5825678652570565e-06, "loss": 0.4756, "mean_token_accuracy": 0.8393372297286987, "num_tokens": 43135018.0, "step": 35910 }, { "entropy": 1.9463165909051896, "epoch": 0.11134885717850614, "grad_norm": 10.353556632995605, "learning_rate": 7.58151228246057e-06, "loss": 0.5536, "mean_token_accuracy": 0.8151269420981407, "num_tokens": 43146922.0, "step": 35920 }, { "entropy": 1.8669220060110092, "epoch": 0.11137985630355583, "grad_norm": 7.747506618499756, "learning_rate": 7.5804571403900685e-06, "loss": 0.4764, "mean_token_accuracy": 0.8381304860115051, "num_tokens": 43159179.0, "step": 35930 }, { "entropy": 1.975856500864029, "epoch": 0.11141085542860553, "grad_norm": 8.58327579498291, "learning_rate": 7.579402438738951e-06, "loss": 0.6303, "mean_token_accuracy": 0.8211771547794342, "num_tokens": 43170565.0, "step": 35940 }, { "entropy": 1.8953873470425606, "epoch": 0.11144185455365523, "grad_norm": 3.883213520050049, "learning_rate": 7.578348177200915e-06, "loss": 0.548, "mean_token_accuracy": 0.8355704694986343, "num_tokens": 43182660.0, "step": 35950 }, { "entropy": 1.9930355623364449, "epoch": 0.11147285367870492, "grad_norm": 7.887125015258789, "learning_rate": 7.577294355469956e-06, "loss": 0.5559, "mean_token_accuracy": 0.8245319411158561, "num_tokens": 43194697.0, "step": 35960 }, { "entropy": 1.9415101364254952, "epoch": 0.11150385280375462, "grad_norm": 9.421870231628418, "learning_rate": 7.576240973240371e-06, "loss": 0.5932, "mean_token_accuracy": 0.8234505504369736, "num_tokens": 43206210.0, "step": 35970 }, { "entropy": 1.9392582058906556, "epoch": 0.11153485192880432, "grad_norm": 8.467351913452148, "learning_rate": 7.575188030206747e-06, "loss": 0.5192, "mean_token_accuracy": 0.8313940614461899, "num_tokens": 43218118.0, "step": 35980 }, { "entropy": 1.9223637744784354, "epoch": 0.111565851053854, "grad_norm": 9.47497272491455, "learning_rate": 7.574135526063976e-06, "loss": 0.5115, "mean_token_accuracy": 0.840501819550991, "num_tokens": 43229669.0, "step": 35990 }, { "entropy": 2.0074471473693847, "epoch": 0.1115968501789037, "grad_norm": 10.357573509216309, "learning_rate": 7.5730834605072416e-06, "loss": 0.6127, "mean_token_accuracy": 0.8227631166577339, "num_tokens": 43240593.0, "step": 36000 }, { "entropy": 1.859610728919506, "epoch": 0.11162784930395339, "grad_norm": 9.257423400878906, "learning_rate": 7.5720318332320255e-06, "loss": 0.5297, "mean_token_accuracy": 0.8329069197177887, "num_tokens": 43253611.0, "step": 36010 }, { "entropy": 1.9028906911611556, "epoch": 0.11165884842900309, "grad_norm": 8.12290096282959, "learning_rate": 7.570980643934104e-06, "loss": 0.5245, "mean_token_accuracy": 0.8339066132903099, "num_tokens": 43265093.0, "step": 36020 }, { "entropy": 1.8995389580726623, "epoch": 0.11168984755405278, "grad_norm": 9.205475807189941, "learning_rate": 7.56992989230955e-06, "loss": 0.4958, "mean_token_accuracy": 0.8334958225488662, "num_tokens": 43277585.0, "step": 36030 }, { "entropy": 1.9100613698363305, "epoch": 0.11172084667910248, "grad_norm": 7.445013046264648, "learning_rate": 7.5688795780547335e-06, "loss": 0.4955, "mean_token_accuracy": 0.842551352083683, "num_tokens": 43289300.0, "step": 36040 }, { "entropy": 1.9307687520980834, "epoch": 0.11175184580415218, "grad_norm": 9.54047966003418, "learning_rate": 7.5678297008663135e-06, "loss": 0.5823, "mean_token_accuracy": 0.8196006685495376, "num_tokens": 43300995.0, "step": 36050 }, { "entropy": 1.9431654140353203, "epoch": 0.11178284492920187, "grad_norm": 8.615473747253418, "learning_rate": 7.566780260441252e-06, "loss": 0.6086, "mean_token_accuracy": 0.8174616977572441, "num_tokens": 43312218.0, "step": 36060 }, { "entropy": 1.8542766660451888, "epoch": 0.11181384405425157, "grad_norm": 8.529784202575684, "learning_rate": 7.565731256476797e-06, "loss": 0.5373, "mean_token_accuracy": 0.8297783643007278, "num_tokens": 43325617.0, "step": 36070 }, { "entropy": 1.9352862015366554, "epoch": 0.11184484317930127, "grad_norm": 8.217283248901367, "learning_rate": 7.564682688670496e-06, "loss": 0.5751, "mean_token_accuracy": 0.8212688997387886, "num_tokens": 43337245.0, "step": 36080 }, { "entropy": 1.972224122285843, "epoch": 0.11187584230435096, "grad_norm": 10.360859870910645, "learning_rate": 7.563634556720185e-06, "loss": 0.567, "mean_token_accuracy": 0.8349591135978699, "num_tokens": 43348686.0, "step": 36090 }, { "entropy": 1.9579823553562163, "epoch": 0.11190684142940066, "grad_norm": 9.387046813964844, "learning_rate": 7.562586860323996e-06, "loss": 0.5886, "mean_token_accuracy": 0.8227823153138161, "num_tokens": 43359689.0, "step": 36100 }, { "entropy": 1.9274040132761001, "epoch": 0.11193784055445036, "grad_norm": 4.451748371124268, "learning_rate": 7.561539599180354e-06, "loss": 0.5332, "mean_token_accuracy": 0.8205605849623681, "num_tokens": 43372242.0, "step": 36110 }, { "entropy": 1.8405972003936768, "epoch": 0.11196883967950004, "grad_norm": 4.114645481109619, "learning_rate": 7.560492772987975e-06, "loss": 0.4202, "mean_token_accuracy": 0.8403662115335464, "num_tokens": 43385797.0, "step": 36120 }, { "entropy": 1.9486818492412568, "epoch": 0.11199983880454974, "grad_norm": 4.341696739196777, "learning_rate": 7.5594463814458676e-06, "loss": 0.5247, "mean_token_accuracy": 0.839286656677723, "num_tokens": 43396906.0, "step": 36130 }, { "entropy": 1.9541378676891328, "epoch": 0.11203083792959943, "grad_norm": 9.982046127319336, "learning_rate": 7.558400424253328e-06, "loss": 0.5785, "mean_token_accuracy": 0.8157087191939354, "num_tokens": 43409073.0, "step": 36140 }, { "entropy": 1.9935852527618407, "epoch": 0.11206183705464913, "grad_norm": 8.649558067321777, "learning_rate": 7.557354901109952e-06, "loss": 0.5575, "mean_token_accuracy": 0.8184923201799392, "num_tokens": 43420300.0, "step": 36150 }, { "entropy": 1.9860820427536965, "epoch": 0.11209283617969883, "grad_norm": 9.3584566116333, "learning_rate": 7.556309811715618e-06, "loss": 0.5715, "mean_token_accuracy": 0.8259601920843125, "num_tokens": 43431655.0, "step": 36160 }, { "entropy": 1.9509647816419602, "epoch": 0.11212383530474852, "grad_norm": 10.400931358337402, "learning_rate": 7.5552651557705e-06, "loss": 0.598, "mean_token_accuracy": 0.8289371073246002, "num_tokens": 43443051.0, "step": 36170 }, { "entropy": 1.8806715980172157, "epoch": 0.11215483442979822, "grad_norm": 11.046425819396973, "learning_rate": 7.55422093297506e-06, "loss": 0.547, "mean_token_accuracy": 0.8374142736196518, "num_tokens": 43455604.0, "step": 36180 }, { "entropy": 1.9038266450166703, "epoch": 0.11218583355484792, "grad_norm": 4.295387268066406, "learning_rate": 7.553177143030047e-06, "loss": 0.5133, "mean_token_accuracy": 0.8337746739387513, "num_tokens": 43468108.0, "step": 36190 }, { "entropy": 1.8910601362586021, "epoch": 0.11221683267989761, "grad_norm": 8.46925163269043, "learning_rate": 7.5521337856365064e-06, "loss": 0.4906, "mean_token_accuracy": 0.8458377867937088, "num_tokens": 43480308.0, "step": 36200 }, { "entropy": 1.9406689956784249, "epoch": 0.11224783180494731, "grad_norm": 8.312734603881836, "learning_rate": 7.551090860495766e-06, "loss": 0.5991, "mean_token_accuracy": 0.8199841871857643, "num_tokens": 43492821.0, "step": 36210 }, { "entropy": 1.9295011416077614, "epoch": 0.112278830929997, "grad_norm": 4.657339096069336, "learning_rate": 7.550048367309445e-06, "loss": 0.5671, "mean_token_accuracy": 0.8267869040369987, "num_tokens": 43504911.0, "step": 36220 }, { "entropy": 1.8883660644292832, "epoch": 0.1123098300550467, "grad_norm": 12.443599700927734, "learning_rate": 7.54900630577945e-06, "loss": 0.5342, "mean_token_accuracy": 0.8197856619954109, "num_tokens": 43517464.0, "step": 36230 }, { "entropy": 1.9811781361699103, "epoch": 0.11234082918009639, "grad_norm": 11.399888038635254, "learning_rate": 7.547964675607977e-06, "loss": 0.6267, "mean_token_accuracy": 0.8182986229658127, "num_tokens": 43528891.0, "step": 36240 }, { "entropy": 1.847057183086872, "epoch": 0.11237182830514608, "grad_norm": 7.622710704803467, "learning_rate": 7.546923476497509e-06, "loss": 0.5199, "mean_token_accuracy": 0.8390681058168411, "num_tokens": 43541805.0, "step": 36250 }, { "entropy": 1.9104607298970222, "epoch": 0.11240282743019578, "grad_norm": 9.49293041229248, "learning_rate": 7.545882708150815e-06, "loss": 0.5786, "mean_token_accuracy": 0.8211572468280792, "num_tokens": 43552709.0, "step": 36260 }, { "entropy": 1.7649886459112167, "epoch": 0.11243382655524548, "grad_norm": 4.7445807456970215, "learning_rate": 7.544842370270952e-06, "loss": 0.4435, "mean_token_accuracy": 0.841453991830349, "num_tokens": 43566226.0, "step": 36270 }, { "entropy": 1.9194318518042564, "epoch": 0.11246482568029517, "grad_norm": 4.042590618133545, "learning_rate": 7.543802462561263e-06, "loss": 0.5773, "mean_token_accuracy": 0.8197874516248703, "num_tokens": 43578124.0, "step": 36280 }, { "entropy": 1.8747497126460075, "epoch": 0.11249582480534487, "grad_norm": 8.346511840820312, "learning_rate": 7.5427629847253766e-06, "loss": 0.52, "mean_token_accuracy": 0.8315855219960213, "num_tokens": 43590273.0, "step": 36290 }, { "entropy": 1.962376557290554, "epoch": 0.11252682393039456, "grad_norm": 7.562712669372559, "learning_rate": 7.541723936467211e-06, "loss": 0.541, "mean_token_accuracy": 0.8308800607919693, "num_tokens": 43601348.0, "step": 36300 }, { "entropy": 1.9383685559034347, "epoch": 0.11255782305544426, "grad_norm": 4.1252875328063965, "learning_rate": 7.540685317490964e-06, "loss": 0.6044, "mean_token_accuracy": 0.8207226291298866, "num_tokens": 43613086.0, "step": 36310 }, { "entropy": 1.9785435765981674, "epoch": 0.11258882218049396, "grad_norm": 9.106582641601562, "learning_rate": 7.539647127501121e-06, "loss": 0.6021, "mean_token_accuracy": 0.8204389974474907, "num_tokens": 43623726.0, "step": 36320 }, { "entropy": 1.9137952119112014, "epoch": 0.11261982130554365, "grad_norm": 8.635375022888184, "learning_rate": 7.5386093662024515e-06, "loss": 0.5612, "mean_token_accuracy": 0.8232575729489326, "num_tokens": 43635777.0, "step": 36330 }, { "entropy": 2.0330847591161727, "epoch": 0.11265082043059335, "grad_norm": 10.423239707946777, "learning_rate": 7.537572033300013e-06, "loss": 0.6766, "mean_token_accuracy": 0.8112093076109886, "num_tokens": 43646664.0, "step": 36340 }, { "entropy": 2.00158928334713, "epoch": 0.11268181955564305, "grad_norm": 8.458577156066895, "learning_rate": 7.536535128499144e-06, "loss": 0.5829, "mean_token_accuracy": 0.8329016759991645, "num_tokens": 43657222.0, "step": 36350 }, { "entropy": 1.9344168439507485, "epoch": 0.11271281868069273, "grad_norm": 5.806880950927734, "learning_rate": 7.535498651505465e-06, "loss": 0.5342, "mean_token_accuracy": 0.8308494806289672, "num_tokens": 43669133.0, "step": 36360 }, { "entropy": 1.9564279466867447, "epoch": 0.11274381780574243, "grad_norm": 7.9867329597473145, "learning_rate": 7.5344626020248825e-06, "loss": 0.5584, "mean_token_accuracy": 0.8347235858440399, "num_tokens": 43680521.0, "step": 36370 }, { "entropy": 1.9638302087783814, "epoch": 0.11277481693079212, "grad_norm": 9.185903549194336, "learning_rate": 7.533426979763585e-06, "loss": 0.6052, "mean_token_accuracy": 0.8131253823637963, "num_tokens": 43691642.0, "step": 36380 }, { "entropy": 1.910866443812847, "epoch": 0.11280581605584182, "grad_norm": 8.041609764099121, "learning_rate": 7.532391784428045e-06, "loss": 0.5606, "mean_token_accuracy": 0.8212843969464302, "num_tokens": 43703753.0, "step": 36390 }, { "entropy": 1.7988116875290872, "epoch": 0.11283681518089152, "grad_norm": 8.769583702087402, "learning_rate": 7.531357015725014e-06, "loss": 0.4508, "mean_token_accuracy": 0.8505406931042672, "num_tokens": 43716887.0, "step": 36400 }, { "entropy": 1.8762123107910156, "epoch": 0.11286781430594121, "grad_norm": 7.339711666107178, "learning_rate": 7.53032267336153e-06, "loss": 0.5858, "mean_token_accuracy": 0.8283303081989288, "num_tokens": 43728905.0, "step": 36410 }, { "entropy": 2.0162509113550184, "epoch": 0.11289881343099091, "grad_norm": 8.925753593444824, "learning_rate": 7.529288757044908e-06, "loss": 0.6202, "mean_token_accuracy": 0.8204985901713371, "num_tokens": 43739992.0, "step": 36420 }, { "entropy": 1.8500737398862839, "epoch": 0.1129298125560406, "grad_norm": 4.230498790740967, "learning_rate": 7.528255266482748e-06, "loss": 0.4843, "mean_token_accuracy": 0.8447869628667831, "num_tokens": 43752806.0, "step": 36430 }, { "entropy": 1.9196542382240296, "epoch": 0.1129608116810903, "grad_norm": 10.557394981384277, "learning_rate": 7.527222201382927e-06, "loss": 0.56, "mean_token_accuracy": 0.8287433817982673, "num_tokens": 43764530.0, "step": 36440 }, { "entropy": 1.937133614718914, "epoch": 0.11299181080614, "grad_norm": 9.187378883361816, "learning_rate": 7.526189561453605e-06, "loss": 0.5236, "mean_token_accuracy": 0.8356496065855026, "num_tokens": 43775958.0, "step": 36450 }, { "entropy": 1.8305316418409348, "epoch": 0.1130228099311897, "grad_norm": 10.253772735595703, "learning_rate": 7.525157346403224e-06, "loss": 0.4758, "mean_token_accuracy": 0.846756660938263, "num_tokens": 43788156.0, "step": 36460 }, { "entropy": 1.8645455896854402, "epoch": 0.1130538090562394, "grad_norm": 2.7411298751831055, "learning_rate": 7.5241255559405015e-06, "loss": 0.5632, "mean_token_accuracy": 0.8278805449604988, "num_tokens": 43800701.0, "step": 36470 }, { "entropy": 1.974195511639118, "epoch": 0.11308480818128909, "grad_norm": 4.0369110107421875, "learning_rate": 7.523094189774437e-06, "loss": 0.5814, "mean_token_accuracy": 0.8239115789532662, "num_tokens": 43811490.0, "step": 36480 }, { "entropy": 1.8882897064089774, "epoch": 0.11311580730633877, "grad_norm": 9.004681587219238, "learning_rate": 7.5220632476143095e-06, "loss": 0.5609, "mean_token_accuracy": 0.8223536923527718, "num_tokens": 43823579.0, "step": 36490 }, { "entropy": 1.8579727187752724, "epoch": 0.11314680643138847, "grad_norm": 4.422814846038818, "learning_rate": 7.521032729169676e-06, "loss": 0.5213, "mean_token_accuracy": 0.8264896929264068, "num_tokens": 43835758.0, "step": 36500 }, { "entropy": 1.8324081644415855, "epoch": 0.11317780555643817, "grad_norm": 4.702271461486816, "learning_rate": 7.520002634150373e-06, "loss": 0.4898, "mean_token_accuracy": 0.8371421471238136, "num_tokens": 43848630.0, "step": 36510 }, { "entropy": 1.8566824480891229, "epoch": 0.11320880468148786, "grad_norm": 4.19157075881958, "learning_rate": 7.518972962266511e-06, "loss": 0.6098, "mean_token_accuracy": 0.8109669283032417, "num_tokens": 43861241.0, "step": 36520 }, { "entropy": 1.8918891534209252, "epoch": 0.11323980380653756, "grad_norm": 4.325606346130371, "learning_rate": 7.517943713228485e-06, "loss": 0.5829, "mean_token_accuracy": 0.8150859504938126, "num_tokens": 43872873.0, "step": 36530 }, { "entropy": 1.8818762391805648, "epoch": 0.11327080293158726, "grad_norm": 8.26830768585205, "learning_rate": 7.516914886746964e-06, "loss": 0.5098, "mean_token_accuracy": 0.8375229358673095, "num_tokens": 43885092.0, "step": 36540 }, { "entropy": 1.8862257033586503, "epoch": 0.11330180205663695, "grad_norm": 4.080634117126465, "learning_rate": 7.515886482532891e-06, "loss": 0.5554, "mean_token_accuracy": 0.8279654592275619, "num_tokens": 43897930.0, "step": 36550 }, { "entropy": 1.9121008217334747, "epoch": 0.11333280118168665, "grad_norm": 7.579226970672607, "learning_rate": 7.5148585002974895e-06, "loss": 0.5392, "mean_token_accuracy": 0.8319680899381637, "num_tokens": 43910576.0, "step": 36560 }, { "entropy": 1.8942013755440712, "epoch": 0.11336380030673635, "grad_norm": 10.820945739746094, "learning_rate": 7.513830939752261e-06, "loss": 0.5398, "mean_token_accuracy": 0.8288922995328903, "num_tokens": 43922364.0, "step": 36570 }, { "entropy": 1.8880238860845566, "epoch": 0.11339479943178604, "grad_norm": 4.12148904800415, "learning_rate": 7.512803800608977e-06, "loss": 0.5554, "mean_token_accuracy": 0.8186680763959885, "num_tokens": 43934444.0, "step": 36580 }, { "entropy": 1.8842089757323266, "epoch": 0.11342579855683574, "grad_norm": 7.816810131072998, "learning_rate": 7.511777082579692e-06, "loss": 0.5072, "mean_token_accuracy": 0.8386390700936317, "num_tokens": 43946333.0, "step": 36590 }, { "entropy": 1.905991567671299, "epoch": 0.11345679768188544, "grad_norm": 9.459228515625, "learning_rate": 7.5107507853767304e-06, "loss": 0.556, "mean_token_accuracy": 0.8341964855790138, "num_tokens": 43958369.0, "step": 36600 }, { "entropy": 1.9278974682092667, "epoch": 0.11348779680693512, "grad_norm": 9.090117454528809, "learning_rate": 7.509724908712693e-06, "loss": 0.531, "mean_token_accuracy": 0.8365562587976456, "num_tokens": 43970116.0, "step": 36610 }, { "entropy": 1.95464196652174, "epoch": 0.11351879593198481, "grad_norm": 8.458623886108398, "learning_rate": 7.508699452300459e-06, "loss": 0.5787, "mean_token_accuracy": 0.830603589117527, "num_tokens": 43981286.0, "step": 36620 }, { "entropy": 1.839498682320118, "epoch": 0.11354979505703451, "grad_norm": 8.516286849975586, "learning_rate": 7.507674415853176e-06, "loss": 0.4453, "mean_token_accuracy": 0.8414850428700447, "num_tokens": 43993909.0, "step": 36630 }, { "entropy": 1.936448486149311, "epoch": 0.11358079418208421, "grad_norm": 4.347235202789307, "learning_rate": 7.506649799084268e-06, "loss": 0.5356, "mean_token_accuracy": 0.8339558869600296, "num_tokens": 44005833.0, "step": 36640 }, { "entropy": 1.884600205719471, "epoch": 0.1136117933071339, "grad_norm": 7.8523712158203125, "learning_rate": 7.505625601707435e-06, "loss": 0.517, "mean_token_accuracy": 0.8333307653665543, "num_tokens": 44018334.0, "step": 36650 }, { "entropy": 1.8594166815280915, "epoch": 0.1136427924321836, "grad_norm": 8.999347686767578, "learning_rate": 7.504601823436648e-06, "loss": 0.5549, "mean_token_accuracy": 0.8310374036431313, "num_tokens": 44030326.0, "step": 36660 }, { "entropy": 1.9014796167612076, "epoch": 0.1136737915572333, "grad_norm": 7.825422763824463, "learning_rate": 7.503578463986152e-06, "loss": 0.579, "mean_token_accuracy": 0.816353191435337, "num_tokens": 44042199.0, "step": 36670 }, { "entropy": 1.7826032146811486, "epoch": 0.113704790682283, "grad_norm": 9.484929084777832, "learning_rate": 7.502555523070463e-06, "loss": 0.4162, "mean_token_accuracy": 0.851135803759098, "num_tokens": 44055210.0, "step": 36680 }, { "entropy": 1.8730618610978127, "epoch": 0.11373578980733269, "grad_norm": 10.11883544921875, "learning_rate": 7.5015330004043705e-06, "loss": 0.534, "mean_token_accuracy": 0.8271143138408661, "num_tokens": 44066956.0, "step": 36690 }, { "entropy": 1.9547926247119904, "epoch": 0.11376678893238239, "grad_norm": 9.232172012329102, "learning_rate": 7.500510895702939e-06, "loss": 0.6011, "mean_token_accuracy": 0.8220591425895691, "num_tokens": 44078247.0, "step": 36700 }, { "entropy": 1.8781561613082887, "epoch": 0.11379778805743208, "grad_norm": 7.583634376525879, "learning_rate": 7.499489208681497e-06, "loss": 0.5071, "mean_token_accuracy": 0.8380724370479584, "num_tokens": 44089827.0, "step": 36710 }, { "entropy": 1.822493526339531, "epoch": 0.11382878718248178, "grad_norm": 4.266534328460693, "learning_rate": 7.498467939055656e-06, "loss": 0.4656, "mean_token_accuracy": 0.8341230794787406, "num_tokens": 44102605.0, "step": 36720 }, { "entropy": 1.8786309957504272, "epoch": 0.11385978630753146, "grad_norm": 9.360684394836426, "learning_rate": 7.497447086541285e-06, "loss": 0.5763, "mean_token_accuracy": 0.8306545913219452, "num_tokens": 44114636.0, "step": 36730 }, { "entropy": 1.904089206457138, "epoch": 0.11389078543258116, "grad_norm": 5.562136650085449, "learning_rate": 7.496426650854535e-06, "loss": 0.6014, "mean_token_accuracy": 0.8137870237231255, "num_tokens": 44127313.0, "step": 36740 }, { "entropy": 1.9380560591816902, "epoch": 0.11392178455763086, "grad_norm": 9.176344871520996, "learning_rate": 7.4954066317118205e-06, "loss": 0.5927, "mean_token_accuracy": 0.8196625456213951, "num_tokens": 44138997.0, "step": 36750 }, { "entropy": 1.881463260948658, "epoch": 0.11395278368268055, "grad_norm": 8.139053344726562, "learning_rate": 7.494387028829828e-06, "loss": 0.5145, "mean_token_accuracy": 0.8382967382669448, "num_tokens": 44151672.0, "step": 36760 }, { "entropy": 1.8621976956725121, "epoch": 0.11398378280773025, "grad_norm": 4.298003673553467, "learning_rate": 7.493367841925514e-06, "loss": 0.5409, "mean_token_accuracy": 0.8231415241956711, "num_tokens": 44164318.0, "step": 36770 }, { "entropy": 1.9737728282809257, "epoch": 0.11401478193277995, "grad_norm": 10.217061996459961, "learning_rate": 7.492349070716108e-06, "loss": 0.5793, "mean_token_accuracy": 0.8322141289710998, "num_tokens": 44175114.0, "step": 36780 }, { "entropy": 1.9328984022140503, "epoch": 0.11404578105782964, "grad_norm": 4.168199062347412, "learning_rate": 7.4913307149191e-06, "loss": 0.5295, "mean_token_accuracy": 0.8284803554415703, "num_tokens": 44187241.0, "step": 36790 }, { "entropy": 1.9318128556013108, "epoch": 0.11407678018287934, "grad_norm": 9.854540824890137, "learning_rate": 7.490312774252257e-06, "loss": 0.5912, "mean_token_accuracy": 0.8278173923492431, "num_tokens": 44197838.0, "step": 36800 }, { "entropy": 1.8671427622437478, "epoch": 0.11410777930792904, "grad_norm": 8.820489883422852, "learning_rate": 7.489295248433609e-06, "loss": 0.6118, "mean_token_accuracy": 0.8205173119902611, "num_tokens": 44209989.0, "step": 36810 }, { "entropy": 1.9128638491034509, "epoch": 0.11413877843297873, "grad_norm": 10.370189666748047, "learning_rate": 7.488278137181456e-06, "loss": 0.593, "mean_token_accuracy": 0.8210515394806862, "num_tokens": 44221247.0, "step": 36820 }, { "entropy": 1.8790946841239928, "epoch": 0.11416977755802843, "grad_norm": 8.685145378112793, "learning_rate": 7.48726144021437e-06, "loss": 0.512, "mean_token_accuracy": 0.8406370490789413, "num_tokens": 44233155.0, "step": 36830 }, { "entropy": 1.973842205107212, "epoch": 0.11420077668307813, "grad_norm": 9.793169021606445, "learning_rate": 7.48624515725118e-06, "loss": 0.5913, "mean_token_accuracy": 0.8233802810311317, "num_tokens": 44243970.0, "step": 36840 }, { "entropy": 1.800303579866886, "epoch": 0.11423177580812782, "grad_norm": 9.683552742004395, "learning_rate": 7.485229288010991e-06, "loss": 0.4214, "mean_token_accuracy": 0.8449695229530334, "num_tokens": 44257962.0, "step": 36850 }, { "entropy": 1.8852539584040642, "epoch": 0.1142627749331775, "grad_norm": 8.617185592651367, "learning_rate": 7.484213832213174e-06, "loss": 0.531, "mean_token_accuracy": 0.8190362945199012, "num_tokens": 44270821.0, "step": 36860 }, { "entropy": 1.8611651435494423, "epoch": 0.1142937740582272, "grad_norm": 3.9128634929656982, "learning_rate": 7.483198789577362e-06, "loss": 0.5256, "mean_token_accuracy": 0.825941427052021, "num_tokens": 44283425.0, "step": 36870 }, { "entropy": 1.863780789077282, "epoch": 0.1143247731832769, "grad_norm": 9.122457504272461, "learning_rate": 7.482184159823459e-06, "loss": 0.5205, "mean_token_accuracy": 0.8322040572762489, "num_tokens": 44295820.0, "step": 36880 }, { "entropy": 1.8578864350914954, "epoch": 0.1143557723083266, "grad_norm": 4.551950931549072, "learning_rate": 7.481169942671628e-06, "loss": 0.5337, "mean_token_accuracy": 0.8270832076668739, "num_tokens": 44308102.0, "step": 36890 }, { "entropy": 1.9780358895659447, "epoch": 0.11438677143337629, "grad_norm": 9.209576606750488, "learning_rate": 7.480156137842306e-06, "loss": 0.5974, "mean_token_accuracy": 0.8193370506167412, "num_tokens": 44319023.0, "step": 36900 }, { "entropy": 1.9365268610417843, "epoch": 0.11441777055842599, "grad_norm": 9.44648551940918, "learning_rate": 7.479142745056188e-06, "loss": 0.5477, "mean_token_accuracy": 0.8305159211158752, "num_tokens": 44330798.0, "step": 36910 }, { "entropy": 1.9427412554621697, "epoch": 0.11444876968347568, "grad_norm": 9.822259902954102, "learning_rate": 7.478129764034238e-06, "loss": 0.5697, "mean_token_accuracy": 0.8291716650128365, "num_tokens": 44341817.0, "step": 36920 }, { "entropy": 1.884063209593296, "epoch": 0.11447976880852538, "grad_norm": 9.40356731414795, "learning_rate": 7.477117194497685e-06, "loss": 0.5212, "mean_token_accuracy": 0.8296123817563057, "num_tokens": 44354019.0, "step": 36930 }, { "entropy": 1.8861080020666123, "epoch": 0.11451076793357508, "grad_norm": 10.153528213500977, "learning_rate": 7.476105036168018e-06, "loss": 0.548, "mean_token_accuracy": 0.8213342532515526, "num_tokens": 44366926.0, "step": 36940 }, { "entropy": 1.8933350324630738, "epoch": 0.11454176705862477, "grad_norm": 9.459976196289062, "learning_rate": 7.475093288766992e-06, "loss": 0.5634, "mean_token_accuracy": 0.828109310567379, "num_tokens": 44378877.0, "step": 36950 }, { "entropy": 1.8734344974160195, "epoch": 0.11457276618367447, "grad_norm": 8.441810607910156, "learning_rate": 7.474081952016626e-06, "loss": 0.5369, "mean_token_accuracy": 0.8315728649497032, "num_tokens": 44392038.0, "step": 36960 }, { "entropy": 1.874902254343033, "epoch": 0.11460376530872417, "grad_norm": 4.568936347961426, "learning_rate": 7.473071025639202e-06, "loss": 0.5017, "mean_token_accuracy": 0.8361835688352585, "num_tokens": 44404425.0, "step": 36970 }, { "entropy": 1.9008426293730736, "epoch": 0.11463476443377385, "grad_norm": 9.730356216430664, "learning_rate": 7.4720605093572664e-06, "loss": 0.5532, "mean_token_accuracy": 0.8256875947117805, "num_tokens": 44416109.0, "step": 36980 }, { "entropy": 1.925182616710663, "epoch": 0.11466576355882355, "grad_norm": 3.8875186443328857, "learning_rate": 7.471050402893625e-06, "loss": 0.611, "mean_token_accuracy": 0.8156601503491402, "num_tokens": 44427762.0, "step": 36990 }, { "entropy": 1.8984755888581275, "epoch": 0.11469676268387324, "grad_norm": 8.992522239685059, "learning_rate": 7.470040705971346e-06, "loss": 0.5251, "mean_token_accuracy": 0.8337834537029266, "num_tokens": 44439999.0, "step": 37000 }, { "entropy": 1.9663574412465095, "epoch": 0.11472776180892294, "grad_norm": 8.80916690826416, "learning_rate": 7.469031418313763e-06, "loss": 0.571, "mean_token_accuracy": 0.8257024556398391, "num_tokens": 44451506.0, "step": 37010 }, { "entropy": 1.8491823315620421, "epoch": 0.11475876093397264, "grad_norm": 4.779745101928711, "learning_rate": 7.46802253964447e-06, "loss": 0.5263, "mean_token_accuracy": 0.8266303971409797, "num_tokens": 44464222.0, "step": 37020 }, { "entropy": 1.9856320530176164, "epoch": 0.11478976005902233, "grad_norm": 7.643355846405029, "learning_rate": 7.46701406968732e-06, "loss": 0.6223, "mean_token_accuracy": 0.8207634121179581, "num_tokens": 44475276.0, "step": 37030 }, { "entropy": 1.940791991353035, "epoch": 0.11482075918407203, "grad_norm": 5.0710954666137695, "learning_rate": 7.466006008166425e-06, "loss": 0.5636, "mean_token_accuracy": 0.8188979029655457, "num_tokens": 44487103.0, "step": 37040 }, { "entropy": 1.9016083896160125, "epoch": 0.11485175830912173, "grad_norm": 4.698379039764404, "learning_rate": 7.464998354806166e-06, "loss": 0.5691, "mean_token_accuracy": 0.8199516862630845, "num_tokens": 44498694.0, "step": 37050 }, { "entropy": 1.9881937965750693, "epoch": 0.11488275743417142, "grad_norm": 9.964612007141113, "learning_rate": 7.463991109331177e-06, "loss": 0.618, "mean_token_accuracy": 0.8096556261181831, "num_tokens": 44509990.0, "step": 37060 }, { "entropy": 1.884983916580677, "epoch": 0.11491375655922112, "grad_norm": 9.16238021850586, "learning_rate": 7.462984271466356e-06, "loss": 0.4769, "mean_token_accuracy": 0.8483946248888969, "num_tokens": 44521793.0, "step": 37070 }, { "entropy": 1.9347106859087944, "epoch": 0.11494475568427082, "grad_norm": 8.995933532714844, "learning_rate": 7.461977840936856e-06, "loss": 0.5537, "mean_token_accuracy": 0.8224361136555671, "num_tokens": 44533297.0, "step": 37080 }, { "entropy": 1.9060018703341484, "epoch": 0.11497575480932051, "grad_norm": 8.693288803100586, "learning_rate": 7.460971817468093e-06, "loss": 0.5297, "mean_token_accuracy": 0.8281079009175301, "num_tokens": 44545609.0, "step": 37090 }, { "entropy": 1.8859413847327233, "epoch": 0.1150067539343702, "grad_norm": 7.988814353942871, "learning_rate": 7.459966200785744e-06, "loss": 0.4902, "mean_token_accuracy": 0.8470205634832382, "num_tokens": 44557728.0, "step": 37100 }, { "entropy": 1.8287798702716827, "epoch": 0.11503775305941989, "grad_norm": 7.7615437507629395, "learning_rate": 7.45896099061574e-06, "loss": 0.5346, "mean_token_accuracy": 0.8320494025945664, "num_tokens": 44571340.0, "step": 37110 }, { "entropy": 1.91844123005867, "epoch": 0.11506875218446959, "grad_norm": 11.6615629196167, "learning_rate": 7.457956186684274e-06, "loss": 0.5549, "mean_token_accuracy": 0.8266859829425812, "num_tokens": 44582865.0, "step": 37120 }, { "entropy": 1.7767401605844497, "epoch": 0.11509975130951929, "grad_norm": 7.414669513702393, "learning_rate": 7.4569517887177935e-06, "loss": 0.4635, "mean_token_accuracy": 0.8381605073809624, "num_tokens": 44596679.0, "step": 37130 }, { "entropy": 1.9388007283210755, "epoch": 0.11513075043456898, "grad_norm": 11.721390724182129, "learning_rate": 7.455947796443009e-06, "loss": 0.6072, "mean_token_accuracy": 0.8257848516106605, "num_tokens": 44608325.0, "step": 37140 }, { "entropy": 1.9356591030955315, "epoch": 0.11516174955961868, "grad_norm": 7.786122798919678, "learning_rate": 7.454944209586882e-06, "loss": 0.5724, "mean_token_accuracy": 0.8272521525621415, "num_tokens": 44619842.0, "step": 37150 }, { "entropy": 1.8918299242854117, "epoch": 0.11519274868466838, "grad_norm": 10.668729782104492, "learning_rate": 7.453941027876637e-06, "loss": 0.5737, "mean_token_accuracy": 0.8250093191862107, "num_tokens": 44631902.0, "step": 37160 }, { "entropy": 1.7994364276528358, "epoch": 0.11522374780971807, "grad_norm": 6.118994235992432, "learning_rate": 7.452938251039754e-06, "loss": 0.4845, "mean_token_accuracy": 0.8353979960083961, "num_tokens": 44645016.0, "step": 37170 }, { "entropy": 1.9420592486858368, "epoch": 0.11525474693476777, "grad_norm": 9.136016845703125, "learning_rate": 7.451935878803968e-06, "loss": 0.5985, "mean_token_accuracy": 0.8215920001268386, "num_tokens": 44656784.0, "step": 37180 }, { "entropy": 1.9028725042939185, "epoch": 0.11528574605981746, "grad_norm": 7.933960914611816, "learning_rate": 7.45093391089727e-06, "loss": 0.5122, "mean_token_accuracy": 0.8299235314130783, "num_tokens": 44669372.0, "step": 37190 }, { "entropy": 1.8886778056621552, "epoch": 0.11531674518486716, "grad_norm": 8.652473449707031, "learning_rate": 7.4499323470479075e-06, "loss": 0.4984, "mean_token_accuracy": 0.835668683052063, "num_tokens": 44681290.0, "step": 37200 }, { "entropy": 1.902733063697815, "epoch": 0.11534774430991686, "grad_norm": 9.28559684753418, "learning_rate": 7.448931186984387e-06, "loss": 0.5584, "mean_token_accuracy": 0.8212042555212975, "num_tokens": 44692941.0, "step": 37210 }, { "entropy": 1.900286576151848, "epoch": 0.11537874343496655, "grad_norm": 8.697643280029297, "learning_rate": 7.447930430435463e-06, "loss": 0.5436, "mean_token_accuracy": 0.8290207460522652, "num_tokens": 44704646.0, "step": 37220 }, { "entropy": 1.8271199837327003, "epoch": 0.11540974256001624, "grad_norm": 10.892705917358398, "learning_rate": 7.446930077130154e-06, "loss": 0.4804, "mean_token_accuracy": 0.8422973319888115, "num_tokens": 44717316.0, "step": 37230 }, { "entropy": 1.875792995095253, "epoch": 0.11544074168506593, "grad_norm": 8.272143363952637, "learning_rate": 7.445930126797723e-06, "loss": 0.5986, "mean_token_accuracy": 0.8243017882108689, "num_tokens": 44729226.0, "step": 37240 }, { "entropy": 1.8794658452272415, "epoch": 0.11547174081011563, "grad_norm": 8.320490837097168, "learning_rate": 7.444930579167699e-06, "loss": 0.5515, "mean_token_accuracy": 0.8193329513072968, "num_tokens": 44740879.0, "step": 37250 }, { "entropy": 1.8461437433958054, "epoch": 0.11550273993516533, "grad_norm": 7.777599334716797, "learning_rate": 7.443931433969854e-06, "loss": 0.505, "mean_token_accuracy": 0.8361176624894142, "num_tokens": 44753425.0, "step": 37260 }, { "entropy": 1.9898195713758469, "epoch": 0.11553373906021502, "grad_norm": 12.342960357666016, "learning_rate": 7.442932690934222e-06, "loss": 0.6227, "mean_token_accuracy": 0.8236858904361725, "num_tokens": 44764029.0, "step": 37270 }, { "entropy": 1.850753267109394, "epoch": 0.11556473818526472, "grad_norm": 10.538897514343262, "learning_rate": 7.441934349791088e-06, "loss": 0.533, "mean_token_accuracy": 0.8256380185484886, "num_tokens": 44775779.0, "step": 37280 }, { "entropy": 1.7188558861613275, "epoch": 0.11559573731031442, "grad_norm": 4.442847728729248, "learning_rate": 7.440936410270987e-06, "loss": 0.3958, "mean_token_accuracy": 0.8477207824587822, "num_tokens": 44789900.0, "step": 37290 }, { "entropy": 1.854294976592064, "epoch": 0.11562673643536411, "grad_norm": 9.337111473083496, "learning_rate": 7.439938872104712e-06, "loss": 0.5511, "mean_token_accuracy": 0.8315454825758934, "num_tokens": 44801588.0, "step": 37300 }, { "entropy": 1.7844763696193695, "epoch": 0.11565773556041381, "grad_norm": 8.090426445007324, "learning_rate": 7.438941735023301e-06, "loss": 0.4811, "mean_token_accuracy": 0.8366570591926574, "num_tokens": 44813896.0, "step": 37310 }, { "entropy": 1.808639107644558, "epoch": 0.1156887346854635, "grad_norm": 4.602088451385498, "learning_rate": 7.437944998758055e-06, "loss": 0.5219, "mean_token_accuracy": 0.8374237030744552, "num_tokens": 44826692.0, "step": 37320 }, { "entropy": 1.9111674383282662, "epoch": 0.1157197338105132, "grad_norm": 9.890421867370605, "learning_rate": 7.436948663040519e-06, "loss": 0.5756, "mean_token_accuracy": 0.8278648778796196, "num_tokens": 44838639.0, "step": 37330 }, { "entropy": 1.9008503317832948, "epoch": 0.1157507329355629, "grad_norm": 10.691512107849121, "learning_rate": 7.435952727602491e-06, "loss": 0.5903, "mean_token_accuracy": 0.8256294175982475, "num_tokens": 44850123.0, "step": 37340 }, { "entropy": 1.9054113239049912, "epoch": 0.11578173206061258, "grad_norm": 9.177732467651367, "learning_rate": 7.434957192176021e-06, "loss": 0.5876, "mean_token_accuracy": 0.8166389659047126, "num_tokens": 44862112.0, "step": 37350 }, { "entropy": 1.7511047348380089, "epoch": 0.11581273118566228, "grad_norm": 9.275190353393555, "learning_rate": 7.4339620564934135e-06, "loss": 0.4522, "mean_token_accuracy": 0.8530907735228539, "num_tokens": 44875516.0, "step": 37360 }, { "entropy": 1.7619803249835968, "epoch": 0.11584373031071198, "grad_norm": 5.685603618621826, "learning_rate": 7.432967320287217e-06, "loss": 0.4322, "mean_token_accuracy": 0.834882402420044, "num_tokens": 44889047.0, "step": 37370 }, { "entropy": 1.8839653521776198, "epoch": 0.11587472943576167, "grad_norm": 9.889922142028809, "learning_rate": 7.431972983290233e-06, "loss": 0.5696, "mean_token_accuracy": 0.820878466963768, "num_tokens": 44901808.0, "step": 37380 }, { "entropy": 1.9028914406895638, "epoch": 0.11590572856081137, "grad_norm": 9.42833423614502, "learning_rate": 7.430979045235518e-06, "loss": 0.5525, "mean_token_accuracy": 0.8282787010073662, "num_tokens": 44912993.0, "step": 37390 }, { "entropy": 1.9422337591648102, "epoch": 0.11593672768586107, "grad_norm": 10.66053295135498, "learning_rate": 7.429985505856372e-06, "loss": 0.6107, "mean_token_accuracy": 0.813813716173172, "num_tokens": 44924256.0, "step": 37400 }, { "entropy": 1.8587273508310318, "epoch": 0.11596772681091076, "grad_norm": 7.039865493774414, "learning_rate": 7.428992364886347e-06, "loss": 0.5092, "mean_token_accuracy": 0.8369274318218232, "num_tokens": 44935852.0, "step": 37410 }, { "entropy": 1.8579163044691085, "epoch": 0.11599872593596046, "grad_norm": 3.244483470916748, "learning_rate": 7.427999622059245e-06, "loss": 0.5921, "mean_token_accuracy": 0.8229958653450012, "num_tokens": 44948437.0, "step": 37420 }, { "entropy": 1.8869281217455864, "epoch": 0.11602972506101016, "grad_norm": 8.914671897888184, "learning_rate": 7.427007277109115e-06, "loss": 0.5231, "mean_token_accuracy": 0.8428304120898247, "num_tokens": 44959542.0, "step": 37430 }, { "entropy": 1.88766710460186, "epoch": 0.11606072418605985, "grad_norm": 8.622590065002441, "learning_rate": 7.426015329770258e-06, "loss": 0.5914, "mean_token_accuracy": 0.8276036903262138, "num_tokens": 44970414.0, "step": 37440 }, { "entropy": 1.8154839545488357, "epoch": 0.11609172331110955, "grad_norm": 9.341155052185059, "learning_rate": 7.425023779777217e-06, "loss": 0.5234, "mean_token_accuracy": 0.8210044398903846, "num_tokens": 44983628.0, "step": 37450 }, { "entropy": 1.8794740170240403, "epoch": 0.11612272243615925, "grad_norm": 8.086356163024902, "learning_rate": 7.424032626864791e-06, "loss": 0.6234, "mean_token_accuracy": 0.8183172300457955, "num_tokens": 44994394.0, "step": 37460 }, { "entropy": 1.8632339894771577, "epoch": 0.11615372156120893, "grad_norm": 7.764867782592773, "learning_rate": 7.423041870768022e-06, "loss": 0.5312, "mean_token_accuracy": 0.8289612337946892, "num_tokens": 45005298.0, "step": 37470 }, { "entropy": 1.8341542795300483, "epoch": 0.11618472068625862, "grad_norm": 8.701674461364746, "learning_rate": 7.422051511222199e-06, "loss": 0.506, "mean_token_accuracy": 0.8334753260016441, "num_tokens": 45018299.0, "step": 37480 }, { "entropy": 1.915319959819317, "epoch": 0.11621571981130832, "grad_norm": 9.27292251586914, "learning_rate": 7.4210615479628625e-06, "loss": 0.5573, "mean_token_accuracy": 0.8271935701370239, "num_tokens": 45029787.0, "step": 37490 }, { "entropy": 1.8192658558487893, "epoch": 0.11624671893635802, "grad_norm": 5.670734882354736, "learning_rate": 7.420071980725793e-06, "loss": 0.4966, "mean_token_accuracy": 0.8343614637851715, "num_tokens": 45042647.0, "step": 37500 }, { "entropy": 1.9440273195505142, "epoch": 0.11627771806140771, "grad_norm": 10.800583839416504, "learning_rate": 7.419082809247022e-06, "loss": 0.6296, "mean_token_accuracy": 0.8177393227815628, "num_tokens": 45053626.0, "step": 37510 }, { "entropy": 1.8665451392531396, "epoch": 0.11630871718645741, "grad_norm": 11.033953666687012, "learning_rate": 7.41809403326283e-06, "loss": 0.5682, "mean_token_accuracy": 0.822401012480259, "num_tokens": 45064923.0, "step": 37520 }, { "entropy": 1.899769589304924, "epoch": 0.11633971631150711, "grad_norm": 8.622315406799316, "learning_rate": 7.417105652509737e-06, "loss": 0.5622, "mean_token_accuracy": 0.8258567243814469, "num_tokens": 45076477.0, "step": 37530 }, { "entropy": 1.951041680574417, "epoch": 0.1163707154365568, "grad_norm": 12.631427764892578, "learning_rate": 7.4161176667245125e-06, "loss": 0.6739, "mean_token_accuracy": 0.8053397119045258, "num_tokens": 45087117.0, "step": 37540 }, { "entropy": 1.740734612941742, "epoch": 0.1164017145616065, "grad_norm": 4.5630879402160645, "learning_rate": 7.415130075644172e-06, "loss": 0.4542, "mean_token_accuracy": 0.8315686360001564, "num_tokens": 45101656.0, "step": 37550 }, { "entropy": 1.8748736679553986, "epoch": 0.1164327136866562, "grad_norm": 8.166940689086914, "learning_rate": 7.414142879005973e-06, "loss": 0.5907, "mean_token_accuracy": 0.8158686429262161, "num_tokens": 45113754.0, "step": 37560 }, { "entropy": 1.8461486741900444, "epoch": 0.1164637128117059, "grad_norm": 10.343168258666992, "learning_rate": 7.4131560765474195e-06, "loss": 0.5501, "mean_token_accuracy": 0.8233461812138557, "num_tokens": 45126280.0, "step": 37570 }, { "entropy": 1.8095831125974655, "epoch": 0.11649471193675559, "grad_norm": 9.836362838745117, "learning_rate": 7.4121696680062626e-06, "loss": 0.531, "mean_token_accuracy": 0.8389960706233979, "num_tokens": 45138548.0, "step": 37580 }, { "entropy": 1.8525902017951013, "epoch": 0.11652571106180529, "grad_norm": 2.9441442489624023, "learning_rate": 7.411183653120493e-06, "loss": 0.5153, "mean_token_accuracy": 0.8308546274900437, "num_tokens": 45150311.0, "step": 37590 }, { "entropy": 1.8115593805909156, "epoch": 0.11655671018685497, "grad_norm": 10.99833869934082, "learning_rate": 7.410198031628346e-06, "loss": 0.5517, "mean_token_accuracy": 0.8222774997353554, "num_tokens": 45161956.0, "step": 37600 }, { "entropy": 1.7148562341928482, "epoch": 0.11658770931190467, "grad_norm": 9.413897514343262, "learning_rate": 7.409212803268305e-06, "loss": 0.5039, "mean_token_accuracy": 0.8297837525606155, "num_tokens": 45175439.0, "step": 37610 }, { "entropy": 1.8933440566062927, "epoch": 0.11661870843695436, "grad_norm": 9.255085945129395, "learning_rate": 7.408227967779092e-06, "loss": 0.6599, "mean_token_accuracy": 0.8186682641506196, "num_tokens": 45187297.0, "step": 37620 }, { "entropy": 1.8789986670017242, "epoch": 0.11664970756200406, "grad_norm": 7.9891180992126465, "learning_rate": 7.407243524899674e-06, "loss": 0.5768, "mean_token_accuracy": 0.82134408056736, "num_tokens": 45198638.0, "step": 37630 }, { "entropy": 1.9211965501308441, "epoch": 0.11668070668705376, "grad_norm": 7.830356597900391, "learning_rate": 7.40625947436926e-06, "loss": 0.6321, "mean_token_accuracy": 0.813792322576046, "num_tokens": 45209458.0, "step": 37640 }, { "entropy": 1.853388948738575, "epoch": 0.11671170581210345, "grad_norm": 9.149991989135742, "learning_rate": 7.405275815927302e-06, "loss": 0.5775, "mean_token_accuracy": 0.8271188646554947, "num_tokens": 45220490.0, "step": 37650 }, { "entropy": 1.8560687810182572, "epoch": 0.11674270493715315, "grad_norm": 10.992624282836914, "learning_rate": 7.404292549313496e-06, "loss": 0.5735, "mean_token_accuracy": 0.8299055308103561, "num_tokens": 45231538.0, "step": 37660 }, { "entropy": 1.8339884772896766, "epoch": 0.11677370406220285, "grad_norm": 9.377613067626953, "learning_rate": 7.403309674267774e-06, "loss": 0.5771, "mean_token_accuracy": 0.8145007371902466, "num_tokens": 45243273.0, "step": 37670 }, { "entropy": 1.8108773604035378, "epoch": 0.11680470318725254, "grad_norm": 8.911115646362305, "learning_rate": 7.402327190530316e-06, "loss": 0.5862, "mean_token_accuracy": 0.8245181262493133, "num_tokens": 45255185.0, "step": 37680 }, { "entropy": 1.7524414852261543, "epoch": 0.11683570231230224, "grad_norm": 8.754755973815918, "learning_rate": 7.40134509784154e-06, "loss": 0.442, "mean_token_accuracy": 0.8447420582175255, "num_tokens": 45268461.0, "step": 37690 }, { "entropy": 1.853088989853859, "epoch": 0.11686670143735194, "grad_norm": 8.089972496032715, "learning_rate": 7.400363395942107e-06, "loss": 0.5872, "mean_token_accuracy": 0.8264397710561753, "num_tokens": 45279403.0, "step": 37700 }, { "entropy": 1.8071050971746445, "epoch": 0.11689770056240163, "grad_norm": 4.270401954650879, "learning_rate": 7.399382084572917e-06, "loss": 0.5712, "mean_token_accuracy": 0.829101575911045, "num_tokens": 45292068.0, "step": 37710 }, { "entropy": 1.878752313554287, "epoch": 0.11692869968745132, "grad_norm": 5.220165252685547, "learning_rate": 7.39840116347511e-06, "loss": 0.5356, "mean_token_accuracy": 0.828040985763073, "num_tokens": 45303427.0, "step": 37720 }, { "entropy": 1.860033529996872, "epoch": 0.11695969881250101, "grad_norm": 8.818233489990234, "learning_rate": 7.397420632390068e-06, "loss": 0.5614, "mean_token_accuracy": 0.8269825220108032, "num_tokens": 45314769.0, "step": 37730 }, { "entropy": 1.88105977922678, "epoch": 0.11699069793755071, "grad_norm": 8.774882316589355, "learning_rate": 7.396440491059412e-06, "loss": 0.5873, "mean_token_accuracy": 0.82732093334198, "num_tokens": 45326083.0, "step": 37740 }, { "entropy": 1.9004520326852798, "epoch": 0.1170216970626004, "grad_norm": 7.999516487121582, "learning_rate": 7.395460739225003e-06, "loss": 0.5686, "mean_token_accuracy": 0.8362064957618713, "num_tokens": 45337413.0, "step": 37750 }, { "entropy": 1.858451707661152, "epoch": 0.1170526961876501, "grad_norm": 10.978395462036133, "learning_rate": 7.39448137662894e-06, "loss": 0.583, "mean_token_accuracy": 0.8314261004328728, "num_tokens": 45349411.0, "step": 37760 }, { "entropy": 1.7877505451440812, "epoch": 0.1170836953126998, "grad_norm": 8.580302238464355, "learning_rate": 7.393502403013563e-06, "loss": 0.539, "mean_token_accuracy": 0.8316345691680909, "num_tokens": 45362113.0, "step": 37770 }, { "entropy": 1.7756628662347793, "epoch": 0.1171146944377495, "grad_norm": 2.5867841243743896, "learning_rate": 7.3925238181214465e-06, "loss": 0.4867, "mean_token_accuracy": 0.8359642580151558, "num_tokens": 45375493.0, "step": 37780 }, { "entropy": 1.9025995954871178, "epoch": 0.11714569356279919, "grad_norm": 8.61524486541748, "learning_rate": 7.39154562169541e-06, "loss": 0.6269, "mean_token_accuracy": 0.8146417796611786, "num_tokens": 45386898.0, "step": 37790 }, { "entropy": 1.8143214404582977, "epoch": 0.11717669268784889, "grad_norm": 4.412146091461182, "learning_rate": 7.390567813478508e-06, "loss": 0.5244, "mean_token_accuracy": 0.8260921567678452, "num_tokens": 45398636.0, "step": 37800 }, { "entropy": 1.8221836000680924, "epoch": 0.11720769181289858, "grad_norm": 9.057048797607422, "learning_rate": 7.38959039321403e-06, "loss": 0.4918, "mean_token_accuracy": 0.8306093484163284, "num_tokens": 45411214.0, "step": 37810 }, { "entropy": 1.7029403105378151, "epoch": 0.11723869093794828, "grad_norm": 3.986116886138916, "learning_rate": 7.388613360645508e-06, "loss": 0.4131, "mean_token_accuracy": 0.8498943701386452, "num_tokens": 45424321.0, "step": 37820 }, { "entropy": 1.8506017461419106, "epoch": 0.11726969006299798, "grad_norm": 11.799568176269531, "learning_rate": 7.387636715516706e-06, "loss": 0.5875, "mean_token_accuracy": 0.8202611520886421, "num_tokens": 45435300.0, "step": 37830 }, { "entropy": 1.7843811869621278, "epoch": 0.11730068918804766, "grad_norm": 9.326491355895996, "learning_rate": 7.38666045757163e-06, "loss": 0.5139, "mean_token_accuracy": 0.8335476636886596, "num_tokens": 45448436.0, "step": 37840 }, { "entropy": 1.7802442833781242, "epoch": 0.11733168831309736, "grad_norm": 9.365975379943848, "learning_rate": 7.38568458655452e-06, "loss": 0.4799, "mean_token_accuracy": 0.8346278235316277, "num_tokens": 45461982.0, "step": 37850 }, { "entropy": 1.774220162630081, "epoch": 0.11736268743814705, "grad_norm": 5.482513427734375, "learning_rate": 7.384709102209855e-06, "loss": 0.5163, "mean_token_accuracy": 0.8305883064866066, "num_tokens": 45474506.0, "step": 37860 }, { "entropy": 1.7526094153523446, "epoch": 0.11739368656319675, "grad_norm": 4.42122220993042, "learning_rate": 7.383734004282347e-06, "loss": 0.5358, "mean_token_accuracy": 0.8322111248970032, "num_tokens": 45487584.0, "step": 37870 }, { "entropy": 1.7954672902822495, "epoch": 0.11742468568824645, "grad_norm": 9.287115097045898, "learning_rate": 7.382759292516944e-06, "loss": 0.4978, "mean_token_accuracy": 0.8465468645095825, "num_tokens": 45499890.0, "step": 37880 }, { "entropy": 1.8831429094076158, "epoch": 0.11745568481329614, "grad_norm": 8.810885429382324, "learning_rate": 7.3817849666588325e-06, "loss": 0.5603, "mean_token_accuracy": 0.8357957974076271, "num_tokens": 45511132.0, "step": 37890 }, { "entropy": 1.8124101281166076, "epoch": 0.11748668393834584, "grad_norm": 8.837002754211426, "learning_rate": 7.3808110264534325e-06, "loss": 0.5354, "mean_token_accuracy": 0.8318711072206497, "num_tokens": 45523253.0, "step": 37900 }, { "entropy": 1.8693040862679482, "epoch": 0.11751768306339554, "grad_norm": 7.558449745178223, "learning_rate": 7.379837471646401e-06, "loss": 0.6194, "mean_token_accuracy": 0.817514568567276, "num_tokens": 45534537.0, "step": 37910 }, { "entropy": 1.8211430594325067, "epoch": 0.11754868218844523, "grad_norm": 9.2406005859375, "learning_rate": 7.378864301983624e-06, "loss": 0.521, "mean_token_accuracy": 0.8348601713776589, "num_tokens": 45547222.0, "step": 37920 }, { "entropy": 1.788476151227951, "epoch": 0.11757968131349493, "grad_norm": 8.003228187561035, "learning_rate": 7.3778915172112294e-06, "loss": 0.5148, "mean_token_accuracy": 0.8349843844771385, "num_tokens": 45558966.0, "step": 37930 }, { "entropy": 1.8203987568616866, "epoch": 0.11761068043854463, "grad_norm": 8.81309700012207, "learning_rate": 7.376919117075574e-06, "loss": 0.5361, "mean_token_accuracy": 0.8295062810182572, "num_tokens": 45571715.0, "step": 37940 }, { "entropy": 1.8281734257936477, "epoch": 0.11764167956359432, "grad_norm": 9.268831253051758, "learning_rate": 7.375947101323252e-06, "loss": 0.5534, "mean_token_accuracy": 0.8323431298136711, "num_tokens": 45583585.0, "step": 37950 }, { "entropy": 1.8576543658971787, "epoch": 0.11767267868864402, "grad_norm": 9.705360412597656, "learning_rate": 7.37497546970109e-06, "loss": 0.5438, "mean_token_accuracy": 0.8329675674438477, "num_tokens": 45595651.0, "step": 37960 }, { "entropy": 1.7942541763186455, "epoch": 0.1177036778136937, "grad_norm": 8.475988388061523, "learning_rate": 7.374004221956146e-06, "loss": 0.5249, "mean_token_accuracy": 0.8307255417108536, "num_tokens": 45608005.0, "step": 37970 }, { "entropy": 1.7157593086361884, "epoch": 0.1177346769387434, "grad_norm": 10.300793647766113, "learning_rate": 7.373033357835715e-06, "loss": 0.4363, "mean_token_accuracy": 0.8378100886940956, "num_tokens": 45621783.0, "step": 37980 }, { "entropy": 1.8599382385611534, "epoch": 0.1177656760637931, "grad_norm": 9.142414093017578, "learning_rate": 7.372062877087321e-06, "loss": 0.5541, "mean_token_accuracy": 0.8253407448530197, "num_tokens": 45632804.0, "step": 37990 }, { "entropy": 1.8708630874752998, "epoch": 0.11779667518884279, "grad_norm": 9.671748161315918, "learning_rate": 7.371092779458723e-06, "loss": 0.5699, "mean_token_accuracy": 0.8188276901841164, "num_tokens": 45644415.0, "step": 38000 }, { "entropy": 1.8491891831159593, "epoch": 0.11782767431389249, "grad_norm": 8.546015739440918, "learning_rate": 7.3701230646979126e-06, "loss": 0.551, "mean_token_accuracy": 0.8339832827448845, "num_tokens": 45655755.0, "step": 38010 }, { "entropy": 1.8837746277451515, "epoch": 0.11785867343894219, "grad_norm": 10.270685195922852, "learning_rate": 7.369153732553109e-06, "loss": 0.575, "mean_token_accuracy": 0.8240560159087181, "num_tokens": 45666750.0, "step": 38020 }, { "entropy": 1.8285693258047104, "epoch": 0.11788967256399188, "grad_norm": 8.460458755493164, "learning_rate": 7.368184782772773e-06, "loss": 0.5454, "mean_token_accuracy": 0.8289884492754936, "num_tokens": 45678928.0, "step": 38030 }, { "entropy": 1.903639057278633, "epoch": 0.11792067168904158, "grad_norm": 9.557387351989746, "learning_rate": 7.367216215105582e-06, "loss": 0.6053, "mean_token_accuracy": 0.8232555508613586, "num_tokens": 45689650.0, "step": 38040 }, { "entropy": 1.840640588104725, "epoch": 0.11795167081409128, "grad_norm": 10.47343635559082, "learning_rate": 7.366248029300462e-06, "loss": 0.551, "mean_token_accuracy": 0.8262113586068154, "num_tokens": 45701186.0, "step": 38050 }, { "entropy": 1.6772912368178368, "epoch": 0.11798266993914097, "grad_norm": 5.621872425079346, "learning_rate": 7.365280225106553e-06, "loss": 0.4109, "mean_token_accuracy": 0.8406716391444207, "num_tokens": 45715025.0, "step": 38060 }, { "entropy": 1.674746471643448, "epoch": 0.11801366906419067, "grad_norm": 8.922220230102539, "learning_rate": 7.364312802273238e-06, "loss": 0.4541, "mean_token_accuracy": 0.8508239656686782, "num_tokens": 45729652.0, "step": 38070 }, { "entropy": 1.8886992782354355, "epoch": 0.11804466818924036, "grad_norm": 9.37268352508545, "learning_rate": 7.3633457605501245e-06, "loss": 0.5581, "mean_token_accuracy": 0.8332405045628548, "num_tokens": 45740799.0, "step": 38080 }, { "entropy": 1.7671217247843742, "epoch": 0.11807566731429005, "grad_norm": 3.940173387527466, "learning_rate": 7.362379099687053e-06, "loss": 0.4674, "mean_token_accuracy": 0.8395871177315712, "num_tokens": 45753706.0, "step": 38090 }, { "entropy": 1.83549507856369, "epoch": 0.11810666643933974, "grad_norm": 10.712289810180664, "learning_rate": 7.3614128194340895e-06, "loss": 0.5369, "mean_token_accuracy": 0.8255318030714989, "num_tokens": 45765958.0, "step": 38100 }, { "entropy": 1.775768581032753, "epoch": 0.11813766556438944, "grad_norm": 9.007753372192383, "learning_rate": 7.360446919541537e-06, "loss": 0.5142, "mean_token_accuracy": 0.8286249712109566, "num_tokens": 45778973.0, "step": 38110 }, { "entropy": 1.8200758472085, "epoch": 0.11816866468943914, "grad_norm": 9.196110725402832, "learning_rate": 7.359481399759919e-06, "loss": 0.5358, "mean_token_accuracy": 0.8254531383514404, "num_tokens": 45791157.0, "step": 38120 }, { "entropy": 1.8400531589984894, "epoch": 0.11819966381448883, "grad_norm": 8.93841552734375, "learning_rate": 7.358516259839993e-06, "loss": 0.5626, "mean_token_accuracy": 0.8274215057492256, "num_tokens": 45802910.0, "step": 38130 }, { "entropy": 1.8769369035959245, "epoch": 0.11823066293953853, "grad_norm": 8.658340454101562, "learning_rate": 7.3575514995327465e-06, "loss": 0.59, "mean_token_accuracy": 0.8198636502027512, "num_tokens": 45814938.0, "step": 38140 }, { "entropy": 1.8544082716107368, "epoch": 0.11826166206458823, "grad_norm": 10.661956787109375, "learning_rate": 7.3565871185893936e-06, "loss": 0.5401, "mean_token_accuracy": 0.833048839867115, "num_tokens": 45826509.0, "step": 38150 }, { "entropy": 1.8468938201665879, "epoch": 0.11829266118963792, "grad_norm": 8.67770004272461, "learning_rate": 7.3556231167613724e-06, "loss": 0.5409, "mean_token_accuracy": 0.829650467634201, "num_tokens": 45838046.0, "step": 38160 }, { "entropy": 1.8621126562356949, "epoch": 0.11832366031468762, "grad_norm": 6.933115005493164, "learning_rate": 7.354659493800356e-06, "loss": 0.5648, "mean_token_accuracy": 0.8264819413423539, "num_tokens": 45850028.0, "step": 38170 }, { "entropy": 1.8032435089349748, "epoch": 0.11835465943973732, "grad_norm": 7.068675994873047, "learning_rate": 7.353696249458242e-06, "loss": 0.5598, "mean_token_accuracy": 0.8219858318567276, "num_tokens": 45862980.0, "step": 38180 }, { "entropy": 1.8914562717080117, "epoch": 0.11838565856478701, "grad_norm": 8.82200813293457, "learning_rate": 7.352733383487156e-06, "loss": 0.6047, "mean_token_accuracy": 0.8125669702887535, "num_tokens": 45875364.0, "step": 38190 }, { "entropy": 1.77557475566864, "epoch": 0.11841665768983671, "grad_norm": 4.266529083251953, "learning_rate": 7.3517708956394485e-06, "loss": 0.4902, "mean_token_accuracy": 0.843745194375515, "num_tokens": 45887863.0, "step": 38200 }, { "entropy": 1.8870860502123832, "epoch": 0.1184476568148864, "grad_norm": 9.665167808532715, "learning_rate": 7.3508087856677e-06, "loss": 0.5614, "mean_token_accuracy": 0.8245103910565377, "num_tokens": 45899658.0, "step": 38210 }, { "entropy": 1.7931729927659035, "epoch": 0.11847865593993609, "grad_norm": 10.355236053466797, "learning_rate": 7.3498470533247175e-06, "loss": 0.5392, "mean_token_accuracy": 0.8335598543286323, "num_tokens": 45912392.0, "step": 38220 }, { "entropy": 1.739191673696041, "epoch": 0.11850965506498579, "grad_norm": 4.354625225067139, "learning_rate": 7.34888569836353e-06, "loss": 0.4508, "mean_token_accuracy": 0.8427866518497467, "num_tokens": 45925460.0, "step": 38230 }, { "entropy": 1.832131990790367, "epoch": 0.11854065419003548, "grad_norm": 11.056679725646973, "learning_rate": 7.347924720537399e-06, "loss": 0.5573, "mean_token_accuracy": 0.8355686992406846, "num_tokens": 45937998.0, "step": 38240 }, { "entropy": 1.8358628578484057, "epoch": 0.11857165331508518, "grad_norm": 2.7926924228668213, "learning_rate": 7.346964119599805e-06, "loss": 0.5361, "mean_token_accuracy": 0.8267253056168556, "num_tokens": 45950095.0, "step": 38250 }, { "entropy": 1.8598995104432106, "epoch": 0.11860265244013488, "grad_norm": 9.030755996704102, "learning_rate": 7.346003895304459e-06, "loss": 0.5634, "mean_token_accuracy": 0.8271134316921234, "num_tokens": 45962187.0, "step": 38260 }, { "entropy": 1.8761465281248093, "epoch": 0.11863365156518457, "grad_norm": 4.900582313537598, "learning_rate": 7.345044047405296e-06, "loss": 0.5422, "mean_token_accuracy": 0.8255822539329529, "num_tokens": 45974742.0, "step": 38270 }, { "entropy": 1.8882175594568253, "epoch": 0.11866465069023427, "grad_norm": 9.869363784790039, "learning_rate": 7.344084575656477e-06, "loss": 0.578, "mean_token_accuracy": 0.818389254808426, "num_tokens": 45986513.0, "step": 38280 }, { "entropy": 1.9118270069360732, "epoch": 0.11869564981528397, "grad_norm": 10.015856742858887, "learning_rate": 7.343125479812384e-06, "loss": 0.626, "mean_token_accuracy": 0.8135067865252494, "num_tokens": 45998695.0, "step": 38290 }, { "entropy": 1.7809939235448837, "epoch": 0.11872664894033366, "grad_norm": 2.5885379314422607, "learning_rate": 7.342166759627627e-06, "loss": 0.4918, "mean_token_accuracy": 0.8359186798334122, "num_tokens": 46011982.0, "step": 38300 }, { "entropy": 1.90637346804142, "epoch": 0.11875764806538336, "grad_norm": 9.184483528137207, "learning_rate": 7.341208414857039e-06, "loss": 0.5245, "mean_token_accuracy": 0.8383510798215866, "num_tokens": 46023222.0, "step": 38310 }, { "entropy": 1.9249227464199066, "epoch": 0.11878864719043306, "grad_norm": 8.180423736572266, "learning_rate": 7.340250445255678e-06, "loss": 0.591, "mean_token_accuracy": 0.822523207962513, "num_tokens": 46033818.0, "step": 38320 }, { "entropy": 1.8500513106584549, "epoch": 0.11881964631548275, "grad_norm": 8.05856990814209, "learning_rate": 7.3392928505788245e-06, "loss": 0.5562, "mean_token_accuracy": 0.8258534908294678, "num_tokens": 46045985.0, "step": 38330 }, { "entropy": 1.891994585096836, "epoch": 0.11885064544053243, "grad_norm": 10.122121810913086, "learning_rate": 7.338335630581982e-06, "loss": 0.5748, "mean_token_accuracy": 0.8260172605514526, "num_tokens": 46057552.0, "step": 38340 }, { "entropy": 1.9085305988788606, "epoch": 0.11888164456558213, "grad_norm": 8.372586250305176, "learning_rate": 7.3373787850208775e-06, "loss": 0.5706, "mean_token_accuracy": 0.8344064444303513, "num_tokens": 46068444.0, "step": 38350 }, { "entropy": 1.8459767132997513, "epoch": 0.11891264369063183, "grad_norm": 9.715620040893555, "learning_rate": 7.336422313651464e-06, "loss": 0.5169, "mean_token_accuracy": 0.83453588783741, "num_tokens": 46080267.0, "step": 38360 }, { "entropy": 1.966032502055168, "epoch": 0.11894364281568152, "grad_norm": 8.732338905334473, "learning_rate": 7.33546621622991e-06, "loss": 0.6502, "mean_token_accuracy": 0.8121734812855721, "num_tokens": 46091339.0, "step": 38370 }, { "entropy": 1.8556820943951606, "epoch": 0.11897464194073122, "grad_norm": 10.176673889160156, "learning_rate": 7.334510492512614e-06, "loss": 0.5158, "mean_token_accuracy": 0.837830537557602, "num_tokens": 46102953.0, "step": 38380 }, { "entropy": 1.883063942193985, "epoch": 0.11900564106578092, "grad_norm": 7.522932052612305, "learning_rate": 7.3335551422561916e-06, "loss": 0.5412, "mean_token_accuracy": 0.8345891013741493, "num_tokens": 46114625.0, "step": 38390 }, { "entropy": 1.8097529754042625, "epoch": 0.11903664019083061, "grad_norm": 3.6872365474700928, "learning_rate": 7.3326001652174846e-06, "loss": 0.515, "mean_token_accuracy": 0.8357144206762314, "num_tokens": 46126642.0, "step": 38400 }, { "entropy": 1.8187772005796432, "epoch": 0.11906763931588031, "grad_norm": 11.781440734863281, "learning_rate": 7.331645561153551e-06, "loss": 0.5275, "mean_token_accuracy": 0.8383165881037712, "num_tokens": 46138142.0, "step": 38410 }, { "entropy": 1.8450339302420615, "epoch": 0.11909863844093001, "grad_norm": 12.367788314819336, "learning_rate": 7.330691329821676e-06, "loss": 0.5472, "mean_token_accuracy": 0.8337818294763565, "num_tokens": 46149774.0, "step": 38420 }, { "entropy": 1.8153291314840316, "epoch": 0.1191296375659797, "grad_norm": 8.434037208557129, "learning_rate": 7.329737470979359e-06, "loss": 0.5374, "mean_token_accuracy": 0.8328280210494995, "num_tokens": 46162425.0, "step": 38430 }, { "entropy": 1.8206452041864396, "epoch": 0.1191606366910294, "grad_norm": 8.897926330566406, "learning_rate": 7.328783984384326e-06, "loss": 0.4986, "mean_token_accuracy": 0.8331076070666313, "num_tokens": 46175025.0, "step": 38440 }, { "entropy": 1.8406914666295051, "epoch": 0.1191916358160791, "grad_norm": 9.005738258361816, "learning_rate": 7.327830869794524e-06, "loss": 0.5414, "mean_token_accuracy": 0.8288857981562614, "num_tokens": 46187094.0, "step": 38450 }, { "entropy": 1.8507864832878114, "epoch": 0.11922263494112878, "grad_norm": 5.744321823120117, "learning_rate": 7.326878126968114e-06, "loss": 0.5862, "mean_token_accuracy": 0.815381346642971, "num_tokens": 46199554.0, "step": 38460 }, { "entropy": 1.8325456768274306, "epoch": 0.11925363406617848, "grad_norm": 10.185477256774902, "learning_rate": 7.325925755663483e-06, "loss": 0.5335, "mean_token_accuracy": 0.8304981887340546, "num_tokens": 46211540.0, "step": 38470 }, { "entropy": 1.8854450479149818, "epoch": 0.11928463319122817, "grad_norm": 8.560661315917969, "learning_rate": 7.324973755639235e-06, "loss": 0.5288, "mean_token_accuracy": 0.823890820145607, "num_tokens": 46223017.0, "step": 38480 }, { "entropy": 1.892309932410717, "epoch": 0.11931563231627787, "grad_norm": 9.574992179870605, "learning_rate": 7.324022126654195e-06, "loss": 0.5751, "mean_token_accuracy": 0.823742826282978, "num_tokens": 46235073.0, "step": 38490 }, { "entropy": 1.814597336947918, "epoch": 0.11934663144132757, "grad_norm": 8.49984359741211, "learning_rate": 7.323070868467407e-06, "loss": 0.5, "mean_token_accuracy": 0.8360254690051079, "num_tokens": 46247594.0, "step": 38500 }, { "entropy": 1.8429415673017502, "epoch": 0.11937763056637726, "grad_norm": 8.082172393798828, "learning_rate": 7.322119980838131e-06, "loss": 0.5642, "mean_token_accuracy": 0.8176757171750069, "num_tokens": 46259339.0, "step": 38510 }, { "entropy": 1.8391290426254272, "epoch": 0.11940862969142696, "grad_norm": 8.323386192321777, "learning_rate": 7.32116946352585e-06, "loss": 0.5536, "mean_token_accuracy": 0.8202460646629334, "num_tokens": 46271303.0, "step": 38520 }, { "entropy": 1.8703968793153762, "epoch": 0.11943962881647666, "grad_norm": 4.209524631500244, "learning_rate": 7.3202193162902655e-06, "loss": 0.5362, "mean_token_accuracy": 0.8387380838394165, "num_tokens": 46282368.0, "step": 38530 }, { "entropy": 1.8230261772871017, "epoch": 0.11947062794152635, "grad_norm": 9.72610855102539, "learning_rate": 7.319269538891293e-06, "loss": 0.5044, "mean_token_accuracy": 0.8248476162552834, "num_tokens": 46294921.0, "step": 38540 }, { "entropy": 1.827505610883236, "epoch": 0.11950162706657605, "grad_norm": 8.594587326049805, "learning_rate": 7.318320131089069e-06, "loss": 0.5674, "mean_token_accuracy": 0.826340651512146, "num_tokens": 46307519.0, "step": 38550 }, { "entropy": 1.8293501362204552, "epoch": 0.11953262619162575, "grad_norm": 7.893844127655029, "learning_rate": 7.31737109264395e-06, "loss": 0.4868, "mean_token_accuracy": 0.8477142170071602, "num_tokens": 46319880.0, "step": 38560 }, { "entropy": 1.967301172018051, "epoch": 0.11956362531667544, "grad_norm": 10.387765884399414, "learning_rate": 7.316422423316503e-06, "loss": 0.6318, "mean_token_accuracy": 0.8172219157218933, "num_tokens": 46331125.0, "step": 38570 }, { "entropy": 1.9015961229801177, "epoch": 0.11959462444172514, "grad_norm": 8.525230407714844, "learning_rate": 7.315474122867519e-06, "loss": 0.5207, "mean_token_accuracy": 0.8258678615093231, "num_tokens": 46343547.0, "step": 38580 }, { "entropy": 1.877706202864647, "epoch": 0.11962562356677482, "grad_norm": 8.298994064331055, "learning_rate": 7.314526191058002e-06, "loss": 0.5629, "mean_token_accuracy": 0.8354897379875184, "num_tokens": 46354401.0, "step": 38590 }, { "entropy": 1.7901566669344902, "epoch": 0.11965662269182452, "grad_norm": 8.73811149597168, "learning_rate": 7.313578627649177e-06, "loss": 0.4969, "mean_token_accuracy": 0.8303673461079597, "num_tokens": 46367136.0, "step": 38600 }, { "entropy": 1.855716420710087, "epoch": 0.11968762181687422, "grad_norm": 8.627557754516602, "learning_rate": 7.31263143240248e-06, "loss": 0.552, "mean_token_accuracy": 0.8318748503923417, "num_tokens": 46378695.0, "step": 38610 }, { "entropy": 1.8308327570557594, "epoch": 0.11971862094192391, "grad_norm": 4.684299468994141, "learning_rate": 7.3116846050795675e-06, "loss": 0.5274, "mean_token_accuracy": 0.8276481464505195, "num_tokens": 46390539.0, "step": 38620 }, { "entropy": 1.7808781787753105, "epoch": 0.11974962006697361, "grad_norm": 9.359618186950684, "learning_rate": 7.3107381454423095e-06, "loss": 0.5116, "mean_token_accuracy": 0.8304883912205696, "num_tokens": 46403361.0, "step": 38630 }, { "entropy": 1.832536792755127, "epoch": 0.1197806191920233, "grad_norm": 9.308186531066895, "learning_rate": 7.3097920532527925e-06, "loss": 0.5477, "mean_token_accuracy": 0.8254012614488602, "num_tokens": 46415320.0, "step": 38640 }, { "entropy": 1.875816921889782, "epoch": 0.119811618317073, "grad_norm": 6.979348182678223, "learning_rate": 7.3088463282733195e-06, "loss": 0.6144, "mean_token_accuracy": 0.8188940063118935, "num_tokens": 46426785.0, "step": 38650 }, { "entropy": 1.896639221906662, "epoch": 0.1198426174421227, "grad_norm": 8.80232048034668, "learning_rate": 7.307900970266406e-06, "loss": 0.5687, "mean_token_accuracy": 0.831016306579113, "num_tokens": 46437545.0, "step": 38660 }, { "entropy": 1.8298520892858505, "epoch": 0.1198736165671724, "grad_norm": 6.179274559020996, "learning_rate": 7.3069559789947875e-06, "loss": 0.6193, "mean_token_accuracy": 0.8189263239502906, "num_tokens": 46449253.0, "step": 38670 }, { "entropy": 1.8082061618566514, "epoch": 0.11990461569222209, "grad_norm": 7.518819808959961, "learning_rate": 7.306011354221407e-06, "loss": 0.5346, "mean_token_accuracy": 0.8381061047315598, "num_tokens": 46461547.0, "step": 38680 }, { "entropy": 1.8741240479052066, "epoch": 0.11993561481727179, "grad_norm": 8.276564598083496, "learning_rate": 7.305067095709427e-06, "loss": 0.5714, "mean_token_accuracy": 0.8233344674110412, "num_tokens": 46473612.0, "step": 38690 }, { "entropy": 1.82129565179348, "epoch": 0.11996661394232148, "grad_norm": 3.840449571609497, "learning_rate": 7.3041232032222255e-06, "loss": 0.5298, "mean_token_accuracy": 0.8374928280711174, "num_tokens": 46485577.0, "step": 38700 }, { "entropy": 1.7987656205892564, "epoch": 0.11999761306737117, "grad_norm": 9.687265396118164, "learning_rate": 7.303179676523391e-06, "loss": 0.5269, "mean_token_accuracy": 0.8323082059621811, "num_tokens": 46497845.0, "step": 38710 }, { "entropy": 1.8387595310807228, "epoch": 0.12002861219242086, "grad_norm": 7.756543159484863, "learning_rate": 7.302236515376725e-06, "loss": 0.5594, "mean_token_accuracy": 0.8210761576890946, "num_tokens": 46510088.0, "step": 38720 }, { "entropy": 1.7564123407006265, "epoch": 0.12005961131747056, "grad_norm": 8.537627220153809, "learning_rate": 7.301293719546245e-06, "loss": 0.5524, "mean_token_accuracy": 0.831873744726181, "num_tokens": 46522634.0, "step": 38730 }, { "entropy": 1.8440282315015792, "epoch": 0.12009061044252026, "grad_norm": 7.22578763961792, "learning_rate": 7.300351288796182e-06, "loss": 0.575, "mean_token_accuracy": 0.8209284529089927, "num_tokens": 46534334.0, "step": 38740 }, { "entropy": 1.8955597847700119, "epoch": 0.12012160956756995, "grad_norm": 8.206884384155273, "learning_rate": 7.299409222890979e-06, "loss": 0.6, "mean_token_accuracy": 0.8230985984206199, "num_tokens": 46546015.0, "step": 38750 }, { "entropy": 1.831144355237484, "epoch": 0.12015260869261965, "grad_norm": 9.138228416442871, "learning_rate": 7.298467521595293e-06, "loss": 0.4659, "mean_token_accuracy": 0.8403099969029426, "num_tokens": 46558715.0, "step": 38760 }, { "entropy": 1.7825349926948548, "epoch": 0.12018360781766935, "grad_norm": 8.191915512084961, "learning_rate": 7.297526184673988e-06, "loss": 0.5195, "mean_token_accuracy": 0.8280392870306968, "num_tokens": 46572867.0, "step": 38770 }, { "entropy": 1.8217300280928612, "epoch": 0.12021460694271904, "grad_norm": 8.535613059997559, "learning_rate": 7.296585211892146e-06, "loss": 0.483, "mean_token_accuracy": 0.8408209979534149, "num_tokens": 46585902.0, "step": 38780 }, { "entropy": 1.8220679610967636, "epoch": 0.12024560606776874, "grad_norm": 4.572232246398926, "learning_rate": 7.295644603015063e-06, "loss": 0.4769, "mean_token_accuracy": 0.8344119265675545, "num_tokens": 46599076.0, "step": 38790 }, { "entropy": 1.8718759939074516, "epoch": 0.12027660519281844, "grad_norm": 9.355148315429688, "learning_rate": 7.294704357808237e-06, "loss": 0.5121, "mean_token_accuracy": 0.8285117372870445, "num_tokens": 46611131.0, "step": 38800 }, { "entropy": 1.901232734322548, "epoch": 0.12030760431786813, "grad_norm": 10.37480354309082, "learning_rate": 7.2937644760373896e-06, "loss": 0.5814, "mean_token_accuracy": 0.8172040551900863, "num_tokens": 46623342.0, "step": 38810 }, { "entropy": 1.8803485922515393, "epoch": 0.12033860344291783, "grad_norm": 9.32126235961914, "learning_rate": 7.292824957468444e-06, "loss": 0.5681, "mean_token_accuracy": 0.8315271034836769, "num_tokens": 46634924.0, "step": 38820 }, { "entropy": 1.9120069742202759, "epoch": 0.12036960256796751, "grad_norm": 10.770748138427734, "learning_rate": 7.29188580186754e-06, "loss": 0.5592, "mean_token_accuracy": 0.8313893929123879, "num_tokens": 46647007.0, "step": 38830 }, { "entropy": 1.7526630774140357, "epoch": 0.12040060169301721, "grad_norm": 4.222184181213379, "learning_rate": 7.290947009001024e-06, "loss": 0.4601, "mean_token_accuracy": 0.8405937299132347, "num_tokens": 46660402.0, "step": 38840 }, { "entropy": 1.8250176429748535, "epoch": 0.1204316008180669, "grad_norm": 8.69330883026123, "learning_rate": 7.290008578635457e-06, "loss": 0.5383, "mean_token_accuracy": 0.82737468034029, "num_tokens": 46673041.0, "step": 38850 }, { "entropy": 1.8118784487247468, "epoch": 0.1204625999431166, "grad_norm": 9.943561553955078, "learning_rate": 7.289070510537608e-06, "loss": 0.5215, "mean_token_accuracy": 0.8418055370450019, "num_tokens": 46686648.0, "step": 38860 }, { "entropy": 1.9078863650560378, "epoch": 0.1204935990681663, "grad_norm": 9.226347923278809, "learning_rate": 7.288132804474457e-06, "loss": 0.5599, "mean_token_accuracy": 0.8267140224575996, "num_tokens": 46698591.0, "step": 38870 }, { "entropy": 1.8343891605734826, "epoch": 0.120524598193216, "grad_norm": 5.343838691711426, "learning_rate": 7.28719546021319e-06, "loss": 0.5039, "mean_token_accuracy": 0.8361982107162476, "num_tokens": 46711162.0, "step": 38880 }, { "entropy": 1.8952967941761016, "epoch": 0.12055559731826569, "grad_norm": 7.90724515914917, "learning_rate": 7.286258477521211e-06, "loss": 0.5974, "mean_token_accuracy": 0.8240769699215889, "num_tokens": 46723517.0, "step": 38890 }, { "entropy": 1.8919255122542382, "epoch": 0.12058659644331539, "grad_norm": 8.763474464416504, "learning_rate": 7.285321856166125e-06, "loss": 0.5488, "mean_token_accuracy": 0.8284026741981506, "num_tokens": 46734522.0, "step": 38900 }, { "entropy": 1.9270798504352569, "epoch": 0.12061759556836509, "grad_norm": 8.891312599182129, "learning_rate": 7.284385595915748e-06, "loss": 0.5376, "mean_token_accuracy": 0.8333567604422569, "num_tokens": 46745723.0, "step": 38910 }, { "entropy": 1.904827606678009, "epoch": 0.12064859469341478, "grad_norm": 9.994085311889648, "learning_rate": 7.283449696538109e-06, "loss": 0.5673, "mean_token_accuracy": 0.8297219768166542, "num_tokens": 46757162.0, "step": 38920 }, { "entropy": 1.9497202724218368, "epoch": 0.12067959381846448, "grad_norm": 10.245466232299805, "learning_rate": 7.282514157801443e-06, "loss": 0.6198, "mean_token_accuracy": 0.8121256932616234, "num_tokens": 46768342.0, "step": 38930 }, { "entropy": 1.9423091277480125, "epoch": 0.12071059294351418, "grad_norm": 8.347733497619629, "learning_rate": 7.2815789794741885e-06, "loss": 0.5971, "mean_token_accuracy": 0.8201627746224404, "num_tokens": 46779722.0, "step": 38940 }, { "entropy": 1.9126493602991104, "epoch": 0.12074159206856387, "grad_norm": 8.225603103637695, "learning_rate": 7.280644161324999e-06, "loss": 0.5515, "mean_token_accuracy": 0.8218429237604141, "num_tokens": 46791464.0, "step": 38950 }, { "entropy": 1.8790960252285003, "epoch": 0.12077259119361355, "grad_norm": 8.749955177307129, "learning_rate": 7.2797097031227335e-06, "loss": 0.5397, "mean_token_accuracy": 0.8350474625825882, "num_tokens": 46804115.0, "step": 38960 }, { "entropy": 1.8715863823890686, "epoch": 0.12080359031866325, "grad_norm": 11.107318878173828, "learning_rate": 7.278775604636458e-06, "loss": 0.5789, "mean_token_accuracy": 0.8246800765395165, "num_tokens": 46815723.0, "step": 38970 }, { "entropy": 1.8465174466371537, "epoch": 0.12083458944371295, "grad_norm": 6.929959774017334, "learning_rate": 7.277841865635446e-06, "loss": 0.5764, "mean_token_accuracy": 0.8237011566758156, "num_tokens": 46828326.0, "step": 38980 }, { "entropy": 1.9157025456428527, "epoch": 0.12086558856876264, "grad_norm": 7.555674076080322, "learning_rate": 7.27690848588918e-06, "loss": 0.5341, "mean_token_accuracy": 0.8201544851064682, "num_tokens": 46840929.0, "step": 38990 }, { "entropy": 1.855295367538929, "epoch": 0.12089658769381234, "grad_norm": 9.253195762634277, "learning_rate": 7.275975465167346e-06, "loss": 0.4612, "mean_token_accuracy": 0.8400891706347465, "num_tokens": 46853363.0, "step": 39000 }, { "entropy": 1.9134544879198074, "epoch": 0.12092758681886204, "grad_norm": 9.56458568572998, "learning_rate": 7.27504280323984e-06, "loss": 0.5498, "mean_token_accuracy": 0.8285094171762466, "num_tokens": 46865204.0, "step": 39010 }, { "entropy": 1.8941469311714172, "epoch": 0.12095858594391173, "grad_norm": 8.927152633666992, "learning_rate": 7.274110499876761e-06, "loss": 0.5084, "mean_token_accuracy": 0.8449266985058784, "num_tokens": 46876368.0, "step": 39020 }, { "entropy": 1.8193779736757278, "epoch": 0.12098958506896143, "grad_norm": 3.3484020233154297, "learning_rate": 7.273178554848418e-06, "loss": 0.5043, "mean_token_accuracy": 0.8299607783555984, "num_tokens": 46889571.0, "step": 39030 }, { "entropy": 1.8515197828412056, "epoch": 0.12102058419401113, "grad_norm": 11.17811393737793, "learning_rate": 7.272246967925323e-06, "loss": 0.5419, "mean_token_accuracy": 0.8318181455135345, "num_tokens": 46901011.0, "step": 39040 }, { "entropy": 1.9295257300138473, "epoch": 0.12105158331906082, "grad_norm": 9.795385360717773, "learning_rate": 7.271315738878194e-06, "loss": 0.5301, "mean_token_accuracy": 0.8381862804293633, "num_tokens": 46911801.0, "step": 39050 }, { "entropy": 1.955726158618927, "epoch": 0.12108258244411052, "grad_norm": 11.714072227478027, "learning_rate": 7.270384867477956e-06, "loss": 0.5593, "mean_token_accuracy": 0.8203748896718025, "num_tokens": 46923433.0, "step": 39060 }, { "entropy": 1.9114267766475677, "epoch": 0.12111358156916022, "grad_norm": 9.219902992248535, "learning_rate": 7.269454353495741e-06, "loss": 0.5248, "mean_token_accuracy": 0.8343056261539459, "num_tokens": 46934777.0, "step": 39070 }, { "entropy": 1.9058862313628198, "epoch": 0.1211445806942099, "grad_norm": 9.145793914794922, "learning_rate": 7.26852419670288e-06, "loss": 0.5637, "mean_token_accuracy": 0.8218263626098633, "num_tokens": 46947035.0, "step": 39080 }, { "entropy": 1.8588959291577338, "epoch": 0.1211755798192596, "grad_norm": 9.202445983886719, "learning_rate": 7.267594396870911e-06, "loss": 0.5226, "mean_token_accuracy": 0.8362350627779961, "num_tokens": 46959557.0, "step": 39090 }, { "entropy": 1.928936019539833, "epoch": 0.12120657894430929, "grad_norm": 11.063957214355469, "learning_rate": 7.2666649537715814e-06, "loss": 0.5909, "mean_token_accuracy": 0.8108921408653259, "num_tokens": 46971699.0, "step": 39100 }, { "entropy": 1.9692992329597474, "epoch": 0.12123757806935899, "grad_norm": 11.699572563171387, "learning_rate": 7.265735867176837e-06, "loss": 0.609, "mean_token_accuracy": 0.8138938039541245, "num_tokens": 46983377.0, "step": 39110 }, { "entropy": 1.942272038757801, "epoch": 0.12126857719440869, "grad_norm": 8.434643745422363, "learning_rate": 7.264807136858832e-06, "loss": 0.6463, "mean_token_accuracy": 0.8089325189590454, "num_tokens": 46996376.0, "step": 39120 }, { "entropy": 1.8001896619796753, "epoch": 0.12129957631945838, "grad_norm": 3.925816059112549, "learning_rate": 7.2638787625899185e-06, "loss": 0.4485, "mean_token_accuracy": 0.8355321779847145, "num_tokens": 47010314.0, "step": 39130 }, { "entropy": 1.9362492710351944, "epoch": 0.12133057544450808, "grad_norm": 7.411516189575195, "learning_rate": 7.26295074414266e-06, "loss": 0.5909, "mean_token_accuracy": 0.8266261234879494, "num_tokens": 47022279.0, "step": 39140 }, { "entropy": 1.892134927213192, "epoch": 0.12136157456955778, "grad_norm": 7.244060039520264, "learning_rate": 7.262023081289816e-06, "loss": 0.4625, "mean_token_accuracy": 0.8465871244668961, "num_tokens": 47034156.0, "step": 39150 }, { "entropy": 1.9123440355062484, "epoch": 0.12139257369460747, "grad_norm": 8.467802047729492, "learning_rate": 7.261095773804354e-06, "loss": 0.5257, "mean_token_accuracy": 0.8312965899705886, "num_tokens": 47045539.0, "step": 39160 }, { "entropy": 1.909021759033203, "epoch": 0.12142357281965717, "grad_norm": 8.491313934326172, "learning_rate": 7.260168821459445e-06, "loss": 0.5967, "mean_token_accuracy": 0.809444610774517, "num_tokens": 47057991.0, "step": 39170 }, { "entropy": 1.8988661482930183, "epoch": 0.12145457194470687, "grad_norm": 8.673091888427734, "learning_rate": 7.259242224028456e-06, "loss": 0.5633, "mean_token_accuracy": 0.824192288517952, "num_tokens": 47069487.0, "step": 39180 }, { "entropy": 1.8936348468065263, "epoch": 0.12148557106975656, "grad_norm": 7.909213066101074, "learning_rate": 7.258315981284962e-06, "loss": 0.5449, "mean_token_accuracy": 0.8279350861907006, "num_tokens": 47081241.0, "step": 39190 }, { "entropy": 1.887278787791729, "epoch": 0.12151657019480624, "grad_norm": 9.825845718383789, "learning_rate": 7.2573900930027416e-06, "loss": 0.5607, "mean_token_accuracy": 0.82880117893219, "num_tokens": 47093264.0, "step": 39200 }, { "entropy": 1.8978556409478187, "epoch": 0.12154756931985594, "grad_norm": 9.518089294433594, "learning_rate": 7.25646455895577e-06, "loss": 0.5479, "mean_token_accuracy": 0.8237947776913643, "num_tokens": 47105376.0, "step": 39210 }, { "entropy": 1.8898191466927527, "epoch": 0.12157856844490564, "grad_norm": 3.807635545730591, "learning_rate": 7.255539378918229e-06, "loss": 0.5214, "mean_token_accuracy": 0.8275874778628349, "num_tokens": 47117645.0, "step": 39220 }, { "entropy": 1.8876211807131766, "epoch": 0.12160956756995533, "grad_norm": 5.313508033752441, "learning_rate": 7.254614552664499e-06, "loss": 0.5445, "mean_token_accuracy": 0.815770597755909, "num_tokens": 47129805.0, "step": 39230 }, { "entropy": 1.89053615629673, "epoch": 0.12164056669500503, "grad_norm": 9.875513076782227, "learning_rate": 7.253690079969162e-06, "loss": 0.5415, "mean_token_accuracy": 0.8238019242882728, "num_tokens": 47142055.0, "step": 39240 }, { "entropy": 1.900741669535637, "epoch": 0.12167156582005473, "grad_norm": 4.167596340179443, "learning_rate": 7.252765960607002e-06, "loss": 0.4978, "mean_token_accuracy": 0.8360105812549591, "num_tokens": 47154609.0, "step": 39250 }, { "entropy": 1.8273233488202094, "epoch": 0.12170256494510442, "grad_norm": 9.690666198730469, "learning_rate": 7.251842194353004e-06, "loss": 0.5093, "mean_token_accuracy": 0.8299174129962921, "num_tokens": 47167381.0, "step": 39260 }, { "entropy": 1.9333147034049034, "epoch": 0.12173356407015412, "grad_norm": 3.9693877696990967, "learning_rate": 7.25091878098235e-06, "loss": 0.5415, "mean_token_accuracy": 0.8288700088858605, "num_tokens": 47179138.0, "step": 39270 }, { "entropy": 1.8680218786001206, "epoch": 0.12176456319520382, "grad_norm": 3.812345504760742, "learning_rate": 7.249995720270428e-06, "loss": 0.476, "mean_token_accuracy": 0.8331614285707474, "num_tokens": 47192026.0, "step": 39280 }, { "entropy": 1.8831014022231103, "epoch": 0.12179556232025351, "grad_norm": 9.065089225769043, "learning_rate": 7.249073011992822e-06, "loss": 0.5455, "mean_token_accuracy": 0.8253825202584266, "num_tokens": 47204601.0, "step": 39290 }, { "entropy": 1.878520594537258, "epoch": 0.12182656144530321, "grad_norm": 8.888299942016602, "learning_rate": 7.248150655925318e-06, "loss": 0.4776, "mean_token_accuracy": 0.8410759374499321, "num_tokens": 47217084.0, "step": 39300 }, { "entropy": 1.9088293090462685, "epoch": 0.12185756057035291, "grad_norm": 8.316096305847168, "learning_rate": 7.247228651843902e-06, "loss": 0.5312, "mean_token_accuracy": 0.8348750114440918, "num_tokens": 47229268.0, "step": 39310 }, { "entropy": 1.9051673859357834, "epoch": 0.1218885596954026, "grad_norm": 8.587472915649414, "learning_rate": 7.246306999524752e-06, "loss": 0.5794, "mean_token_accuracy": 0.8288885727524757, "num_tokens": 47241225.0, "step": 39320 }, { "entropy": 1.970414823293686, "epoch": 0.12191955882045229, "grad_norm": 8.662064552307129, "learning_rate": 7.2453856987442604e-06, "loss": 0.5843, "mean_token_accuracy": 0.8233033016324043, "num_tokens": 47251991.0, "step": 39330 }, { "entropy": 1.8904896154999733, "epoch": 0.12195055794550198, "grad_norm": 8.939061164855957, "learning_rate": 7.244464749279004e-06, "loss": 0.547, "mean_token_accuracy": 0.8356327429413796, "num_tokens": 47264187.0, "step": 39340 }, { "entropy": 2.0048918604850767, "epoch": 0.12198155707055168, "grad_norm": 7.971733570098877, "learning_rate": 7.243544150905766e-06, "loss": 0.6107, "mean_token_accuracy": 0.8271526664495468, "num_tokens": 47275694.0, "step": 39350 }, { "entropy": 2.0341754078865053, "epoch": 0.12201255619560138, "grad_norm": 8.931192398071289, "learning_rate": 7.242623903401524e-06, "loss": 0.5592, "mean_token_accuracy": 0.8209575936198235, "num_tokens": 47286270.0, "step": 39360 }, { "entropy": 1.960363219678402, "epoch": 0.12204355532065107, "grad_norm": 8.31447696685791, "learning_rate": 7.241704006543459e-06, "loss": 0.5269, "mean_token_accuracy": 0.8278711885213852, "num_tokens": 47297957.0, "step": 39370 }, { "entropy": 1.9565995991230012, "epoch": 0.12207455444570077, "grad_norm": 10.019770622253418, "learning_rate": 7.240784460108944e-06, "loss": 0.533, "mean_token_accuracy": 0.830487422645092, "num_tokens": 47310043.0, "step": 39380 }, { "entropy": 1.8893634766340255, "epoch": 0.12210555357075047, "grad_norm": 5.0957865715026855, "learning_rate": 7.239865263875553e-06, "loss": 0.4684, "mean_token_accuracy": 0.8399478271603584, "num_tokens": 47322846.0, "step": 39390 }, { "entropy": 1.9584024906158448, "epoch": 0.12213655269580016, "grad_norm": 9.545947074890137, "learning_rate": 7.2389464176210585e-06, "loss": 0.5554, "mean_token_accuracy": 0.8273420795798302, "num_tokens": 47333837.0, "step": 39400 }, { "entropy": 1.9519995272159576, "epoch": 0.12216755182084986, "grad_norm": 6.452544212341309, "learning_rate": 7.2380279211234304e-06, "loss": 0.5955, "mean_token_accuracy": 0.8233728304505348, "num_tokens": 47345921.0, "step": 39410 }, { "entropy": 1.9542613357305527, "epoch": 0.12219855094589956, "grad_norm": 9.183480262756348, "learning_rate": 7.237109774160834e-06, "loss": 0.5779, "mean_token_accuracy": 0.8244384691119194, "num_tokens": 47357133.0, "step": 39420 }, { "entropy": 2.000173199176788, "epoch": 0.12222955007094925, "grad_norm": 10.723604202270508, "learning_rate": 7.236191976511631e-06, "loss": 0.5979, "mean_token_accuracy": 0.823377488553524, "num_tokens": 47367446.0, "step": 39430 }, { "entropy": 2.0126789212226868, "epoch": 0.12226054919599895, "grad_norm": 9.821727752685547, "learning_rate": 7.235274527954382e-06, "loss": 0.636, "mean_token_accuracy": 0.8074411496520042, "num_tokens": 47378541.0, "step": 39440 }, { "entropy": 1.9093073785305024, "epoch": 0.12229154832104863, "grad_norm": 9.164787292480469, "learning_rate": 7.234357428267842e-06, "loss": 0.5121, "mean_token_accuracy": 0.8361682832241059, "num_tokens": 47390114.0, "step": 39450 }, { "entropy": 1.8918178245425223, "epoch": 0.12232254744609833, "grad_norm": 9.560025215148926, "learning_rate": 7.233440677230964e-06, "loss": 0.5231, "mean_token_accuracy": 0.8367931827902794, "num_tokens": 47402972.0, "step": 39460 }, { "entropy": 1.9258590131998061, "epoch": 0.12235354657114803, "grad_norm": 8.87900161743164, "learning_rate": 7.232524274622897e-06, "loss": 0.5538, "mean_token_accuracy": 0.8345812693238258, "num_tokens": 47414317.0, "step": 39470 }, { "entropy": 1.8653698325157166, "epoch": 0.12238454569619772, "grad_norm": 9.456033706665039, "learning_rate": 7.231608220222983e-06, "loss": 0.5338, "mean_token_accuracy": 0.8317572355270386, "num_tokens": 47426321.0, "step": 39480 }, { "entropy": 1.9535607546567917, "epoch": 0.12241554482124742, "grad_norm": 4.313974857330322, "learning_rate": 7.230692513810767e-06, "loss": 0.5645, "mean_token_accuracy": 0.8326726511120797, "num_tokens": 47437625.0, "step": 39490 }, { "entropy": 1.8951896831393242, "epoch": 0.12244654394629712, "grad_norm": 8.987655639648438, "learning_rate": 7.229777155165975e-06, "loss": 0.4771, "mean_token_accuracy": 0.8377861216664314, "num_tokens": 47450365.0, "step": 39500 }, { "entropy": 1.9469685688614846, "epoch": 0.12247754307134681, "grad_norm": 8.353129386901855, "learning_rate": 7.228862144068547e-06, "loss": 0.5508, "mean_token_accuracy": 0.828496278822422, "num_tokens": 47462011.0, "step": 39510 }, { "entropy": 1.8816884592175485, "epoch": 0.12250854219639651, "grad_norm": 8.542236328125, "learning_rate": 7.227947480298603e-06, "loss": 0.5017, "mean_token_accuracy": 0.8291819423437119, "num_tokens": 47474823.0, "step": 39520 }, { "entropy": 1.8621670812368394, "epoch": 0.1225395413214462, "grad_norm": 8.68166446685791, "learning_rate": 7.227033163636462e-06, "loss": 0.5437, "mean_token_accuracy": 0.8337508887052536, "num_tokens": 47487106.0, "step": 39530 }, { "entropy": 1.9243188560009004, "epoch": 0.1225705404464959, "grad_norm": 9.343770027160645, "learning_rate": 7.226119193862641e-06, "loss": 0.5637, "mean_token_accuracy": 0.8281685680150985, "num_tokens": 47498743.0, "step": 39540 }, { "entropy": 1.9087723910808563, "epoch": 0.1226015395715456, "grad_norm": 10.595560073852539, "learning_rate": 7.225205570757848e-06, "loss": 0.5472, "mean_token_accuracy": 0.8282433733344078, "num_tokens": 47511087.0, "step": 39550 }, { "entropy": 1.9597302109003067, "epoch": 0.1226325386965953, "grad_norm": 9.186569213867188, "learning_rate": 7.224292294102985e-06, "loss": 0.6176, "mean_token_accuracy": 0.8227703988552093, "num_tokens": 47522853.0, "step": 39560 }, { "entropy": 1.8456875741481782, "epoch": 0.12266353782164498, "grad_norm": 4.475244998931885, "learning_rate": 7.223379363679148e-06, "loss": 0.5637, "mean_token_accuracy": 0.8218344271183013, "num_tokens": 47535932.0, "step": 39570 }, { "entropy": 1.8878474682569504, "epoch": 0.12269453694669467, "grad_norm": 8.224106788635254, "learning_rate": 7.222466779267628e-06, "loss": 0.6166, "mean_token_accuracy": 0.8177327990531922, "num_tokens": 47547596.0, "step": 39580 }, { "entropy": 1.9054535388946534, "epoch": 0.12272553607174437, "grad_norm": 9.366242408752441, "learning_rate": 7.221554540649909e-06, "loss": 0.5031, "mean_token_accuracy": 0.8298028498888016, "num_tokens": 47560679.0, "step": 39590 }, { "entropy": 1.9364402756094932, "epoch": 0.12275653519679407, "grad_norm": 9.393874168395996, "learning_rate": 7.220642647607665e-06, "loss": 0.5802, "mean_token_accuracy": 0.8262747645378112, "num_tokens": 47572344.0, "step": 39600 }, { "entropy": 1.904061770439148, "epoch": 0.12278753432184376, "grad_norm": 11.156982421875, "learning_rate": 7.219731099922768e-06, "loss": 0.5131, "mean_token_accuracy": 0.8359477326273919, "num_tokens": 47583272.0, "step": 39610 }, { "entropy": 1.8150178447365761, "epoch": 0.12281853344689346, "grad_norm": 8.800067901611328, "learning_rate": 7.218819897377277e-06, "loss": 0.4655, "mean_token_accuracy": 0.84474868029356, "num_tokens": 47596939.0, "step": 39620 }, { "entropy": 1.7370637387037278, "epoch": 0.12284953257194316, "grad_norm": 8.489538192749023, "learning_rate": 7.217909039753451e-06, "loss": 0.4292, "mean_token_accuracy": 0.8503855854272843, "num_tokens": 47610478.0, "step": 39630 }, { "entropy": 1.8361937582492829, "epoch": 0.12288053169699285, "grad_norm": 10.117167472839355, "learning_rate": 7.216998526833735e-06, "loss": 0.5115, "mean_token_accuracy": 0.8379788339138031, "num_tokens": 47623032.0, "step": 39640 }, { "entropy": 1.880786618590355, "epoch": 0.12291153082204255, "grad_norm": 8.611593246459961, "learning_rate": 7.216088358400767e-06, "loss": 0.5068, "mean_token_accuracy": 0.841580656170845, "num_tokens": 47634664.0, "step": 39650 }, { "entropy": 1.8434013739228248, "epoch": 0.12294252994709225, "grad_norm": 11.259652137756348, "learning_rate": 7.2151785342373795e-06, "loss": 0.544, "mean_token_accuracy": 0.8298772439360619, "num_tokens": 47646858.0, "step": 39660 }, { "entropy": 1.882999302446842, "epoch": 0.12297352907214194, "grad_norm": 9.449716567993164, "learning_rate": 7.214269054126593e-06, "loss": 0.6034, "mean_token_accuracy": 0.8209249958395958, "num_tokens": 47658958.0, "step": 39670 }, { "entropy": 1.9060813054442405, "epoch": 0.12300452819719164, "grad_norm": 4.450017929077148, "learning_rate": 7.2133599178516235e-06, "loss": 0.5039, "mean_token_accuracy": 0.8366493061184883, "num_tokens": 47670981.0, "step": 39680 }, { "entropy": 2.007936453819275, "epoch": 0.12303552732224134, "grad_norm": 10.02921199798584, "learning_rate": 7.212451125195874e-06, "loss": 0.6157, "mean_token_accuracy": 0.8246709734201432, "num_tokens": 47681702.0, "step": 39690 }, { "entropy": 2.0430832996964456, "epoch": 0.12306652644729102, "grad_norm": 9.025290489196777, "learning_rate": 7.211542675942941e-06, "loss": 0.6219, "mean_token_accuracy": 0.816537082195282, "num_tokens": 47692748.0, "step": 39700 }, { "entropy": 1.9663964182138443, "epoch": 0.12309752557234072, "grad_norm": 10.015572547912598, "learning_rate": 7.2106345698766134e-06, "loss": 0.5715, "mean_token_accuracy": 0.814094303548336, "num_tokens": 47703817.0, "step": 39710 }, { "entropy": 1.793771331012249, "epoch": 0.12312852469739041, "grad_norm": 3.0519356727600098, "learning_rate": 7.209726806780866e-06, "loss": 0.4738, "mean_token_accuracy": 0.8429444998502731, "num_tokens": 47717762.0, "step": 39720 }, { "entropy": 1.7859131276607514, "epoch": 0.12315952382244011, "grad_norm": 3.6266133785247803, "learning_rate": 7.208819386439868e-06, "loss": 0.4706, "mean_token_accuracy": 0.8404285296797752, "num_tokens": 47731173.0, "step": 39730 }, { "entropy": 1.8794568166136743, "epoch": 0.1231905229474898, "grad_norm": 4.375041484832764, "learning_rate": 7.207912308637975e-06, "loss": 0.5168, "mean_token_accuracy": 0.8297057136893272, "num_tokens": 47743900.0, "step": 39740 }, { "entropy": 1.9013297393918038, "epoch": 0.1232215220725395, "grad_norm": 9.238868713378906, "learning_rate": 7.207005573159738e-06, "loss": 0.5438, "mean_token_accuracy": 0.8304916322231293, "num_tokens": 47755784.0, "step": 39750 }, { "entropy": 1.9574954420328141, "epoch": 0.1232525211975892, "grad_norm": 10.237153053283691, "learning_rate": 7.2060991797898904e-06, "loss": 0.5814, "mean_token_accuracy": 0.8244188159704209, "num_tokens": 47766731.0, "step": 39760 }, { "entropy": 1.8875176712870598, "epoch": 0.1232835203226389, "grad_norm": 10.606158256530762, "learning_rate": 7.205193128313362e-06, "loss": 0.5121, "mean_token_accuracy": 0.8495481833815575, "num_tokens": 47778152.0, "step": 39770 }, { "entropy": 1.9175768822431565, "epoch": 0.12331451944768859, "grad_norm": 8.000387191772461, "learning_rate": 7.204287418515269e-06, "loss": 0.5059, "mean_token_accuracy": 0.8478017941117286, "num_tokens": 47788937.0, "step": 39780 }, { "entropy": 1.8297215834259988, "epoch": 0.12334551857273829, "grad_norm": 10.035210609436035, "learning_rate": 7.203382050180914e-06, "loss": 0.4475, "mean_token_accuracy": 0.8538377195596695, "num_tokens": 47801439.0, "step": 39790 }, { "entropy": 1.9494767993688584, "epoch": 0.12337651769778799, "grad_norm": 9.916494369506836, "learning_rate": 7.202477023095793e-06, "loss": 0.5851, "mean_token_accuracy": 0.8265805870294571, "num_tokens": 47811893.0, "step": 39800 }, { "entropy": 1.8632332697510718, "epoch": 0.12340751682283768, "grad_norm": 8.924822807312012, "learning_rate": 7.201572337045587e-06, "loss": 0.5439, "mean_token_accuracy": 0.835116508603096, "num_tokens": 47823892.0, "step": 39810 }, { "entropy": 1.8363080993294716, "epoch": 0.12343851594788736, "grad_norm": 4.758782863616943, "learning_rate": 7.200667991816167e-06, "loss": 0.5063, "mean_token_accuracy": 0.8292215019464493, "num_tokens": 47835997.0, "step": 39820 }, { "entropy": 1.9205212816596031, "epoch": 0.12346951507293706, "grad_norm": 8.591593742370605, "learning_rate": 7.199763987193592e-06, "loss": 0.5566, "mean_token_accuracy": 0.8230644062161445, "num_tokens": 47847772.0, "step": 39830 }, { "entropy": 1.9247150912880897, "epoch": 0.12350051419798676, "grad_norm": 10.074462890625, "learning_rate": 7.19886032296411e-06, "loss": 0.57, "mean_token_accuracy": 0.8206574454903602, "num_tokens": 47859033.0, "step": 39840 }, { "entropy": 1.9125849097967147, "epoch": 0.12353151332303645, "grad_norm": 9.70457649230957, "learning_rate": 7.197956998914155e-06, "loss": 0.574, "mean_token_accuracy": 0.82592853307724, "num_tokens": 47870577.0, "step": 39850 }, { "entropy": 1.92913771122694, "epoch": 0.12356251244808615, "grad_norm": 8.795923233032227, "learning_rate": 7.197054014830351e-06, "loss": 0.5198, "mean_token_accuracy": 0.8308982938528061, "num_tokens": 47883093.0, "step": 39860 }, { "entropy": 1.8687468573451043, "epoch": 0.12359351157313585, "grad_norm": 9.503414154052734, "learning_rate": 7.196151370499505e-06, "loss": 0.5147, "mean_token_accuracy": 0.8317637592554092, "num_tokens": 47895551.0, "step": 39870 }, { "entropy": 1.969892618060112, "epoch": 0.12362451069818554, "grad_norm": 9.0645112991333, "learning_rate": 7.195249065708615e-06, "loss": 0.6059, "mean_token_accuracy": 0.818208034336567, "num_tokens": 47906405.0, "step": 39880 }, { "entropy": 1.8821232318878174, "epoch": 0.12365550982323524, "grad_norm": 8.767228126525879, "learning_rate": 7.194347100244863e-06, "loss": 0.5053, "mean_token_accuracy": 0.8397012487053871, "num_tokens": 47918131.0, "step": 39890 }, { "entropy": 1.8662964954972268, "epoch": 0.12368650894828494, "grad_norm": 9.380695343017578, "learning_rate": 7.1934454738956235e-06, "loss": 0.5109, "mean_token_accuracy": 0.8346205174922943, "num_tokens": 47929900.0, "step": 39900 }, { "entropy": 1.9048209875822066, "epoch": 0.12371750807333463, "grad_norm": 7.6859354972839355, "learning_rate": 7.19254418644845e-06, "loss": 0.5487, "mean_token_accuracy": 0.8317638859152794, "num_tokens": 47941788.0, "step": 39910 }, { "entropy": 1.9060806199908256, "epoch": 0.12374850719838433, "grad_norm": 14.849568367004395, "learning_rate": 7.1916432376910865e-06, "loss": 0.5648, "mean_token_accuracy": 0.8300574287772179, "num_tokens": 47953554.0, "step": 39920 }, { "entropy": 1.8868515014648437, "epoch": 0.12377950632343403, "grad_norm": 8.418484687805176, "learning_rate": 7.19074262741146e-06, "loss": 0.5313, "mean_token_accuracy": 0.8337209805846214, "num_tokens": 47965620.0, "step": 39930 }, { "entropy": 1.9150093987584114, "epoch": 0.12381050544848371, "grad_norm": 8.23653793334961, "learning_rate": 7.18984235539769e-06, "loss": 0.5473, "mean_token_accuracy": 0.8326784566044807, "num_tokens": 47977186.0, "step": 39940 }, { "entropy": 1.8675342485308648, "epoch": 0.1238415045735334, "grad_norm": 8.780365943908691, "learning_rate": 7.188942421438074e-06, "loss": 0.5194, "mean_token_accuracy": 0.8340345472097397, "num_tokens": 47988962.0, "step": 39950 }, { "entropy": 1.8479683220386505, "epoch": 0.1238725036985831, "grad_norm": 9.722196578979492, "learning_rate": 7.188042825321099e-06, "loss": 0.4927, "mean_token_accuracy": 0.8346738666296005, "num_tokens": 48001542.0, "step": 39960 }, { "entropy": 1.9192697882652283, "epoch": 0.1239035028236328, "grad_norm": 9.703927993774414, "learning_rate": 7.187143566835436e-06, "loss": 0.5948, "mean_token_accuracy": 0.8207048952579499, "num_tokens": 48013716.0, "step": 39970 }, { "entropy": 1.9120296582579612, "epoch": 0.1239345019486825, "grad_norm": 8.731795310974121, "learning_rate": 7.186244645769942e-06, "loss": 0.5596, "mean_token_accuracy": 0.8310200199484825, "num_tokens": 48026045.0, "step": 39980 }, { "entropy": 1.9499492287635802, "epoch": 0.12396550107373219, "grad_norm": 7.662987232208252, "learning_rate": 7.185346061913657e-06, "loss": 0.5668, "mean_token_accuracy": 0.8196876659989357, "num_tokens": 48037456.0, "step": 39990 }, { "entropy": 1.832308356463909, "epoch": 0.12399650019878189, "grad_norm": 3.518549919128418, "learning_rate": 7.18444781505581e-06, "loss": 0.4718, "mean_token_accuracy": 0.8477145195007324, "num_tokens": 48049861.0, "step": 40000 }, { "entropy": 1.86460652500391, "epoch": 0.12402749932383159, "grad_norm": 8.867399215698242, "learning_rate": 7.183549904985806e-06, "loss": 0.5642, "mean_token_accuracy": 0.82719586789608, "num_tokens": 48062011.0, "step": 40010 }, { "entropy": 1.880849677324295, "epoch": 0.12405849844888128, "grad_norm": 4.325946807861328, "learning_rate": 7.182652331493244e-06, "loss": 0.5531, "mean_token_accuracy": 0.8259584322571755, "num_tokens": 48074386.0, "step": 40020 }, { "entropy": 1.9078103736042977, "epoch": 0.12408949757393098, "grad_norm": 10.76919937133789, "learning_rate": 7.181755094367901e-06, "loss": 0.507, "mean_token_accuracy": 0.8391536310315132, "num_tokens": 48085967.0, "step": 40030 }, { "entropy": 1.8335755363106727, "epoch": 0.12412049669898068, "grad_norm": 12.115399360656738, "learning_rate": 7.1808581933997365e-06, "loss": 0.4988, "mean_token_accuracy": 0.833656270802021, "num_tokens": 48098343.0, "step": 40040 }, { "entropy": 1.8345079183578492, "epoch": 0.12415149582403037, "grad_norm": 9.696667671203613, "learning_rate": 7.1799616283789e-06, "loss": 0.5002, "mean_token_accuracy": 0.8341936364769935, "num_tokens": 48111160.0, "step": 40050 }, { "entropy": 1.7580282986164093, "epoch": 0.12418249494908007, "grad_norm": 8.78324031829834, "learning_rate": 7.179065399095719e-06, "loss": 0.5413, "mean_token_accuracy": 0.8408230692148209, "num_tokens": 48125547.0, "step": 40060 }, { "entropy": 1.8306955844163895, "epoch": 0.12421349407412975, "grad_norm": 9.833135604858398, "learning_rate": 7.178169505340706e-06, "loss": 0.5345, "mean_token_accuracy": 0.8345167860388756, "num_tokens": 48137633.0, "step": 40070 }, { "entropy": 1.819261023402214, "epoch": 0.12424449319917945, "grad_norm": 13.903244972229004, "learning_rate": 7.177273946904556e-06, "loss": 0.4783, "mean_token_accuracy": 0.8336721554398536, "num_tokens": 48151380.0, "step": 40080 }, { "entropy": 1.8060119107365609, "epoch": 0.12427549232422914, "grad_norm": 3.684872627258301, "learning_rate": 7.176378723578145e-06, "loss": 0.4902, "mean_token_accuracy": 0.8292297214269638, "num_tokens": 48164473.0, "step": 40090 }, { "entropy": 1.8856921419501305, "epoch": 0.12430649144927884, "grad_norm": 10.007135391235352, "learning_rate": 7.175483835152539e-06, "loss": 0.5491, "mean_token_accuracy": 0.8323625057935715, "num_tokens": 48176696.0, "step": 40100 }, { "entropy": 1.7466407373547554, "epoch": 0.12433749057432854, "grad_norm": 4.418228626251221, "learning_rate": 7.174589281418974e-06, "loss": 0.4057, "mean_token_accuracy": 0.8500324800610543, "num_tokens": 48190217.0, "step": 40110 }, { "entropy": 1.9157506242394446, "epoch": 0.12436848969937823, "grad_norm": 8.385701179504395, "learning_rate": 7.17369506216888e-06, "loss": 0.5272, "mean_token_accuracy": 0.825266569852829, "num_tokens": 48201921.0, "step": 40120 }, { "entropy": 1.858496817946434, "epoch": 0.12439948882442793, "grad_norm": 4.637474060058594, "learning_rate": 7.172801177193862e-06, "loss": 0.5327, "mean_token_accuracy": 0.8193719312548637, "num_tokens": 48214518.0, "step": 40130 }, { "entropy": 1.8998822063207625, "epoch": 0.12443048794947763, "grad_norm": 9.965042114257812, "learning_rate": 7.171907626285708e-06, "loss": 0.505, "mean_token_accuracy": 0.838045471906662, "num_tokens": 48225714.0, "step": 40140 }, { "entropy": 1.8577251955866814, "epoch": 0.12446148707452732, "grad_norm": 9.38553237915039, "learning_rate": 7.171014409236389e-06, "loss": 0.5167, "mean_token_accuracy": 0.8318695619702339, "num_tokens": 48238441.0, "step": 40150 }, { "entropy": 1.9177394479513168, "epoch": 0.12449248619957702, "grad_norm": 8.650186538696289, "learning_rate": 7.1701215258380555e-06, "loss": 0.5711, "mean_token_accuracy": 0.8154436364769936, "num_tokens": 48250481.0, "step": 40160 }, { "entropy": 1.9266600027680396, "epoch": 0.12452348532462672, "grad_norm": 8.298869132995605, "learning_rate": 7.169228975883042e-06, "loss": 0.5794, "mean_token_accuracy": 0.8180478677153588, "num_tokens": 48262220.0, "step": 40170 }, { "entropy": 1.9378298938274383, "epoch": 0.12455448444967641, "grad_norm": 4.73800802230835, "learning_rate": 7.16833675916386e-06, "loss": 0.5535, "mean_token_accuracy": 0.8223996505141258, "num_tokens": 48274072.0, "step": 40180 }, { "entropy": 1.8640633895993233, "epoch": 0.1245854835747261, "grad_norm": 10.636198043823242, "learning_rate": 7.167444875473203e-06, "loss": 0.5658, "mean_token_accuracy": 0.8277820602059365, "num_tokens": 48286360.0, "step": 40190 }, { "entropy": 1.9393206879496574, "epoch": 0.1246164826997758, "grad_norm": 4.20045804977417, "learning_rate": 7.166553324603949e-06, "loss": 0.578, "mean_token_accuracy": 0.8166181564331054, "num_tokens": 48297250.0, "step": 40200 }, { "entropy": 1.8666782602667809, "epoch": 0.12464748182482549, "grad_norm": 9.612086296081543, "learning_rate": 7.165662106349151e-06, "loss": 0.5188, "mean_token_accuracy": 0.8358882114291191, "num_tokens": 48309504.0, "step": 40210 }, { "entropy": 1.8223158940672874, "epoch": 0.12467848094987519, "grad_norm": 8.702556610107422, "learning_rate": 7.164771220502042e-06, "loss": 0.5055, "mean_token_accuracy": 0.8401806324720382, "num_tokens": 48321758.0, "step": 40220 }, { "entropy": 1.8826921790838242, "epoch": 0.12470948007492488, "grad_norm": 8.89100456237793, "learning_rate": 7.16388066685604e-06, "loss": 0.5317, "mean_token_accuracy": 0.823623314499855, "num_tokens": 48334423.0, "step": 40230 }, { "entropy": 1.935545490682125, "epoch": 0.12474047919997458, "grad_norm": 8.564839363098145, "learning_rate": 7.16299044520474e-06, "loss": 0.552, "mean_token_accuracy": 0.8268688634037972, "num_tokens": 48345619.0, "step": 40240 }, { "entropy": 1.835379946231842, "epoch": 0.12477147832502428, "grad_norm": 8.765735626220703, "learning_rate": 7.162100555341913e-06, "loss": 0.4967, "mean_token_accuracy": 0.8396931126713753, "num_tokens": 48358114.0, "step": 40250 }, { "entropy": 1.8680355235934258, "epoch": 0.12480247745007397, "grad_norm": 10.267287254333496, "learning_rate": 7.161210997061516e-06, "loss": 0.6458, "mean_token_accuracy": 0.8112869113683701, "num_tokens": 48370421.0, "step": 40260 }, { "entropy": 1.8904762849211694, "epoch": 0.12483347657512367, "grad_norm": 10.460432052612305, "learning_rate": 7.1603217701576784e-06, "loss": 0.5603, "mean_token_accuracy": 0.8190462201833725, "num_tokens": 48382227.0, "step": 40270 }, { "entropy": 1.8971378430724144, "epoch": 0.12486447570017337, "grad_norm": 7.277894020080566, "learning_rate": 7.159432874424715e-06, "loss": 0.5434, "mean_token_accuracy": 0.8331350639462471, "num_tokens": 48393629.0, "step": 40280 }, { "entropy": 1.941260239481926, "epoch": 0.12489547482522306, "grad_norm": 9.197176933288574, "learning_rate": 7.158544309657114e-06, "loss": 0.6038, "mean_token_accuracy": 0.8198348104953765, "num_tokens": 48404836.0, "step": 40290 }, { "entropy": 1.834352783858776, "epoch": 0.12492647395027276, "grad_norm": 4.567273139953613, "learning_rate": 7.157656075649543e-06, "loss": 0.556, "mean_token_accuracy": 0.8348585799336433, "num_tokens": 48416706.0, "step": 40300 }, { "entropy": 1.7823708355426788, "epoch": 0.12495747307532244, "grad_norm": 4.625446796417236, "learning_rate": 7.1567681721968504e-06, "loss": 0.4906, "mean_token_accuracy": 0.8396563857793808, "num_tokens": 48430018.0, "step": 40310 }, { "entropy": 1.8015142977237701, "epoch": 0.12498847220037214, "grad_norm": 10.068672180175781, "learning_rate": 7.155880599094063e-06, "loss": 0.5809, "mean_token_accuracy": 0.8157304137945175, "num_tokens": 48442115.0, "step": 40320 }, { "entropy": 1.8101561158895492, "epoch": 0.12501947132542185, "grad_norm": 9.83664321899414, "learning_rate": 7.154993356136379e-06, "loss": 0.5194, "mean_token_accuracy": 0.8348063215613365, "num_tokens": 48454395.0, "step": 40330 }, { "entropy": 1.8592476963996887, "epoch": 0.12505047045047155, "grad_norm": 8.112977027893066, "learning_rate": 7.154106443119184e-06, "loss": 0.525, "mean_token_accuracy": 0.8234350681304932, "num_tokens": 48466396.0, "step": 40340 }, { "entropy": 1.8576691791415214, "epoch": 0.12508146957552124, "grad_norm": 4.848905086517334, "learning_rate": 7.153219859838033e-06, "loss": 0.5681, "mean_token_accuracy": 0.8281613394618035, "num_tokens": 48479521.0, "step": 40350 }, { "entropy": 1.831693847477436, "epoch": 0.12511246870057094, "grad_norm": 7.906863689422607, "learning_rate": 7.152333606088664e-06, "loss": 0.5156, "mean_token_accuracy": 0.8364334732294083, "num_tokens": 48492267.0, "step": 40360 }, { "entropy": 1.805116631090641, "epoch": 0.12514346782562064, "grad_norm": 7.8519415855407715, "learning_rate": 7.151447681666986e-06, "loss": 0.5006, "mean_token_accuracy": 0.8286394074559211, "num_tokens": 48505894.0, "step": 40370 }, { "entropy": 1.8447122775018214, "epoch": 0.1251744669506703, "grad_norm": 7.955524444580078, "learning_rate": 7.150562086369092e-06, "loss": 0.4819, "mean_token_accuracy": 0.8388951927423477, "num_tokens": 48518153.0, "step": 40380 }, { "entropy": 1.8237828686833382, "epoch": 0.12520546607572, "grad_norm": 10.909595489501953, "learning_rate": 7.149676819991247e-06, "loss": 0.5013, "mean_token_accuracy": 0.835619455575943, "num_tokens": 48530887.0, "step": 40390 }, { "entropy": 1.9085283294320106, "epoch": 0.1252364652007697, "grad_norm": 9.184427261352539, "learning_rate": 7.148791882329893e-06, "loss": 0.5645, "mean_token_accuracy": 0.8193691223859787, "num_tokens": 48542762.0, "step": 40400 }, { "entropy": 1.8631555259227752, "epoch": 0.1252674643258194, "grad_norm": 9.436893463134766, "learning_rate": 7.147907273181649e-06, "loss": 0.5495, "mean_token_accuracy": 0.8169412419199944, "num_tokens": 48555091.0, "step": 40410 }, { "entropy": 1.836050059646368, "epoch": 0.1252984634508691, "grad_norm": 10.38294792175293, "learning_rate": 7.1470229923433125e-06, "loss": 0.5127, "mean_token_accuracy": 0.8353980749845504, "num_tokens": 48567717.0, "step": 40420 }, { "entropy": 1.9533065795898437, "epoch": 0.1253294625759188, "grad_norm": 3.965822696685791, "learning_rate": 7.146139039611852e-06, "loss": 0.5775, "mean_token_accuracy": 0.8248067319393158, "num_tokens": 48579059.0, "step": 40430 }, { "entropy": 1.8541604191064835, "epoch": 0.12536046170096848, "grad_norm": 8.920310020446777, "learning_rate": 7.1452554147844155e-06, "loss": 0.5358, "mean_token_accuracy": 0.8245360970497131, "num_tokens": 48591120.0, "step": 40440 }, { "entropy": 1.8779646888375283, "epoch": 0.12539146082601818, "grad_norm": 8.486520767211914, "learning_rate": 7.144372117658325e-06, "loss": 0.4997, "mean_token_accuracy": 0.8279622659087181, "num_tokens": 48603019.0, "step": 40450 }, { "entropy": 1.9320873647928238, "epoch": 0.12542245995106788, "grad_norm": 11.511739730834961, "learning_rate": 7.143489148031079e-06, "loss": 0.5977, "mean_token_accuracy": 0.8194258600473404, "num_tokens": 48615247.0, "step": 40460 }, { "entropy": 1.9659615710377694, "epoch": 0.12545345907611757, "grad_norm": 9.763450622558594, "learning_rate": 7.142606505700348e-06, "loss": 0.6165, "mean_token_accuracy": 0.8184546142816543, "num_tokens": 48626590.0, "step": 40470 }, { "entropy": 1.9306480765342713, "epoch": 0.12548445820116727, "grad_norm": 8.713022232055664, "learning_rate": 7.141724190463982e-06, "loss": 0.5698, "mean_token_accuracy": 0.8337675094604492, "num_tokens": 48637473.0, "step": 40480 }, { "entropy": 1.798674686253071, "epoch": 0.12551545732621697, "grad_norm": 11.227383613586426, "learning_rate": 7.140842202120004e-06, "loss": 0.4768, "mean_token_accuracy": 0.8351314097642899, "num_tokens": 48651671.0, "step": 40490 }, { "entropy": 1.9205691695213318, "epoch": 0.12554645645126666, "grad_norm": 8.593749046325684, "learning_rate": 7.139960540466611e-06, "loss": 0.5708, "mean_token_accuracy": 0.8320924565196037, "num_tokens": 48662844.0, "step": 40500 }, { "entropy": 1.9018804863095284, "epoch": 0.12557745557631636, "grad_norm": 9.196910858154297, "learning_rate": 7.13907920530217e-06, "loss": 0.5599, "mean_token_accuracy": 0.8320263445377349, "num_tokens": 48674513.0, "step": 40510 }, { "entropy": 1.8978378668427467, "epoch": 0.12560845470136606, "grad_norm": 7.797028064727783, "learning_rate": 7.138198196425235e-06, "loss": 0.541, "mean_token_accuracy": 0.826276271045208, "num_tokens": 48686033.0, "step": 40520 }, { "entropy": 1.8717727780342102, "epoch": 0.12563945382641575, "grad_norm": 8.808572769165039, "learning_rate": 7.137317513634519e-06, "loss": 0.5139, "mean_token_accuracy": 0.827865032851696, "num_tokens": 48698508.0, "step": 40530 }, { "entropy": 1.8457770988345146, "epoch": 0.12567045295146545, "grad_norm": 8.519002914428711, "learning_rate": 7.136437156728917e-06, "loss": 0.5143, "mean_token_accuracy": 0.8214613869786263, "num_tokens": 48711134.0, "step": 40540 }, { "entropy": 1.935419850051403, "epoch": 0.12570145207651515, "grad_norm": 8.7819185256958, "learning_rate": 7.135557125507497e-06, "loss": 0.52, "mean_token_accuracy": 0.8435436561703682, "num_tokens": 48722163.0, "step": 40550 }, { "entropy": 1.8241950839757919, "epoch": 0.12573245120156484, "grad_norm": 3.9725289344787598, "learning_rate": 7.134677419769499e-06, "loss": 0.451, "mean_token_accuracy": 0.8425084307789803, "num_tokens": 48735211.0, "step": 40560 }, { "entropy": 1.8317349657416344, "epoch": 0.12576345032661454, "grad_norm": 3.390087842941284, "learning_rate": 7.133798039314337e-06, "loss": 0.4573, "mean_token_accuracy": 0.8422825932502747, "num_tokens": 48748014.0, "step": 40570 }, { "entropy": 1.7657338783144951, "epoch": 0.12579444945166424, "grad_norm": 8.908466339111328, "learning_rate": 7.1329189839415956e-06, "loss": 0.4756, "mean_token_accuracy": 0.8356280282139779, "num_tokens": 48761505.0, "step": 40580 }, { "entropy": 1.8681147009134293, "epoch": 0.12582544857671393, "grad_norm": 9.203214645385742, "learning_rate": 7.132040253451038e-06, "loss": 0.5362, "mean_token_accuracy": 0.8317851752042771, "num_tokens": 48773868.0, "step": 40590 }, { "entropy": 1.8376054048538208, "epoch": 0.12585644770176363, "grad_norm": 4.301458835601807, "learning_rate": 7.131161847642594e-06, "loss": 0.4825, "mean_token_accuracy": 0.8354909658432007, "num_tokens": 48786722.0, "step": 40600 }, { "entropy": 1.968809324502945, "epoch": 0.12588744682681333, "grad_norm": 9.997700691223145, "learning_rate": 7.130283766316368e-06, "loss": 0.6149, "mean_token_accuracy": 0.8223230019211769, "num_tokens": 48797626.0, "step": 40610 }, { "entropy": 1.9517834931612015, "epoch": 0.125918445951863, "grad_norm": 8.660726547241211, "learning_rate": 7.1294060092726395e-06, "loss": 0.6263, "mean_token_accuracy": 0.8167603313922882, "num_tokens": 48809941.0, "step": 40620 }, { "entropy": 1.854266294836998, "epoch": 0.1259494450769127, "grad_norm": 8.519408226013184, "learning_rate": 7.128528576311854e-06, "loss": 0.5019, "mean_token_accuracy": 0.8381375819444656, "num_tokens": 48822363.0, "step": 40630 }, { "entropy": 1.8118067890405656, "epoch": 0.1259804442019624, "grad_norm": 10.48330020904541, "learning_rate": 7.127651467234633e-06, "loss": 0.5479, "mean_token_accuracy": 0.8347623705863952, "num_tokens": 48835835.0, "step": 40640 }, { "entropy": 1.8399090513586998, "epoch": 0.12601144332701208, "grad_norm": 9.834306716918945, "learning_rate": 7.12677468184177e-06, "loss": 0.4583, "mean_token_accuracy": 0.8451863750815392, "num_tokens": 48848639.0, "step": 40650 }, { "entropy": 1.8340865150094032, "epoch": 0.12604244245206178, "grad_norm": 10.710108757019043, "learning_rate": 7.125898219934229e-06, "loss": 0.4604, "mean_token_accuracy": 0.8310162261128425, "num_tokens": 48861498.0, "step": 40660 }, { "entropy": 1.9059862732887267, "epoch": 0.12607344157711148, "grad_norm": 8.74216365814209, "learning_rate": 7.125022081313144e-06, "loss": 0.5565, "mean_token_accuracy": 0.8258930623531342, "num_tokens": 48873676.0, "step": 40670 }, { "entropy": 1.8104983791708946, "epoch": 0.12610444070216117, "grad_norm": 3.50832462310791, "learning_rate": 7.124146265779823e-06, "loss": 0.5084, "mean_token_accuracy": 0.8388922065496445, "num_tokens": 48887359.0, "step": 40680 }, { "entropy": 1.886466035246849, "epoch": 0.12613543982721087, "grad_norm": 9.473098754882812, "learning_rate": 7.123270773135742e-06, "loss": 0.5395, "mean_token_accuracy": 0.8378918841481209, "num_tokens": 48898642.0, "step": 40690 }, { "entropy": 1.9465019404888153, "epoch": 0.12616643895226057, "grad_norm": 8.107137680053711, "learning_rate": 7.12239560318255e-06, "loss": 0.5828, "mean_token_accuracy": 0.8316583067178727, "num_tokens": 48909355.0, "step": 40700 }, { "entropy": 1.864406055212021, "epoch": 0.12619743807731026, "grad_norm": 7.923045635223389, "learning_rate": 7.121520755722065e-06, "loss": 0.5385, "mean_token_accuracy": 0.8234068945050239, "num_tokens": 48921713.0, "step": 40710 }, { "entropy": 1.917890764772892, "epoch": 0.12622843720235996, "grad_norm": 8.965694427490234, "learning_rate": 7.120646230556275e-06, "loss": 0.5322, "mean_token_accuracy": 0.8340031638741493, "num_tokens": 48933362.0, "step": 40720 }, { "entropy": 1.9002999886870384, "epoch": 0.12625943632740966, "grad_norm": 9.010135650634766, "learning_rate": 7.119772027487341e-06, "loss": 0.5219, "mean_token_accuracy": 0.8289919942617416, "num_tokens": 48945208.0, "step": 40730 }, { "entropy": 1.862901757657528, "epoch": 0.12629043545245935, "grad_norm": 9.43112564086914, "learning_rate": 7.118898146317591e-06, "loss": 0.5057, "mean_token_accuracy": 0.8338583365082741, "num_tokens": 48957105.0, "step": 40740 }, { "entropy": 1.8847228810191154, "epoch": 0.12632143457750905, "grad_norm": 4.666851043701172, "learning_rate": 7.118024586849524e-06, "loss": 0.5383, "mean_token_accuracy": 0.8274060249328613, "num_tokens": 48970207.0, "step": 40750 }, { "entropy": 1.8614213794469834, "epoch": 0.12635243370255875, "grad_norm": 8.402881622314453, "learning_rate": 7.117151348885809e-06, "loss": 0.5099, "mean_token_accuracy": 0.8313090309500695, "num_tokens": 48983272.0, "step": 40760 }, { "entropy": 1.8629606261849403, "epoch": 0.12638343282760844, "grad_norm": 3.4327898025512695, "learning_rate": 7.116278432229283e-06, "loss": 0.4715, "mean_token_accuracy": 0.8382740557193756, "num_tokens": 48995280.0, "step": 40770 }, { "entropy": 1.866362100839615, "epoch": 0.12641443195265814, "grad_norm": 6.627174377441406, "learning_rate": 7.1154058366829534e-06, "loss": 0.4933, "mean_token_accuracy": 0.8405808374285698, "num_tokens": 49007496.0, "step": 40780 }, { "entropy": 1.9819986671209335, "epoch": 0.12644543107770784, "grad_norm": 9.322339057922363, "learning_rate": 7.114533562049997e-06, "loss": 0.6276, "mean_token_accuracy": 0.8162664338946343, "num_tokens": 49018388.0, "step": 40790 }, { "entropy": 1.849839760363102, "epoch": 0.12647643020275753, "grad_norm": 3.9357082843780518, "learning_rate": 7.113661608133757e-06, "loss": 0.5208, "mean_token_accuracy": 0.8274443417787551, "num_tokens": 49030285.0, "step": 40800 }, { "entropy": 1.8829088985919953, "epoch": 0.12650742932780723, "grad_norm": 7.758213996887207, "learning_rate": 7.112789974737751e-06, "loss": 0.5549, "mean_token_accuracy": 0.831216461956501, "num_tokens": 49042179.0, "step": 40810 }, { "entropy": 1.9304823964834212, "epoch": 0.12653842845285693, "grad_norm": 8.798954963684082, "learning_rate": 7.1119186616656555e-06, "loss": 0.5907, "mean_token_accuracy": 0.8195964187383652, "num_tokens": 49054008.0, "step": 40820 }, { "entropy": 1.9207462221384048, "epoch": 0.12656942757790662, "grad_norm": 9.050512313842773, "learning_rate": 7.111047668721327e-06, "loss": 0.575, "mean_token_accuracy": 0.8204113721847535, "num_tokens": 49066034.0, "step": 40830 }, { "entropy": 1.908941000699997, "epoch": 0.12660042670295632, "grad_norm": 4.945900917053223, "learning_rate": 7.11017699570878e-06, "loss": 0.559, "mean_token_accuracy": 0.8230469167232514, "num_tokens": 49077639.0, "step": 40840 }, { "entropy": 1.8663305729627608, "epoch": 0.12663142582800602, "grad_norm": 8.779427528381348, "learning_rate": 7.109306642432202e-06, "loss": 0.4799, "mean_token_accuracy": 0.8439178988337517, "num_tokens": 49090125.0, "step": 40850 }, { "entropy": 1.9162701606750487, "epoch": 0.1266624249530557, "grad_norm": 9.195842742919922, "learning_rate": 7.108436608695949e-06, "loss": 0.5807, "mean_token_accuracy": 0.8183003440499306, "num_tokens": 49101421.0, "step": 40860 }, { "entropy": 1.9350932016968727, "epoch": 0.12669342407810538, "grad_norm": 9.8731689453125, "learning_rate": 7.10756689430454e-06, "loss": 0.6017, "mean_token_accuracy": 0.8179444178938866, "num_tokens": 49113375.0, "step": 40870 }, { "entropy": 1.9218543514609336, "epoch": 0.12672442320315508, "grad_norm": 8.230029106140137, "learning_rate": 7.106697499062666e-06, "loss": 0.5615, "mean_token_accuracy": 0.8303100064396858, "num_tokens": 49124773.0, "step": 40880 }, { "entropy": 1.8881905749440193, "epoch": 0.12675542232820478, "grad_norm": 5.785170555114746, "learning_rate": 7.105828422775184e-06, "loss": 0.5658, "mean_token_accuracy": 0.8263862758874894, "num_tokens": 49137292.0, "step": 40890 }, { "entropy": 1.8283898428082466, "epoch": 0.12678642145325447, "grad_norm": 10.07806396484375, "learning_rate": 7.1049596652471145e-06, "loss": 0.4984, "mean_token_accuracy": 0.8381018027663231, "num_tokens": 49149805.0, "step": 40900 }, { "entropy": 1.8706416577100753, "epoch": 0.12681742057830417, "grad_norm": 8.058819770812988, "learning_rate": 7.104091226283651e-06, "loss": 0.5054, "mean_token_accuracy": 0.84037394374609, "num_tokens": 49160775.0, "step": 40910 }, { "entropy": 1.8797523587942124, "epoch": 0.12684841970335387, "grad_norm": 8.583271026611328, "learning_rate": 7.103223105690148e-06, "loss": 0.5492, "mean_token_accuracy": 0.8319562718272209, "num_tokens": 49172562.0, "step": 40920 }, { "entropy": 1.81065763682127, "epoch": 0.12687941882840356, "grad_norm": 9.554974555969238, "learning_rate": 7.1023553032721315e-06, "loss": 0.5111, "mean_token_accuracy": 0.839790141582489, "num_tokens": 49185408.0, "step": 40930 }, { "entropy": 1.9385173588991165, "epoch": 0.12691041795345326, "grad_norm": 9.629557609558105, "learning_rate": 7.101487818835289e-06, "loss": 0.5379, "mean_token_accuracy": 0.8281932666897773, "num_tokens": 49197278.0, "step": 40940 }, { "entropy": 1.8193309232592583, "epoch": 0.12694141707850295, "grad_norm": 8.458683013916016, "learning_rate": 7.100620652185476e-06, "loss": 0.451, "mean_token_accuracy": 0.8422279581427574, "num_tokens": 49210376.0, "step": 40950 }, { "entropy": 1.8436155632138251, "epoch": 0.12697241620355265, "grad_norm": 10.37833023071289, "learning_rate": 7.099753803128716e-06, "loss": 0.5406, "mean_token_accuracy": 0.8349693238735199, "num_tokens": 49222925.0, "step": 40960 }, { "entropy": 1.836123764514923, "epoch": 0.12700341532860235, "grad_norm": 9.300790786743164, "learning_rate": 7.0988872714711934e-06, "loss": 0.4906, "mean_token_accuracy": 0.8327738150954247, "num_tokens": 49235480.0, "step": 40970 }, { "entropy": 1.8801463842391968, "epoch": 0.12703441445365204, "grad_norm": 5.394232749938965, "learning_rate": 7.098021057019264e-06, "loss": 0.541, "mean_token_accuracy": 0.8303487420082092, "num_tokens": 49247573.0, "step": 40980 }, { "entropy": 1.8794305130839348, "epoch": 0.12706541357870174, "grad_norm": 9.10645580291748, "learning_rate": 7.097155159579446e-06, "loss": 0.52, "mean_token_accuracy": 0.8420027121901512, "num_tokens": 49258928.0, "step": 40990 }, { "entropy": 1.9318907380104064, "epoch": 0.12709641270375144, "grad_norm": 9.83584976196289, "learning_rate": 7.09628957895842e-06, "loss": 0.5834, "mean_token_accuracy": 0.8214644491672516, "num_tokens": 49270094.0, "step": 41000 }, { "entropy": 1.8540407776832581, "epoch": 0.12712741182880113, "grad_norm": 9.077452659606934, "learning_rate": 7.095424314963037e-06, "loss": 0.4985, "mean_token_accuracy": 0.8308973863720894, "num_tokens": 49282965.0, "step": 41010 }, { "entropy": 1.8202777698636055, "epoch": 0.12715841095385083, "grad_norm": 4.704195976257324, "learning_rate": 7.094559367400309e-06, "loss": 0.5531, "mean_token_accuracy": 0.8255807936191559, "num_tokens": 49296075.0, "step": 41020 }, { "entropy": 1.9383866339921951, "epoch": 0.12718941007890053, "grad_norm": 9.143877983093262, "learning_rate": 7.093694736077415e-06, "loss": 0.5269, "mean_token_accuracy": 0.8385669678449631, "num_tokens": 49306680.0, "step": 41030 }, { "entropy": 1.742047442495823, "epoch": 0.12722040920395022, "grad_norm": 8.927555084228516, "learning_rate": 7.092830420801696e-06, "loss": 0.4322, "mean_token_accuracy": 0.8529659613966942, "num_tokens": 49320860.0, "step": 41040 }, { "entropy": 1.8793509498238563, "epoch": 0.12725140832899992, "grad_norm": 8.80550765991211, "learning_rate": 7.09196642138066e-06, "loss": 0.5804, "mean_token_accuracy": 0.819065073132515, "num_tokens": 49332538.0, "step": 41050 }, { "entropy": 1.8639131098985673, "epoch": 0.12728240745404962, "grad_norm": 8.924798965454102, "learning_rate": 7.091102737621975e-06, "loss": 0.5412, "mean_token_accuracy": 0.829890587925911, "num_tokens": 49345210.0, "step": 41060 }, { "entropy": 1.8677813604474067, "epoch": 0.12731340657909931, "grad_norm": 8.423454284667969, "learning_rate": 7.0902393693334806e-06, "loss": 0.4858, "mean_token_accuracy": 0.8379834398627282, "num_tokens": 49357016.0, "step": 41070 }, { "entropy": 1.9531373485922814, "epoch": 0.127344405704149, "grad_norm": 10.435996055603027, "learning_rate": 7.089376316323171e-06, "loss": 0.5963, "mean_token_accuracy": 0.8160718679428101, "num_tokens": 49368452.0, "step": 41080 }, { "entropy": 1.9246175095438958, "epoch": 0.1273754048291987, "grad_norm": 8.71939468383789, "learning_rate": 7.088513578399207e-06, "loss": 0.562, "mean_token_accuracy": 0.8220181420445443, "num_tokens": 49380204.0, "step": 41090 }, { "entropy": 1.8995565429329873, "epoch": 0.1274064039542484, "grad_norm": 9.864094734191895, "learning_rate": 7.08765115536992e-06, "loss": 0.5459, "mean_token_accuracy": 0.8301453128457069, "num_tokens": 49392226.0, "step": 41100 }, { "entropy": 1.900149242579937, "epoch": 0.1274374030792981, "grad_norm": 8.183351516723633, "learning_rate": 7.086789047043793e-06, "loss": 0.5465, "mean_token_accuracy": 0.8305352702736855, "num_tokens": 49404110.0, "step": 41110 }, { "entropy": 1.9237231731414794, "epoch": 0.12746840220434777, "grad_norm": 8.36330509185791, "learning_rate": 7.08592725322948e-06, "loss": 0.5263, "mean_token_accuracy": 0.8335365414619446, "num_tokens": 49416490.0, "step": 41120 }, { "entropy": 1.9189850777387618, "epoch": 0.12749940132939747, "grad_norm": 9.289202690124512, "learning_rate": 7.085065773735793e-06, "loss": 0.5032, "mean_token_accuracy": 0.8426647335290909, "num_tokens": 49427534.0, "step": 41130 }, { "entropy": 1.8757270485162736, "epoch": 0.12753040045444716, "grad_norm": 9.52793025970459, "learning_rate": 7.084204608371712e-06, "loss": 0.5224, "mean_token_accuracy": 0.8288914114236832, "num_tokens": 49439317.0, "step": 41140 }, { "entropy": 1.8509258836507798, "epoch": 0.12756139957949686, "grad_norm": 5.029160976409912, "learning_rate": 7.083343756946375e-06, "loss": 0.5584, "mean_token_accuracy": 0.8297637715935707, "num_tokens": 49451446.0, "step": 41150 }, { "entropy": 1.9340157762169838, "epoch": 0.12759239870454656, "grad_norm": 11.50483226776123, "learning_rate": 7.082483219269084e-06, "loss": 0.5791, "mean_token_accuracy": 0.8197537034749984, "num_tokens": 49463109.0, "step": 41160 }, { "entropy": 1.8594611391425133, "epoch": 0.12762339782959625, "grad_norm": 7.6252899169921875, "learning_rate": 7.081622995149303e-06, "loss": 0.5265, "mean_token_accuracy": 0.8369016021490097, "num_tokens": 49474727.0, "step": 41170 }, { "entropy": 1.800793182849884, "epoch": 0.12765439695464595, "grad_norm": 1.9972714185714722, "learning_rate": 7.080763084396659e-06, "loss": 0.5289, "mean_token_accuracy": 0.8380374610424042, "num_tokens": 49488788.0, "step": 41180 }, { "entropy": 1.9101984828710556, "epoch": 0.12768539607969565, "grad_norm": 10.091278076171875, "learning_rate": 7.0799034868209375e-06, "loss": 0.5464, "mean_token_accuracy": 0.8259019926190376, "num_tokens": 49501586.0, "step": 41190 }, { "entropy": 1.8499779477715492, "epoch": 0.12771639520474534, "grad_norm": 2.6960954666137695, "learning_rate": 7.079044202232089e-06, "loss": 0.5091, "mean_token_accuracy": 0.833436419069767, "num_tokens": 49514195.0, "step": 41200 }, { "entropy": 1.825632943212986, "epoch": 0.12774739432979504, "grad_norm": 9.283682823181152, "learning_rate": 7.078185230440225e-06, "loss": 0.4755, "mean_token_accuracy": 0.8395808920264244, "num_tokens": 49525989.0, "step": 41210 }, { "entropy": 1.9196676731109619, "epoch": 0.12777839345484474, "grad_norm": 8.81406021118164, "learning_rate": 7.0773265712556175e-06, "loss": 0.6053, "mean_token_accuracy": 0.8251327946782112, "num_tokens": 49536614.0, "step": 41220 }, { "entropy": 1.8985184118151666, "epoch": 0.12780939257989443, "grad_norm": 9.597168922424316, "learning_rate": 7.076468224488697e-06, "loss": 0.5692, "mean_token_accuracy": 0.821132518351078, "num_tokens": 49548013.0, "step": 41230 }, { "entropy": 1.8932278618216514, "epoch": 0.12784039170494413, "grad_norm": 8.59653091430664, "learning_rate": 7.075610189950059e-06, "loss": 0.5927, "mean_token_accuracy": 0.817491953074932, "num_tokens": 49559982.0, "step": 41240 }, { "entropy": 1.9004715710878373, "epoch": 0.12787139082999383, "grad_norm": 9.525247573852539, "learning_rate": 7.074752467450462e-06, "loss": 0.5614, "mean_token_accuracy": 0.8346248850226402, "num_tokens": 49571257.0, "step": 41250 }, { "entropy": 1.9199392378330231, "epoch": 0.12790238995504352, "grad_norm": 9.400187492370605, "learning_rate": 7.073895056800815e-06, "loss": 0.5788, "mean_token_accuracy": 0.8198289066553116, "num_tokens": 49582933.0, "step": 41260 }, { "entropy": 1.9200360536575318, "epoch": 0.12793338908009322, "grad_norm": 8.804792404174805, "learning_rate": 7.0730379578121956e-06, "loss": 0.5771, "mean_token_accuracy": 0.8220379948616028, "num_tokens": 49594138.0, "step": 41270 }, { "entropy": 1.9512885123491288, "epoch": 0.12796438820514291, "grad_norm": 10.201251983642578, "learning_rate": 7.0721811702958406e-06, "loss": 0.5598, "mean_token_accuracy": 0.8327943772077561, "num_tokens": 49604773.0, "step": 41280 }, { "entropy": 1.9171435490250588, "epoch": 0.1279953873301926, "grad_norm": 10.569693565368652, "learning_rate": 7.071324694063147e-06, "loss": 0.5814, "mean_token_accuracy": 0.8239866450428963, "num_tokens": 49615737.0, "step": 41290 }, { "entropy": 1.8590099811553955, "epoch": 0.1280263864552423, "grad_norm": 11.221964836120605, "learning_rate": 7.070468528925668e-06, "loss": 0.5038, "mean_token_accuracy": 0.8409821361303329, "num_tokens": 49627714.0, "step": 41300 }, { "entropy": 1.842246988415718, "epoch": 0.128057385580292, "grad_norm": 3.050896406173706, "learning_rate": 7.06961267469512e-06, "loss": 0.5573, "mean_token_accuracy": 0.8346633806824684, "num_tokens": 49640153.0, "step": 41310 }, { "entropy": 1.8636904895305633, "epoch": 0.1280883847053417, "grad_norm": 8.947188377380371, "learning_rate": 7.068757131183378e-06, "loss": 0.5331, "mean_token_accuracy": 0.8275300249457359, "num_tokens": 49652387.0, "step": 41320 }, { "entropy": 1.8928270608186721, "epoch": 0.1281193838303914, "grad_norm": 9.10622501373291, "learning_rate": 7.067901898202475e-06, "loss": 0.5765, "mean_token_accuracy": 0.8179589688777924, "num_tokens": 49664065.0, "step": 41330 }, { "entropy": 1.9435312688350677, "epoch": 0.1281503829554411, "grad_norm": 8.725778579711914, "learning_rate": 7.067046975564605e-06, "loss": 0.6324, "mean_token_accuracy": 0.8186397060751915, "num_tokens": 49674905.0, "step": 41340 }, { "entropy": 1.9112671226263047, "epoch": 0.1281813820804908, "grad_norm": 8.562727928161621, "learning_rate": 7.066192363082123e-06, "loss": 0.5768, "mean_token_accuracy": 0.8293170660734177, "num_tokens": 49686743.0, "step": 41350 }, { "entropy": 1.8668039426207543, "epoch": 0.12821238120554046, "grad_norm": 8.329222679138184, "learning_rate": 7.0653380605675344e-06, "loss": 0.5288, "mean_token_accuracy": 0.8435063764452935, "num_tokens": 49697557.0, "step": 41360 }, { "entropy": 1.8395963311195374, "epoch": 0.12824338033059016, "grad_norm": 2.403559684753418, "learning_rate": 7.064484067833515e-06, "loss": 0.5249, "mean_token_accuracy": 0.8410920441150666, "num_tokens": 49710159.0, "step": 41370 }, { "entropy": 1.9411360412836074, "epoch": 0.12827437945563985, "grad_norm": 8.784658432006836, "learning_rate": 7.063630384692888e-06, "loss": 0.6002, "mean_token_accuracy": 0.824271696805954, "num_tokens": 49720451.0, "step": 41380 }, { "entropy": 1.8381685689091682, "epoch": 0.12830537858068955, "grad_norm": 7.8702497482299805, "learning_rate": 7.062777010958642e-06, "loss": 0.4757, "mean_token_accuracy": 0.8503654897212982, "num_tokens": 49732913.0, "step": 41390 }, { "entropy": 1.9066902339458465, "epoch": 0.12833637770573925, "grad_norm": 8.476966857910156, "learning_rate": 7.061923946443923e-06, "loss": 0.5498, "mean_token_accuracy": 0.8252906069159508, "num_tokens": 49744489.0, "step": 41400 }, { "entropy": 1.8539624221622943, "epoch": 0.12836737683078894, "grad_norm": 9.746798515319824, "learning_rate": 7.061071190962031e-06, "loss": 0.5353, "mean_token_accuracy": 0.8306064233183861, "num_tokens": 49757061.0, "step": 41410 }, { "entropy": 1.8506083533167839, "epoch": 0.12839837595583864, "grad_norm": 9.114572525024414, "learning_rate": 7.060218744326428e-06, "loss": 0.4466, "mean_token_accuracy": 0.8507398188114166, "num_tokens": 49769261.0, "step": 41420 }, { "entropy": 1.8710011199116707, "epoch": 0.12842937508088834, "grad_norm": 4.284439563751221, "learning_rate": 7.059366606350731e-06, "loss": 0.5173, "mean_token_accuracy": 0.8323528245091438, "num_tokens": 49781442.0, "step": 41430 }, { "entropy": 1.9052658289670945, "epoch": 0.12846037420593803, "grad_norm": 8.909741401672363, "learning_rate": 7.0585147768487165e-06, "loss": 0.6048, "mean_token_accuracy": 0.825349123775959, "num_tokens": 49794445.0, "step": 41440 }, { "entropy": 1.911386439204216, "epoch": 0.12849137333098773, "grad_norm": 9.72156810760498, "learning_rate": 7.057663255634316e-06, "loss": 0.5451, "mean_token_accuracy": 0.8314669832587243, "num_tokens": 49806414.0, "step": 41450 }, { "entropy": 1.8788209095597268, "epoch": 0.12852237245603743, "grad_norm": 4.481322288513184, "learning_rate": 7.056812042521619e-06, "loss": 0.4999, "mean_token_accuracy": 0.8367927253246308, "num_tokens": 49818629.0, "step": 41460 }, { "entropy": 1.958992251753807, "epoch": 0.12855337158108712, "grad_norm": 9.640412330627441, "learning_rate": 7.0559611373248725e-06, "loss": 0.607, "mean_token_accuracy": 0.8203074246644974, "num_tokens": 49829898.0, "step": 41470 }, { "entropy": 1.7837151035666465, "epoch": 0.12858437070613682, "grad_norm": 3.7444610595703125, "learning_rate": 7.05511053985848e-06, "loss": 0.3599, "mean_token_accuracy": 0.8547704428434372, "num_tokens": 49844096.0, "step": 41480 }, { "entropy": 1.8662296697497367, "epoch": 0.12861536983118652, "grad_norm": 4.482416152954102, "learning_rate": 7.054260249937003e-06, "loss": 0.5229, "mean_token_accuracy": 0.8283546611666679, "num_tokens": 49856112.0, "step": 41490 }, { "entropy": 1.8327308684587478, "epoch": 0.1286463689562362, "grad_norm": 4.7443342208862305, "learning_rate": 7.053410267375156e-06, "loss": 0.4376, "mean_token_accuracy": 0.8486336767673492, "num_tokens": 49869424.0, "step": 41500 }, { "entropy": 1.7724727407097816, "epoch": 0.1286773680812859, "grad_norm": 9.06193733215332, "learning_rate": 7.052560591987811e-06, "loss": 0.4431, "mean_token_accuracy": 0.8514471918344497, "num_tokens": 49882790.0, "step": 41510 }, { "entropy": 1.8597662687301635, "epoch": 0.1287083672063356, "grad_norm": 10.311935424804688, "learning_rate": 7.051711223589997e-06, "loss": 0.5304, "mean_token_accuracy": 0.83001659065485, "num_tokens": 49894927.0, "step": 41520 }, { "entropy": 1.8753248527646065, "epoch": 0.1287393663313853, "grad_norm": 9.605613708496094, "learning_rate": 7.050862161996901e-06, "loss": 0.5419, "mean_token_accuracy": 0.8333491206169128, "num_tokens": 49906710.0, "step": 41530 }, { "entropy": 1.904360829293728, "epoch": 0.128770365456435, "grad_norm": 7.814454078674316, "learning_rate": 7.050013407023859e-06, "loss": 0.5386, "mean_token_accuracy": 0.8288786053657532, "num_tokens": 49918562.0, "step": 41540 }, { "entropy": 1.9048177361488343, "epoch": 0.1288013645814847, "grad_norm": 9.190512657165527, "learning_rate": 7.049164958486372e-06, "loss": 0.5529, "mean_token_accuracy": 0.8335204407572746, "num_tokens": 49929488.0, "step": 41550 }, { "entropy": 1.8486354887485503, "epoch": 0.1288323637065344, "grad_norm": 9.387413024902344, "learning_rate": 7.048316816200086e-06, "loss": 0.5107, "mean_token_accuracy": 0.8399450510740281, "num_tokens": 49941628.0, "step": 41560 }, { "entropy": 1.8960034802556038, "epoch": 0.1288633628315841, "grad_norm": 4.520693302154541, "learning_rate": 7.047468979980812e-06, "loss": 0.5923, "mean_token_accuracy": 0.8215907469391823, "num_tokens": 49953693.0, "step": 41570 }, { "entropy": 1.838585540652275, "epoch": 0.12889436195663379, "grad_norm": 9.408111572265625, "learning_rate": 7.046621449644507e-06, "loss": 0.5268, "mean_token_accuracy": 0.825011870265007, "num_tokens": 49964922.0, "step": 41580 }, { "entropy": 1.8944636061787605, "epoch": 0.12892536108168348, "grad_norm": 8.773103713989258, "learning_rate": 7.045774225007293e-06, "loss": 0.5337, "mean_token_accuracy": 0.8297211557626725, "num_tokens": 49977343.0, "step": 41590 }, { "entropy": 1.8642171308398248, "epoch": 0.12895636020673318, "grad_norm": 9.104546546936035, "learning_rate": 7.044927305885436e-06, "loss": 0.5694, "mean_token_accuracy": 0.824115814268589, "num_tokens": 49990496.0, "step": 41600 }, { "entropy": 1.832181690633297, "epoch": 0.12898735933178285, "grad_norm": 4.260911464691162, "learning_rate": 7.044080692095364e-06, "loss": 0.4779, "mean_token_accuracy": 0.8464431285858154, "num_tokens": 50002745.0, "step": 41610 }, { "entropy": 1.878463228046894, "epoch": 0.12901835845683254, "grad_norm": 8.97909927368164, "learning_rate": 7.043234383453658e-06, "loss": 0.5373, "mean_token_accuracy": 0.8333162799477577, "num_tokens": 50014652.0, "step": 41620 }, { "entropy": 1.8109195098280906, "epoch": 0.12904935758188224, "grad_norm": 10.204434394836426, "learning_rate": 7.04238837977705e-06, "loss": 0.5412, "mean_token_accuracy": 0.8232007443904876, "num_tokens": 50026970.0, "step": 41630 }, { "entropy": 1.8154733762145043, "epoch": 0.12908035670693194, "grad_norm": 5.0115132331848145, "learning_rate": 7.041542680882431e-06, "loss": 0.5119, "mean_token_accuracy": 0.8351962670683861, "num_tokens": 50039078.0, "step": 41640 }, { "entropy": 1.854199130833149, "epoch": 0.12911135583198163, "grad_norm": 10.00779914855957, "learning_rate": 7.04069728658684e-06, "loss": 0.5641, "mean_token_accuracy": 0.8258049175143242, "num_tokens": 50050935.0, "step": 41650 }, { "entropy": 1.9253348022699357, "epoch": 0.12914235495703133, "grad_norm": 8.788981437683105, "learning_rate": 7.039852196707477e-06, "loss": 0.5403, "mean_token_accuracy": 0.8358196586370468, "num_tokens": 50062097.0, "step": 41660 }, { "entropy": 1.8359443858265876, "epoch": 0.12917335408208103, "grad_norm": 8.613965034484863, "learning_rate": 7.039007411061688e-06, "loss": 0.47, "mean_token_accuracy": 0.8417585626244545, "num_tokens": 50074289.0, "step": 41670 }, { "entropy": 1.7834791973233224, "epoch": 0.12920435320713072, "grad_norm": 5.170468807220459, "learning_rate": 7.038162929466977e-06, "loss": 0.4737, "mean_token_accuracy": 0.8395596250891686, "num_tokens": 50087113.0, "step": 41680 }, { "entropy": 1.8834406480193138, "epoch": 0.12923535233218042, "grad_norm": 9.960625648498535, "learning_rate": 7.037318751741002e-06, "loss": 0.5586, "mean_token_accuracy": 0.8236372783780098, "num_tokens": 50098957.0, "step": 41690 }, { "entropy": 1.8330654799938202, "epoch": 0.12926635145723012, "grad_norm": 8.917972564697266, "learning_rate": 7.036474877701568e-06, "loss": 0.5263, "mean_token_accuracy": 0.8239554926753044, "num_tokens": 50111327.0, "step": 41700 }, { "entropy": 1.9040823325514793, "epoch": 0.1292973505822798, "grad_norm": 10.261785507202148, "learning_rate": 7.03563130716664e-06, "loss": 0.5855, "mean_token_accuracy": 0.8225290685892105, "num_tokens": 50123086.0, "step": 41710 }, { "entropy": 1.91173397898674, "epoch": 0.1293283497073295, "grad_norm": 9.091620445251465, "learning_rate": 7.0347880399543345e-06, "loss": 0.5809, "mean_token_accuracy": 0.8207633405923843, "num_tokens": 50134801.0, "step": 41720 }, { "entropy": 1.8690212473273278, "epoch": 0.1293593488323792, "grad_norm": 8.664191246032715, "learning_rate": 7.0339450758829165e-06, "loss": 0.5377, "mean_token_accuracy": 0.8263664215803146, "num_tokens": 50146507.0, "step": 41730 }, { "entropy": 1.8848260268568993, "epoch": 0.1293903479574289, "grad_norm": 10.184765815734863, "learning_rate": 7.033102414770806e-06, "loss": 0.5267, "mean_token_accuracy": 0.8280350670218468, "num_tokens": 50158176.0, "step": 41740 }, { "entropy": 1.78608690649271, "epoch": 0.1294213470824786, "grad_norm": 9.575325012207031, "learning_rate": 7.032260056436574e-06, "loss": 0.4691, "mean_token_accuracy": 0.8456524163484573, "num_tokens": 50171381.0, "step": 41750 }, { "entropy": 1.9386123955249785, "epoch": 0.1294523462075283, "grad_norm": 10.201217651367188, "learning_rate": 7.031418000698947e-06, "loss": 0.5914, "mean_token_accuracy": 0.8301969483494759, "num_tokens": 50182781.0, "step": 41760 }, { "entropy": 1.847770369052887, "epoch": 0.129483345332578, "grad_norm": 4.556051254272461, "learning_rate": 7.0305762473768e-06, "loss": 0.5087, "mean_token_accuracy": 0.8307551980018616, "num_tokens": 50195460.0, "step": 41770 }, { "entropy": 1.816901859641075, "epoch": 0.1295143444576277, "grad_norm": 9.289469718933105, "learning_rate": 7.0297347962891595e-06, "loss": 0.5057, "mean_token_accuracy": 0.8283630818128586, "num_tokens": 50208808.0, "step": 41780 }, { "entropy": 1.929781760275364, "epoch": 0.12954534358267739, "grad_norm": 8.88871955871582, "learning_rate": 7.028893647255209e-06, "loss": 0.5515, "mean_token_accuracy": 0.8377395987510681, "num_tokens": 50220192.0, "step": 41790 }, { "entropy": 1.8346890568733216, "epoch": 0.12957634270772708, "grad_norm": 11.108233451843262, "learning_rate": 7.028052800094273e-06, "loss": 0.5077, "mean_token_accuracy": 0.8360419407486915, "num_tokens": 50232642.0, "step": 41800 }, { "entropy": 1.762622408568859, "epoch": 0.12960734183277678, "grad_norm": 3.646545886993408, "learning_rate": 7.027212254625838e-06, "loss": 0.4675, "mean_token_accuracy": 0.8363216891884804, "num_tokens": 50245870.0, "step": 41810 }, { "entropy": 1.7347578413784503, "epoch": 0.12963834095782648, "grad_norm": 10.43207836151123, "learning_rate": 7.026372010669536e-06, "loss": 0.4645, "mean_token_accuracy": 0.8484481438994408, "num_tokens": 50259093.0, "step": 41820 }, { "entropy": 1.8771490514278413, "epoch": 0.12966934008287617, "grad_norm": 10.095263481140137, "learning_rate": 7.025532068045149e-06, "loss": 0.5608, "mean_token_accuracy": 0.8258682683110237, "num_tokens": 50270396.0, "step": 41830 }, { "entropy": 1.8018823832273483, "epoch": 0.12970033920792587, "grad_norm": 10.791528701782227, "learning_rate": 7.024692426572615e-06, "loss": 0.5024, "mean_token_accuracy": 0.8406801581382751, "num_tokens": 50283135.0, "step": 41840 }, { "entropy": 1.8664998412132263, "epoch": 0.12973133833297557, "grad_norm": 11.595536231994629, "learning_rate": 7.023853086072019e-06, "loss": 0.554, "mean_token_accuracy": 0.820669724047184, "num_tokens": 50294875.0, "step": 41850 }, { "entropy": 1.8681924358010291, "epoch": 0.12976233745802523, "grad_norm": 9.37474536895752, "learning_rate": 7.0230140463635955e-06, "loss": 0.5866, "mean_token_accuracy": 0.8231520146131516, "num_tokens": 50307197.0, "step": 41860 }, { "entropy": 1.8410033360123634, "epoch": 0.12979333658307493, "grad_norm": 4.718626022338867, "learning_rate": 7.022175307267729e-06, "loss": 0.467, "mean_token_accuracy": 0.8391537711024284, "num_tokens": 50319844.0, "step": 41870 }, { "entropy": 1.9163996964693069, "epoch": 0.12982433570812463, "grad_norm": 13.432290077209473, "learning_rate": 7.021336868604959e-06, "loss": 0.6464, "mean_token_accuracy": 0.8215458050370217, "num_tokens": 50331051.0, "step": 41880 }, { "entropy": 1.8775553047657012, "epoch": 0.12985533483317432, "grad_norm": 9.11666488647461, "learning_rate": 7.0204987301959715e-06, "loss": 0.5546, "mean_token_accuracy": 0.8308037981390953, "num_tokens": 50342313.0, "step": 41890 }, { "entropy": 1.7834023706614972, "epoch": 0.12988633395822402, "grad_norm": 5.124344825744629, "learning_rate": 7.019660891861601e-06, "loss": 0.4445, "mean_token_accuracy": 0.8429507181048393, "num_tokens": 50356360.0, "step": 41900 }, { "entropy": 1.9084928244352342, "epoch": 0.12991733308327372, "grad_norm": 8.772656440734863, "learning_rate": 7.018823353422832e-06, "loss": 0.5841, "mean_token_accuracy": 0.8257664754986763, "num_tokens": 50367277.0, "step": 41910 }, { "entropy": 1.8719779431819916, "epoch": 0.1299483322083234, "grad_norm": 8.429474830627441, "learning_rate": 7.017986114700802e-06, "loss": 0.501, "mean_token_accuracy": 0.8469484955072403, "num_tokens": 50378849.0, "step": 41920 }, { "entropy": 1.8775894209742545, "epoch": 0.1299793313333731, "grad_norm": 8.218339920043945, "learning_rate": 7.0171491755167954e-06, "loss": 0.5791, "mean_token_accuracy": 0.813252392411232, "num_tokens": 50390771.0, "step": 41930 }, { "entropy": 1.8514499336481094, "epoch": 0.1300103304584228, "grad_norm": 8.702057838439941, "learning_rate": 7.016312535692245e-06, "loss": 0.5392, "mean_token_accuracy": 0.837669375538826, "num_tokens": 50402598.0, "step": 41940 }, { "entropy": 1.7951854154467584, "epoch": 0.1300413295834725, "grad_norm": 9.224007606506348, "learning_rate": 7.0154761950487325e-06, "loss": 0.4451, "mean_token_accuracy": 0.8424011334776879, "num_tokens": 50415365.0, "step": 41950 }, { "entropy": 1.941623505949974, "epoch": 0.1300723287085222, "grad_norm": 10.595728874206543, "learning_rate": 7.01464015340799e-06, "loss": 0.6463, "mean_token_accuracy": 0.8008893147110939, "num_tokens": 50426866.0, "step": 41960 }, { "entropy": 1.8391387566924096, "epoch": 0.1301033278335719, "grad_norm": 9.028034210205078, "learning_rate": 7.0138044105918975e-06, "loss": 0.5394, "mean_token_accuracy": 0.8306760504841805, "num_tokens": 50440268.0, "step": 41970 }, { "entropy": 1.7658006258308887, "epoch": 0.1301343269586216, "grad_norm": 2.6869993209838867, "learning_rate": 7.0129689664224855e-06, "loss": 0.4255, "mean_token_accuracy": 0.8479226931929589, "num_tokens": 50455088.0, "step": 41980 }, { "entropy": 1.8967247605323792, "epoch": 0.1301653260836713, "grad_norm": 7.826026916503906, "learning_rate": 7.012133820721929e-06, "loss": 0.5326, "mean_token_accuracy": 0.8311007678508758, "num_tokens": 50466652.0, "step": 41990 }, { "entropy": 1.958921417593956, "epoch": 0.130196325208721, "grad_norm": 8.257652282714844, "learning_rate": 7.011298973312554e-06, "loss": 0.5903, "mean_token_accuracy": 0.8199125394225121, "num_tokens": 50477973.0, "step": 42000 }, { "entropy": 1.8253989905118941, "epoch": 0.13022732433377068, "grad_norm": 3.0014355182647705, "learning_rate": 7.0104644240168294e-06, "loss": 0.5041, "mean_token_accuracy": 0.835718783736229, "num_tokens": 50490889.0, "step": 42010 }, { "entropy": 1.9271552190184593, "epoch": 0.13025832345882038, "grad_norm": 10.399243354797363, "learning_rate": 7.0096301726573835e-06, "loss": 0.6215, "mean_token_accuracy": 0.8131484746932983, "num_tokens": 50502026.0, "step": 42020 }, { "entropy": 1.9419305875897408, "epoch": 0.13028932258387008, "grad_norm": 7.492198467254639, "learning_rate": 7.008796219056981e-06, "loss": 0.6331, "mean_token_accuracy": 0.82049780189991, "num_tokens": 50513287.0, "step": 42030 }, { "entropy": 1.9706764385104178, "epoch": 0.13032032170891977, "grad_norm": 10.456650733947754, "learning_rate": 7.00796256303854e-06, "loss": 0.5711, "mean_token_accuracy": 0.8240112096071244, "num_tokens": 50524654.0, "step": 42040 }, { "entropy": 1.8961515158414841, "epoch": 0.13035132083396947, "grad_norm": 10.818836212158203, "learning_rate": 7.0071292044251215e-06, "loss": 0.5216, "mean_token_accuracy": 0.8393558323383331, "num_tokens": 50537697.0, "step": 42050 }, { "entropy": 1.9397668451070786, "epoch": 0.13038231995901917, "grad_norm": 7.472477912902832, "learning_rate": 7.006296143039939e-06, "loss": 0.5613, "mean_token_accuracy": 0.8343762636184693, "num_tokens": 50549295.0, "step": 42060 }, { "entropy": 1.8642112627625465, "epoch": 0.13041331908406886, "grad_norm": 2.947805643081665, "learning_rate": 7.00546337870635e-06, "loss": 0.5143, "mean_token_accuracy": 0.8320413127541542, "num_tokens": 50561911.0, "step": 42070 }, { "entropy": 1.8363241106271744, "epoch": 0.13044431820911856, "grad_norm": 4.179972171783447, "learning_rate": 7.0046309112478594e-06, "loss": 0.4682, "mean_token_accuracy": 0.8355718359351159, "num_tokens": 50574636.0, "step": 42080 }, { "entropy": 1.8695838272571563, "epoch": 0.13047531733416826, "grad_norm": 10.452752113342285, "learning_rate": 7.003798740488118e-06, "loss": 0.5269, "mean_token_accuracy": 0.8281556889414787, "num_tokens": 50586479.0, "step": 42090 }, { "entropy": 1.8435761332511902, "epoch": 0.13050631645921792, "grad_norm": 9.886906623840332, "learning_rate": 7.0029668662509255e-06, "loss": 0.5702, "mean_token_accuracy": 0.8295658230781555, "num_tokens": 50599307.0, "step": 42100 }, { "entropy": 1.8621219590306282, "epoch": 0.13053731558426762, "grad_norm": 7.598883628845215, "learning_rate": 7.002135288360228e-06, "loss": 0.4905, "mean_token_accuracy": 0.8365325689315796, "num_tokens": 50611631.0, "step": 42110 }, { "entropy": 1.926572097837925, "epoch": 0.13056831470931732, "grad_norm": 7.883036136627197, "learning_rate": 7.0013040066401135e-06, "loss": 0.5725, "mean_token_accuracy": 0.8214625731110573, "num_tokens": 50622981.0, "step": 42120 }, { "entropy": 1.9384382754564284, "epoch": 0.13059931383436701, "grad_norm": 8.586524963378906, "learning_rate": 7.000473020914823e-06, "loss": 0.585, "mean_token_accuracy": 0.8287076443433762, "num_tokens": 50633779.0, "step": 42130 }, { "entropy": 1.9370570868253707, "epoch": 0.1306303129594167, "grad_norm": 10.588152885437012, "learning_rate": 6.999642331008736e-06, "loss": 0.5732, "mean_token_accuracy": 0.8158860892057419, "num_tokens": 50645735.0, "step": 42140 }, { "entropy": 1.8363999262452126, "epoch": 0.1306613120844664, "grad_norm": 4.617247581481934, "learning_rate": 6.998811936746385e-06, "loss": 0.5028, "mean_token_accuracy": 0.8244207426905632, "num_tokens": 50659623.0, "step": 42150 }, { "entropy": 1.9262761980295182, "epoch": 0.1306923112095161, "grad_norm": 11.821525573730469, "learning_rate": 6.9979818379524435e-06, "loss": 0.5402, "mean_token_accuracy": 0.8174602270126343, "num_tokens": 50672505.0, "step": 42160 }, { "entropy": 1.8478658609092236, "epoch": 0.1307233103345658, "grad_norm": 5.417337894439697, "learning_rate": 6.997152034451732e-06, "loss": 0.4861, "mean_token_accuracy": 0.8429075211286545, "num_tokens": 50686049.0, "step": 42170 }, { "entropy": 1.897514969110489, "epoch": 0.1307543094596155, "grad_norm": 7.976996421813965, "learning_rate": 6.9963225260692145e-06, "loss": 0.5191, "mean_token_accuracy": 0.832906337082386, "num_tokens": 50698264.0, "step": 42180 }, { "entropy": 1.8931497901678085, "epoch": 0.1307853085846652, "grad_norm": 4.187560081481934, "learning_rate": 6.995493312630006e-06, "loss": 0.5075, "mean_token_accuracy": 0.8311450779438019, "num_tokens": 50710916.0, "step": 42190 }, { "entropy": 1.8941505983471871, "epoch": 0.1308163077097149, "grad_norm": 8.752896308898926, "learning_rate": 6.9946643939593606e-06, "loss": 0.5434, "mean_token_accuracy": 0.8309932291507721, "num_tokens": 50722894.0, "step": 42200 }, { "entropy": 1.897823777794838, "epoch": 0.1308473068347646, "grad_norm": 8.950542449951172, "learning_rate": 6.993835769882677e-06, "loss": 0.5178, "mean_token_accuracy": 0.8307448908686638, "num_tokens": 50735048.0, "step": 42210 }, { "entropy": 2.0057034313678743, "epoch": 0.13087830595981428, "grad_norm": 8.577478408813477, "learning_rate": 6.993007440225504e-06, "loss": 0.5908, "mean_token_accuracy": 0.8248661920428276, "num_tokens": 50745880.0, "step": 42220 }, { "entropy": 1.933904617279768, "epoch": 0.13090930508486398, "grad_norm": 7.418628692626953, "learning_rate": 6.99217940481353e-06, "loss": 0.534, "mean_token_accuracy": 0.8272743076086044, "num_tokens": 50757689.0, "step": 42230 }, { "entropy": 1.8607298329472541, "epoch": 0.13094030420991368, "grad_norm": 3.3645823001861572, "learning_rate": 6.991351663472591e-06, "loss": 0.5565, "mean_token_accuracy": 0.8338559105992317, "num_tokens": 50770252.0, "step": 42240 }, { "entropy": 1.7977669216692447, "epoch": 0.13097130333496337, "grad_norm": 8.085532188415527, "learning_rate": 6.990524216028667e-06, "loss": 0.4282, "mean_token_accuracy": 0.8425278753042221, "num_tokens": 50783859.0, "step": 42250 }, { "entropy": 1.9316506549715995, "epoch": 0.13100230246001307, "grad_norm": 9.128771781921387, "learning_rate": 6.989697062307879e-06, "loss": 0.553, "mean_token_accuracy": 0.8342802032828331, "num_tokens": 50795907.0, "step": 42260 }, { "entropy": 2.010502940416336, "epoch": 0.13103330158506277, "grad_norm": 9.819306373596191, "learning_rate": 6.988870202136493e-06, "loss": 0.6157, "mean_token_accuracy": 0.8241182819008828, "num_tokens": 50806819.0, "step": 42270 }, { "entropy": 1.879169872403145, "epoch": 0.13106430071011246, "grad_norm": 8.068976402282715, "learning_rate": 6.988043635340924e-06, "loss": 0.474, "mean_token_accuracy": 0.8331845104694366, "num_tokens": 50820181.0, "step": 42280 }, { "entropy": 1.9254466131329537, "epoch": 0.13109529983516216, "grad_norm": 8.121038436889648, "learning_rate": 6.987217361747725e-06, "loss": 0.5318, "mean_token_accuracy": 0.830107967555523, "num_tokens": 50831403.0, "step": 42290 }, { "entropy": 1.9769433185458183, "epoch": 0.13112629896021186, "grad_norm": 10.498136520385742, "learning_rate": 6.986391381183594e-06, "loss": 0.5892, "mean_token_accuracy": 0.8088495776057243, "num_tokens": 50843327.0, "step": 42300 }, { "entropy": 1.8907544031739234, "epoch": 0.13115729808526155, "grad_norm": 10.089003562927246, "learning_rate": 6.98556569347537e-06, "loss": 0.5238, "mean_token_accuracy": 0.8322427123785019, "num_tokens": 50856211.0, "step": 42310 }, { "entropy": 1.9495109453797341, "epoch": 0.13118829721031125, "grad_norm": 9.126409530639648, "learning_rate": 6.984740298450043e-06, "loss": 0.5702, "mean_token_accuracy": 0.8336078703403473, "num_tokens": 50867480.0, "step": 42320 }, { "entropy": 1.8435195326805114, "epoch": 0.13121929633536095, "grad_norm": 6.529050827026367, "learning_rate": 6.983915195934738e-06, "loss": 0.5314, "mean_token_accuracy": 0.8262422546744347, "num_tokens": 50880593.0, "step": 42330 }, { "entropy": 1.9224420145154, "epoch": 0.13125029546041064, "grad_norm": 10.954397201538086, "learning_rate": 6.983090385756723e-06, "loss": 0.5696, "mean_token_accuracy": 0.8218831300735474, "num_tokens": 50891387.0, "step": 42340 }, { "entropy": 1.8449978575110435, "epoch": 0.1312812945854603, "grad_norm": 8.803427696228027, "learning_rate": 6.982265867743417e-06, "loss": 0.4777, "mean_token_accuracy": 0.8435740604996681, "num_tokens": 50903726.0, "step": 42350 }, { "entropy": 1.8841196730732919, "epoch": 0.13131229371051, "grad_norm": 9.251840591430664, "learning_rate": 6.981441641722373e-06, "loss": 0.5113, "mean_token_accuracy": 0.8290949910879135, "num_tokens": 50916111.0, "step": 42360 }, { "entropy": 1.927403984963894, "epoch": 0.1313432928355597, "grad_norm": 8.574084281921387, "learning_rate": 6.98061770752129e-06, "loss": 0.5651, "mean_token_accuracy": 0.8280455321073532, "num_tokens": 50928284.0, "step": 42370 }, { "entropy": 1.9771150171756744, "epoch": 0.1313742919606094, "grad_norm": 8.762701988220215, "learning_rate": 6.9797940649680106e-06, "loss": 0.6193, "mean_token_accuracy": 0.8151744335889817, "num_tokens": 50939233.0, "step": 42380 }, { "entropy": 1.8265860572457313, "epoch": 0.1314052910856591, "grad_norm": 7.463681697845459, "learning_rate": 6.978970713890515e-06, "loss": 0.555, "mean_token_accuracy": 0.8328731596469879, "num_tokens": 50953036.0, "step": 42390 }, { "entropy": 1.8653569996356965, "epoch": 0.1314362902107088, "grad_norm": 8.80778694152832, "learning_rate": 6.978147654116929e-06, "loss": 0.47, "mean_token_accuracy": 0.8439178183674813, "num_tokens": 50964848.0, "step": 42400 }, { "entropy": 1.939113649725914, "epoch": 0.1314672893357585, "grad_norm": 10.408567428588867, "learning_rate": 6.977324885475521e-06, "loss": 0.5543, "mean_token_accuracy": 0.8219144955277443, "num_tokens": 50976553.0, "step": 42410 }, { "entropy": 1.862153060734272, "epoch": 0.1314982884608082, "grad_norm": 8.101511001586914, "learning_rate": 6.976502407794701e-06, "loss": 0.485, "mean_token_accuracy": 0.835495936870575, "num_tokens": 50988487.0, "step": 42420 }, { "entropy": 1.9604466244578362, "epoch": 0.13152928758585788, "grad_norm": 9.159782409667969, "learning_rate": 6.975680220903015e-06, "loss": 0.5845, "mean_token_accuracy": 0.8208717614412308, "num_tokens": 51000283.0, "step": 42430 }, { "entropy": 1.9139268666505813, "epoch": 0.13156028671090758, "grad_norm": 8.362812042236328, "learning_rate": 6.974858324629158e-06, "loss": 0.5689, "mean_token_accuracy": 0.8323205769062042, "num_tokens": 51012209.0, "step": 42440 }, { "entropy": 1.9230769395828247, "epoch": 0.13159128583595728, "grad_norm": 8.45923900604248, "learning_rate": 6.97403671880196e-06, "loss": 0.572, "mean_token_accuracy": 0.8329043105244637, "num_tokens": 51023308.0, "step": 42450 }, { "entropy": 1.9842663645744323, "epoch": 0.13162228496100697, "grad_norm": 8.042434692382812, "learning_rate": 6.973215403250397e-06, "loss": 0.5793, "mean_token_accuracy": 0.8260912299156189, "num_tokens": 51033767.0, "step": 42460 }, { "entropy": 1.9136120170354842, "epoch": 0.13165328408605667, "grad_norm": 8.325801849365234, "learning_rate": 6.972394377803584e-06, "loss": 0.5382, "mean_token_accuracy": 0.8208369135856628, "num_tokens": 51045947.0, "step": 42470 }, { "entropy": 1.8511994555592537, "epoch": 0.13168428321110637, "grad_norm": 3.737189531326294, "learning_rate": 6.9715736422907764e-06, "loss": 0.4705, "mean_token_accuracy": 0.8399424910545349, "num_tokens": 51057715.0, "step": 42480 }, { "entropy": 1.8442467346787452, "epoch": 0.13171528233615606, "grad_norm": 4.051930904388428, "learning_rate": 6.9707531965413695e-06, "loss": 0.4948, "mean_token_accuracy": 0.8524442002177238, "num_tokens": 51068742.0, "step": 42490 }, { "entropy": 1.8845135629177094, "epoch": 0.13174628146120576, "grad_norm": 9.697474479675293, "learning_rate": 6.969933040384902e-06, "loss": 0.5258, "mean_token_accuracy": 0.8385147228837013, "num_tokens": 51080521.0, "step": 42500 }, { "entropy": 1.7795793518424035, "epoch": 0.13177728058625546, "grad_norm": 9.526708602905273, "learning_rate": 6.96911317365105e-06, "loss": 0.4709, "mean_token_accuracy": 0.8431052088737487, "num_tokens": 51093716.0, "step": 42510 }, { "entropy": 1.9105603873729706, "epoch": 0.13180827971130515, "grad_norm": 8.357869148254395, "learning_rate": 6.968293596169631e-06, "loss": 0.5976, "mean_token_accuracy": 0.8246324315667153, "num_tokens": 51105239.0, "step": 42520 }, { "entropy": 1.9591112911701203, "epoch": 0.13183927883635485, "grad_norm": 9.68211841583252, "learning_rate": 6.967474307770603e-06, "loss": 0.6515, "mean_token_accuracy": 0.8138914480805397, "num_tokens": 51116502.0, "step": 42530 }, { "entropy": 1.9038333266973495, "epoch": 0.13187027796140455, "grad_norm": 8.462784767150879, "learning_rate": 6.966655308284064e-06, "loss": 0.5137, "mean_token_accuracy": 0.8396757513284683, "num_tokens": 51128381.0, "step": 42540 }, { "entropy": 1.8404983699321746, "epoch": 0.13190127708645424, "grad_norm": 8.014533042907715, "learning_rate": 6.965836597540249e-06, "loss": 0.4811, "mean_token_accuracy": 0.8473035827279091, "num_tokens": 51140787.0, "step": 42550 }, { "entropy": 1.9219214513897895, "epoch": 0.13193227621150394, "grad_norm": 8.607987403869629, "learning_rate": 6.965018175369538e-06, "loss": 0.5378, "mean_token_accuracy": 0.8373000919818878, "num_tokens": 51152611.0, "step": 42560 }, { "entropy": 1.8740851491689683, "epoch": 0.13196327533655364, "grad_norm": 8.044096946716309, "learning_rate": 6.9642000416024435e-06, "loss": 0.5307, "mean_token_accuracy": 0.8397703349590302, "num_tokens": 51164281.0, "step": 42570 }, { "entropy": 1.8905795052647592, "epoch": 0.13199427446160333, "grad_norm": 9.585006713867188, "learning_rate": 6.963382196069625e-06, "loss": 0.5458, "mean_token_accuracy": 0.8185367554426193, "num_tokens": 51175917.0, "step": 42580 }, { "entropy": 1.814333714544773, "epoch": 0.13202527358665303, "grad_norm": 10.914127349853516, "learning_rate": 6.962564638601874e-06, "loss": 0.5236, "mean_token_accuracy": 0.8299461394548416, "num_tokens": 51188596.0, "step": 42590 }, { "entropy": 1.8761808335781098, "epoch": 0.1320562727117027, "grad_norm": 8.772909164428711, "learning_rate": 6.961747369030127e-06, "loss": 0.5328, "mean_token_accuracy": 0.8223336800932884, "num_tokens": 51200294.0, "step": 42600 }, { "entropy": 1.8656433135271073, "epoch": 0.1320872718367524, "grad_norm": 4.012499809265137, "learning_rate": 6.960930387185456e-06, "loss": 0.5392, "mean_token_accuracy": 0.8211655914783478, "num_tokens": 51212299.0, "step": 42610 }, { "entropy": 1.8340272575616836, "epoch": 0.1321182709618021, "grad_norm": 5.148853302001953, "learning_rate": 6.960113692899071e-06, "loss": 0.4994, "mean_token_accuracy": 0.8375645697116851, "num_tokens": 51224694.0, "step": 42620 }, { "entropy": 1.8271689996123315, "epoch": 0.1321492700868518, "grad_norm": 8.511850357055664, "learning_rate": 6.9592972860023235e-06, "loss": 0.4601, "mean_token_accuracy": 0.8452630266547203, "num_tokens": 51237562.0, "step": 42630 }, { "entropy": 1.9014787808060647, "epoch": 0.13218026921190149, "grad_norm": 4.459075927734375, "learning_rate": 6.9584811663267015e-06, "loss": 0.5206, "mean_token_accuracy": 0.8347061321139335, "num_tokens": 51248656.0, "step": 42640 }, { "entropy": 1.9535549938678742, "epoch": 0.13221126833695118, "grad_norm": 4.374874114990234, "learning_rate": 6.9576653337038325e-06, "loss": 0.6071, "mean_token_accuracy": 0.8113219693303109, "num_tokens": 51259712.0, "step": 42650 }, { "entropy": 1.8858298167586327, "epoch": 0.13224226746200088, "grad_norm": 11.858930587768555, "learning_rate": 6.956849787965481e-06, "loss": 0.5066, "mean_token_accuracy": 0.8303458750247955, "num_tokens": 51272446.0, "step": 42660 }, { "entropy": 1.8564813017845154, "epoch": 0.13227326658705058, "grad_norm": 3.350623846054077, "learning_rate": 6.956034528943548e-06, "loss": 0.5335, "mean_token_accuracy": 0.8342030212283135, "num_tokens": 51285301.0, "step": 42670 }, { "entropy": 1.849312388896942, "epoch": 0.13230426571210027, "grad_norm": 9.046114921569824, "learning_rate": 6.955219556470077e-06, "loss": 0.4985, "mean_token_accuracy": 0.8443035989999771, "num_tokens": 51297696.0, "step": 42680 }, { "entropy": 1.9661724478006364, "epoch": 0.13233526483714997, "grad_norm": 9.500844955444336, "learning_rate": 6.954404870377246e-06, "loss": 0.6121, "mean_token_accuracy": 0.8136408120393753, "num_tokens": 51307961.0, "step": 42690 }, { "entropy": 1.939977452158928, "epoch": 0.13236626396219967, "grad_norm": 8.7796049118042, "learning_rate": 6.953590470497371e-06, "loss": 0.6126, "mean_token_accuracy": 0.8261479705572128, "num_tokens": 51318495.0, "step": 42700 }, { "entropy": 1.840474684536457, "epoch": 0.13239726308724936, "grad_norm": 11.025080680847168, "learning_rate": 6.952776356662905e-06, "loss": 0.5237, "mean_token_accuracy": 0.8339684426784515, "num_tokens": 51330208.0, "step": 42710 }, { "entropy": 1.947993564605713, "epoch": 0.13242826221229906, "grad_norm": 9.111898422241211, "learning_rate": 6.9519625287064375e-06, "loss": 0.5756, "mean_token_accuracy": 0.8302145257592202, "num_tokens": 51340861.0, "step": 42720 }, { "entropy": 1.8799726784229278, "epoch": 0.13245926133734875, "grad_norm": 9.264261245727539, "learning_rate": 6.951148986460699e-06, "loss": 0.5314, "mean_token_accuracy": 0.8337329775094986, "num_tokens": 51353102.0, "step": 42730 }, { "entropy": 1.959149533510208, "epoch": 0.13249026046239845, "grad_norm": 10.266985893249512, "learning_rate": 6.950335729758554e-06, "loss": 0.6076, "mean_token_accuracy": 0.821322014927864, "num_tokens": 51364079.0, "step": 42740 }, { "entropy": 1.937872663140297, "epoch": 0.13252125958744815, "grad_norm": 9.663759231567383, "learning_rate": 6.949522758433003e-06, "loss": 0.5499, "mean_token_accuracy": 0.8321762755513191, "num_tokens": 51375560.0, "step": 42750 }, { "entropy": 1.8674385949969292, "epoch": 0.13255225871249784, "grad_norm": 7.8949432373046875, "learning_rate": 6.948710072317184e-06, "loss": 0.4933, "mean_token_accuracy": 0.837440250813961, "num_tokens": 51388270.0, "step": 42760 }, { "entropy": 1.9320979446172715, "epoch": 0.13258325783754754, "grad_norm": 8.221677780151367, "learning_rate": 6.9478976712443755e-06, "loss": 0.5291, "mean_token_accuracy": 0.8377137213945389, "num_tokens": 51399663.0, "step": 42770 }, { "entropy": 1.8442893743515014, "epoch": 0.13261425696259724, "grad_norm": 7.525363445281982, "learning_rate": 6.947085555047985e-06, "loss": 0.5198, "mean_token_accuracy": 0.8382806748151779, "num_tokens": 51411959.0, "step": 42780 }, { "entropy": 1.8832678958773612, "epoch": 0.13264525608764693, "grad_norm": 8.396281242370605, "learning_rate": 6.946273723561562e-06, "loss": 0.486, "mean_token_accuracy": 0.8351016476750374, "num_tokens": 51423692.0, "step": 42790 }, { "entropy": 1.8216581985354423, "epoch": 0.13267625521269663, "grad_norm": 8.608697891235352, "learning_rate": 6.9454621766187904e-06, "loss": 0.481, "mean_token_accuracy": 0.8447302252054214, "num_tokens": 51436099.0, "step": 42800 }, { "entropy": 1.9112284123897552, "epoch": 0.13270725433774633, "grad_norm": 5.335425853729248, "learning_rate": 6.944650914053489e-06, "loss": 0.5714, "mean_token_accuracy": 0.8203225836157799, "num_tokens": 51448674.0, "step": 42810 }, { "entropy": 1.9009429544210434, "epoch": 0.13273825346279602, "grad_norm": 8.94002914428711, "learning_rate": 6.943839935699615e-06, "loss": 0.5229, "mean_token_accuracy": 0.8352271884679794, "num_tokens": 51460644.0, "step": 42820 }, { "entropy": 1.8937845051288604, "epoch": 0.13276925258784572, "grad_norm": 4.647216320037842, "learning_rate": 6.943029241391259e-06, "loss": 0.5719, "mean_token_accuracy": 0.8236138209700584, "num_tokens": 51473179.0, "step": 42830 }, { "entropy": 1.8776622116565704, "epoch": 0.1328002517128954, "grad_norm": 9.4772367477417, "learning_rate": 6.942218830962648e-06, "loss": 0.5519, "mean_token_accuracy": 0.8267105832695961, "num_tokens": 51486098.0, "step": 42840 }, { "entropy": 1.7324663981795312, "epoch": 0.1328312508379451, "grad_norm": 2.6686737537384033, "learning_rate": 6.941408704248144e-06, "loss": 0.3974, "mean_token_accuracy": 0.8572231993079186, "num_tokens": 51499848.0, "step": 42850 }, { "entropy": 1.8544006049633026, "epoch": 0.13286224996299478, "grad_norm": 9.000062942504883, "learning_rate": 6.940598861082245e-06, "loss": 0.4792, "mean_token_accuracy": 0.8476372644305229, "num_tokens": 51512048.0, "step": 42860 }, { "entropy": 1.9546643912792205, "epoch": 0.13289324908804448, "grad_norm": 11.137999534606934, "learning_rate": 6.939789301299585e-06, "loss": 0.6104, "mean_token_accuracy": 0.8146396011114121, "num_tokens": 51523110.0, "step": 42870 }, { "entropy": 1.9276799812912941, "epoch": 0.13292424821309418, "grad_norm": 10.72694206237793, "learning_rate": 6.938980024734927e-06, "loss": 0.5825, "mean_token_accuracy": 0.8259019777178764, "num_tokens": 51534581.0, "step": 42880 }, { "entropy": 1.7917723521590232, "epoch": 0.13295524733814387, "grad_norm": 9.843888282775879, "learning_rate": 6.938171031223178e-06, "loss": 0.4621, "mean_token_accuracy": 0.8381556645035744, "num_tokens": 51548074.0, "step": 42890 }, { "entropy": 1.9410393938422204, "epoch": 0.13298624646319357, "grad_norm": 10.020930290222168, "learning_rate": 6.937362320599377e-06, "loss": 0.5643, "mean_token_accuracy": 0.8282965987920761, "num_tokens": 51559680.0, "step": 42900 }, { "entropy": 1.92443605363369, "epoch": 0.13301724558824327, "grad_norm": 10.336004257202148, "learning_rate": 6.936553892698692e-06, "loss": 0.557, "mean_token_accuracy": 0.8238545969128609, "num_tokens": 51571461.0, "step": 42910 }, { "entropy": 1.833638320863247, "epoch": 0.13304824471329296, "grad_norm": 10.432703971862793, "learning_rate": 6.935745747356429e-06, "loss": 0.4945, "mean_token_accuracy": 0.836958509683609, "num_tokens": 51583715.0, "step": 42920 }, { "entropy": 1.9126337066292762, "epoch": 0.13307924383834266, "grad_norm": 4.075279712677002, "learning_rate": 6.934937884408032e-06, "loss": 0.511, "mean_token_accuracy": 0.8253563806414604, "num_tokens": 51595798.0, "step": 42930 }, { "entropy": 1.9492529153823852, "epoch": 0.13311024296339236, "grad_norm": 8.318151473999023, "learning_rate": 6.934130303689072e-06, "loss": 0.5273, "mean_token_accuracy": 0.8317864894866943, "num_tokens": 51606958.0, "step": 42940 }, { "entropy": 1.9043660476803779, "epoch": 0.13314124208844205, "grad_norm": 4.388843059539795, "learning_rate": 6.93332300503526e-06, "loss": 0.5195, "mean_token_accuracy": 0.8340587362647056, "num_tokens": 51619061.0, "step": 42950 }, { "entropy": 1.874220597743988, "epoch": 0.13317224121349175, "grad_norm": 3.5497045516967773, "learning_rate": 6.932515988282438e-06, "loss": 0.5506, "mean_token_accuracy": 0.8257475927472114, "num_tokens": 51630509.0, "step": 42960 }, { "entropy": 1.8349529922008514, "epoch": 0.13320324033854145, "grad_norm": 8.229619979858398, "learning_rate": 6.931709253266582e-06, "loss": 0.5153, "mean_token_accuracy": 0.836476719379425, "num_tokens": 51642250.0, "step": 42970 }, { "entropy": 1.9145320609211922, "epoch": 0.13323423946359114, "grad_norm": 8.655306816101074, "learning_rate": 6.930902799823801e-06, "loss": 0.5779, "mean_token_accuracy": 0.8278056040406228, "num_tokens": 51654224.0, "step": 42980 }, { "entropy": 1.9441554173827171, "epoch": 0.13326523858864084, "grad_norm": 10.367897987365723, "learning_rate": 6.9300966277903415e-06, "loss": 0.5136, "mean_token_accuracy": 0.8317381590604782, "num_tokens": 51665436.0, "step": 42990 }, { "entropy": 1.8716817542910575, "epoch": 0.13329623771369054, "grad_norm": 8.810930252075195, "learning_rate": 6.929290737002579e-06, "loss": 0.5147, "mean_token_accuracy": 0.8309359416365624, "num_tokens": 51678512.0, "step": 43000 }, { "entropy": 1.9239093586802483, "epoch": 0.13332723683874023, "grad_norm": 10.611305236816406, "learning_rate": 6.928485127297019e-06, "loss": 0.5381, "mean_token_accuracy": 0.828672143816948, "num_tokens": 51689903.0, "step": 43010 }, { "entropy": 1.9985890626907348, "epoch": 0.13335823596378993, "grad_norm": 9.104059219360352, "learning_rate": 6.92767979851031e-06, "loss": 0.6427, "mean_token_accuracy": 0.8104574084281921, "num_tokens": 51700881.0, "step": 43020 }, { "entropy": 1.8614750012755394, "epoch": 0.13338923508883962, "grad_norm": 3.7970998287200928, "learning_rate": 6.926874750479225e-06, "loss": 0.5172, "mean_token_accuracy": 0.8261910125613212, "num_tokens": 51713424.0, "step": 43030 }, { "entropy": 1.9317557483911514, "epoch": 0.13342023421388932, "grad_norm": 4.564321041107178, "learning_rate": 6.926069983040672e-06, "loss": 0.514, "mean_token_accuracy": 0.8412420928478241, "num_tokens": 51724984.0, "step": 43040 }, { "entropy": 1.8474729374051093, "epoch": 0.13345123333893902, "grad_norm": 4.81390905380249, "learning_rate": 6.925265496031692e-06, "loss": 0.473, "mean_token_accuracy": 0.8341810539364815, "num_tokens": 51738001.0, "step": 43050 }, { "entropy": 1.938997507095337, "epoch": 0.13348223246398871, "grad_norm": 10.528800010681152, "learning_rate": 6.92446128928946e-06, "loss": 0.611, "mean_token_accuracy": 0.8227848649024964, "num_tokens": 51749495.0, "step": 43060 }, { "entropy": 1.724546130001545, "epoch": 0.1335132315890384, "grad_norm": 2.564570188522339, "learning_rate": 6.9236573626512815e-06, "loss": 0.4273, "mean_token_accuracy": 0.8508135721087455, "num_tokens": 51763526.0, "step": 43070 }, { "entropy": 1.9087968587875366, "epoch": 0.1335442307140881, "grad_norm": 10.67532730102539, "learning_rate": 6.922853715954594e-06, "loss": 0.5305, "mean_token_accuracy": 0.8410237103700637, "num_tokens": 51775239.0, "step": 43080 }, { "entropy": 1.9154332160949707, "epoch": 0.13357522983913778, "grad_norm": 8.595878601074219, "learning_rate": 6.922050349036968e-06, "loss": 0.5283, "mean_token_accuracy": 0.8399403557181359, "num_tokens": 51786721.0, "step": 43090 }, { "entropy": 1.9540342479944228, "epoch": 0.13360622896418747, "grad_norm": 8.74820327758789, "learning_rate": 6.921247261736105e-06, "loss": 0.5451, "mean_token_accuracy": 0.8402211844921113, "num_tokens": 51797894.0, "step": 43100 }, { "entropy": 1.9339706212282182, "epoch": 0.13363722808923717, "grad_norm": 10.516135215759277, "learning_rate": 6.920444453889838e-06, "loss": 0.5889, "mean_token_accuracy": 0.8250453129410744, "num_tokens": 51808969.0, "step": 43110 }, { "entropy": 1.9804834306240082, "epoch": 0.13366822721428687, "grad_norm": 9.053743362426758, "learning_rate": 6.919641925336133e-06, "loss": 0.6111, "mean_token_accuracy": 0.8214120969176293, "num_tokens": 51820364.0, "step": 43120 }, { "entropy": 1.856301885843277, "epoch": 0.13369922633933656, "grad_norm": 9.301741600036621, "learning_rate": 6.9188396759130886e-06, "loss": 0.4907, "mean_token_accuracy": 0.8374891221523285, "num_tokens": 51832827.0, "step": 43130 }, { "entropy": 1.817978872358799, "epoch": 0.13373022546438626, "grad_norm": 9.00844955444336, "learning_rate": 6.918037705458932e-06, "loss": 0.5028, "mean_token_accuracy": 0.8360246539115905, "num_tokens": 51845614.0, "step": 43140 }, { "entropy": 1.9156497776508332, "epoch": 0.13376122458943596, "grad_norm": 9.389613151550293, "learning_rate": 6.9172360138120205e-06, "loss": 0.5771, "mean_token_accuracy": 0.8285941392183304, "num_tokens": 51857344.0, "step": 43150 }, { "entropy": 1.9007001653313638, "epoch": 0.13379222371448565, "grad_norm": 9.915302276611328, "learning_rate": 6.9164346008108465e-06, "loss": 0.5365, "mean_token_accuracy": 0.8260676816105843, "num_tokens": 51869381.0, "step": 43160 }, { "entropy": 1.8302084445953368, "epoch": 0.13382322283953535, "grad_norm": 9.13771915435791, "learning_rate": 6.915633466294033e-06, "loss": 0.502, "mean_token_accuracy": 0.841213583946228, "num_tokens": 51882099.0, "step": 43170 }, { "entropy": 1.9292520344257356, "epoch": 0.13385422196458505, "grad_norm": 9.99196720123291, "learning_rate": 6.914832610100331e-06, "loss": 0.562, "mean_token_accuracy": 0.82747802734375, "num_tokens": 51893348.0, "step": 43180 }, { "entropy": 1.8885715886950494, "epoch": 0.13388522108963474, "grad_norm": 9.511621475219727, "learning_rate": 6.914032032068623e-06, "loss": 0.5304, "mean_token_accuracy": 0.8361998051404953, "num_tokens": 51906070.0, "step": 43190 }, { "entropy": 1.8899712771177293, "epoch": 0.13391622021468444, "grad_norm": 12.438838005065918, "learning_rate": 6.913231732037921e-06, "loss": 0.5835, "mean_token_accuracy": 0.8116952329874039, "num_tokens": 51917513.0, "step": 43200 }, { "entropy": 1.8960667297244072, "epoch": 0.13394721933973414, "grad_norm": 10.735957145690918, "learning_rate": 6.912431709847373e-06, "loss": 0.545, "mean_token_accuracy": 0.8333559215068818, "num_tokens": 51929508.0, "step": 43210 }, { "entropy": 1.8930713683366776, "epoch": 0.13397821846478383, "grad_norm": 8.979076385498047, "learning_rate": 6.911631965336252e-06, "loss": 0.5303, "mean_token_accuracy": 0.8422817379236222, "num_tokens": 51941545.0, "step": 43220 }, { "entropy": 1.8466507881879806, "epoch": 0.13400921758983353, "grad_norm": 8.20052719116211, "learning_rate": 6.9108324983439605e-06, "loss": 0.5342, "mean_token_accuracy": 0.8367902502417565, "num_tokens": 51953888.0, "step": 43230 }, { "entropy": 1.9022724777460098, "epoch": 0.13404021671488323, "grad_norm": 8.77592945098877, "learning_rate": 6.910033308710034e-06, "loss": 0.5512, "mean_token_accuracy": 0.819185683131218, "num_tokens": 51966016.0, "step": 43240 }, { "entropy": 1.9349356770515442, "epoch": 0.13407121583993292, "grad_norm": 6.25929594039917, "learning_rate": 6.909234396274137e-06, "loss": 0.549, "mean_token_accuracy": 0.833576287329197, "num_tokens": 51978893.0, "step": 43250 }, { "entropy": 1.9031900018453598, "epoch": 0.13410221496498262, "grad_norm": 8.607972145080566, "learning_rate": 6.908435760876063e-06, "loss": 0.5548, "mean_token_accuracy": 0.8291514754295349, "num_tokens": 51990673.0, "step": 43260 }, { "entropy": 1.9223877504467963, "epoch": 0.13413321409003232, "grad_norm": 8.238653182983398, "learning_rate": 6.9076374023557366e-06, "loss": 0.5656, "mean_token_accuracy": 0.8217443853616715, "num_tokens": 52002484.0, "step": 43270 }, { "entropy": 1.9485286980867387, "epoch": 0.134164213215082, "grad_norm": 13.640851974487305, "learning_rate": 6.90683932055321e-06, "loss": 0.6573, "mean_token_accuracy": 0.8062111675739289, "num_tokens": 52013371.0, "step": 43280 }, { "entropy": 1.8719741210341454, "epoch": 0.1341952123401317, "grad_norm": 7.591920852661133, "learning_rate": 6.906041515308666e-06, "loss": 0.5224, "mean_token_accuracy": 0.8311554431915283, "num_tokens": 52025599.0, "step": 43290 }, { "entropy": 1.867848064005375, "epoch": 0.1342262114651814, "grad_norm": 6.9264068603515625, "learning_rate": 6.905243986462417e-06, "loss": 0.4803, "mean_token_accuracy": 0.8396237522363663, "num_tokens": 52038600.0, "step": 43300 }, { "entropy": 1.9002704456448556, "epoch": 0.1342572105902311, "grad_norm": 8.932539939880371, "learning_rate": 6.9044467338549005e-06, "loss": 0.5224, "mean_token_accuracy": 0.8397117152810096, "num_tokens": 52050301.0, "step": 43310 }, { "entropy": 1.839927391707897, "epoch": 0.1342882097152808, "grad_norm": 3.8596723079681396, "learning_rate": 6.903649757326689e-06, "loss": 0.5026, "mean_token_accuracy": 0.8357837229967118, "num_tokens": 52062178.0, "step": 43320 }, { "entropy": 1.9066562354564667, "epoch": 0.1343192088403305, "grad_norm": 9.047643661499023, "learning_rate": 6.902853056718479e-06, "loss": 0.5685, "mean_token_accuracy": 0.8244990050792694, "num_tokens": 52073513.0, "step": 43330 }, { "entropy": 1.9126151755452157, "epoch": 0.13435020796538016, "grad_norm": 7.049984455108643, "learning_rate": 6.9020566318711e-06, "loss": 0.5438, "mean_token_accuracy": 0.8374730840325355, "num_tokens": 52084870.0, "step": 43340 }, { "entropy": 1.9268995508551598, "epoch": 0.13438120709042986, "grad_norm": 8.504199981689453, "learning_rate": 6.901260482625506e-06, "loss": 0.5814, "mean_token_accuracy": 0.8227284833788872, "num_tokens": 52096377.0, "step": 43350 }, { "entropy": 1.8570437088608742, "epoch": 0.13441220621547956, "grad_norm": 4.3164873123168945, "learning_rate": 6.90046460882278e-06, "loss": 0.4649, "mean_token_accuracy": 0.8381210267543793, "num_tokens": 52109355.0, "step": 43360 }, { "entropy": 1.834708495438099, "epoch": 0.13444320534052925, "grad_norm": 8.836848258972168, "learning_rate": 6.8996690103041376e-06, "loss": 0.523, "mean_token_accuracy": 0.8306301310658455, "num_tokens": 52121829.0, "step": 43370 }, { "entropy": 1.9287886828184129, "epoch": 0.13447420446557895, "grad_norm": 10.67462158203125, "learning_rate": 6.898873686910913e-06, "loss": 0.5709, "mean_token_accuracy": 0.8287528812885284, "num_tokens": 52133362.0, "step": 43380 }, { "entropy": 1.855184331536293, "epoch": 0.13450520359062865, "grad_norm": 3.361020803451538, "learning_rate": 6.898078638484581e-06, "loss": 0.5023, "mean_token_accuracy": 0.8325510248541832, "num_tokens": 52145855.0, "step": 43390 }, { "entropy": 1.9872781455516815, "epoch": 0.13453620271567834, "grad_norm": 4.882389545440674, "learning_rate": 6.897283864866734e-06, "loss": 0.6029, "mean_token_accuracy": 0.8120069310069085, "num_tokens": 52157083.0, "step": 43400 }, { "entropy": 1.9066018536686897, "epoch": 0.13456720184072804, "grad_norm": 8.167949676513672, "learning_rate": 6.896489365899096e-06, "loss": 0.5342, "mean_token_accuracy": 0.8314293026924133, "num_tokens": 52169100.0, "step": 43410 }, { "entropy": 1.8998596712946891, "epoch": 0.13459820096577774, "grad_norm": 8.666936874389648, "learning_rate": 6.895695141423521e-06, "loss": 0.5258, "mean_token_accuracy": 0.8273270547389984, "num_tokens": 52181307.0, "step": 43420 }, { "entropy": 1.8541322633624078, "epoch": 0.13462920009082743, "grad_norm": 4.933404922485352, "learning_rate": 6.894901191281985e-06, "loss": 0.5296, "mean_token_accuracy": 0.8414107546210289, "num_tokens": 52193857.0, "step": 43430 }, { "entropy": 1.8842070639133452, "epoch": 0.13466019921587713, "grad_norm": 7.285519599914551, "learning_rate": 6.894107515316597e-06, "loss": 0.513, "mean_token_accuracy": 0.8374163269996643, "num_tokens": 52205689.0, "step": 43440 }, { "entropy": 1.8152766466140746, "epoch": 0.13469119834092683, "grad_norm": 7.589842796325684, "learning_rate": 6.893314113369588e-06, "loss": 0.4532, "mean_token_accuracy": 0.845301553606987, "num_tokens": 52219579.0, "step": 43450 }, { "entropy": 1.756292749941349, "epoch": 0.13472219746597652, "grad_norm": 6.797213554382324, "learning_rate": 6.89252098528332e-06, "loss": 0.4379, "mean_token_accuracy": 0.8514433890581131, "num_tokens": 52232565.0, "step": 43460 }, { "entropy": 1.8172416999936103, "epoch": 0.13475319659102622, "grad_norm": 7.267201900482178, "learning_rate": 6.891728130900279e-06, "loss": 0.4651, "mean_token_accuracy": 0.8358834356069564, "num_tokens": 52246625.0, "step": 43470 }, { "entropy": 1.9737324267625809, "epoch": 0.13478419571607592, "grad_norm": 7.657471656799316, "learning_rate": 6.89093555006308e-06, "loss": 0.5858, "mean_token_accuracy": 0.827315254509449, "num_tokens": 52257290.0, "step": 43480 }, { "entropy": 1.8453372538089752, "epoch": 0.1348151948411256, "grad_norm": 3.9033000469207764, "learning_rate": 6.890143242614467e-06, "loss": 0.4739, "mean_token_accuracy": 0.8415728509426117, "num_tokens": 52269698.0, "step": 43490 }, { "entropy": 1.912654523551464, "epoch": 0.1348461939661753, "grad_norm": 9.669717788696289, "learning_rate": 6.889351208397301e-06, "loss": 0.5615, "mean_token_accuracy": 0.8255023837089539, "num_tokens": 52281310.0, "step": 43500 }, { "entropy": 1.9051228925585746, "epoch": 0.134877193091225, "grad_norm": 9.635804176330566, "learning_rate": 6.888559447254581e-06, "loss": 0.5713, "mean_token_accuracy": 0.8345887675881386, "num_tokens": 52293144.0, "step": 43510 }, { "entropy": 1.8088420122861861, "epoch": 0.1349081922162747, "grad_norm": 7.230260372161865, "learning_rate": 6.887767959029426e-06, "loss": 0.4494, "mean_token_accuracy": 0.8482095554471016, "num_tokens": 52305778.0, "step": 43520 }, { "entropy": 1.7873062670230866, "epoch": 0.1349391913413244, "grad_norm": 7.785470008850098, "learning_rate": 6.886976743565082e-06, "loss": 0.4747, "mean_token_accuracy": 0.847273476421833, "num_tokens": 52318326.0, "step": 43530 }, { "entropy": 1.8959727630019187, "epoch": 0.1349701904663741, "grad_norm": 4.0128254890441895, "learning_rate": 6.886185800704923e-06, "loss": 0.5779, "mean_token_accuracy": 0.8261519700288773, "num_tokens": 52329198.0, "step": 43540 }, { "entropy": 1.8708949625492095, "epoch": 0.1350011895914238, "grad_norm": 8.658716201782227, "learning_rate": 6.8853951302924424e-06, "loss": 0.5234, "mean_token_accuracy": 0.8351444467902184, "num_tokens": 52340797.0, "step": 43550 }, { "entropy": 1.9420608460903168, "epoch": 0.1350321887164735, "grad_norm": 8.919011116027832, "learning_rate": 6.884604732171271e-06, "loss": 0.6175, "mean_token_accuracy": 0.8187188416719436, "num_tokens": 52352295.0, "step": 43560 }, { "entropy": 1.8305167585611344, "epoch": 0.13506318784152319, "grad_norm": 7.717691898345947, "learning_rate": 6.883814606185152e-06, "loss": 0.5156, "mean_token_accuracy": 0.8335830017924308, "num_tokens": 52364889.0, "step": 43570 }, { "entropy": 1.8474395513534545, "epoch": 0.13509418696657288, "grad_norm": 4.04104471206665, "learning_rate": 6.883024752177963e-06, "loss": 0.5319, "mean_token_accuracy": 0.8278833538293838, "num_tokens": 52377375.0, "step": 43580 }, { "entropy": 1.8440303042531014, "epoch": 0.13512518609162255, "grad_norm": 8.369099617004395, "learning_rate": 6.882235169993708e-06, "loss": 0.5352, "mean_token_accuracy": 0.838912108540535, "num_tokens": 52389129.0, "step": 43590 }, { "entropy": 1.7567294076085092, "epoch": 0.13515618521667225, "grad_norm": 8.187392234802246, "learning_rate": 6.881445859476506e-06, "loss": 0.4261, "mean_token_accuracy": 0.8403543129563331, "num_tokens": 52402761.0, "step": 43600 }, { "entropy": 1.854997991025448, "epoch": 0.13518718434172194, "grad_norm": 3.2255561351776123, "learning_rate": 6.880656820470614e-06, "loss": 0.4601, "mean_token_accuracy": 0.8469349488615989, "num_tokens": 52414592.0, "step": 43610 }, { "entropy": 1.9099668189883232, "epoch": 0.13521818346677164, "grad_norm": 8.775009155273438, "learning_rate": 6.8798680528204045e-06, "loss": 0.5708, "mean_token_accuracy": 0.8262275233864784, "num_tokens": 52425981.0, "step": 43620 }, { "entropy": 1.9568244695663453, "epoch": 0.13524918259182134, "grad_norm": 7.415703773498535, "learning_rate": 6.879079556370377e-06, "loss": 0.5595, "mean_token_accuracy": 0.8358083561062812, "num_tokens": 52436580.0, "step": 43630 }, { "entropy": 1.87071183770895, "epoch": 0.13528018171687103, "grad_norm": 12.355653762817383, "learning_rate": 6.87829133096516e-06, "loss": 0.5773, "mean_token_accuracy": 0.8205010086297989, "num_tokens": 52449363.0, "step": 43640 }, { "entropy": 1.7835715875029563, "epoch": 0.13531118084192073, "grad_norm": 10.02258014678955, "learning_rate": 6.877503376449503e-06, "loss": 0.4716, "mean_token_accuracy": 0.8421624451875687, "num_tokens": 52462468.0, "step": 43650 }, { "entropy": 1.834514120221138, "epoch": 0.13534217996697043, "grad_norm": 9.3626127243042, "learning_rate": 6.876715692668278e-06, "loss": 0.5309, "mean_token_accuracy": 0.8375687584280967, "num_tokens": 52475023.0, "step": 43660 }, { "entropy": 1.9362987548112869, "epoch": 0.13537317909202012, "grad_norm": 8.014119148254395, "learning_rate": 6.875928279466486e-06, "loss": 0.582, "mean_token_accuracy": 0.8320098280906677, "num_tokens": 52486718.0, "step": 43670 }, { "entropy": 1.9744535475969314, "epoch": 0.13540417821706982, "grad_norm": 9.7212495803833, "learning_rate": 6.87514113668925e-06, "loss": 0.5774, "mean_token_accuracy": 0.8281274557113647, "num_tokens": 52497664.0, "step": 43680 }, { "entropy": 1.893913634121418, "epoch": 0.13543517734211952, "grad_norm": 6.767265796661377, "learning_rate": 6.874354264181815e-06, "loss": 0.5278, "mean_token_accuracy": 0.835284897685051, "num_tokens": 52509266.0, "step": 43690 }, { "entropy": 1.985574059188366, "epoch": 0.1354661764671692, "grad_norm": 5.145320892333984, "learning_rate": 6.873567661789554e-06, "loss": 0.5915, "mean_token_accuracy": 0.8212827607989311, "num_tokens": 52520661.0, "step": 43700 }, { "entropy": 1.8134831815958024, "epoch": 0.1354971755922189, "grad_norm": 3.865335702896118, "learning_rate": 6.872781329357961e-06, "loss": 0.4826, "mean_token_accuracy": 0.839052714407444, "num_tokens": 52533364.0, "step": 43710 }, { "entropy": 1.9578730061650276, "epoch": 0.1355281747172686, "grad_norm": 10.674678802490234, "learning_rate": 6.871995266732656e-06, "loss": 0.6165, "mean_token_accuracy": 0.8204101115465164, "num_tokens": 52544383.0, "step": 43720 }, { "entropy": 1.7825062423944473, "epoch": 0.1355591738423183, "grad_norm": 8.773101806640625, "learning_rate": 6.871209473759379e-06, "loss": 0.4185, "mean_token_accuracy": 0.8400455713272095, "num_tokens": 52557813.0, "step": 43730 }, { "entropy": 1.9574034690856934, "epoch": 0.135590172967368, "grad_norm": 9.174714088439941, "learning_rate": 6.870423950283998e-06, "loss": 0.5867, "mean_token_accuracy": 0.8255863264203072, "num_tokens": 52569076.0, "step": 43740 }, { "entropy": 1.9655730903148652, "epoch": 0.1356211720924177, "grad_norm": 10.547797203063965, "learning_rate": 6.869638696152497e-06, "loss": 0.602, "mean_token_accuracy": 0.8277508243918419, "num_tokens": 52579434.0, "step": 43750 }, { "entropy": 1.8552222028374672, "epoch": 0.1356521712174674, "grad_norm": 8.642329216003418, "learning_rate": 6.868853711210994e-06, "loss": 0.4995, "mean_token_accuracy": 0.8307379022240639, "num_tokens": 52592462.0, "step": 43760 }, { "entropy": 1.897879421710968, "epoch": 0.1356831703425171, "grad_norm": 4.2802042961120605, "learning_rate": 6.868068995305721e-06, "loss": 0.504, "mean_token_accuracy": 0.830087074637413, "num_tokens": 52605129.0, "step": 43770 }, { "entropy": 1.9668879002332686, "epoch": 0.1357141694675668, "grad_norm": 10.869790077209473, "learning_rate": 6.8672845482830375e-06, "loss": 0.63, "mean_token_accuracy": 0.8210312753915787, "num_tokens": 52616024.0, "step": 43780 }, { "entropy": 1.909705390036106, "epoch": 0.13574516859261648, "grad_norm": 8.73975658416748, "learning_rate": 6.866500369989424e-06, "loss": 0.5014, "mean_token_accuracy": 0.8324807018041611, "num_tokens": 52627564.0, "step": 43790 }, { "entropy": 1.8812494575977325, "epoch": 0.13577616771766618, "grad_norm": 9.084870338439941, "learning_rate": 6.865716460271482e-06, "loss": 0.5365, "mean_token_accuracy": 0.8389278829097748, "num_tokens": 52639374.0, "step": 43800 }, { "entropy": 1.9060865387320518, "epoch": 0.13580716684271588, "grad_norm": 8.113476753234863, "learning_rate": 6.86493281897594e-06, "loss": 0.5409, "mean_token_accuracy": 0.8345111593604088, "num_tokens": 52650722.0, "step": 43810 }, { "entropy": 1.8717187106609345, "epoch": 0.13583816596776557, "grad_norm": 9.743805885314941, "learning_rate": 6.864149445949645e-06, "loss": 0.5383, "mean_token_accuracy": 0.8225096851587296, "num_tokens": 52663058.0, "step": 43820 }, { "entropy": 1.8713490128517152, "epoch": 0.13586916509281524, "grad_norm": 8.773591995239258, "learning_rate": 6.86336634103957e-06, "loss": 0.5154, "mean_token_accuracy": 0.8360903859138489, "num_tokens": 52675165.0, "step": 43830 }, { "entropy": 1.8310488358139991, "epoch": 0.13590016421786494, "grad_norm": 4.114312648773193, "learning_rate": 6.862583504092806e-06, "loss": 0.4769, "mean_token_accuracy": 0.8354193419218063, "num_tokens": 52686891.0, "step": 43840 }, { "entropy": 1.8633979707956314, "epoch": 0.13593116334291463, "grad_norm": 9.394161224365234, "learning_rate": 6.861800934956568e-06, "loss": 0.5265, "mean_token_accuracy": 0.8260962456464768, "num_tokens": 52698634.0, "step": 43850 }, { "entropy": 1.8397690102458, "epoch": 0.13596216246796433, "grad_norm": 8.046875, "learning_rate": 6.861018633478194e-06, "loss": 0.4864, "mean_token_accuracy": 0.8343220546841621, "num_tokens": 52711492.0, "step": 43860 }, { "entropy": 1.8941639497876168, "epoch": 0.13599316159301403, "grad_norm": 10.234258651733398, "learning_rate": 6.8602365995051445e-06, "loss": 0.5132, "mean_token_accuracy": 0.8383172944188118, "num_tokens": 52723250.0, "step": 43870 }, { "entropy": 1.8545018136501312, "epoch": 0.13602416071806372, "grad_norm": 13.808713912963867, "learning_rate": 6.8594548328849984e-06, "loss": 0.52, "mean_token_accuracy": 0.8330124363303184, "num_tokens": 52734990.0, "step": 43880 }, { "entropy": 1.8971347585320473, "epoch": 0.13605515984311342, "grad_norm": 9.022436141967773, "learning_rate": 6.858673333465455e-06, "loss": 0.5029, "mean_token_accuracy": 0.8370542734861374, "num_tokens": 52746095.0, "step": 43890 }, { "entropy": 1.8831921055912972, "epoch": 0.13608615896816312, "grad_norm": 11.447751998901367, "learning_rate": 6.8578921010943434e-06, "loss": 0.5254, "mean_token_accuracy": 0.8259268268942833, "num_tokens": 52758290.0, "step": 43900 }, { "entropy": 1.8786703854799272, "epoch": 0.13611715809321281, "grad_norm": 7.6989898681640625, "learning_rate": 6.857111135619603e-06, "loss": 0.4644, "mean_token_accuracy": 0.8493966236710548, "num_tokens": 52770682.0, "step": 43910 }, { "entropy": 1.8685062855482102, "epoch": 0.1361481572182625, "grad_norm": 9.15949821472168, "learning_rate": 6.856330436889304e-06, "loss": 0.555, "mean_token_accuracy": 0.8257764980196953, "num_tokens": 52782612.0, "step": 43920 }, { "entropy": 1.9584510385990144, "epoch": 0.1361791563433122, "grad_norm": 8.610404968261719, "learning_rate": 6.855550004751631e-06, "loss": 0.6142, "mean_token_accuracy": 0.8210650518536567, "num_tokens": 52793680.0, "step": 43930 }, { "entropy": 1.8771604105830193, "epoch": 0.1362101554683619, "grad_norm": 9.448259353637695, "learning_rate": 6.854769839054892e-06, "loss": 0.578, "mean_token_accuracy": 0.8262861162424088, "num_tokens": 52804679.0, "step": 43940 }, { "entropy": 1.9119238778948784, "epoch": 0.1362411545934116, "grad_norm": 7.940917015075684, "learning_rate": 6.853989939647518e-06, "loss": 0.5228, "mean_token_accuracy": 0.8279161244630814, "num_tokens": 52816435.0, "step": 43950 }, { "entropy": 1.9507309287786483, "epoch": 0.1362721537184613, "grad_norm": 8.971407890319824, "learning_rate": 6.853210306378055e-06, "loss": 0.5933, "mean_token_accuracy": 0.8229886546730996, "num_tokens": 52827096.0, "step": 43960 }, { "entropy": 2.0049089699983598, "epoch": 0.136303152843511, "grad_norm": 9.031655311584473, "learning_rate": 6.852430939095177e-06, "loss": 0.6456, "mean_token_accuracy": 0.8153550997376442, "num_tokens": 52838566.0, "step": 43970 }, { "entropy": 1.9605593144893647, "epoch": 0.1363341519685607, "grad_norm": 9.522255897521973, "learning_rate": 6.851651837647672e-06, "loss": 0.6225, "mean_token_accuracy": 0.8113553240895272, "num_tokens": 52848874.0, "step": 43980 }, { "entropy": 1.9099364891648292, "epoch": 0.1363651510936104, "grad_norm": 9.196107864379883, "learning_rate": 6.85087300188445e-06, "loss": 0.5653, "mean_token_accuracy": 0.8262931287288666, "num_tokens": 52861218.0, "step": 43990 }, { "entropy": 1.911503429710865, "epoch": 0.13639615021866008, "grad_norm": 9.200566291809082, "learning_rate": 6.850094431654544e-06, "loss": 0.5383, "mean_token_accuracy": 0.8331650167703628, "num_tokens": 52872651.0, "step": 44000 }, { "entropy": 1.8342396020889282, "epoch": 0.13642714934370978, "grad_norm": 4.3369903564453125, "learning_rate": 6.849316126807107e-06, "loss": 0.507, "mean_token_accuracy": 0.8284252122044563, "num_tokens": 52885546.0, "step": 44010 }, { "entropy": 1.8490222096443176, "epoch": 0.13645814846875948, "grad_norm": 7.278261184692383, "learning_rate": 6.848538087191405e-06, "loss": 0.4986, "mean_token_accuracy": 0.8356017455458641, "num_tokens": 52897211.0, "step": 44020 }, { "entropy": 1.8598289757966995, "epoch": 0.13648914759380917, "grad_norm": 8.271986961364746, "learning_rate": 6.8477603126568325e-06, "loss": 0.5433, "mean_token_accuracy": 0.8204525783658028, "num_tokens": 52908373.0, "step": 44030 }, { "entropy": 1.8169844523072243, "epoch": 0.13652014671885887, "grad_norm": 9.633807182312012, "learning_rate": 6.846982803052898e-06, "loss": 0.4595, "mean_token_accuracy": 0.8419378191232681, "num_tokens": 52921312.0, "step": 44040 }, { "entropy": 1.8356873735785484, "epoch": 0.13655114584390857, "grad_norm": 10.545353889465332, "learning_rate": 6.846205558229234e-06, "loss": 0.4975, "mean_token_accuracy": 0.8387926116585731, "num_tokens": 52934203.0, "step": 44050 }, { "entropy": 1.9119671627879142, "epoch": 0.13658214496895826, "grad_norm": 10.298490524291992, "learning_rate": 6.845428578035587e-06, "loss": 0.5507, "mean_token_accuracy": 0.8357212603092193, "num_tokens": 52945457.0, "step": 44060 }, { "entropy": 1.9574100762605666, "epoch": 0.13661314409400796, "grad_norm": 7.986303806304932, "learning_rate": 6.8446518623218284e-06, "loss": 0.5709, "mean_token_accuracy": 0.8229815408587455, "num_tokens": 52956675.0, "step": 44070 }, { "entropy": 1.9151659101247787, "epoch": 0.13664414321905763, "grad_norm": 8.7662935256958, "learning_rate": 6.843875410937946e-06, "loss": 0.5161, "mean_token_accuracy": 0.8375581040978431, "num_tokens": 52967730.0, "step": 44080 }, { "entropy": 1.8822844669222831, "epoch": 0.13667514234410733, "grad_norm": 8.069694519042969, "learning_rate": 6.8430992237340455e-06, "loss": 0.5308, "mean_token_accuracy": 0.8303315103054046, "num_tokens": 52979174.0, "step": 44090 }, { "entropy": 1.83747291713953, "epoch": 0.13670614146915702, "grad_norm": 8.96426773071289, "learning_rate": 6.8423233005603554e-06, "loss": 0.5115, "mean_token_accuracy": 0.8289276763796807, "num_tokens": 52991305.0, "step": 44100 }, { "entropy": 1.8574891343712807, "epoch": 0.13673714059420672, "grad_norm": 8.317970275878906, "learning_rate": 6.8415476412672185e-06, "loss": 0.5309, "mean_token_accuracy": 0.8376831218600274, "num_tokens": 53002731.0, "step": 44110 }, { "entropy": 1.929608154296875, "epoch": 0.13676813971925642, "grad_norm": 8.995396614074707, "learning_rate": 6.8407722457051005e-06, "loss": 0.5975, "mean_token_accuracy": 0.8171190902590751, "num_tokens": 53013927.0, "step": 44120 }, { "entropy": 1.777741050720215, "epoch": 0.1367991388443061, "grad_norm": 9.382816314697266, "learning_rate": 6.839997113724582e-06, "loss": 0.4223, "mean_token_accuracy": 0.8454552739858627, "num_tokens": 53026933.0, "step": 44130 }, { "entropy": 1.8432798728346824, "epoch": 0.1368301379693558, "grad_norm": 7.455301284790039, "learning_rate": 6.839222245176366e-06, "loss": 0.5518, "mean_token_accuracy": 0.837872776389122, "num_tokens": 53039377.0, "step": 44140 }, { "entropy": 1.8326244458556176, "epoch": 0.1368611370944055, "grad_norm": 8.096274375915527, "learning_rate": 6.838447639911271e-06, "loss": 0.5404, "mean_token_accuracy": 0.8261276423931122, "num_tokens": 53050778.0, "step": 44150 }, { "entropy": 1.7385433629155158, "epoch": 0.1368921362194552, "grad_norm": 8.745050430297852, "learning_rate": 6.837673297780233e-06, "loss": 0.4397, "mean_token_accuracy": 0.8421786040067673, "num_tokens": 53064304.0, "step": 44160 }, { "entropy": 1.8994769722223281, "epoch": 0.1369231353445049, "grad_norm": 9.911214828491211, "learning_rate": 6.836899218634308e-06, "loss": 0.5867, "mean_token_accuracy": 0.8285346269607544, "num_tokens": 53076071.0, "step": 44170 }, { "entropy": 1.87768052816391, "epoch": 0.1369541344695546, "grad_norm": 8.06799602508545, "learning_rate": 6.836125402324671e-06, "loss": 0.5769, "mean_token_accuracy": 0.8310927867889404, "num_tokens": 53087494.0, "step": 44180 }, { "entropy": 1.9110476791858673, "epoch": 0.1369851335946043, "grad_norm": 9.325477600097656, "learning_rate": 6.835351848702615e-06, "loss": 0.6014, "mean_token_accuracy": 0.824842332303524, "num_tokens": 53098984.0, "step": 44190 }, { "entropy": 1.8674473196268082, "epoch": 0.137016132719654, "grad_norm": 9.16952896118164, "learning_rate": 6.834578557619546e-06, "loss": 0.5374, "mean_token_accuracy": 0.8323730126023292, "num_tokens": 53110328.0, "step": 44200 }, { "entropy": 1.801041378080845, "epoch": 0.13704713184470368, "grad_norm": 4.72969388961792, "learning_rate": 6.8338055289269914e-06, "loss": 0.5129, "mean_token_accuracy": 0.840267626941204, "num_tokens": 53123078.0, "step": 44210 }, { "entropy": 1.9226965308189392, "epoch": 0.13707813096975338, "grad_norm": 9.973706245422363, "learning_rate": 6.8330327624765955e-06, "loss": 0.6247, "mean_token_accuracy": 0.8245911300182343, "num_tokens": 53133601.0, "step": 44220 }, { "entropy": 1.9381461694836617, "epoch": 0.13710913009480308, "grad_norm": 8.738226890563965, "learning_rate": 6.832260258120124e-06, "loss": 0.5691, "mean_token_accuracy": 0.8299503594636917, "num_tokens": 53145004.0, "step": 44230 }, { "entropy": 1.7989469826221467, "epoch": 0.13714012921985277, "grad_norm": 9.312309265136719, "learning_rate": 6.831488015709451e-06, "loss": 0.4809, "mean_token_accuracy": 0.8428971692919731, "num_tokens": 53158057.0, "step": 44240 }, { "entropy": 1.9178980767726899, "epoch": 0.13717112834490247, "grad_norm": 8.284287452697754, "learning_rate": 6.830716035096575e-06, "loss": 0.5569, "mean_token_accuracy": 0.8296015933156013, "num_tokens": 53169211.0, "step": 44250 }, { "entropy": 1.8435182958841323, "epoch": 0.13720212746995217, "grad_norm": 9.238348007202148, "learning_rate": 6.82994431613361e-06, "loss": 0.5224, "mean_token_accuracy": 0.8327393636107445, "num_tokens": 53180990.0, "step": 44260 }, { "entropy": 1.8937449261546135, "epoch": 0.13723312659500186, "grad_norm": 4.894327163696289, "learning_rate": 6.829172858672786e-06, "loss": 0.5545, "mean_token_accuracy": 0.827644082903862, "num_tokens": 53192974.0, "step": 44270 }, { "entropy": 1.910013110935688, "epoch": 0.13726412572005156, "grad_norm": 7.417638301849365, "learning_rate": 6.828401662566448e-06, "loss": 0.5548, "mean_token_accuracy": 0.8346083343029023, "num_tokens": 53204669.0, "step": 44280 }, { "entropy": 1.878174401819706, "epoch": 0.13729512484510126, "grad_norm": 8.256863594055176, "learning_rate": 6.827630727667063e-06, "loss": 0.5648, "mean_token_accuracy": 0.8261244371533394, "num_tokens": 53216346.0, "step": 44290 }, { "entropy": 1.8674972161650658, "epoch": 0.13732612397015095, "grad_norm": 8.606951713562012, "learning_rate": 6.826860053827209e-06, "loss": 0.5128, "mean_token_accuracy": 0.832400643825531, "num_tokens": 53228130.0, "step": 44300 }, { "entropy": 1.8570816084742545, "epoch": 0.13735712309520065, "grad_norm": 8.84512710571289, "learning_rate": 6.826089640899584e-06, "loss": 0.5438, "mean_token_accuracy": 0.8290506705641747, "num_tokens": 53240500.0, "step": 44310 }, { "entropy": 1.9010854482650756, "epoch": 0.13738812222025035, "grad_norm": 9.149563789367676, "learning_rate": 6.825319488737001e-06, "loss": 0.5429, "mean_token_accuracy": 0.8214688524603844, "num_tokens": 53252154.0, "step": 44320 }, { "entropy": 1.8761808782815934, "epoch": 0.13741912134530002, "grad_norm": 8.693964958190918, "learning_rate": 6.824549597192389e-06, "loss": 0.4767, "mean_token_accuracy": 0.8360074177384377, "num_tokens": 53264128.0, "step": 44330 }, { "entropy": 1.8591908350586892, "epoch": 0.1374501204703497, "grad_norm": 6.832935810089111, "learning_rate": 6.823779966118794e-06, "loss": 0.5326, "mean_token_accuracy": 0.831575682759285, "num_tokens": 53275450.0, "step": 44340 }, { "entropy": 1.840942220389843, "epoch": 0.1374811195953994, "grad_norm": 8.024560928344727, "learning_rate": 6.823010595369376e-06, "loss": 0.481, "mean_token_accuracy": 0.836443580687046, "num_tokens": 53287524.0, "step": 44350 }, { "entropy": 1.9482527136802674, "epoch": 0.1375121187204491, "grad_norm": 9.615019798278809, "learning_rate": 6.8222414847974136e-06, "loss": 0.565, "mean_token_accuracy": 0.8262650355696678, "num_tokens": 53298379.0, "step": 44360 }, { "entropy": 1.8953028246760368, "epoch": 0.1375431178454988, "grad_norm": 5.4591169357299805, "learning_rate": 6.821472634256301e-06, "loss": 0.5462, "mean_token_accuracy": 0.8326457425951957, "num_tokens": 53310130.0, "step": 44370 }, { "entropy": 1.9527557328343392, "epoch": 0.1375741169705485, "grad_norm": 9.788069725036621, "learning_rate": 6.820704043599545e-06, "loss": 0.5806, "mean_token_accuracy": 0.8213037893176078, "num_tokens": 53321390.0, "step": 44380 }, { "entropy": 1.8823087111115455, "epoch": 0.1376051160955982, "grad_norm": 9.298927307128906, "learning_rate": 6.819935712680769e-06, "loss": 0.5455, "mean_token_accuracy": 0.8245194494724274, "num_tokens": 53333069.0, "step": 44390 }, { "entropy": 1.920275841653347, "epoch": 0.1376361152206479, "grad_norm": 8.774785041809082, "learning_rate": 6.819167641353716e-06, "loss": 0.5892, "mean_token_accuracy": 0.8146108031272888, "num_tokens": 53344813.0, "step": 44400 }, { "entropy": 1.87754784822464, "epoch": 0.1376671143456976, "grad_norm": 4.192923545837402, "learning_rate": 6.818399829472239e-06, "loss": 0.5525, "mean_token_accuracy": 0.8271669283509254, "num_tokens": 53357822.0, "step": 44410 }, { "entropy": 1.7767975226044654, "epoch": 0.13769811347074729, "grad_norm": 9.163516998291016, "learning_rate": 6.8176322768903065e-06, "loss": 0.5023, "mean_token_accuracy": 0.8373310551047325, "num_tokens": 53371230.0, "step": 44420 }, { "entropy": 1.8346783101558686, "epoch": 0.13772911259579698, "grad_norm": 8.969266891479492, "learning_rate": 6.816864983462007e-06, "loss": 0.4936, "mean_token_accuracy": 0.8468596264719963, "num_tokens": 53383491.0, "step": 44430 }, { "entropy": 1.8975114315748214, "epoch": 0.13776011172084668, "grad_norm": 8.912147521972656, "learning_rate": 6.816097949041537e-06, "loss": 0.5409, "mean_token_accuracy": 0.8271856248378754, "num_tokens": 53395233.0, "step": 44440 }, { "entropy": 1.7993320614099502, "epoch": 0.13779111084589638, "grad_norm": 4.392065048217773, "learning_rate": 6.815331173483213e-06, "loss": 0.4544, "mean_token_accuracy": 0.8502204686403274, "num_tokens": 53407103.0, "step": 44450 }, { "entropy": 1.7951432079076768, "epoch": 0.13782210997094607, "grad_norm": 8.865302085876465, "learning_rate": 6.8145646566414645e-06, "loss": 0.5096, "mean_token_accuracy": 0.8381197184324265, "num_tokens": 53418949.0, "step": 44460 }, { "entropy": 1.787670373916626, "epoch": 0.13785310909599577, "grad_norm": 9.359115600585938, "learning_rate": 6.813798398370836e-06, "loss": 0.4616, "mean_token_accuracy": 0.8410790026187897, "num_tokens": 53431653.0, "step": 44470 }, { "entropy": 1.8918751433491707, "epoch": 0.13788410822104546, "grad_norm": 8.384889602661133, "learning_rate": 6.813032398525985e-06, "loss": 0.5482, "mean_token_accuracy": 0.8350237265229226, "num_tokens": 53443023.0, "step": 44480 }, { "entropy": 1.8514575868844987, "epoch": 0.13791510734609516, "grad_norm": 9.768966674804688, "learning_rate": 6.812266656961686e-06, "loss": 0.5369, "mean_token_accuracy": 0.8352868661284447, "num_tokens": 53454879.0, "step": 44490 }, { "entropy": 1.7858368843793868, "epoch": 0.13794610647114486, "grad_norm": 10.17784309387207, "learning_rate": 6.811501173532825e-06, "loss": 0.5214, "mean_token_accuracy": 0.8304376199841499, "num_tokens": 53467387.0, "step": 44500 }, { "entropy": 1.8139680430293084, "epoch": 0.13797710559619455, "grad_norm": 3.656538724899292, "learning_rate": 6.810735948094402e-06, "loss": 0.4672, "mean_token_accuracy": 0.8408811286091804, "num_tokens": 53479197.0, "step": 44510 }, { "entropy": 1.8897014811635018, "epoch": 0.13800810472124425, "grad_norm": 8.641545295715332, "learning_rate": 6.809970980501534e-06, "loss": 0.5551, "mean_token_accuracy": 0.8215809658169746, "num_tokens": 53490882.0, "step": 44520 }, { "entropy": 1.9230589419603348, "epoch": 0.13803910384629395, "grad_norm": 10.131928443908691, "learning_rate": 6.80920627060945e-06, "loss": 0.5744, "mean_token_accuracy": 0.8328511983156204, "num_tokens": 53502073.0, "step": 44530 }, { "entropy": 1.924891072511673, "epoch": 0.13807010297134364, "grad_norm": 8.855517387390137, "learning_rate": 6.808441818273496e-06, "loss": 0.5667, "mean_token_accuracy": 0.8318179234862327, "num_tokens": 53513609.0, "step": 44540 }, { "entropy": 1.9333911418914795, "epoch": 0.13810110209639334, "grad_norm": 8.190418243408203, "learning_rate": 6.807677623349122e-06, "loss": 0.6013, "mean_token_accuracy": 0.8251995086669922, "num_tokens": 53524500.0, "step": 44550 }, { "entropy": 1.8267549514770507, "epoch": 0.13813210122144304, "grad_norm": 9.940940856933594, "learning_rate": 6.806913685691902e-06, "loss": 0.5157, "mean_token_accuracy": 0.8373084679245949, "num_tokens": 53537133.0, "step": 44560 }, { "entropy": 1.8623538598418237, "epoch": 0.1381631003464927, "grad_norm": 9.177823066711426, "learning_rate": 6.806150005157519e-06, "loss": 0.5458, "mean_token_accuracy": 0.8380098685622215, "num_tokens": 53548906.0, "step": 44570 }, { "entropy": 1.8576925709843635, "epoch": 0.1381940994715424, "grad_norm": 4.811609268188477, "learning_rate": 6.805386581601771e-06, "loss": 0.5152, "mean_token_accuracy": 0.8375003471970558, "num_tokens": 53561573.0, "step": 44580 }, { "entropy": 1.8996227890253068, "epoch": 0.1382250985965921, "grad_norm": 9.413639068603516, "learning_rate": 6.804623414880566e-06, "loss": 0.5631, "mean_token_accuracy": 0.8216565132141114, "num_tokens": 53572897.0, "step": 44590 }, { "entropy": 1.9721861988306046, "epoch": 0.1382560977216418, "grad_norm": 8.800419807434082, "learning_rate": 6.803860504849928e-06, "loss": 0.5792, "mean_token_accuracy": 0.8295346736907959, "num_tokens": 53583631.0, "step": 44600 }, { "entropy": 1.848584523051977, "epoch": 0.1382870968466915, "grad_norm": 10.385255813598633, "learning_rate": 6.803097851365994e-06, "loss": 0.5099, "mean_token_accuracy": 0.8350134581327439, "num_tokens": 53596466.0, "step": 44610 }, { "entropy": 1.854781810939312, "epoch": 0.1383180959717412, "grad_norm": 9.363265991210938, "learning_rate": 6.8023354542850115e-06, "loss": 0.5288, "mean_token_accuracy": 0.8366737559437751, "num_tokens": 53608422.0, "step": 44620 }, { "entropy": 1.9337295114994049, "epoch": 0.1383490950967909, "grad_norm": 10.87320613861084, "learning_rate": 6.8015733134633434e-06, "loss": 0.5616, "mean_token_accuracy": 0.8331938117742539, "num_tokens": 53619179.0, "step": 44630 }, { "entropy": 1.864050853252411, "epoch": 0.13838009422184058, "grad_norm": 9.210199356079102, "learning_rate": 6.800811428757463e-06, "loss": 0.5562, "mean_token_accuracy": 0.8297410145401954, "num_tokens": 53631637.0, "step": 44640 }, { "entropy": 1.8213204242289067, "epoch": 0.13841109334689028, "grad_norm": 8.101444244384766, "learning_rate": 6.80004980002396e-06, "loss": 0.448, "mean_token_accuracy": 0.833535049855709, "num_tokens": 53644709.0, "step": 44650 }, { "entropy": 1.8359388574957847, "epoch": 0.13844209247193998, "grad_norm": 7.781729221343994, "learning_rate": 6.799288427119529e-06, "loss": 0.4765, "mean_token_accuracy": 0.8498460426926613, "num_tokens": 53656295.0, "step": 44660 }, { "entropy": 1.7105117201805116, "epoch": 0.13847309159698967, "grad_norm": 7.457498073577881, "learning_rate": 6.798527309900985e-06, "loss": 0.3764, "mean_token_accuracy": 0.8544366523623467, "num_tokens": 53670465.0, "step": 44670 }, { "entropy": 1.913690346479416, "epoch": 0.13850409072203937, "grad_norm": 9.645509719848633, "learning_rate": 6.797766448225251e-06, "loss": 0.5731, "mean_token_accuracy": 0.8224819481372834, "num_tokens": 53681032.0, "step": 44680 }, { "entropy": 1.7610852643847466, "epoch": 0.13853508984708907, "grad_norm": 9.441847801208496, "learning_rate": 6.797005841949362e-06, "loss": 0.5023, "mean_token_accuracy": 0.8374917894601822, "num_tokens": 53693660.0, "step": 44690 }, { "entropy": 1.8001830980181694, "epoch": 0.13856608897213876, "grad_norm": 7.770538806915283, "learning_rate": 6.796245490930466e-06, "loss": 0.5021, "mean_token_accuracy": 0.8382193520665169, "num_tokens": 53706447.0, "step": 44700 }, { "entropy": 1.8964716732501983, "epoch": 0.13859708809718846, "grad_norm": 10.237454414367676, "learning_rate": 6.795485395025823e-06, "loss": 0.63, "mean_token_accuracy": 0.8149755626916886, "num_tokens": 53717276.0, "step": 44710 }, { "entropy": 1.8226286932826041, "epoch": 0.13862808722223816, "grad_norm": 8.327227592468262, "learning_rate": 6.794725554092804e-06, "loss": 0.532, "mean_token_accuracy": 0.8249738708138465, "num_tokens": 53729351.0, "step": 44720 }, { "entropy": 1.8385003119707108, "epoch": 0.13865908634728785, "grad_norm": 8.358758926391602, "learning_rate": 6.793965967988893e-06, "loss": 0.5196, "mean_token_accuracy": 0.823200698196888, "num_tokens": 53741541.0, "step": 44730 }, { "entropy": 1.8498780086636544, "epoch": 0.13869008547233755, "grad_norm": 9.434165000915527, "learning_rate": 6.793206636571682e-06, "loss": 0.479, "mean_token_accuracy": 0.8404179245233536, "num_tokens": 53753633.0, "step": 44740 }, { "entropy": 1.7737918436527251, "epoch": 0.13872108459738725, "grad_norm": 9.19772720336914, "learning_rate": 6.792447559698879e-06, "loss": 0.4508, "mean_token_accuracy": 0.8522493004798889, "num_tokens": 53766858.0, "step": 44750 }, { "entropy": 1.928390011191368, "epoch": 0.13875208372243694, "grad_norm": 9.316328048706055, "learning_rate": 6.791688737228301e-06, "loss": 0.5819, "mean_token_accuracy": 0.8271341755986213, "num_tokens": 53777506.0, "step": 44760 }, { "entropy": 1.7646124705672264, "epoch": 0.13878308284748664, "grad_norm": 3.9269862174987793, "learning_rate": 6.790930169017873e-06, "loss": 0.4169, "mean_token_accuracy": 0.8518116131424904, "num_tokens": 53790737.0, "step": 44770 }, { "entropy": 1.8543679133057593, "epoch": 0.13881408197253634, "grad_norm": 4.760611534118652, "learning_rate": 6.790171854925639e-06, "loss": 0.492, "mean_token_accuracy": 0.8325586140155792, "num_tokens": 53803127.0, "step": 44780 }, { "entropy": 1.9045397624373437, "epoch": 0.13884508109758603, "grad_norm": 9.993179321289062, "learning_rate": 6.789413794809746e-06, "loss": 0.5373, "mean_token_accuracy": 0.8328649774193764, "num_tokens": 53814755.0, "step": 44790 }, { "entropy": 1.8256755113601684, "epoch": 0.13887608022263573, "grad_norm": 9.889800071716309, "learning_rate": 6.788655988528456e-06, "loss": 0.4687, "mean_token_accuracy": 0.8507124841213226, "num_tokens": 53827255.0, "step": 44800 }, { "entropy": 1.8313057616353035, "epoch": 0.13890707934768542, "grad_norm": 8.37853717803955, "learning_rate": 6.787898435940142e-06, "loss": 0.5314, "mean_token_accuracy": 0.8321856454014778, "num_tokens": 53838659.0, "step": 44810 }, { "entropy": 1.8516384482383728, "epoch": 0.1389380784727351, "grad_norm": 4.396270751953125, "learning_rate": 6.787141136903286e-06, "loss": 0.4627, "mean_token_accuracy": 0.8430250212550163, "num_tokens": 53851127.0, "step": 44820 }, { "entropy": 1.9117999702692032, "epoch": 0.1389690775977848, "grad_norm": 4.586118698120117, "learning_rate": 6.7863840912764766e-06, "loss": 0.5565, "mean_token_accuracy": 0.8265089154243469, "num_tokens": 53862914.0, "step": 44830 }, { "entropy": 1.956819300353527, "epoch": 0.1390000767228345, "grad_norm": 8.921218872070312, "learning_rate": 6.785627298918424e-06, "loss": 0.6126, "mean_token_accuracy": 0.81765276491642, "num_tokens": 53874681.0, "step": 44840 }, { "entropy": 1.8389661602675915, "epoch": 0.13903107584788418, "grad_norm": 3.82698917388916, "learning_rate": 6.784870759687936e-06, "loss": 0.4488, "mean_token_accuracy": 0.8415364250540733, "num_tokens": 53888519.0, "step": 44850 }, { "entropy": 1.8882953882217408, "epoch": 0.13906207497293388, "grad_norm": 13.667028427124023, "learning_rate": 6.78411447344394e-06, "loss": 0.5228, "mean_token_accuracy": 0.822119127213955, "num_tokens": 53900801.0, "step": 44860 }, { "entropy": 1.8657517284154892, "epoch": 0.13909307409798358, "grad_norm": 8.731566429138184, "learning_rate": 6.783358440045469e-06, "loss": 0.4694, "mean_token_accuracy": 0.8443102672696113, "num_tokens": 53912704.0, "step": 44870 }, { "entropy": 1.898390594124794, "epoch": 0.13912407322303327, "grad_norm": 8.59327507019043, "learning_rate": 6.782602659351665e-06, "loss": 0.5484, "mean_token_accuracy": 0.8349470987915992, "num_tokens": 53924654.0, "step": 44880 }, { "entropy": 1.8541928052902221, "epoch": 0.13915507234808297, "grad_norm": 4.574117660522461, "learning_rate": 6.781847131221781e-06, "loss": 0.5823, "mean_token_accuracy": 0.8248166784644126, "num_tokens": 53936606.0, "step": 44890 }, { "entropy": 1.929851384460926, "epoch": 0.13918607147313267, "grad_norm": 8.305390357971191, "learning_rate": 6.781091855515185e-06, "loss": 0.5662, "mean_token_accuracy": 0.8298721700906754, "num_tokens": 53948064.0, "step": 44900 }, { "entropy": 1.9231369107961656, "epoch": 0.13921707059818236, "grad_norm": 10.014037132263184, "learning_rate": 6.780336832091346e-06, "loss": 0.5509, "mean_token_accuracy": 0.8278407782316208, "num_tokens": 53959163.0, "step": 44910 }, { "entropy": 1.8753797337412834, "epoch": 0.13924806972323206, "grad_norm": 4.650575160980225, "learning_rate": 6.779582060809845e-06, "loss": 0.5268, "mean_token_accuracy": 0.8299560397863388, "num_tokens": 53970669.0, "step": 44920 }, { "entropy": 1.8801968157291413, "epoch": 0.13927906884828176, "grad_norm": 7.466404438018799, "learning_rate": 6.778827541530377e-06, "loss": 0.5665, "mean_token_accuracy": 0.8341574132442474, "num_tokens": 53981942.0, "step": 44930 }, { "entropy": 1.9134157732129098, "epoch": 0.13931006797333145, "grad_norm": 8.176107406616211, "learning_rate": 6.7780732741127416e-06, "loss": 0.5927, "mean_token_accuracy": 0.8287787228822708, "num_tokens": 53993567.0, "step": 44940 }, { "entropy": 1.86323581635952, "epoch": 0.13934106709838115, "grad_norm": 10.628385543823242, "learning_rate": 6.77731925841685e-06, "loss": 0.4713, "mean_token_accuracy": 0.8439363062381744, "num_tokens": 54005593.0, "step": 44950 }, { "entropy": 1.9236777603626252, "epoch": 0.13937206622343085, "grad_norm": 10.189666748046875, "learning_rate": 6.77656549430272e-06, "loss": 0.5409, "mean_token_accuracy": 0.8301165014505386, "num_tokens": 54017353.0, "step": 44960 }, { "entropy": 1.9137274518609046, "epoch": 0.13940306534848054, "grad_norm": 4.35982608795166, "learning_rate": 6.77581198163048e-06, "loss": 0.576, "mean_token_accuracy": 0.8243458449840546, "num_tokens": 54028577.0, "step": 44970 }, { "entropy": 1.881200096011162, "epoch": 0.13943406447353024, "grad_norm": 8.86418342590332, "learning_rate": 6.775058720260368e-06, "loss": 0.6144, "mean_token_accuracy": 0.831314592063427, "num_tokens": 54040844.0, "step": 44980 }, { "entropy": 1.8607298702001571, "epoch": 0.13946506359857994, "grad_norm": 8.576333999633789, "learning_rate": 6.77430571005273e-06, "loss": 0.5247, "mean_token_accuracy": 0.831570616364479, "num_tokens": 54052863.0, "step": 44990 }, { "entropy": 1.9090143859386444, "epoch": 0.13949606272362963, "grad_norm": 4.669756889343262, "learning_rate": 6.7735529508680195e-06, "loss": 0.5629, "mean_token_accuracy": 0.8271263659000396, "num_tokens": 54064516.0, "step": 45000 }, { "entropy": 1.8905669406056405, "epoch": 0.13952706184867933, "grad_norm": 10.381400108337402, "learning_rate": 6.772800442566799e-06, "loss": 0.5365, "mean_token_accuracy": 0.831216785311699, "num_tokens": 54076038.0, "step": 45010 }, { "entropy": 1.7761605516076089, "epoch": 0.13955806097372903, "grad_norm": 2.322737216949463, "learning_rate": 6.772048185009742e-06, "loss": 0.4652, "mean_token_accuracy": 0.8467302456498146, "num_tokens": 54089228.0, "step": 45020 }, { "entropy": 1.9183564558625221, "epoch": 0.13958906009877872, "grad_norm": 3.822122097015381, "learning_rate": 6.771296178057627e-06, "loss": 0.5374, "mean_token_accuracy": 0.8354233950376511, "num_tokens": 54100756.0, "step": 45030 }, { "entropy": 1.89002455919981, "epoch": 0.13962005922382842, "grad_norm": 10.499687194824219, "learning_rate": 6.770544421571341e-06, "loss": 0.5127, "mean_token_accuracy": 0.833263611793518, "num_tokens": 54112772.0, "step": 45040 }, { "entropy": 1.8945647366344929, "epoch": 0.13965105834887812, "grad_norm": 8.45938777923584, "learning_rate": 6.7697929154118806e-06, "loss": 0.5375, "mean_token_accuracy": 0.8283353626728058, "num_tokens": 54124878.0, "step": 45050 }, { "entropy": 1.9062055438756942, "epoch": 0.1396820574739278, "grad_norm": 8.998255729675293, "learning_rate": 6.769041659440348e-06, "loss": 0.5651, "mean_token_accuracy": 0.8295639351010322, "num_tokens": 54136958.0, "step": 45060 }, { "entropy": 1.9289535745978355, "epoch": 0.13971305659897748, "grad_norm": 7.8160505294799805, "learning_rate": 6.768290653517961e-06, "loss": 0.5361, "mean_token_accuracy": 0.8284988284111023, "num_tokens": 54148685.0, "step": 45070 }, { "entropy": 1.8776358231902122, "epoch": 0.13974405572402718, "grad_norm": 10.067404747009277, "learning_rate": 6.767539897506031e-06, "loss": 0.4766, "mean_token_accuracy": 0.8291481390595437, "num_tokens": 54161099.0, "step": 45080 }, { "entropy": 1.8345778226852416, "epoch": 0.13977505484907687, "grad_norm": 9.045985221862793, "learning_rate": 6.766789391265992e-06, "loss": 0.5164, "mean_token_accuracy": 0.8270506203174591, "num_tokens": 54173712.0, "step": 45090 }, { "entropy": 1.9202216163277626, "epoch": 0.13980605397412657, "grad_norm": 8.717995643615723, "learning_rate": 6.7660391346593745e-06, "loss": 0.6067, "mean_token_accuracy": 0.820853091776371, "num_tokens": 54186315.0, "step": 45100 }, { "entropy": 1.9440720230340958, "epoch": 0.13983705309917627, "grad_norm": 7.580258846282959, "learning_rate": 6.765289127547821e-06, "loss": 0.5868, "mean_token_accuracy": 0.8336421042680741, "num_tokens": 54197829.0, "step": 45110 }, { "entropy": 1.9380245164036751, "epoch": 0.13986805222422596, "grad_norm": 9.486980438232422, "learning_rate": 6.764539369793085e-06, "loss": 0.5743, "mean_token_accuracy": 0.818755242228508, "num_tokens": 54209626.0, "step": 45120 }, { "entropy": 1.9459102168679236, "epoch": 0.13989905134927566, "grad_norm": 8.933557510375977, "learning_rate": 6.7637898612570185e-06, "loss": 0.5483, "mean_token_accuracy": 0.8371159300208092, "num_tokens": 54221719.0, "step": 45130 }, { "entropy": 1.7940280467271805, "epoch": 0.13993005047432536, "grad_norm": 10.48863410949707, "learning_rate": 6.7630406018015884e-06, "loss": 0.4218, "mean_token_accuracy": 0.8437355652451515, "num_tokens": 54235073.0, "step": 45140 }, { "entropy": 1.7776363700628282, "epoch": 0.13996104959937505, "grad_norm": 8.434184074401855, "learning_rate": 6.762291591288863e-06, "loss": 0.434, "mean_token_accuracy": 0.8471253350377083, "num_tokens": 54248626.0, "step": 45150 }, { "entropy": 1.8552054047584534, "epoch": 0.13999204872442475, "grad_norm": 7.70085334777832, "learning_rate": 6.761542829581025e-06, "loss": 0.4975, "mean_token_accuracy": 0.8401309624314308, "num_tokens": 54260486.0, "step": 45160 }, { "entropy": 1.856016856431961, "epoch": 0.14002304784947445, "grad_norm": 10.124335289001465, "learning_rate": 6.760794316540352e-06, "loss": 0.5163, "mean_token_accuracy": 0.8326075285673141, "num_tokens": 54273686.0, "step": 45170 }, { "entropy": 1.845085544884205, "epoch": 0.14005404697452414, "grad_norm": 4.437714576721191, "learning_rate": 6.760046052029241e-06, "loss": 0.5034, "mean_token_accuracy": 0.8339051187038422, "num_tokens": 54286613.0, "step": 45180 }, { "entropy": 1.8965241134166717, "epoch": 0.14008504609957384, "grad_norm": 8.724405288696289, "learning_rate": 6.7592980359101864e-06, "loss": 0.5568, "mean_token_accuracy": 0.818605862557888, "num_tokens": 54298537.0, "step": 45190 }, { "entropy": 1.8351117700338364, "epoch": 0.14011604522462354, "grad_norm": 2.729274034500122, "learning_rate": 6.758550268045797e-06, "loss": 0.5117, "mean_token_accuracy": 0.8237488672137261, "num_tokens": 54312345.0, "step": 45200 }, { "entropy": 1.8630397498607636, "epoch": 0.14014704434967323, "grad_norm": 8.679328918457031, "learning_rate": 6.757802748298778e-06, "loss": 0.4905, "mean_token_accuracy": 0.8270406112074852, "num_tokens": 54324911.0, "step": 45210 }, { "entropy": 1.8825726188719272, "epoch": 0.14017804347472293, "grad_norm": 4.007488250732422, "learning_rate": 6.757055476531949e-06, "loss": 0.519, "mean_token_accuracy": 0.8377444759011269, "num_tokens": 54337224.0, "step": 45220 }, { "entropy": 1.978707179427147, "epoch": 0.14020904259977263, "grad_norm": 8.847635269165039, "learning_rate": 6.756308452608234e-06, "loss": 0.5984, "mean_token_accuracy": 0.8280522257089615, "num_tokens": 54348233.0, "step": 45230 }, { "entropy": 1.9876850560307502, "epoch": 0.14024004172482232, "grad_norm": 6.580830097198486, "learning_rate": 6.755561676390661e-06, "loss": 0.5664, "mean_token_accuracy": 0.8235482305288315, "num_tokens": 54359366.0, "step": 45240 }, { "entropy": 1.8557663679122924, "epoch": 0.14027104084987202, "grad_norm": 8.015181541442871, "learning_rate": 6.754815147742368e-06, "loss": 0.5147, "mean_token_accuracy": 0.8315120905637741, "num_tokens": 54372683.0, "step": 45250 }, { "entropy": 1.9107207596302032, "epoch": 0.14030203997492172, "grad_norm": 7.833492279052734, "learning_rate": 6.754068866526591e-06, "loss": 0.5812, "mean_token_accuracy": 0.8162291869521141, "num_tokens": 54384271.0, "step": 45260 }, { "entropy": 1.8534480392932893, "epoch": 0.1403330390999714, "grad_norm": 8.591071128845215, "learning_rate": 6.753322832606681e-06, "loss": 0.5102, "mean_token_accuracy": 0.8405616924166679, "num_tokens": 54396228.0, "step": 45270 }, { "entropy": 1.9540248334407806, "epoch": 0.1403640382250211, "grad_norm": 8.374753952026367, "learning_rate": 6.752577045846086e-06, "loss": 0.5839, "mean_token_accuracy": 0.8295043364167214, "num_tokens": 54407440.0, "step": 45280 }, { "entropy": 1.8010128349065782, "epoch": 0.1403950373500708, "grad_norm": 8.82718276977539, "learning_rate": 6.7518315061083694e-06, "loss": 0.4518, "mean_token_accuracy": 0.8495183497667312, "num_tokens": 54420654.0, "step": 45290 }, { "entropy": 1.9086335971951485, "epoch": 0.1404260364751205, "grad_norm": 9.302217483520508, "learning_rate": 6.751086213257192e-06, "loss": 0.5101, "mean_token_accuracy": 0.8401185050606728, "num_tokens": 54432469.0, "step": 45300 }, { "entropy": 1.84428121894598, "epoch": 0.14045703560017017, "grad_norm": 8.570785522460938, "learning_rate": 6.750341167156322e-06, "loss": 0.4681, "mean_token_accuracy": 0.840717563033104, "num_tokens": 54444860.0, "step": 45310 }, { "entropy": 1.890746709704399, "epoch": 0.14048803472521987, "grad_norm": 4.786993980407715, "learning_rate": 6.749596367669633e-06, "loss": 0.5198, "mean_token_accuracy": 0.8295010164380073, "num_tokens": 54456895.0, "step": 45320 }, { "entropy": 1.9245510414242744, "epoch": 0.14051903385026956, "grad_norm": 10.287075996398926, "learning_rate": 6.748851814661106e-06, "loss": 0.5775, "mean_token_accuracy": 0.8316981479525566, "num_tokens": 54469287.0, "step": 45330 }, { "entropy": 1.918796244263649, "epoch": 0.14055003297531926, "grad_norm": 7.518104553222656, "learning_rate": 6.748107507994823e-06, "loss": 0.5142, "mean_token_accuracy": 0.8400950893759728, "num_tokens": 54481147.0, "step": 45340 }, { "entropy": 1.878979854285717, "epoch": 0.14058103210036896, "grad_norm": 4.404152870178223, "learning_rate": 6.747363447534975e-06, "loss": 0.5295, "mean_token_accuracy": 0.8336639732122422, "num_tokens": 54492785.0, "step": 45350 }, { "entropy": 1.8506067097187042, "epoch": 0.14061203122541865, "grad_norm": 9.694655418395996, "learning_rate": 6.746619633145854e-06, "loss": 0.5202, "mean_token_accuracy": 0.831046088039875, "num_tokens": 54505486.0, "step": 45360 }, { "entropy": 1.851573745906353, "epoch": 0.14064303035046835, "grad_norm": 8.404928207397461, "learning_rate": 6.745876064691858e-06, "loss": 0.4996, "mean_token_accuracy": 0.8353975236415863, "num_tokens": 54518430.0, "step": 45370 }, { "entropy": 1.8510615780949593, "epoch": 0.14067402947551805, "grad_norm": 6.233858108520508, "learning_rate": 6.745132742037491e-06, "loss": 0.5096, "mean_token_accuracy": 0.8379595011472702, "num_tokens": 54530823.0, "step": 45380 }, { "entropy": 1.8410978406667708, "epoch": 0.14070502860056774, "grad_norm": 7.378570556640625, "learning_rate": 6.744389665047362e-06, "loss": 0.4424, "mean_token_accuracy": 0.8501470074057579, "num_tokens": 54543450.0, "step": 45390 }, { "entropy": 1.811590349674225, "epoch": 0.14073602772561744, "grad_norm": 9.43700122833252, "learning_rate": 6.743646833586182e-06, "loss": 0.4649, "mean_token_accuracy": 0.8426756337285042, "num_tokens": 54556418.0, "step": 45400 }, { "entropy": 1.8802534580230712, "epoch": 0.14076702685066714, "grad_norm": 8.894246101379395, "learning_rate": 6.742904247518765e-06, "loss": 0.5336, "mean_token_accuracy": 0.831374479830265, "num_tokens": 54568516.0, "step": 45410 }, { "entropy": 1.9234187602996826, "epoch": 0.14079802597571683, "grad_norm": 8.303701400756836, "learning_rate": 6.742161906710033e-06, "loss": 0.5486, "mean_token_accuracy": 0.8301004365086555, "num_tokens": 54580154.0, "step": 45420 }, { "entropy": 1.854028156399727, "epoch": 0.14082902510076653, "grad_norm": 6.356640815734863, "learning_rate": 6.741419811025011e-06, "loss": 0.5102, "mean_token_accuracy": 0.8381151556968689, "num_tokens": 54592717.0, "step": 45430 }, { "entropy": 1.9734964907169341, "epoch": 0.14086002422581623, "grad_norm": 10.328083038330078, "learning_rate": 6.740677960328828e-06, "loss": 0.5915, "mean_token_accuracy": 0.8209626346826553, "num_tokens": 54603430.0, "step": 45440 }, { "entropy": 1.973678994178772, "epoch": 0.14089102335086592, "grad_norm": 9.18193531036377, "learning_rate": 6.739936354486713e-06, "loss": 0.6025, "mean_token_accuracy": 0.8171725884079933, "num_tokens": 54614206.0, "step": 45450 }, { "entropy": 1.9304319381713868, "epoch": 0.14092202247591562, "grad_norm": 9.106557846069336, "learning_rate": 6.7391949933640045e-06, "loss": 0.5131, "mean_token_accuracy": 0.8340640947222709, "num_tokens": 54626347.0, "step": 45460 }, { "entropy": 1.931530450284481, "epoch": 0.14095302160096532, "grad_norm": 10.796926498413086, "learning_rate": 6.738453876826143e-06, "loss": 0.561, "mean_token_accuracy": 0.8309638902544976, "num_tokens": 54637794.0, "step": 45470 }, { "entropy": 1.9570015415549278, "epoch": 0.140984020726015, "grad_norm": 11.423550605773926, "learning_rate": 6.7377130047386695e-06, "loss": 0.5831, "mean_token_accuracy": 0.820400333404541, "num_tokens": 54648619.0, "step": 45480 }, { "entropy": 1.9563427597284317, "epoch": 0.1410150198510647, "grad_norm": 7.9254560470581055, "learning_rate": 6.7369723769672335e-06, "loss": 0.5795, "mean_token_accuracy": 0.8188218504190445, "num_tokens": 54659912.0, "step": 45490 }, { "entropy": 1.9148031935095786, "epoch": 0.1410460189761144, "grad_norm": 7.281384468078613, "learning_rate": 6.736231993377581e-06, "loss": 0.5117, "mean_token_accuracy": 0.8334671080112457, "num_tokens": 54671882.0, "step": 45500 }, { "entropy": 1.9391163542866707, "epoch": 0.1410770181011641, "grad_norm": 9.63330078125, "learning_rate": 6.735491853835571e-06, "loss": 0.5381, "mean_token_accuracy": 0.8372703313827514, "num_tokens": 54682803.0, "step": 45510 }, { "entropy": 1.8276369392871856, "epoch": 0.1411080172262138, "grad_norm": 8.537032127380371, "learning_rate": 6.734751958207155e-06, "loss": 0.5093, "mean_token_accuracy": 0.8435697957873345, "num_tokens": 54695679.0, "step": 45520 }, { "entropy": 1.8639580070972444, "epoch": 0.1411390163512635, "grad_norm": 7.994620323181152, "learning_rate": 6.7340123063583955e-06, "loss": 0.5023, "mean_token_accuracy": 0.8325873509049415, "num_tokens": 54708838.0, "step": 45530 }, { "entropy": 1.8180058419704437, "epoch": 0.1411700154763132, "grad_norm": 4.438790321350098, "learning_rate": 6.733272898155452e-06, "loss": 0.4698, "mean_token_accuracy": 0.8409150257706642, "num_tokens": 54721440.0, "step": 45540 }, { "entropy": 1.8847099259495734, "epoch": 0.1412010146013629, "grad_norm": 10.436013221740723, "learning_rate": 6.732533733464593e-06, "loss": 0.58, "mean_token_accuracy": 0.8249880090355873, "num_tokens": 54733585.0, "step": 45550 }, { "entropy": 1.913513371348381, "epoch": 0.14123201372641256, "grad_norm": 6.460671901702881, "learning_rate": 6.731794812152185e-06, "loss": 0.5792, "mean_token_accuracy": 0.8275411754846573, "num_tokens": 54744839.0, "step": 45560 }, { "entropy": 1.9194407686591148, "epoch": 0.14126301285146226, "grad_norm": 10.074481964111328, "learning_rate": 6.7310561340847e-06, "loss": 0.6015, "mean_token_accuracy": 0.8203522220253945, "num_tokens": 54755728.0, "step": 45570 }, { "entropy": 1.975143238902092, "epoch": 0.14129401197651195, "grad_norm": 7.195339679718018, "learning_rate": 6.73031769912871e-06, "loss": 0.6573, "mean_token_accuracy": 0.8187574937939643, "num_tokens": 54767221.0, "step": 45580 }, { "entropy": 1.9089319556951523, "epoch": 0.14132501110156165, "grad_norm": 9.949457168579102, "learning_rate": 6.729579507150891e-06, "loss": 0.5393, "mean_token_accuracy": 0.8359342351555824, "num_tokens": 54778850.0, "step": 45590 }, { "entropy": 1.9323441043496132, "epoch": 0.14135601022661134, "grad_norm": 4.49478816986084, "learning_rate": 6.728841558018021e-06, "loss": 0.5635, "mean_token_accuracy": 0.8376797735691071, "num_tokens": 54790559.0, "step": 45600 }, { "entropy": 1.9415909707546235, "epoch": 0.14138700935166104, "grad_norm": 8.446100234985352, "learning_rate": 6.72810385159698e-06, "loss": 0.5561, "mean_token_accuracy": 0.8303654089570045, "num_tokens": 54802533.0, "step": 45610 }, { "entropy": 1.917179961502552, "epoch": 0.14141800847671074, "grad_norm": 3.9077000617980957, "learning_rate": 6.7273663877547516e-06, "loss": 0.5538, "mean_token_accuracy": 0.8232975766062737, "num_tokens": 54814772.0, "step": 45620 }, { "entropy": 1.8237308233976364, "epoch": 0.14144900760176043, "grad_norm": 11.04135799407959, "learning_rate": 6.726629166358418e-06, "loss": 0.4777, "mean_token_accuracy": 0.8421555116772652, "num_tokens": 54827998.0, "step": 45630 }, { "entropy": 1.876150907576084, "epoch": 0.14148000672681013, "grad_norm": 6.673070907592773, "learning_rate": 6.725892187275168e-06, "loss": 0.4424, "mean_token_accuracy": 0.845268502831459, "num_tokens": 54840719.0, "step": 45640 }, { "entropy": 1.8938727974891663, "epoch": 0.14151100585185983, "grad_norm": 4.026385307312012, "learning_rate": 6.725155450372289e-06, "loss": 0.5532, "mean_token_accuracy": 0.8300805896520614, "num_tokens": 54852189.0, "step": 45650 }, { "entropy": 1.8416188895702361, "epoch": 0.14154200497690952, "grad_norm": 8.692848205566406, "learning_rate": 6.724418955517171e-06, "loss": 0.5154, "mean_token_accuracy": 0.8287687584757805, "num_tokens": 54864719.0, "step": 45660 }, { "entropy": 1.9456066131591796, "epoch": 0.14157300410195922, "grad_norm": 9.043981552124023, "learning_rate": 6.723682702577305e-06, "loss": 0.6013, "mean_token_accuracy": 0.8262223243713379, "num_tokens": 54876153.0, "step": 45670 }, { "entropy": 1.8887315690517426, "epoch": 0.14160400322700892, "grad_norm": 8.423904418945312, "learning_rate": 6.7229466914202864e-06, "loss": 0.4768, "mean_token_accuracy": 0.851503835618496, "num_tokens": 54888235.0, "step": 45680 }, { "entropy": 1.862038327753544, "epoch": 0.14163500235205861, "grad_norm": 3.738830804824829, "learning_rate": 6.722210921913808e-06, "loss": 0.4949, "mean_token_accuracy": 0.8433087572455407, "num_tokens": 54900600.0, "step": 45690 }, { "entropy": 1.9590093523263932, "epoch": 0.1416660014771083, "grad_norm": 4.451807498931885, "learning_rate": 6.721475393925665e-06, "loss": 0.5955, "mean_token_accuracy": 0.8190687134861946, "num_tokens": 54911971.0, "step": 45700 }, { "entropy": 1.8686208486557008, "epoch": 0.141697000602158, "grad_norm": 9.367164611816406, "learning_rate": 6.720740107323755e-06, "loss": 0.511, "mean_token_accuracy": 0.8351000919938087, "num_tokens": 54924188.0, "step": 45710 }, { "entropy": 1.9188775599002839, "epoch": 0.1417279997272077, "grad_norm": 10.714550018310547, "learning_rate": 6.720005061976077e-06, "loss": 0.5598, "mean_token_accuracy": 0.8303559988737106, "num_tokens": 54935495.0, "step": 45720 }, { "entropy": 1.8970612213015556, "epoch": 0.1417589988522574, "grad_norm": 4.424524307250977, "learning_rate": 6.7192702577507306e-06, "loss": 0.5181, "mean_token_accuracy": 0.8270143359899521, "num_tokens": 54948599.0, "step": 45730 }, { "entropy": 1.9638367608189582, "epoch": 0.1417899979773071, "grad_norm": 8.639518737792969, "learning_rate": 6.718535694515915e-06, "loss": 0.5418, "mean_token_accuracy": 0.8268617764115334, "num_tokens": 54960096.0, "step": 45740 }, { "entropy": 1.920237709581852, "epoch": 0.1418209971023568, "grad_norm": 8.430821418762207, "learning_rate": 6.717801372139931e-06, "loss": 0.5459, "mean_token_accuracy": 0.8220153465867043, "num_tokens": 54972068.0, "step": 45750 }, { "entropy": 1.7880508966743947, "epoch": 0.1418519962274065, "grad_norm": 4.553556442260742, "learning_rate": 6.717067290491183e-06, "loss": 0.4357, "mean_token_accuracy": 0.8418186247348786, "num_tokens": 54985926.0, "step": 45760 }, { "entropy": 1.9695019006729126, "epoch": 0.1418829953524562, "grad_norm": 8.724945068359375, "learning_rate": 6.716333449438172e-06, "loss": 0.5232, "mean_token_accuracy": 0.8376459792256356, "num_tokens": 54996931.0, "step": 45770 }, { "entropy": 1.8776928693056107, "epoch": 0.14191399447750588, "grad_norm": 4.291085720062256, "learning_rate": 6.715599848849499e-06, "loss": 0.5726, "mean_token_accuracy": 0.822805991768837, "num_tokens": 55009212.0, "step": 45780 }, { "entropy": 1.9443933725357057, "epoch": 0.14194499360255558, "grad_norm": 8.464522361755371, "learning_rate": 6.714866488593871e-06, "loss": 0.5886, "mean_token_accuracy": 0.8179412320256233, "num_tokens": 55019971.0, "step": 45790 }, { "entropy": 1.8963981166481971, "epoch": 0.14197599272760528, "grad_norm": 8.489872932434082, "learning_rate": 6.714133368540089e-06, "loss": 0.5414, "mean_token_accuracy": 0.830699360370636, "num_tokens": 55030869.0, "step": 45800 }, { "entropy": 1.8366913974285126, "epoch": 0.14200699185265495, "grad_norm": 4.7726216316223145, "learning_rate": 6.713400488557057e-06, "loss": 0.4756, "mean_token_accuracy": 0.8389330074191094, "num_tokens": 55043876.0, "step": 45810 }, { "entropy": 1.8235629141330718, "epoch": 0.14203799097770464, "grad_norm": 4.734610080718994, "learning_rate": 6.712667848513782e-06, "loss": 0.4653, "mean_token_accuracy": 0.8486215993762016, "num_tokens": 55056283.0, "step": 45820 }, { "entropy": 1.7981175884604454, "epoch": 0.14206899010275434, "grad_norm": 9.670796394348145, "learning_rate": 6.711935448279365e-06, "loss": 0.465, "mean_token_accuracy": 0.8392187222838402, "num_tokens": 55069610.0, "step": 45830 }, { "entropy": 1.8424267813563346, "epoch": 0.14209998922780404, "grad_norm": 3.722155809402466, "learning_rate": 6.711203287723014e-06, "loss": 0.4865, "mean_token_accuracy": 0.8339389190077782, "num_tokens": 55083021.0, "step": 45840 }, { "entropy": 1.877331507205963, "epoch": 0.14213098835285373, "grad_norm": 9.001885414123535, "learning_rate": 6.7104713667140285e-06, "loss": 0.5586, "mean_token_accuracy": 0.8358711794018745, "num_tokens": 55095426.0, "step": 45850 }, { "entropy": 1.9491191014647484, "epoch": 0.14216198747790343, "grad_norm": 4.518219470977783, "learning_rate": 6.709739685121816e-06, "loss": 0.6017, "mean_token_accuracy": 0.8208028897643089, "num_tokens": 55106699.0, "step": 45860 }, { "entropy": 1.8255974546074867, "epoch": 0.14219298660295313, "grad_norm": 8.47142505645752, "learning_rate": 6.70900824281588e-06, "loss": 0.4659, "mean_token_accuracy": 0.8375585183501244, "num_tokens": 55119196.0, "step": 45870 }, { "entropy": 1.844706942141056, "epoch": 0.14222398572800282, "grad_norm": 3.8983564376831055, "learning_rate": 6.70827703966582e-06, "loss": 0.5071, "mean_token_accuracy": 0.8367140933871269, "num_tokens": 55132280.0, "step": 45880 }, { "entropy": 1.8609853073954583, "epoch": 0.14225498485305252, "grad_norm": 4.357093811035156, "learning_rate": 6.707546075541341e-06, "loss": 0.4997, "mean_token_accuracy": 0.8403833657503128, "num_tokens": 55144004.0, "step": 45890 }, { "entropy": 1.9344159916043282, "epoch": 0.14228598397810222, "grad_norm": 8.425995826721191, "learning_rate": 6.706815350312245e-06, "loss": 0.5469, "mean_token_accuracy": 0.8343609884381294, "num_tokens": 55155185.0, "step": 45900 }, { "entropy": 1.9265785560011863, "epoch": 0.1423169831031519, "grad_norm": 8.693990707397461, "learning_rate": 6.706084863848432e-06, "loss": 0.5486, "mean_token_accuracy": 0.8264458447694778, "num_tokens": 55166299.0, "step": 45910 }, { "entropy": 1.9623035162687301, "epoch": 0.1423479822282016, "grad_norm": 7.973003387451172, "learning_rate": 6.705354616019903e-06, "loss": 0.589, "mean_token_accuracy": 0.8248161807656288, "num_tokens": 55177652.0, "step": 45920 }, { "entropy": 1.7614901930093765, "epoch": 0.1423789813532513, "grad_norm": 10.886859893798828, "learning_rate": 6.704624606696758e-06, "loss": 0.4666, "mean_token_accuracy": 0.8515376761555672, "num_tokens": 55191188.0, "step": 45930 }, { "entropy": 1.9051169365644456, "epoch": 0.142409980478301, "grad_norm": 9.506697654724121, "learning_rate": 6.7038948357491925e-06, "loss": 0.6262, "mean_token_accuracy": 0.8160225167870522, "num_tokens": 55202436.0, "step": 45940 }, { "entropy": 1.8652933433651924, "epoch": 0.1424409796033507, "grad_norm": 8.896991729736328, "learning_rate": 6.703165303047507e-06, "loss": 0.5543, "mean_token_accuracy": 0.8282453447580338, "num_tokens": 55213975.0, "step": 45950 }, { "entropy": 1.8330632865428924, "epoch": 0.1424719787284004, "grad_norm": 10.671113967895508, "learning_rate": 6.702436008462098e-06, "loss": 0.4907, "mean_token_accuracy": 0.8493910744786263, "num_tokens": 55226050.0, "step": 45960 }, { "entropy": 1.9106391370296478, "epoch": 0.1425029778534501, "grad_norm": 10.167787551879883, "learning_rate": 6.701706951863456e-06, "loss": 0.5872, "mean_token_accuracy": 0.8183905243873596, "num_tokens": 55237459.0, "step": 45970 }, { "entropy": 1.898222027719021, "epoch": 0.1425339769784998, "grad_norm": 9.797660827636719, "learning_rate": 6.700978133122177e-06, "loss": 0.5341, "mean_token_accuracy": 0.8333139047026634, "num_tokens": 55249387.0, "step": 45980 }, { "entropy": 1.9070947885513305, "epoch": 0.14256497610354948, "grad_norm": 9.02385139465332, "learning_rate": 6.700249552108953e-06, "loss": 0.5671, "mean_token_accuracy": 0.8196926236152648, "num_tokens": 55260722.0, "step": 45990 }, { "entropy": 1.8567352384328841, "epoch": 0.14259597522859918, "grad_norm": 4.3364458084106445, "learning_rate": 6.699521208694573e-06, "loss": 0.5167, "mean_token_accuracy": 0.8253255099058151, "num_tokens": 55273316.0, "step": 46000 }, { "entropy": 1.9003437846899032, "epoch": 0.14262697435364888, "grad_norm": 10.944622993469238, "learning_rate": 6.6987931027499264e-06, "loss": 0.5584, "mean_token_accuracy": 0.8321839943528175, "num_tokens": 55284892.0, "step": 46010 }, { "entropy": 1.9338586196303367, "epoch": 0.14265797347869857, "grad_norm": 9.371607780456543, "learning_rate": 6.698065234146e-06, "loss": 0.6125, "mean_token_accuracy": 0.8222884982824326, "num_tokens": 55296243.0, "step": 46020 }, { "entropy": 1.773510305583477, "epoch": 0.14268897260374827, "grad_norm": 4.4485602378845215, "learning_rate": 6.697337602753876e-06, "loss": 0.4525, "mean_token_accuracy": 0.8317829132080078, "num_tokens": 55310357.0, "step": 46030 }, { "entropy": 1.794416318833828, "epoch": 0.14271997172879797, "grad_norm": 8.086502075195312, "learning_rate": 6.696610208444741e-06, "loss": 0.4673, "mean_token_accuracy": 0.8388103753328323, "num_tokens": 55323254.0, "step": 46040 }, { "entropy": 1.8933477729558945, "epoch": 0.14275097085384764, "grad_norm": 4.526361465454102, "learning_rate": 6.695883051089873e-06, "loss": 0.5491, "mean_token_accuracy": 0.8254470497369766, "num_tokens": 55335499.0, "step": 46050 }, { "entropy": 1.8915483728051186, "epoch": 0.14278196997889733, "grad_norm": 8.898006439208984, "learning_rate": 6.695156130560652e-06, "loss": 0.5882, "mean_token_accuracy": 0.8290236473083497, "num_tokens": 55347737.0, "step": 46060 }, { "entropy": 1.8370354726910592, "epoch": 0.14281296910394703, "grad_norm": 8.612515449523926, "learning_rate": 6.694429446728551e-06, "loss": 0.4643, "mean_token_accuracy": 0.8437874168157578, "num_tokens": 55360168.0, "step": 46070 }, { "entropy": 1.8509749799966813, "epoch": 0.14284396822899673, "grad_norm": 8.452802658081055, "learning_rate": 6.6937029994651485e-06, "loss": 0.511, "mean_token_accuracy": 0.8318821370601654, "num_tokens": 55372897.0, "step": 46080 }, { "entropy": 1.819749604165554, "epoch": 0.14287496735404642, "grad_norm": 8.721274375915527, "learning_rate": 6.692976788642114e-06, "loss": 0.4727, "mean_token_accuracy": 0.8412142008543014, "num_tokens": 55385814.0, "step": 46090 }, { "entropy": 1.9298714756965638, "epoch": 0.14290596647909612, "grad_norm": 7.0825700759887695, "learning_rate": 6.692250814131215e-06, "loss": 0.5796, "mean_token_accuracy": 0.8247520595788955, "num_tokens": 55397111.0, "step": 46100 }, { "entropy": 1.9013961970806121, "epoch": 0.14293696560414582, "grad_norm": 10.107673645019531, "learning_rate": 6.691525075804319e-06, "loss": 0.5349, "mean_token_accuracy": 0.830139285326004, "num_tokens": 55408213.0, "step": 46110 }, { "entropy": 1.9001147076487541, "epoch": 0.1429679647291955, "grad_norm": 8.269566535949707, "learning_rate": 6.690799573533387e-06, "loss": 0.5766, "mean_token_accuracy": 0.8280939370393753, "num_tokens": 55419892.0, "step": 46120 }, { "entropy": 1.9132464528083801, "epoch": 0.1429989638542452, "grad_norm": 8.161412239074707, "learning_rate": 6.690074307190485e-06, "loss": 0.5669, "mean_token_accuracy": 0.825434684753418, "num_tokens": 55430279.0, "step": 46130 }, { "entropy": 1.8425243273377419, "epoch": 0.1430299629792949, "grad_norm": 4.587552070617676, "learning_rate": 6.689349276647765e-06, "loss": 0.4945, "mean_token_accuracy": 0.8376808211207389, "num_tokens": 55441827.0, "step": 46140 }, { "entropy": 1.946098005771637, "epoch": 0.1430609621043446, "grad_norm": 11.747321128845215, "learning_rate": 6.688624481777485e-06, "loss": 0.5897, "mean_token_accuracy": 0.8218299329280854, "num_tokens": 55452623.0, "step": 46150 }, { "entropy": 1.8413902148604393, "epoch": 0.1430919612293943, "grad_norm": 8.52793025970459, "learning_rate": 6.687899922451993e-06, "loss": 0.509, "mean_token_accuracy": 0.839706726372242, "num_tokens": 55464897.0, "step": 46160 }, { "entropy": 1.88809677362442, "epoch": 0.143122960354444, "grad_norm": 8.712146759033203, "learning_rate": 6.6871755985437425e-06, "loss": 0.5336, "mean_token_accuracy": 0.826761020720005, "num_tokens": 55476274.0, "step": 46170 }, { "entropy": 1.8601938232779502, "epoch": 0.1431539594794937, "grad_norm": 4.302032947540283, "learning_rate": 6.686451509925272e-06, "loss": 0.5246, "mean_token_accuracy": 0.8418595373630524, "num_tokens": 55487836.0, "step": 46180 }, { "entropy": 1.7940224602818489, "epoch": 0.1431849586045434, "grad_norm": 11.997045516967773, "learning_rate": 6.685727656469229e-06, "loss": 0.451, "mean_token_accuracy": 0.8457210063934326, "num_tokens": 55499979.0, "step": 46190 }, { "entropy": 1.8284522131085397, "epoch": 0.14321595772959309, "grad_norm": 8.663432121276855, "learning_rate": 6.685004038048349e-06, "loss": 0.4899, "mean_token_accuracy": 0.8391269639134407, "num_tokens": 55512352.0, "step": 46200 }, { "entropy": 1.8878327459096909, "epoch": 0.14324695685464278, "grad_norm": 8.888419151306152, "learning_rate": 6.684280654535462e-06, "loss": 0.5389, "mean_token_accuracy": 0.8362879782915116, "num_tokens": 55523451.0, "step": 46210 }, { "entropy": 1.8454574525356293, "epoch": 0.14327795597969248, "grad_norm": 8.546029090881348, "learning_rate": 6.683557505803507e-06, "loss": 0.5064, "mean_token_accuracy": 0.8269944965839386, "num_tokens": 55535264.0, "step": 46220 }, { "entropy": 1.8240334704518317, "epoch": 0.14330895510474218, "grad_norm": 4.8377885818481445, "learning_rate": 6.6828345917255045e-06, "loss": 0.498, "mean_token_accuracy": 0.8359172835946083, "num_tokens": 55547756.0, "step": 46230 }, { "entropy": 1.9085311383008956, "epoch": 0.14333995422979187, "grad_norm": 9.510826110839844, "learning_rate": 6.682111912174579e-06, "loss": 0.6108, "mean_token_accuracy": 0.828006249666214, "num_tokens": 55559998.0, "step": 46240 }, { "entropy": 1.8106452487409115, "epoch": 0.14337095335484157, "grad_norm": 9.17758846282959, "learning_rate": 6.681389467023951e-06, "loss": 0.4858, "mean_token_accuracy": 0.8463340416550637, "num_tokens": 55573054.0, "step": 46250 }, { "entropy": 1.8135211139917373, "epoch": 0.14340195247989126, "grad_norm": 8.731499671936035, "learning_rate": 6.680667256146936e-06, "loss": 0.4866, "mean_token_accuracy": 0.8303871005773544, "num_tokens": 55586890.0, "step": 46260 }, { "entropy": 1.940975472331047, "epoch": 0.14343295160494096, "grad_norm": 7.670884132385254, "learning_rate": 6.679945279416942e-06, "loss": 0.5781, "mean_token_accuracy": 0.8210515171289444, "num_tokens": 55598136.0, "step": 46270 }, { "entropy": 1.8406999617815019, "epoch": 0.14346395072999066, "grad_norm": 8.801468849182129, "learning_rate": 6.679223536707477e-06, "loss": 0.499, "mean_token_accuracy": 0.8413797855377197, "num_tokens": 55610122.0, "step": 46280 }, { "entropy": 1.8838975965976714, "epoch": 0.14349494985504035, "grad_norm": 7.5125908851623535, "learning_rate": 6.678502027892142e-06, "loss": 0.502, "mean_token_accuracy": 0.84541737139225, "num_tokens": 55621817.0, "step": 46290 }, { "entropy": 1.8918793559074403, "epoch": 0.14352594898009002, "grad_norm": 9.769289016723633, "learning_rate": 6.677780752844637e-06, "loss": 0.5443, "mean_token_accuracy": 0.8287898004055023, "num_tokens": 55633088.0, "step": 46300 }, { "entropy": 1.82922722697258, "epoch": 0.14355694810513972, "grad_norm": 10.320469856262207, "learning_rate": 6.677059711438752e-06, "loss": 0.5303, "mean_token_accuracy": 0.8273350238800049, "num_tokens": 55644796.0, "step": 46310 }, { "entropy": 1.8213071301579475, "epoch": 0.14358794723018942, "grad_norm": 8.950617790222168, "learning_rate": 6.676338903548379e-06, "loss": 0.4821, "mean_token_accuracy": 0.8466832935810089, "num_tokens": 55656040.0, "step": 46320 }, { "entropy": 1.7985667988657952, "epoch": 0.1436189463552391, "grad_norm": 11.022417068481445, "learning_rate": 6.675618329047501e-06, "loss": 0.485, "mean_token_accuracy": 0.8403051659464836, "num_tokens": 55668863.0, "step": 46330 }, { "entropy": 1.807174201309681, "epoch": 0.1436499454802888, "grad_norm": 11.001171112060547, "learning_rate": 6.674897987810195e-06, "loss": 0.5059, "mean_token_accuracy": 0.8363948926329613, "num_tokens": 55680958.0, "step": 46340 }, { "entropy": 1.8860581666231155, "epoch": 0.1436809446053385, "grad_norm": 9.26038646697998, "learning_rate": 6.674177879710637e-06, "loss": 0.5345, "mean_token_accuracy": 0.8372591659426689, "num_tokens": 55692194.0, "step": 46350 }, { "entropy": 1.8062124118208884, "epoch": 0.1437119437303882, "grad_norm": 9.093782424926758, "learning_rate": 6.6734580046230955e-06, "loss": 0.5286, "mean_token_accuracy": 0.8431164383888244, "num_tokens": 55704710.0, "step": 46360 }, { "entropy": 1.8013732418417931, "epoch": 0.1437429428554379, "grad_norm": 9.56591796875, "learning_rate": 6.672738362421936e-06, "loss": 0.4691, "mean_token_accuracy": 0.8444247037172318, "num_tokens": 55716737.0, "step": 46370 }, { "entropy": 1.8738340884447098, "epoch": 0.1437739419804876, "grad_norm": 9.894036293029785, "learning_rate": 6.672018952981613e-06, "loss": 0.5498, "mean_token_accuracy": 0.8256061196327209, "num_tokens": 55727558.0, "step": 46380 }, { "entropy": 1.9396587938070298, "epoch": 0.1438049411055373, "grad_norm": 8.471114158630371, "learning_rate": 6.671299776176685e-06, "loss": 0.6301, "mean_token_accuracy": 0.8130400836467743, "num_tokens": 55738367.0, "step": 46390 }, { "entropy": 1.8212429881095886, "epoch": 0.143835940230587, "grad_norm": 6.425068378448486, "learning_rate": 6.6705808318817975e-06, "loss": 0.5034, "mean_token_accuracy": 0.8256319522857666, "num_tokens": 55751238.0, "step": 46400 }, { "entropy": 1.8310098990797996, "epoch": 0.14386693935563669, "grad_norm": 4.085906982421875, "learning_rate": 6.669862119971694e-06, "loss": 0.5558, "mean_token_accuracy": 0.8291790023446083, "num_tokens": 55763570.0, "step": 46410 }, { "entropy": 1.808751115947962, "epoch": 0.14389793848068638, "grad_norm": 10.913744926452637, "learning_rate": 6.669143640321213e-06, "loss": 0.5168, "mean_token_accuracy": 0.8259348452091217, "num_tokens": 55776144.0, "step": 46420 }, { "entropy": 1.8655160009860992, "epoch": 0.14392893760573608, "grad_norm": 11.0983247756958, "learning_rate": 6.668425392805282e-06, "loss": 0.5305, "mean_token_accuracy": 0.8399639427661896, "num_tokens": 55787889.0, "step": 46430 }, { "entropy": 1.863385981321335, "epoch": 0.14395993673078578, "grad_norm": 8.552124977111816, "learning_rate": 6.667707377298932e-06, "loss": 0.513, "mean_token_accuracy": 0.8352120831608772, "num_tokens": 55800483.0, "step": 46440 }, { "entropy": 1.9188006550073624, "epoch": 0.14399093585583547, "grad_norm": 10.365525245666504, "learning_rate": 6.66698959367728e-06, "loss": 0.6035, "mean_token_accuracy": 0.8209626540541649, "num_tokens": 55811326.0, "step": 46450 }, { "entropy": 1.9086585596203804, "epoch": 0.14402193498088517, "grad_norm": 11.863431930541992, "learning_rate": 6.666272041815539e-06, "loss": 0.5813, "mean_token_accuracy": 0.8237880125641823, "num_tokens": 55823396.0, "step": 46460 }, { "entropy": 1.8327384784817695, "epoch": 0.14405293410593487, "grad_norm": 7.275395393371582, "learning_rate": 6.66555472158902e-06, "loss": 0.4894, "mean_token_accuracy": 0.8353081822395325, "num_tokens": 55836892.0, "step": 46470 }, { "entropy": 1.7775721468031407, "epoch": 0.14408393323098456, "grad_norm": 7.417864799499512, "learning_rate": 6.664837632873123e-06, "loss": 0.4326, "mean_token_accuracy": 0.8478566333651543, "num_tokens": 55849778.0, "step": 46480 }, { "entropy": 1.8510145902633668, "epoch": 0.14411493235603426, "grad_norm": 8.72498607635498, "learning_rate": 6.664120775543344e-06, "loss": 0.5191, "mean_token_accuracy": 0.835884952545166, "num_tokens": 55861927.0, "step": 46490 }, { "entropy": 1.811974573135376, "epoch": 0.14414593148108396, "grad_norm": 8.177042007446289, "learning_rate": 6.663404149475273e-06, "loss": 0.479, "mean_token_accuracy": 0.8483860984444618, "num_tokens": 55873403.0, "step": 46500 }, { "entropy": 1.86693923920393, "epoch": 0.14417693060613365, "grad_norm": 8.267274856567383, "learning_rate": 6.662687754544593e-06, "loss": 0.5635, "mean_token_accuracy": 0.8321180418133736, "num_tokens": 55885678.0, "step": 46510 }, { "entropy": 1.8252136752009391, "epoch": 0.14420792973118335, "grad_norm": 4.183609485626221, "learning_rate": 6.661971590627081e-06, "loss": 0.5056, "mean_token_accuracy": 0.8298415824770927, "num_tokens": 55897994.0, "step": 46520 }, { "entropy": 1.8707138195633888, "epoch": 0.14423892885623305, "grad_norm": 7.805715084075928, "learning_rate": 6.661255657598608e-06, "loss": 0.5584, "mean_token_accuracy": 0.8322044730186462, "num_tokens": 55909873.0, "step": 46530 }, { "entropy": 1.8549921035766601, "epoch": 0.14426992798128274, "grad_norm": 10.361091613769531, "learning_rate": 6.660539955335135e-06, "loss": 0.5159, "mean_token_accuracy": 0.8390928566455841, "num_tokens": 55921497.0, "step": 46540 }, { "entropy": 1.8014539882540703, "epoch": 0.1443009271063324, "grad_norm": 7.748378753662109, "learning_rate": 6.659824483712719e-06, "loss": 0.5017, "mean_token_accuracy": 0.8297727882862092, "num_tokens": 55933974.0, "step": 46550 }, { "entropy": 1.912005639076233, "epoch": 0.1443319262313821, "grad_norm": 8.874786376953125, "learning_rate": 6.659109242607511e-06, "loss": 0.5539, "mean_token_accuracy": 0.8200063824653625, "num_tokens": 55945880.0, "step": 46560 }, { "entropy": 1.8200358718633651, "epoch": 0.1443629253564318, "grad_norm": 4.0915913581848145, "learning_rate": 6.658394231895755e-06, "loss": 0.5076, "mean_token_accuracy": 0.8318188101053238, "num_tokens": 55958253.0, "step": 46570 }, { "entropy": 1.8874425664544106, "epoch": 0.1443939244814815, "grad_norm": 9.732337951660156, "learning_rate": 6.657679451453786e-06, "loss": 0.5342, "mean_token_accuracy": 0.8317007169127464, "num_tokens": 55970172.0, "step": 46580 }, { "entropy": 1.8596255108714104, "epoch": 0.1444249236065312, "grad_norm": 9.834501266479492, "learning_rate": 6.656964901158031e-06, "loss": 0.5135, "mean_token_accuracy": 0.8399197280406951, "num_tokens": 55981424.0, "step": 46590 }, { "entropy": 1.8636808514595031, "epoch": 0.1444559227315809, "grad_norm": 9.933505058288574, "learning_rate": 6.656250580885014e-06, "loss": 0.5678, "mean_token_accuracy": 0.8327499255537987, "num_tokens": 55993099.0, "step": 46600 }, { "entropy": 1.8671812251210214, "epoch": 0.1444869218566306, "grad_norm": 8.399667739868164, "learning_rate": 6.6555364905113505e-06, "loss": 0.549, "mean_token_accuracy": 0.8313580706715584, "num_tokens": 56004188.0, "step": 46610 }, { "entropy": 1.8309295520186424, "epoch": 0.1445179209816803, "grad_norm": 4.422349452972412, "learning_rate": 6.654822629913745e-06, "loss": 0.4595, "mean_token_accuracy": 0.8452798783779144, "num_tokens": 56016240.0, "step": 46620 }, { "entropy": 1.746955469250679, "epoch": 0.14454892010672998, "grad_norm": 5.409885406494141, "learning_rate": 6.654108998968999e-06, "loss": 0.4726, "mean_token_accuracy": 0.8372356191277504, "num_tokens": 56029851.0, "step": 46630 }, { "entropy": 1.8878102406859398, "epoch": 0.14457991923177968, "grad_norm": 9.581794738769531, "learning_rate": 6.653395597554003e-06, "loss": 0.5621, "mean_token_accuracy": 0.8266360089182854, "num_tokens": 56041081.0, "step": 46640 }, { "entropy": 1.84484543800354, "epoch": 0.14461091835682938, "grad_norm": 7.586556434631348, "learning_rate": 6.652682425545742e-06, "loss": 0.5313, "mean_token_accuracy": 0.8356327712535858, "num_tokens": 56053083.0, "step": 46650 }, { "entropy": 1.8373423531651496, "epoch": 0.14464191748187907, "grad_norm": 9.485453605651855, "learning_rate": 6.651969482821293e-06, "loss": 0.5017, "mean_token_accuracy": 0.8305898755788803, "num_tokens": 56065247.0, "step": 46660 }, { "entropy": 1.8729679718613625, "epoch": 0.14467291660692877, "grad_norm": 4.585037708282471, "learning_rate": 6.651256769257825e-06, "loss": 0.5095, "mean_token_accuracy": 0.8245977357029914, "num_tokens": 56077249.0, "step": 46670 }, { "entropy": 1.8576960027217866, "epoch": 0.14470391573197847, "grad_norm": 8.548659324645996, "learning_rate": 6.650544284732601e-06, "loss": 0.5327, "mean_token_accuracy": 0.8239507541060448, "num_tokens": 56089496.0, "step": 46680 }, { "entropy": 1.7963466018438339, "epoch": 0.14473491485702816, "grad_norm": 8.17918872833252, "learning_rate": 6.649832029122969e-06, "loss": 0.4707, "mean_token_accuracy": 0.8414889872074127, "num_tokens": 56102439.0, "step": 46690 }, { "entropy": 1.8463149771094323, "epoch": 0.14476591398207786, "grad_norm": 8.336082458496094, "learning_rate": 6.6491200023063785e-06, "loss": 0.532, "mean_token_accuracy": 0.8314433738589286, "num_tokens": 56115196.0, "step": 46700 }, { "entropy": 1.8520590156316756, "epoch": 0.14479691310712756, "grad_norm": 9.052793502807617, "learning_rate": 6.648408204160365e-06, "loss": 0.5665, "mean_token_accuracy": 0.8344256520271301, "num_tokens": 56127775.0, "step": 46710 }, { "entropy": 1.8827403590083123, "epoch": 0.14482791223217725, "grad_norm": 9.495018005371094, "learning_rate": 6.647696634562557e-06, "loss": 0.5524, "mean_token_accuracy": 0.825685128569603, "num_tokens": 56140171.0, "step": 46720 }, { "entropy": 1.9633108615875243, "epoch": 0.14485891135722695, "grad_norm": 8.340209007263184, "learning_rate": 6.646985293390675e-06, "loss": 0.5714, "mean_token_accuracy": 0.8286648213863372, "num_tokens": 56151174.0, "step": 46730 }, { "entropy": 1.912299408018589, "epoch": 0.14488991048227665, "grad_norm": 9.881667137145996, "learning_rate": 6.64627418052253e-06, "loss": 0.5312, "mean_token_accuracy": 0.834262129664421, "num_tokens": 56162304.0, "step": 46740 }, { "entropy": 1.9305769801139832, "epoch": 0.14492090960732634, "grad_norm": 6.763992786407471, "learning_rate": 6.6455632958360265e-06, "loss": 0.6003, "mean_token_accuracy": 0.8186121672391892, "num_tokens": 56173644.0, "step": 46750 }, { "entropy": 1.8460824131965636, "epoch": 0.14495190873237604, "grad_norm": 10.21903133392334, "learning_rate": 6.644852639209157e-06, "loss": 0.5388, "mean_token_accuracy": 0.8290549755096436, "num_tokens": 56185409.0, "step": 46760 }, { "entropy": 1.843972623348236, "epoch": 0.14498290785742574, "grad_norm": 5.34625244140625, "learning_rate": 6.6441422105200105e-06, "loss": 0.5037, "mean_token_accuracy": 0.8397470220923424, "num_tokens": 56197647.0, "step": 46770 }, { "entropy": 1.9071653231978416, "epoch": 0.14501390698247543, "grad_norm": 9.360774993896484, "learning_rate": 6.643432009646762e-06, "loss": 0.5526, "mean_token_accuracy": 0.8233714982867241, "num_tokens": 56209322.0, "step": 46780 }, { "entropy": 1.867496982216835, "epoch": 0.14504490610752513, "grad_norm": 9.754353523254395, "learning_rate": 6.642722036467681e-06, "loss": 0.5032, "mean_token_accuracy": 0.8340319588780403, "num_tokens": 56221453.0, "step": 46790 }, { "entropy": 1.7567606747150422, "epoch": 0.1450759052325748, "grad_norm": 12.389626502990723, "learning_rate": 6.642012290861126e-06, "loss": 0.4416, "mean_token_accuracy": 0.8503746286034584, "num_tokens": 56234727.0, "step": 46800 }, { "entropy": 1.9208232283592224, "epoch": 0.1451069043576245, "grad_norm": 4.7340803146362305, "learning_rate": 6.641302772705548e-06, "loss": 0.5905, "mean_token_accuracy": 0.8217899233102799, "num_tokens": 56245924.0, "step": 46810 }, { "entropy": 1.7486246049404144, "epoch": 0.1451379034826742, "grad_norm": 4.023608207702637, "learning_rate": 6.640593481879488e-06, "loss": 0.4293, "mean_token_accuracy": 0.8474720388650894, "num_tokens": 56259967.0, "step": 46820 }, { "entropy": 1.9232235804200173, "epoch": 0.1451689026077239, "grad_norm": 9.536555290222168, "learning_rate": 6.63988441826158e-06, "loss": 0.5834, "mean_token_accuracy": 0.8297638326883316, "num_tokens": 56271400.0, "step": 46830 }, { "entropy": 1.862909409403801, "epoch": 0.14519990173277358, "grad_norm": 9.564486503601074, "learning_rate": 6.639175581730542e-06, "loss": 0.5344, "mean_token_accuracy": 0.8356858551502228, "num_tokens": 56283115.0, "step": 46840 }, { "entropy": 1.8271246001124382, "epoch": 0.14523090085782328, "grad_norm": 4.301812648773193, "learning_rate": 6.638466972165192e-06, "loss": 0.4924, "mean_token_accuracy": 0.8387002035975456, "num_tokens": 56295992.0, "step": 46850 }, { "entropy": 1.901492816209793, "epoch": 0.14526189998287298, "grad_norm": 8.603260040283203, "learning_rate": 6.63775858944443e-06, "loss": 0.5661, "mean_token_accuracy": 0.8132639810442924, "num_tokens": 56307074.0, "step": 46860 }, { "entropy": 1.7843642815947534, "epoch": 0.14529289910792267, "grad_norm": 7.738562107086182, "learning_rate": 6.637050433447254e-06, "loss": 0.4475, "mean_token_accuracy": 0.8477630734443664, "num_tokens": 56320727.0, "step": 46870 }, { "entropy": 1.8238538324832916, "epoch": 0.14532389823297237, "grad_norm": 4.08509635925293, "learning_rate": 6.636342504052748e-06, "loss": 0.4876, "mean_token_accuracy": 0.8303026512265206, "num_tokens": 56333848.0, "step": 46880 }, { "entropy": 1.7951844319701196, "epoch": 0.14535489735802207, "grad_norm": 7.781752586364746, "learning_rate": 6.635634801140083e-06, "loss": 0.4462, "mean_token_accuracy": 0.8464242547750473, "num_tokens": 56346965.0, "step": 46890 }, { "entropy": 1.8057351917028428, "epoch": 0.14538589648307176, "grad_norm": 3.1438589096069336, "learning_rate": 6.634927324588528e-06, "loss": 0.4282, "mean_token_accuracy": 0.8528795570135117, "num_tokens": 56359959.0, "step": 46900 }, { "entropy": 1.9472463458776474, "epoch": 0.14541689560812146, "grad_norm": 9.385503768920898, "learning_rate": 6.634220074277438e-06, "loss": 0.5786, "mean_token_accuracy": 0.82788744866848, "num_tokens": 56371468.0, "step": 46910 }, { "entropy": 1.8034564316272736, "epoch": 0.14544789473317116, "grad_norm": 6.253968238830566, "learning_rate": 6.633513050086256e-06, "loss": 0.4302, "mean_token_accuracy": 0.8509827584028244, "num_tokens": 56384646.0, "step": 46920 }, { "entropy": 1.8610297739505768, "epoch": 0.14547889385822085, "grad_norm": 9.084609031677246, "learning_rate": 6.6328062518945195e-06, "loss": 0.4996, "mean_token_accuracy": 0.8404456228017807, "num_tokens": 56396918.0, "step": 46930 }, { "entropy": 1.901959379762411, "epoch": 0.14550989298327055, "grad_norm": 8.576674461364746, "learning_rate": 6.63209967958185e-06, "loss": 0.5396, "mean_token_accuracy": 0.8313698336482048, "num_tokens": 56409026.0, "step": 46940 }, { "entropy": 1.8410135254263877, "epoch": 0.14554089210832025, "grad_norm": 4.225868225097656, "learning_rate": 6.631393333027966e-06, "loss": 0.4869, "mean_token_accuracy": 0.838833749294281, "num_tokens": 56421316.0, "step": 46950 }, { "entropy": 1.8137712955474854, "epoch": 0.14557189123336994, "grad_norm": 7.888924598693848, "learning_rate": 6.630687212112668e-06, "loss": 0.4549, "mean_token_accuracy": 0.8446080341935158, "num_tokens": 56434495.0, "step": 46960 }, { "entropy": 1.9139295309782027, "epoch": 0.14560289035841964, "grad_norm": 7.617772579193115, "learning_rate": 6.629981316715853e-06, "loss": 0.5159, "mean_token_accuracy": 0.831653282046318, "num_tokens": 56447269.0, "step": 46970 }, { "entropy": 1.965524472296238, "epoch": 0.14563388948346934, "grad_norm": 9.798111915588379, "learning_rate": 6.629275646717503e-06, "loss": 0.5769, "mean_token_accuracy": 0.8235475957393646, "num_tokens": 56458884.0, "step": 46980 }, { "entropy": 1.8459963738918304, "epoch": 0.14566488860851903, "grad_norm": 8.215746879577637, "learning_rate": 6.628570201997693e-06, "loss": 0.4492, "mean_token_accuracy": 0.8458223536610603, "num_tokens": 56472093.0, "step": 46990 }, { "entropy": 1.956969639658928, "epoch": 0.14569588773356873, "grad_norm": 8.928169250488281, "learning_rate": 6.62786498243658e-06, "loss": 0.6133, "mean_token_accuracy": 0.8251758500933647, "num_tokens": 56482622.0, "step": 47000 }, { "entropy": 1.9019935339689256, "epoch": 0.14572688685861843, "grad_norm": 7.649944305419922, "learning_rate": 6.627159987914421e-06, "loss": 0.6096, "mean_token_accuracy": 0.8216905370354652, "num_tokens": 56495135.0, "step": 47010 }, { "entropy": 1.8709472745656968, "epoch": 0.14575788598366812, "grad_norm": 5.07206916809082, "learning_rate": 6.626455218311551e-06, "loss": 0.5006, "mean_token_accuracy": 0.8332753911614418, "num_tokens": 56507439.0, "step": 47020 }, { "entropy": 1.9712065637111664, "epoch": 0.14578888510871782, "grad_norm": 8.687137603759766, "learning_rate": 6.6257506735084055e-06, "loss": 0.6034, "mean_token_accuracy": 0.8207579106092453, "num_tokens": 56518700.0, "step": 47030 }, { "entropy": 1.7840657129883766, "epoch": 0.1458198842337675, "grad_norm": 8.681142807006836, "learning_rate": 6.625046353385498e-06, "loss": 0.4615, "mean_token_accuracy": 0.8308488741517067, "num_tokens": 56531438.0, "step": 47040 }, { "entropy": 1.9545001640915871, "epoch": 0.14585088335881718, "grad_norm": 11.037637710571289, "learning_rate": 6.624342257823438e-06, "loss": 0.5669, "mean_token_accuracy": 0.8239473134279252, "num_tokens": 56542655.0, "step": 47050 }, { "entropy": 1.8824947997927666, "epoch": 0.14588188248386688, "grad_norm": 8.642295837402344, "learning_rate": 6.623638386702921e-06, "loss": 0.5302, "mean_token_accuracy": 0.8258578881621361, "num_tokens": 56555705.0, "step": 47060 }, { "entropy": 1.871096746623516, "epoch": 0.14591288160891658, "grad_norm": 9.842241287231445, "learning_rate": 6.622934739904732e-06, "loss": 0.4928, "mean_token_accuracy": 0.836942833662033, "num_tokens": 56568268.0, "step": 47070 }, { "entropy": 1.9141883179545403, "epoch": 0.14594388073396627, "grad_norm": 8.952548027038574, "learning_rate": 6.6222313173097454e-06, "loss": 0.5371, "mean_token_accuracy": 0.8351935803890228, "num_tokens": 56579903.0, "step": 47080 }, { "entropy": 1.8572052717208862, "epoch": 0.14597487985901597, "grad_norm": 9.243520736694336, "learning_rate": 6.62152811879892e-06, "loss": 0.5324, "mean_token_accuracy": 0.8366377666592598, "num_tokens": 56592500.0, "step": 47090 }, { "entropy": 1.8863570004701615, "epoch": 0.14600587898406567, "grad_norm": 8.63272762298584, "learning_rate": 6.620825144253312e-06, "loss": 0.528, "mean_token_accuracy": 0.8290054813027382, "num_tokens": 56604096.0, "step": 47100 }, { "entropy": 1.8670645967125892, "epoch": 0.14603687810911536, "grad_norm": 8.21442985534668, "learning_rate": 6.620122393554056e-06, "loss": 0.6717, "mean_token_accuracy": 0.8223129764199257, "num_tokens": 56616718.0, "step": 47110 }, { "entropy": 1.9217035099864006, "epoch": 0.14606787723416506, "grad_norm": 4.886983871459961, "learning_rate": 6.6194198665823796e-06, "loss": 0.5741, "mean_token_accuracy": 0.8262522548437119, "num_tokens": 56627768.0, "step": 47120 }, { "entropy": 1.9420514121651649, "epoch": 0.14609887635921476, "grad_norm": 8.3319673538208, "learning_rate": 6.6187175632195985e-06, "loss": 0.602, "mean_token_accuracy": 0.8175177812576294, "num_tokens": 56638951.0, "step": 47130 }, { "entropy": 1.9218653574585915, "epoch": 0.14612987548426445, "grad_norm": 8.83938217163086, "learning_rate": 6.618015483347118e-06, "loss": 0.5549, "mean_token_accuracy": 0.82304065823555, "num_tokens": 56651297.0, "step": 47140 }, { "entropy": 1.9661997646093368, "epoch": 0.14616087460931415, "grad_norm": 9.71408462524414, "learning_rate": 6.6173136268464276e-06, "loss": 0.5889, "mean_token_accuracy": 0.8207120850682259, "num_tokens": 56662494.0, "step": 47150 }, { "entropy": 1.934261092543602, "epoch": 0.14619187373436385, "grad_norm": 8.855196952819824, "learning_rate": 6.616611993599109e-06, "loss": 0.5464, "mean_token_accuracy": 0.8397008880972863, "num_tokens": 56673808.0, "step": 47160 }, { "entropy": 1.9365019842982292, "epoch": 0.14622287285941354, "grad_norm": 8.653056144714355, "learning_rate": 6.6159105834868275e-06, "loss": 0.5803, "mean_token_accuracy": 0.8273678243160247, "num_tokens": 56685577.0, "step": 47170 }, { "entropy": 1.872480408847332, "epoch": 0.14625387198446324, "grad_norm": 8.392419815063477, "learning_rate": 6.615209396391338e-06, "loss": 0.511, "mean_token_accuracy": 0.8286007553339004, "num_tokens": 56698302.0, "step": 47180 }, { "entropy": 1.9219907984137534, "epoch": 0.14628487110951294, "grad_norm": 11.029183387756348, "learning_rate": 6.614508432194486e-06, "loss": 0.5493, "mean_token_accuracy": 0.8315992474555969, "num_tokens": 56709524.0, "step": 47190 }, { "entropy": 1.928644596040249, "epoch": 0.14631587023456263, "grad_norm": 8.550175666809082, "learning_rate": 6.613807690778199e-06, "loss": 0.5016, "mean_token_accuracy": 0.8338589072227478, "num_tokens": 56721586.0, "step": 47200 }, { "entropy": 1.8833053424954413, "epoch": 0.14634686935961233, "grad_norm": 4.840978622436523, "learning_rate": 6.613107172024497e-06, "loss": 0.5268, "mean_token_accuracy": 0.8266768530011177, "num_tokens": 56733887.0, "step": 47210 }, { "entropy": 1.8826899453997612, "epoch": 0.14637786848466203, "grad_norm": 6.925222396850586, "learning_rate": 6.6124068758154836e-06, "loss": 0.5489, "mean_token_accuracy": 0.8324401840567589, "num_tokens": 56745661.0, "step": 47220 }, { "entropy": 1.8747637838125228, "epoch": 0.14640886760971172, "grad_norm": 9.450230598449707, "learning_rate": 6.611706802033354e-06, "loss": 0.5337, "mean_token_accuracy": 0.8353126794099808, "num_tokens": 56758002.0, "step": 47230 }, { "entropy": 1.8948550507426263, "epoch": 0.14643986673476142, "grad_norm": 4.941351890563965, "learning_rate": 6.611006950560388e-06, "loss": 0.5553, "mean_token_accuracy": 0.8193787977099418, "num_tokens": 56770680.0, "step": 47240 }, { "entropy": 1.9120934292674066, "epoch": 0.14647086585981112, "grad_norm": 10.706798553466797, "learning_rate": 6.610307321278952e-06, "loss": 0.6107, "mean_token_accuracy": 0.8241473525762558, "num_tokens": 56782436.0, "step": 47250 }, { "entropy": 1.9104921489953994, "epoch": 0.1465018649848608, "grad_norm": 8.724833488464355, "learning_rate": 6.6096079140715005e-06, "loss": 0.5747, "mean_token_accuracy": 0.8173437684774398, "num_tokens": 56794146.0, "step": 47260 }, { "entropy": 1.826416552066803, "epoch": 0.1465328641099105, "grad_norm": 7.878024578094482, "learning_rate": 6.6089087288205766e-06, "loss": 0.4277, "mean_token_accuracy": 0.8480478748679161, "num_tokens": 56806763.0, "step": 47270 }, { "entropy": 1.8449065156280995, "epoch": 0.1465638632349602, "grad_norm": 8.807769775390625, "learning_rate": 6.608209765408807e-06, "loss": 0.4694, "mean_token_accuracy": 0.8426515519618988, "num_tokens": 56819546.0, "step": 47280 }, { "entropy": 1.9609465137124062, "epoch": 0.14659486236000988, "grad_norm": 9.253771781921387, "learning_rate": 6.607511023718909e-06, "loss": 0.5784, "mean_token_accuracy": 0.8272478267550468, "num_tokens": 56830641.0, "step": 47290 }, { "entropy": 1.8133751511573792, "epoch": 0.14662586148505957, "grad_norm": 4.237439155578613, "learning_rate": 6.6068125036336824e-06, "loss": 0.479, "mean_token_accuracy": 0.8408553779125214, "num_tokens": 56843286.0, "step": 47300 }, { "entropy": 1.8579260095953942, "epoch": 0.14665686061010927, "grad_norm": 8.59593677520752, "learning_rate": 6.6061142050360174e-06, "loss": 0.492, "mean_token_accuracy": 0.8416462868452073, "num_tokens": 56855446.0, "step": 47310 }, { "entropy": 1.9909762263298034, "epoch": 0.14668785973515897, "grad_norm": 9.792447090148926, "learning_rate": 6.60541612780889e-06, "loss": 0.5909, "mean_token_accuracy": 0.8207048043608666, "num_tokens": 56866314.0, "step": 47320 }, { "entropy": 1.8144005626440047, "epoch": 0.14671885886020866, "grad_norm": 6.8003644943237305, "learning_rate": 6.604718271835362e-06, "loss": 0.4639, "mean_token_accuracy": 0.8494958788156509, "num_tokens": 56879467.0, "step": 47330 }, { "entropy": 1.934771877527237, "epoch": 0.14674985798525836, "grad_norm": 7.686990261077881, "learning_rate": 6.60402063699858e-06, "loss": 0.5566, "mean_token_accuracy": 0.8413501441478729, "num_tokens": 56890100.0, "step": 47340 }, { "entropy": 1.8710382498800755, "epoch": 0.14678085711030806, "grad_norm": 9.023658752441406, "learning_rate": 6.603323223181781e-06, "loss": 0.5367, "mean_token_accuracy": 0.8303638845682144, "num_tokens": 56903039.0, "step": 47350 }, { "entropy": 1.853154082596302, "epoch": 0.14681185623535775, "grad_norm": 7.490630626678467, "learning_rate": 6.6026260302682866e-06, "loss": 0.4761, "mean_token_accuracy": 0.8393293723464013, "num_tokens": 56915063.0, "step": 47360 }, { "entropy": 1.8372676715254783, "epoch": 0.14684285536040745, "grad_norm": 6.07124137878418, "learning_rate": 6.601929058141503e-06, "loss": 0.5217, "mean_token_accuracy": 0.8287788331508636, "num_tokens": 56928744.0, "step": 47370 }, { "entropy": 1.831204354763031, "epoch": 0.14687385448545714, "grad_norm": 4.461570739746094, "learning_rate": 6.601232306684922e-06, "loss": 0.4442, "mean_token_accuracy": 0.8416302114725113, "num_tokens": 56941775.0, "step": 47380 }, { "entropy": 1.8040766946971416, "epoch": 0.14690485361050684, "grad_norm": 3.481854200363159, "learning_rate": 6.600535775782128e-06, "loss": 0.4513, "mean_token_accuracy": 0.8418020218610763, "num_tokens": 56955721.0, "step": 47390 }, { "entropy": 1.8790962159633637, "epoch": 0.14693585273555654, "grad_norm": 8.983044624328613, "learning_rate": 6.599839465316782e-06, "loss": 0.4852, "mean_token_accuracy": 0.8474810421466827, "num_tokens": 56968021.0, "step": 47400 }, { "entropy": 1.8795790463685988, "epoch": 0.14696685186060623, "grad_norm": 9.99634838104248, "learning_rate": 6.599143375172638e-06, "loss": 0.5054, "mean_token_accuracy": 0.8363196149468421, "num_tokens": 56979842.0, "step": 47410 }, { "entropy": 1.9174206882715226, "epoch": 0.14699785098565593, "grad_norm": 8.22517204284668, "learning_rate": 6.598447505233533e-06, "loss": 0.5717, "mean_token_accuracy": 0.8306628108024597, "num_tokens": 56991796.0, "step": 47420 }, { "entropy": 1.9805866748094558, "epoch": 0.14702885011070563, "grad_norm": 9.243436813354492, "learning_rate": 6.59775185538339e-06, "loss": 0.5826, "mean_token_accuracy": 0.829287999868393, "num_tokens": 57003139.0, "step": 47430 }, { "entropy": 1.8765076369047164, "epoch": 0.14705984923575532, "grad_norm": 8.006113052368164, "learning_rate": 6.597056425506216e-06, "loss": 0.5153, "mean_token_accuracy": 0.8328446924686432, "num_tokens": 57015872.0, "step": 47440 }, { "entropy": 1.8864967197179794, "epoch": 0.14709084836080502, "grad_norm": 3.926126003265381, "learning_rate": 6.596361215486107e-06, "loss": 0.4721, "mean_token_accuracy": 0.8418742284178734, "num_tokens": 57027981.0, "step": 47450 }, { "entropy": 1.8978651389479637, "epoch": 0.14712184748585472, "grad_norm": 4.1893134117126465, "learning_rate": 6.595666225207241e-06, "loss": 0.5869, "mean_token_accuracy": 0.8199717015028, "num_tokens": 57039934.0, "step": 47460 }, { "entropy": 1.9076218917965888, "epoch": 0.14715284661090441, "grad_norm": 11.420867919921875, "learning_rate": 6.594971454553885e-06, "loss": 0.5421, "mean_token_accuracy": 0.830481293797493, "num_tokens": 57052007.0, "step": 47470 }, { "entropy": 2.0099540084600447, "epoch": 0.1471838457359541, "grad_norm": 8.73011589050293, "learning_rate": 6.5942769034103895e-06, "loss": 0.6084, "mean_token_accuracy": 0.8137494072318077, "num_tokens": 57062753.0, "step": 47480 }, { "entropy": 1.8646449223160744, "epoch": 0.1472148448610038, "grad_norm": 10.318194389343262, "learning_rate": 6.593582571661188e-06, "loss": 0.4844, "mean_token_accuracy": 0.842064967751503, "num_tokens": 57075443.0, "step": 47490 }, { "entropy": 1.8991115644574166, "epoch": 0.1472458439860535, "grad_norm": 6.439994812011719, "learning_rate": 6.592888459190802e-06, "loss": 0.5384, "mean_token_accuracy": 0.8363965794444084, "num_tokens": 57088273.0, "step": 47500 }, { "entropy": 1.9018190592527389, "epoch": 0.1472768431111032, "grad_norm": 5.49556827545166, "learning_rate": 6.592194565883839e-06, "loss": 0.4666, "mean_token_accuracy": 0.8406558021903038, "num_tokens": 57100663.0, "step": 47510 }, { "entropy": 2.0055468559265135, "epoch": 0.1473078422361529, "grad_norm": 8.085359573364258, "learning_rate": 6.591500891624989e-06, "loss": 0.612, "mean_token_accuracy": 0.8196608617901802, "num_tokens": 57111772.0, "step": 47520 }, { "entropy": 1.944721657037735, "epoch": 0.1473388413612026, "grad_norm": 8.445950508117676, "learning_rate": 6.590807436299027e-06, "loss": 0.5381, "mean_token_accuracy": 0.824900957942009, "num_tokens": 57123451.0, "step": 47530 }, { "entropy": 1.9133238226175309, "epoch": 0.14736984048625226, "grad_norm": 4.461698532104492, "learning_rate": 6.590114199790815e-06, "loss": 0.4967, "mean_token_accuracy": 0.8348732620477677, "num_tokens": 57135025.0, "step": 47540 }, { "entropy": 1.862838363647461, "epoch": 0.14740083961130196, "grad_norm": 3.581024408340454, "learning_rate": 6.589421181985297e-06, "loss": 0.4594, "mean_token_accuracy": 0.8478377997875214, "num_tokens": 57146927.0, "step": 47550 }, { "entropy": 1.9030270993709564, "epoch": 0.14743183873635166, "grad_norm": 7.672686576843262, "learning_rate": 6.588728382767504e-06, "loss": 0.5129, "mean_token_accuracy": 0.8314098447561264, "num_tokens": 57158627.0, "step": 47560 }, { "entropy": 1.9589687079191207, "epoch": 0.14746283786140135, "grad_norm": 9.731719970703125, "learning_rate": 6.58803580202255e-06, "loss": 0.5612, "mean_token_accuracy": 0.8271705433726311, "num_tokens": 57169934.0, "step": 47570 }, { "entropy": 1.934072805941105, "epoch": 0.14749383698645105, "grad_norm": 8.698883056640625, "learning_rate": 6.587343439635634e-06, "loss": 0.504, "mean_token_accuracy": 0.8397659227252007, "num_tokens": 57182220.0, "step": 47580 }, { "entropy": 1.9594342321157456, "epoch": 0.14752483611150075, "grad_norm": 9.804793357849121, "learning_rate": 6.586651295492042e-06, "loss": 0.5365, "mean_token_accuracy": 0.8358360469341278, "num_tokens": 57193785.0, "step": 47590 }, { "entropy": 1.985189399123192, "epoch": 0.14755583523655044, "grad_norm": 8.428631782531738, "learning_rate": 6.585959369477139e-06, "loss": 0.5789, "mean_token_accuracy": 0.8262299597263336, "num_tokens": 57205399.0, "step": 47600 }, { "entropy": 2.0018600046634676, "epoch": 0.14758683436160014, "grad_norm": 7.76343297958374, "learning_rate": 6.585267661476379e-06, "loss": 0.5989, "mean_token_accuracy": 0.8301928460597991, "num_tokens": 57216010.0, "step": 47610 }, { "entropy": 1.9304006546735764, "epoch": 0.14761783348664984, "grad_norm": 7.311439514160156, "learning_rate": 6.584576171375298e-06, "loss": 0.5318, "mean_token_accuracy": 0.8509150952100754, "num_tokens": 57227397.0, "step": 47620 }, { "entropy": 1.924649564921856, "epoch": 0.14764883261169953, "grad_norm": 8.823622703552246, "learning_rate": 6.5838848990595135e-06, "loss": 0.527, "mean_token_accuracy": 0.8323667719960213, "num_tokens": 57239462.0, "step": 47630 }, { "entropy": 1.9695616766810418, "epoch": 0.14767983173674923, "grad_norm": 9.303346633911133, "learning_rate": 6.583193844414736e-06, "loss": 0.5739, "mean_token_accuracy": 0.8252382263541221, "num_tokens": 57250697.0, "step": 47640 }, { "entropy": 1.9715566843748094, "epoch": 0.14771083086179893, "grad_norm": 8.258867263793945, "learning_rate": 6.582503007326752e-06, "loss": 0.579, "mean_token_accuracy": 0.8235922992229462, "num_tokens": 57261910.0, "step": 47650 }, { "entropy": 1.9681269809603692, "epoch": 0.14774182998684862, "grad_norm": 4.465610504150391, "learning_rate": 6.58181238768143e-06, "loss": 0.5258, "mean_token_accuracy": 0.8352336034178733, "num_tokens": 57273666.0, "step": 47660 }, { "entropy": 1.8807261288166046, "epoch": 0.14777282911189832, "grad_norm": 8.713099479675293, "learning_rate": 6.5811219853647315e-06, "loss": 0.4857, "mean_token_accuracy": 0.8416456952691078, "num_tokens": 57286174.0, "step": 47670 }, { "entropy": 1.9564361080527306, "epoch": 0.14780382823694801, "grad_norm": 4.097822189331055, "learning_rate": 6.580431800262694e-06, "loss": 0.5526, "mean_token_accuracy": 0.8282778993248939, "num_tokens": 57298085.0, "step": 47680 }, { "entropy": 1.9153534591197967, "epoch": 0.1478348273619977, "grad_norm": 8.843733787536621, "learning_rate": 6.57974183226144e-06, "loss": 0.5322, "mean_token_accuracy": 0.832582226395607, "num_tokens": 57310164.0, "step": 47690 }, { "entropy": 1.8493407145142555, "epoch": 0.1478658264870474, "grad_norm": 10.445718765258789, "learning_rate": 6.579052081247181e-06, "loss": 0.4756, "mean_token_accuracy": 0.8435613483190536, "num_tokens": 57323308.0, "step": 47700 }, { "entropy": 1.87943754196167, "epoch": 0.1478968256120971, "grad_norm": 7.931858062744141, "learning_rate": 6.578362547106202e-06, "loss": 0.5018, "mean_token_accuracy": 0.8353148818016052, "num_tokens": 57336086.0, "step": 47710 }, { "entropy": 1.9901050910353661, "epoch": 0.1479278247371468, "grad_norm": 8.560962677001953, "learning_rate": 6.5776732297248805e-06, "loss": 0.5716, "mean_token_accuracy": 0.8269386544823647, "num_tokens": 57347719.0, "step": 47720 }, { "entropy": 2.0061254844069483, "epoch": 0.1479588238621965, "grad_norm": 7.655875205993652, "learning_rate": 6.576984128989673e-06, "loss": 0.5468, "mean_token_accuracy": 0.837123441696167, "num_tokens": 57359271.0, "step": 47730 }, { "entropy": 2.0044953674077988, "epoch": 0.1479898229872462, "grad_norm": 8.174701690673828, "learning_rate": 6.576295244787121e-06, "loss": 0.5825, "mean_token_accuracy": 0.8275741189718246, "num_tokens": 57370109.0, "step": 47740 }, { "entropy": 1.8147762969136239, "epoch": 0.1480208221122959, "grad_norm": 9.397214889526367, "learning_rate": 6.575606577003847e-06, "loss": 0.4674, "mean_token_accuracy": 0.8475957497954368, "num_tokens": 57383452.0, "step": 47750 }, { "entropy": 1.9344044476747513, "epoch": 0.1480518212373456, "grad_norm": 9.833171844482422, "learning_rate": 6.574918125526558e-06, "loss": 0.5411, "mean_token_accuracy": 0.8328167483210563, "num_tokens": 57394798.0, "step": 47760 }, { "entropy": 1.8636110588908195, "epoch": 0.14808282036239528, "grad_norm": 3.9326298236846924, "learning_rate": 6.574229890242045e-06, "loss": 0.4629, "mean_token_accuracy": 0.8495140254497529, "num_tokens": 57407317.0, "step": 47770 }, { "entropy": 1.8715795949101448, "epoch": 0.14811381948744495, "grad_norm": 4.289327621459961, "learning_rate": 6.57354187103718e-06, "loss": 0.517, "mean_token_accuracy": 0.8313918590545655, "num_tokens": 57419698.0, "step": 47780 }, { "entropy": 1.8456829696893693, "epoch": 0.14814481861249465, "grad_norm": 2.9080862998962402, "learning_rate": 6.57285406779892e-06, "loss": 0.4923, "mean_token_accuracy": 0.8416327074170112, "num_tokens": 57432183.0, "step": 47790 }, { "entropy": 1.8631455272436142, "epoch": 0.14817581773754435, "grad_norm": 8.882286071777344, "learning_rate": 6.5721664804143015e-06, "loss": 0.4566, "mean_token_accuracy": 0.8395325362682342, "num_tokens": 57444434.0, "step": 47800 }, { "entropy": 1.8976523950695992, "epoch": 0.14820681686259404, "grad_norm": 9.217388153076172, "learning_rate": 6.5714791087704465e-06, "loss": 0.5584, "mean_token_accuracy": 0.8214605495333671, "num_tokens": 57456138.0, "step": 47810 }, { "entropy": 1.9854907482862472, "epoch": 0.14823781598764374, "grad_norm": 9.021675109863281, "learning_rate": 6.570791952754559e-06, "loss": 0.5766, "mean_token_accuracy": 0.8244616031646729, "num_tokens": 57466708.0, "step": 47820 }, { "entropy": 1.9569459453225135, "epoch": 0.14826881511269344, "grad_norm": 9.533801078796387, "learning_rate": 6.570105012253927e-06, "loss": 0.5676, "mean_token_accuracy": 0.8229658871889114, "num_tokens": 57478273.0, "step": 47830 }, { "entropy": 1.85792086571455, "epoch": 0.14829981423774313, "grad_norm": 9.635293960571289, "learning_rate": 6.569418287155915e-06, "loss": 0.518, "mean_token_accuracy": 0.8407026007771492, "num_tokens": 57491220.0, "step": 47840 }, { "entropy": 1.906779670715332, "epoch": 0.14833081336279283, "grad_norm": 9.097919464111328, "learning_rate": 6.568731777347978e-06, "loss": 0.5183, "mean_token_accuracy": 0.8338428869843483, "num_tokens": 57503621.0, "step": 47850 }, { "entropy": 1.8580841287970542, "epoch": 0.14836181248784253, "grad_norm": 9.157151222229004, "learning_rate": 6.568045482717649e-06, "loss": 0.4643, "mean_token_accuracy": 0.8528664171695709, "num_tokens": 57515610.0, "step": 47860 }, { "entropy": 1.839845283329487, "epoch": 0.14839281161289222, "grad_norm": 4.290517807006836, "learning_rate": 6.567359403152542e-06, "loss": 0.4947, "mean_token_accuracy": 0.8425501629710197, "num_tokens": 57528868.0, "step": 47870 }, { "entropy": 1.8454848155379295, "epoch": 0.14842381073794192, "grad_norm": 4.269301891326904, "learning_rate": 6.566673538540357e-06, "loss": 0.4629, "mean_token_accuracy": 0.8409277200698853, "num_tokens": 57542104.0, "step": 47880 }, { "entropy": 1.8853970304131509, "epoch": 0.14845480986299162, "grad_norm": 8.715253829956055, "learning_rate": 6.5659878887688726e-06, "loss": 0.4744, "mean_token_accuracy": 0.8417058885097504, "num_tokens": 57555226.0, "step": 47890 }, { "entropy": 1.905742235481739, "epoch": 0.1484858089880413, "grad_norm": 8.63672161102295, "learning_rate": 6.56530245372595e-06, "loss": 0.5077, "mean_token_accuracy": 0.8448764130473136, "num_tokens": 57567249.0, "step": 47900 }, { "entropy": 1.9830147728323937, "epoch": 0.148516808113091, "grad_norm": 7.22813606262207, "learning_rate": 6.564617233299536e-06, "loss": 0.5724, "mean_token_accuracy": 0.8194261863827705, "num_tokens": 57579388.0, "step": 47910 }, { "entropy": 1.769668859243393, "epoch": 0.1485478072381407, "grad_norm": 4.5545125007629395, "learning_rate": 6.563932227377654e-06, "loss": 0.4876, "mean_token_accuracy": 0.8449043944478035, "num_tokens": 57594365.0, "step": 47920 }, { "entropy": 1.802185168862343, "epoch": 0.1485788063631904, "grad_norm": 10.426836013793945, "learning_rate": 6.563247435848412e-06, "loss": 0.4355, "mean_token_accuracy": 0.8285350769758224, "num_tokens": 57608164.0, "step": 47930 }, { "entropy": 1.88195166811347, "epoch": 0.1486098054882401, "grad_norm": 7.976967811584473, "learning_rate": 6.5625628586e-06, "loss": 0.5016, "mean_token_accuracy": 0.8379166424274445, "num_tokens": 57621398.0, "step": 47940 }, { "entropy": 1.8815200373530387, "epoch": 0.1486408046132898, "grad_norm": 4.0617876052856445, "learning_rate": 6.561878495520689e-06, "loss": 0.5044, "mean_token_accuracy": 0.8357270896434784, "num_tokens": 57633368.0, "step": 47950 }, { "entropy": 1.8966475576162338, "epoch": 0.1486718037383395, "grad_norm": 7.131133079528809, "learning_rate": 6.56119434649883e-06, "loss": 0.5513, "mean_token_accuracy": 0.8241571202874184, "num_tokens": 57645429.0, "step": 47960 }, { "entropy": 1.970197968184948, "epoch": 0.1487028028633892, "grad_norm": 8.698470115661621, "learning_rate": 6.5605104114228565e-06, "loss": 0.5695, "mean_token_accuracy": 0.8253947734832764, "num_tokens": 57656725.0, "step": 47970 }, { "entropy": 1.8682085782289506, "epoch": 0.14873380198843889, "grad_norm": 7.869755744934082, "learning_rate": 6.5598266901812866e-06, "loss": 0.4861, "mean_token_accuracy": 0.8386846721172333, "num_tokens": 57668875.0, "step": 47980 }, { "entropy": 1.9324923619627952, "epoch": 0.14876480111348858, "grad_norm": 8.918335914611816, "learning_rate": 6.559143182662716e-06, "loss": 0.5732, "mean_token_accuracy": 0.8326233476400375, "num_tokens": 57680511.0, "step": 47990 }, { "entropy": 1.8512993976473808, "epoch": 0.14879580023853828, "grad_norm": 3.6257503032684326, "learning_rate": 6.55845988875582e-06, "loss": 0.4992, "mean_token_accuracy": 0.8437419295310974, "num_tokens": 57693483.0, "step": 48000 }, { "entropy": 1.877199736237526, "epoch": 0.14882679936358797, "grad_norm": 4.235811233520508, "learning_rate": 6.557776808349361e-06, "loss": 0.48, "mean_token_accuracy": 0.8381939142942428, "num_tokens": 57705721.0, "step": 48010 }, { "entropy": 1.969959369301796, "epoch": 0.14885779848863767, "grad_norm": 8.612361907958984, "learning_rate": 6.557093941332177e-06, "loss": 0.594, "mean_token_accuracy": 0.8242570266127587, "num_tokens": 57716430.0, "step": 48020 }, { "entropy": 1.9459771722555161, "epoch": 0.14888879761368734, "grad_norm": 4.5910749435424805, "learning_rate": 6.556411287593189e-06, "loss": 0.5514, "mean_token_accuracy": 0.824884532392025, "num_tokens": 57728220.0, "step": 48030 }, { "entropy": 1.8936843484640122, "epoch": 0.14891979673873704, "grad_norm": 9.532984733581543, "learning_rate": 6.5557288470214e-06, "loss": 0.5743, "mean_token_accuracy": 0.8205350682139396, "num_tokens": 57740471.0, "step": 48040 }, { "entropy": 1.8339029610157014, "epoch": 0.14895079586378673, "grad_norm": 3.72184157371521, "learning_rate": 6.555046619505892e-06, "loss": 0.5027, "mean_token_accuracy": 0.8391190290451049, "num_tokens": 57752802.0, "step": 48050 }, { "entropy": 1.9061024576425551, "epoch": 0.14898179498883643, "grad_norm": 7.634225845336914, "learning_rate": 6.554364604935828e-06, "loss": 0.5458, "mean_token_accuracy": 0.8316650420427323, "num_tokens": 57764533.0, "step": 48060 }, { "entropy": 1.9208767369389534, "epoch": 0.14901279411388613, "grad_norm": 9.036537170410156, "learning_rate": 6.5536828032004554e-06, "loss": 0.5611, "mean_token_accuracy": 0.831520353257656, "num_tokens": 57776014.0, "step": 48070 }, { "entropy": 1.8474348559975624, "epoch": 0.14904379323893582, "grad_norm": 4.276791095733643, "learning_rate": 6.553001214189095e-06, "loss": 0.507, "mean_token_accuracy": 0.8321506321430207, "num_tokens": 57788876.0, "step": 48080 }, { "entropy": 1.8484969601035117, "epoch": 0.14907479236398552, "grad_norm": 9.312860488891602, "learning_rate": 6.552319837791156e-06, "loss": 0.5018, "mean_token_accuracy": 0.839774203300476, "num_tokens": 57801080.0, "step": 48090 }, { "entropy": 1.8271625474095345, "epoch": 0.14910579148903522, "grad_norm": 3.9336740970611572, "learning_rate": 6.551638673896124e-06, "loss": 0.4867, "mean_token_accuracy": 0.8387017279863358, "num_tokens": 57813838.0, "step": 48100 }, { "entropy": 2.0055576503276824, "epoch": 0.1491367906140849, "grad_norm": 8.840508460998535, "learning_rate": 6.550957722393561e-06, "loss": 0.602, "mean_token_accuracy": 0.823385763168335, "num_tokens": 57824304.0, "step": 48110 }, { "entropy": 1.8011847533285619, "epoch": 0.1491677897391346, "grad_norm": 7.309523582458496, "learning_rate": 6.55027698317312e-06, "loss": 0.447, "mean_token_accuracy": 0.8421018213033676, "num_tokens": 57838533.0, "step": 48120 }, { "entropy": 1.8618897780776025, "epoch": 0.1491987888641843, "grad_norm": 8.072103500366211, "learning_rate": 6.549596456124524e-06, "loss": 0.5082, "mean_token_accuracy": 0.8348542749881744, "num_tokens": 57850689.0, "step": 48130 }, { "entropy": 1.9547663122415542, "epoch": 0.149229787989234, "grad_norm": 7.6012349128723145, "learning_rate": 6.548916141137581e-06, "loss": 0.5302, "mean_token_accuracy": 0.8345204189419746, "num_tokens": 57861907.0, "step": 48140 }, { "entropy": 1.9206694543361664, "epoch": 0.1492607871142837, "grad_norm": 9.59740924835205, "learning_rate": 6.548236038102178e-06, "loss": 0.4998, "mean_token_accuracy": 0.8426736682653427, "num_tokens": 57873874.0, "step": 48150 }, { "entropy": 1.9666424721479416, "epoch": 0.1492917862393334, "grad_norm": 9.023880958557129, "learning_rate": 6.547556146908285e-06, "loss": 0.5622, "mean_token_accuracy": 0.8249856010079384, "num_tokens": 57885436.0, "step": 48160 }, { "entropy": 1.926368948817253, "epoch": 0.1493227853643831, "grad_norm": 9.346840858459473, "learning_rate": 6.546876467445947e-06, "loss": 0.6086, "mean_token_accuracy": 0.8210124105215073, "num_tokens": 57896563.0, "step": 48170 }, { "entropy": 1.967759844660759, "epoch": 0.1493537844894328, "grad_norm": 8.98388385772705, "learning_rate": 6.546196999605291e-06, "loss": 0.5438, "mean_token_accuracy": 0.8355759829282761, "num_tokens": 57907605.0, "step": 48180 }, { "entropy": 1.9306065008044242, "epoch": 0.14938478361448249, "grad_norm": 6.92803955078125, "learning_rate": 6.545517743276522e-06, "loss": 0.5503, "mean_token_accuracy": 0.8288378581404686, "num_tokens": 57918939.0, "step": 48190 }, { "entropy": 1.871036571264267, "epoch": 0.14941578273953218, "grad_norm": 5.162753582000732, "learning_rate": 6.544838698349932e-06, "loss": 0.5035, "mean_token_accuracy": 0.8356782793998718, "num_tokens": 57931205.0, "step": 48200 }, { "entropy": 1.8842888280749321, "epoch": 0.14944678186458188, "grad_norm": 9.374017715454102, "learning_rate": 6.5441598647158835e-06, "loss": 0.5492, "mean_token_accuracy": 0.8327220484614373, "num_tokens": 57942778.0, "step": 48210 }, { "entropy": 1.8962712571024896, "epoch": 0.14947778098963158, "grad_norm": 8.113261222839355, "learning_rate": 6.543481242264823e-06, "loss": 0.5285, "mean_token_accuracy": 0.8363337010145188, "num_tokens": 57955128.0, "step": 48220 }, { "entropy": 1.9268214851617813, "epoch": 0.14950878011468127, "grad_norm": 9.561257362365723, "learning_rate": 6.542802830887277e-06, "loss": 0.5674, "mean_token_accuracy": 0.8345524996519089, "num_tokens": 57966427.0, "step": 48230 }, { "entropy": 1.8193738594651223, "epoch": 0.14953977923973097, "grad_norm": 7.488965034484863, "learning_rate": 6.542124630473848e-06, "loss": 0.4613, "mean_token_accuracy": 0.8413304805755615, "num_tokens": 57980642.0, "step": 48240 }, { "entropy": 1.8412556283175945, "epoch": 0.14957077836478067, "grad_norm": 4.181149959564209, "learning_rate": 6.541446640915224e-06, "loss": 0.4584, "mean_token_accuracy": 0.840462064743042, "num_tokens": 57994002.0, "step": 48250 }, { "entropy": 1.8988297596573829, "epoch": 0.14960177748983036, "grad_norm": 7.6107025146484375, "learning_rate": 6.540768862102166e-06, "loss": 0.4761, "mean_token_accuracy": 0.8412007540464401, "num_tokens": 58005701.0, "step": 48260 }, { "entropy": 1.860750602185726, "epoch": 0.14963277661488006, "grad_norm": 9.468915939331055, "learning_rate": 6.5400912939255156e-06, "loss": 0.4706, "mean_token_accuracy": 0.8450239017605782, "num_tokens": 58018246.0, "step": 48270 }, { "entropy": 1.9176729574799538, "epoch": 0.14966377573992973, "grad_norm": 5.558367729187012, "learning_rate": 6.5394139362761964e-06, "loss": 0.5402, "mean_token_accuracy": 0.8284741297364235, "num_tokens": 58030012.0, "step": 48280 }, { "entropy": 1.8357847198843955, "epoch": 0.14969477486497942, "grad_norm": 4.411746501922607, "learning_rate": 6.5387367890452105e-06, "loss": 0.4603, "mean_token_accuracy": 0.8405596747994423, "num_tokens": 58042890.0, "step": 48290 }, { "entropy": 1.8932993397116662, "epoch": 0.14972577399002912, "grad_norm": 8.260334014892578, "learning_rate": 6.538059852123636e-06, "loss": 0.4953, "mean_token_accuracy": 0.8374821558594704, "num_tokens": 58055225.0, "step": 48300 }, { "entropy": 1.7874238356947898, "epoch": 0.14975677311507882, "grad_norm": 10.80783462524414, "learning_rate": 6.537383125402632e-06, "loss": 0.4407, "mean_token_accuracy": 0.8481561884284019, "num_tokens": 58068456.0, "step": 48310 }, { "entropy": 1.9409461870789528, "epoch": 0.14978777224012851, "grad_norm": 8.024087905883789, "learning_rate": 6.536706608773437e-06, "loss": 0.5432, "mean_token_accuracy": 0.835419024527073, "num_tokens": 58079470.0, "step": 48320 }, { "entropy": 1.9292469948530198, "epoch": 0.1498187713651782, "grad_norm": 7.896714687347412, "learning_rate": 6.536030302127366e-06, "loss": 0.5654, "mean_token_accuracy": 0.8290838211774826, "num_tokens": 58090946.0, "step": 48330 }, { "entropy": 1.878405897319317, "epoch": 0.1498497704902279, "grad_norm": 11.065820693969727, "learning_rate": 6.535354205355815e-06, "loss": 0.5846, "mean_token_accuracy": 0.8164193764328956, "num_tokens": 58102826.0, "step": 48340 }, { "entropy": 1.9426892310380937, "epoch": 0.1498807696152776, "grad_norm": 8.384618759155273, "learning_rate": 6.534678318350258e-06, "loss": 0.5697, "mean_token_accuracy": 0.825680835545063, "num_tokens": 58113587.0, "step": 48350 }, { "entropy": 1.8836750328540801, "epoch": 0.1499117687403273, "grad_norm": 9.055339813232422, "learning_rate": 6.534002641002247e-06, "loss": 0.5524, "mean_token_accuracy": 0.8372475564479828, "num_tokens": 58125471.0, "step": 48360 }, { "entropy": 1.9558273077011108, "epoch": 0.149942767865377, "grad_norm": 8.898198127746582, "learning_rate": 6.533327173203413e-06, "loss": 0.5413, "mean_token_accuracy": 0.8336478710174561, "num_tokens": 58137376.0, "step": 48370 }, { "entropy": 1.9481865465641022, "epoch": 0.1499737669904267, "grad_norm": 10.591567993164062, "learning_rate": 6.532651914845465e-06, "loss": 0.5301, "mean_token_accuracy": 0.8392414048314094, "num_tokens": 58148589.0, "step": 48380 }, { "entropy": 1.8466496154665948, "epoch": 0.1500047661154764, "grad_norm": 9.16861629486084, "learning_rate": 6.531976865820191e-06, "loss": 0.5262, "mean_token_accuracy": 0.836252911388874, "num_tokens": 58160303.0, "step": 48390 }, { "entropy": 1.8699641510844232, "epoch": 0.1500357652405261, "grad_norm": 8.319191932678223, "learning_rate": 6.531302026019457e-06, "loss": 0.4918, "mean_token_accuracy": 0.8408505141735076, "num_tokens": 58172442.0, "step": 48400 }, { "entropy": 1.9123701184988022, "epoch": 0.15006676436557578, "grad_norm": 7.481304168701172, "learning_rate": 6.530627395335206e-06, "loss": 0.5081, "mean_token_accuracy": 0.83611781001091, "num_tokens": 58184383.0, "step": 48410 }, { "entropy": 1.9331264093518257, "epoch": 0.15009776349062548, "grad_norm": 8.620382308959961, "learning_rate": 6.529952973659459e-06, "loss": 0.5176, "mean_token_accuracy": 0.8453733563423157, "num_tokens": 58195683.0, "step": 48420 }, { "entropy": 1.8736334875226022, "epoch": 0.15012876261567518, "grad_norm": 7.600832939147949, "learning_rate": 6.52927876088432e-06, "loss": 0.5108, "mean_token_accuracy": 0.8397956430912018, "num_tokens": 58207322.0, "step": 48430 }, { "entropy": 1.887729911506176, "epoch": 0.15015976174072487, "grad_norm": 9.004258155822754, "learning_rate": 6.5286047569019626e-06, "loss": 0.5415, "mean_token_accuracy": 0.8307413533329964, "num_tokens": 58219063.0, "step": 48440 }, { "entropy": 1.8791867524385453, "epoch": 0.15019076086577457, "grad_norm": 8.612053871154785, "learning_rate": 6.5279309616046475e-06, "loss": 0.5384, "mean_token_accuracy": 0.8294597789645195, "num_tokens": 58230800.0, "step": 48450 }, { "entropy": 1.8117628186941146, "epoch": 0.15022175999082427, "grad_norm": 7.665522575378418, "learning_rate": 6.527257374884704e-06, "loss": 0.5374, "mean_token_accuracy": 0.8396688923239708, "num_tokens": 58243429.0, "step": 48460 }, { "entropy": 1.961036217212677, "epoch": 0.15025275911587396, "grad_norm": 9.192258834838867, "learning_rate": 6.5265839966345466e-06, "loss": 0.5953, "mean_token_accuracy": 0.819936765730381, "num_tokens": 58254745.0, "step": 48470 }, { "entropy": 1.671479968726635, "epoch": 0.15028375824092366, "grad_norm": 4.934950828552246, "learning_rate": 6.5259108267466635e-06, "loss": 0.4365, "mean_token_accuracy": 0.8538110002875328, "num_tokens": 58270043.0, "step": 48480 }, { "entropy": 1.9253578931093216, "epoch": 0.15031475736597336, "grad_norm": 8.208556175231934, "learning_rate": 6.525237865113621e-06, "loss": 0.5404, "mean_token_accuracy": 0.8374222457408905, "num_tokens": 58281799.0, "step": 48490 }, { "entropy": 1.8729837134480476, "epoch": 0.15034575649102305, "grad_norm": 9.998933792114258, "learning_rate": 6.524565111628065e-06, "loss": 0.5488, "mean_token_accuracy": 0.8276918828487396, "num_tokens": 58293623.0, "step": 48500 }, { "entropy": 1.7981028646230697, "epoch": 0.15037675561607275, "grad_norm": 5.204981327056885, "learning_rate": 6.523892566182717e-06, "loss": 0.4219, "mean_token_accuracy": 0.8532559484243393, "num_tokens": 58306840.0, "step": 48510 }, { "entropy": 1.884388868510723, "epoch": 0.15040775474112242, "grad_norm": 9.6846342086792, "learning_rate": 6.523220228670375e-06, "loss": 0.5532, "mean_token_accuracy": 0.8274782180786133, "num_tokens": 58318898.0, "step": 48520 }, { "entropy": 1.9351076558232307, "epoch": 0.15043875386617211, "grad_norm": 8.531515121459961, "learning_rate": 6.522548098983917e-06, "loss": 0.5733, "mean_token_accuracy": 0.8250018388032914, "num_tokens": 58330362.0, "step": 48530 }, { "entropy": 1.8293303191661834, "epoch": 0.1504697529912218, "grad_norm": 9.502824783325195, "learning_rate": 6.521876177016295e-06, "loss": 0.4581, "mean_token_accuracy": 0.8435877069830895, "num_tokens": 58343242.0, "step": 48540 }, { "entropy": 1.890285351872444, "epoch": 0.1505007521162715, "grad_norm": 9.041160583496094, "learning_rate": 6.521204462660542e-06, "loss": 0.501, "mean_token_accuracy": 0.8346679538488389, "num_tokens": 58355866.0, "step": 48550 }, { "entropy": 1.8444904461503029, "epoch": 0.1505317512413212, "grad_norm": 3.8206639289855957, "learning_rate": 6.520532955809765e-06, "loss": 0.4764, "mean_token_accuracy": 0.8404908493161202, "num_tokens": 58368640.0, "step": 48560 }, { "entropy": 1.835204230248928, "epoch": 0.1505627503663709, "grad_norm": 8.419897079467773, "learning_rate": 6.5198616563571505e-06, "loss": 0.4638, "mean_token_accuracy": 0.8446251839399338, "num_tokens": 58381253.0, "step": 48570 }, { "entropy": 1.8946281239390372, "epoch": 0.1505937494914206, "grad_norm": 9.88506031036377, "learning_rate": 6.519190564195959e-06, "loss": 0.5422, "mean_token_accuracy": 0.8362071871757507, "num_tokens": 58393895.0, "step": 48580 }, { "entropy": 1.82159626185894, "epoch": 0.1506247486164703, "grad_norm": 3.63238787651062, "learning_rate": 6.518519679219528e-06, "loss": 0.4447, "mean_token_accuracy": 0.8525608211755753, "num_tokens": 58407125.0, "step": 48590 }, { "entropy": 1.8428357735276222, "epoch": 0.15065574774152, "grad_norm": 9.18499755859375, "learning_rate": 6.517849001321278e-06, "loss": 0.4695, "mean_token_accuracy": 0.8396696642041206, "num_tokens": 58420081.0, "step": 48600 }, { "entropy": 1.8873658314347268, "epoch": 0.1506867468665697, "grad_norm": 10.179847717285156, "learning_rate": 6.517178530394698e-06, "loss": 0.4804, "mean_token_accuracy": 0.8444565415382386, "num_tokens": 58432042.0, "step": 48610 }, { "entropy": 1.844616176187992, "epoch": 0.15071774599161938, "grad_norm": 3.9476633071899414, "learning_rate": 6.516508266333358e-06, "loss": 0.4762, "mean_token_accuracy": 0.844041819870472, "num_tokens": 58444698.0, "step": 48620 }, { "entropy": 1.8296439558267594, "epoch": 0.15074874511666908, "grad_norm": 9.040117263793945, "learning_rate": 6.5158382090309035e-06, "loss": 0.4992, "mean_token_accuracy": 0.8471789911389351, "num_tokens": 58456911.0, "step": 48630 }, { "entropy": 1.891487744450569, "epoch": 0.15077974424171878, "grad_norm": 8.067523956298828, "learning_rate": 6.515168358381057e-06, "loss": 0.5691, "mean_token_accuracy": 0.822045773267746, "num_tokens": 58468808.0, "step": 48640 }, { "entropy": 1.9148033902049064, "epoch": 0.15081074336676847, "grad_norm": 8.858370780944824, "learning_rate": 6.514498714277619e-06, "loss": 0.5551, "mean_token_accuracy": 0.8241749078035354, "num_tokens": 58480696.0, "step": 48650 }, { "entropy": 1.9157651707530021, "epoch": 0.15084174249181817, "grad_norm": 9.38106918334961, "learning_rate": 6.5138292766144615e-06, "loss": 0.54, "mean_token_accuracy": 0.8287394896149636, "num_tokens": 58492144.0, "step": 48660 }, { "entropy": 1.9842921257019044, "epoch": 0.15087274161686787, "grad_norm": 8.547111511230469, "learning_rate": 6.5131600452855394e-06, "loss": 0.5597, "mean_token_accuracy": 0.8356396153569221, "num_tokens": 58502452.0, "step": 48670 }, { "entropy": 1.972530573606491, "epoch": 0.15090374074191756, "grad_norm": 9.455718040466309, "learning_rate": 6.512491020184877e-06, "loss": 0.5894, "mean_token_accuracy": 0.8308357432484627, "num_tokens": 58513985.0, "step": 48680 }, { "entropy": 1.9101537883281707, "epoch": 0.15093473986696726, "grad_norm": 8.880309104919434, "learning_rate": 6.5118222012065825e-06, "loss": 0.5024, "mean_token_accuracy": 0.8405994296073913, "num_tokens": 58525140.0, "step": 48690 }, { "entropy": 1.946905219554901, "epoch": 0.15096573899201696, "grad_norm": 7.35371732711792, "learning_rate": 6.511153588244832e-06, "loss": 0.5481, "mean_token_accuracy": 0.8271761432290077, "num_tokens": 58536580.0, "step": 48700 }, { "entropy": 1.947824102640152, "epoch": 0.15099673811706665, "grad_norm": 9.245978355407715, "learning_rate": 6.510485181193884e-06, "loss": 0.6137, "mean_token_accuracy": 0.821645250916481, "num_tokens": 58548298.0, "step": 48710 }, { "entropy": 1.8544338196516037, "epoch": 0.15102773724211635, "grad_norm": 10.973575592041016, "learning_rate": 6.50981697994807e-06, "loss": 0.5094, "mean_token_accuracy": 0.8355946585536003, "num_tokens": 58560605.0, "step": 48720 }, { "entropy": 1.8731532111763953, "epoch": 0.15105873636716605, "grad_norm": 8.846532821655273, "learning_rate": 6.5091489844017984e-06, "loss": 0.5354, "mean_token_accuracy": 0.838229563832283, "num_tokens": 58573583.0, "step": 48730 }, { "entropy": 1.8979535773396492, "epoch": 0.15108973549221574, "grad_norm": 8.72420883178711, "learning_rate": 6.5084811944495515e-06, "loss": 0.5351, "mean_token_accuracy": 0.8275813281536102, "num_tokens": 58585343.0, "step": 48740 }, { "entropy": 1.9084653094410897, "epoch": 0.15112073461726544, "grad_norm": 8.924386978149414, "learning_rate": 6.50781360998589e-06, "loss": 0.5136, "mean_token_accuracy": 0.8385712206363678, "num_tokens": 58596594.0, "step": 48750 }, { "entropy": 1.9596381425857543, "epoch": 0.15115173374231514, "grad_norm": 8.702342987060547, "learning_rate": 6.50714623090545e-06, "loss": 0.5923, "mean_token_accuracy": 0.8301157906651497, "num_tokens": 58607471.0, "step": 48760 }, { "entropy": 1.8487385302782058, "epoch": 0.1511827328673648, "grad_norm": 8.954679489135742, "learning_rate": 6.506479057102942e-06, "loss": 0.5111, "mean_token_accuracy": 0.8388300389051437, "num_tokens": 58619562.0, "step": 48770 }, { "entropy": 1.910484978556633, "epoch": 0.1512137319924145, "grad_norm": 9.405644416809082, "learning_rate": 6.505812088473151e-06, "loss": 0.5814, "mean_token_accuracy": 0.8201380014419556, "num_tokens": 58630856.0, "step": 48780 }, { "entropy": 1.816239494085312, "epoch": 0.1512447311174642, "grad_norm": 4.988278388977051, "learning_rate": 6.505145324910941e-06, "loss": 0.4222, "mean_token_accuracy": 0.8467860907316208, "num_tokens": 58643774.0, "step": 48790 }, { "entropy": 1.889238955080509, "epoch": 0.1512757302425139, "grad_norm": 8.861750602722168, "learning_rate": 6.504478766311248e-06, "loss": 0.5108, "mean_token_accuracy": 0.8347546219825744, "num_tokens": 58655783.0, "step": 48800 }, { "entropy": 1.9085273638367652, "epoch": 0.1513067293675636, "grad_norm": 9.441707611083984, "learning_rate": 6.503812412569084e-06, "loss": 0.523, "mean_token_accuracy": 0.8369966968894005, "num_tokens": 58667819.0, "step": 48810 }, { "entropy": 1.9685264229774475, "epoch": 0.1513377284926133, "grad_norm": 10.141180038452148, "learning_rate": 6.503146263579539e-06, "loss": 0.5336, "mean_token_accuracy": 0.8229795888066291, "num_tokens": 58679528.0, "step": 48820 }, { "entropy": 1.8758042559027672, "epoch": 0.15136872761766298, "grad_norm": 11.18869400024414, "learning_rate": 6.502480319237775e-06, "loss": 0.5305, "mean_token_accuracy": 0.8300367653369903, "num_tokens": 58691911.0, "step": 48830 }, { "entropy": 1.8802335485816002, "epoch": 0.15139972674271268, "grad_norm": 8.065637588500977, "learning_rate": 6.5018145794390305e-06, "loss": 0.501, "mean_token_accuracy": 0.8387890234589577, "num_tokens": 58704390.0, "step": 48840 }, { "entropy": 1.8963167145848274, "epoch": 0.15143072586776238, "grad_norm": 5.054469585418701, "learning_rate": 6.501149044078618e-06, "loss": 0.5604, "mean_token_accuracy": 0.8216091677546501, "num_tokens": 58717245.0, "step": 48850 }, { "entropy": 1.8458565592765808, "epoch": 0.15146172499281207, "grad_norm": 4.077347278594971, "learning_rate": 6.500483713051927e-06, "loss": 0.4924, "mean_token_accuracy": 0.8441688820719719, "num_tokens": 58729499.0, "step": 48860 }, { "entropy": 1.9472675323486328, "epoch": 0.15149272411786177, "grad_norm": 8.823904991149902, "learning_rate": 6.499818586254422e-06, "loss": 0.513, "mean_token_accuracy": 0.8328737184405327, "num_tokens": 58741160.0, "step": 48870 }, { "entropy": 1.9384382277727128, "epoch": 0.15152372324291147, "grad_norm": 8.922835350036621, "learning_rate": 6.499153663581638e-06, "loss": 0.49, "mean_token_accuracy": 0.8393163681030273, "num_tokens": 58752947.0, "step": 48880 }, { "entropy": 1.9302797958254814, "epoch": 0.15155472236796116, "grad_norm": 8.566751480102539, "learning_rate": 6.49848894492919e-06, "loss": 0.5269, "mean_token_accuracy": 0.8295559108257293, "num_tokens": 58765340.0, "step": 48890 }, { "entropy": 1.8543498650193215, "epoch": 0.15158572149301086, "grad_norm": 8.253662109375, "learning_rate": 6.497824430192765e-06, "loss": 0.4847, "mean_token_accuracy": 0.8355495229363441, "num_tokens": 58778000.0, "step": 48900 }, { "entropy": 1.874028617143631, "epoch": 0.15161672061806056, "grad_norm": 5.289228916168213, "learning_rate": 6.497160119268126e-06, "loss": 0.4513, "mean_token_accuracy": 0.8463690742850304, "num_tokens": 58791118.0, "step": 48910 }, { "entropy": 1.9725682616233826, "epoch": 0.15164771974311025, "grad_norm": 8.66529655456543, "learning_rate": 6.496496012051109e-06, "loss": 0.6059, "mean_token_accuracy": 0.8233085095882415, "num_tokens": 58802602.0, "step": 48920 }, { "entropy": 1.9239532873034477, "epoch": 0.15167871886815995, "grad_norm": 9.758983612060547, "learning_rate": 6.495832108437626e-06, "loss": 0.479, "mean_token_accuracy": 0.8342737898230552, "num_tokens": 58814927.0, "step": 48930 }, { "entropy": 1.846892774105072, "epoch": 0.15170971799320965, "grad_norm": 4.085844993591309, "learning_rate": 6.495168408323665e-06, "loss": 0.4897, "mean_token_accuracy": 0.8468069404363632, "num_tokens": 58827118.0, "step": 48940 }, { "entropy": 1.9423293888568878, "epoch": 0.15174071711825934, "grad_norm": 9.48098373413086, "learning_rate": 6.4945049116052795e-06, "loss": 0.5816, "mean_token_accuracy": 0.821548655629158, "num_tokens": 58838717.0, "step": 48950 }, { "entropy": 1.9620625630021096, "epoch": 0.15177171624330904, "grad_norm": 7.364303112030029, "learning_rate": 6.493841618178611e-06, "loss": 0.5329, "mean_token_accuracy": 0.8329650431871414, "num_tokens": 58850165.0, "step": 48960 }, { "entropy": 1.7987186387181282, "epoch": 0.15180271536835874, "grad_norm": 8.708568572998047, "learning_rate": 6.4931785279398666e-06, "loss": 0.4112, "mean_token_accuracy": 0.8545974805951119, "num_tokens": 58863232.0, "step": 48970 }, { "entropy": 1.92343827188015, "epoch": 0.15183371449340843, "grad_norm": 8.546026229858398, "learning_rate": 6.4925156407853275e-06, "loss": 0.5071, "mean_token_accuracy": 0.8306387692689896, "num_tokens": 58874670.0, "step": 48980 }, { "entropy": 1.8594204097986222, "epoch": 0.15186471361845813, "grad_norm": 3.8005247116088867, "learning_rate": 6.491852956611351e-06, "loss": 0.5176, "mean_token_accuracy": 0.8334858074784279, "num_tokens": 58887318.0, "step": 48990 }, { "entropy": 1.939525455236435, "epoch": 0.15189571274350783, "grad_norm": 7.96549129486084, "learning_rate": 6.4911904753143696e-06, "loss": 0.5349, "mean_token_accuracy": 0.8367563143372536, "num_tokens": 58898840.0, "step": 49000 }, { "entropy": 1.8316703870892526, "epoch": 0.15192671186855752, "grad_norm": 7.305845737457275, "learning_rate": 6.490528196790886e-06, "loss": 0.5014, "mean_token_accuracy": 0.8301277205348014, "num_tokens": 58911661.0, "step": 49010 }, { "entropy": 1.808566428720951, "epoch": 0.1519577109936072, "grad_norm": 7.7132792472839355, "learning_rate": 6.489866120937483e-06, "loss": 0.4316, "mean_token_accuracy": 0.851980808377266, "num_tokens": 58924530.0, "step": 49020 }, { "entropy": 1.9115712746977807, "epoch": 0.1519887101186569, "grad_norm": 8.088088989257812, "learning_rate": 6.489204247650809e-06, "loss": 0.5556, "mean_token_accuracy": 0.832783767580986, "num_tokens": 58936710.0, "step": 49030 }, { "entropy": 1.7950451903045177, "epoch": 0.15201970924370659, "grad_norm": 7.494925022125244, "learning_rate": 6.4885425768275945e-06, "loss": 0.4237, "mean_token_accuracy": 0.8493603855371475, "num_tokens": 58950735.0, "step": 49040 }, { "entropy": 1.8894059650599957, "epoch": 0.15205070836875628, "grad_norm": 7.540571212768555, "learning_rate": 6.487881108364637e-06, "loss": 0.5165, "mean_token_accuracy": 0.835996463894844, "num_tokens": 58962971.0, "step": 49050 }, { "entropy": 1.7538513764739037, "epoch": 0.15208170749380598, "grad_norm": 4.750351905822754, "learning_rate": 6.487219842158812e-06, "loss": 0.3752, "mean_token_accuracy": 0.8489253923296929, "num_tokens": 58976766.0, "step": 49060 }, { "entropy": 1.7878063380718232, "epoch": 0.15211270661885568, "grad_norm": 3.8629848957061768, "learning_rate": 6.486558778107066e-06, "loss": 0.4389, "mean_token_accuracy": 0.8460006847977638, "num_tokens": 58990175.0, "step": 49070 }, { "entropy": 1.8979044020175935, "epoch": 0.15214370574390537, "grad_norm": 10.183228492736816, "learning_rate": 6.485897916106419e-06, "loss": 0.5256, "mean_token_accuracy": 0.8209685668349266, "num_tokens": 59002434.0, "step": 49080 }, { "entropy": 1.936001867055893, "epoch": 0.15217470486895507, "grad_norm": 10.987131118774414, "learning_rate": 6.485237256053968e-06, "loss": 0.5699, "mean_token_accuracy": 0.8229846864938736, "num_tokens": 59013865.0, "step": 49090 }, { "entropy": 1.7930354595184326, "epoch": 0.15220570399400477, "grad_norm": 8.664037704467773, "learning_rate": 6.484576797846879e-06, "loss": 0.4632, "mean_token_accuracy": 0.8444106802344322, "num_tokens": 59026905.0, "step": 49100 }, { "entropy": 1.848856683075428, "epoch": 0.15223670311905446, "grad_norm": 8.302072525024414, "learning_rate": 6.4839165413823935e-06, "loss": 0.551, "mean_token_accuracy": 0.8347058981657028, "num_tokens": 59039328.0, "step": 49110 }, { "entropy": 1.754729336500168, "epoch": 0.15226770224410416, "grad_norm": 8.616592407226562, "learning_rate": 6.483256486557824e-06, "loss": 0.4711, "mean_token_accuracy": 0.8488777309656144, "num_tokens": 59052355.0, "step": 49120 }, { "entropy": 1.854713924229145, "epoch": 0.15229870136915385, "grad_norm": 10.268636703491211, "learning_rate": 6.482596633270561e-06, "loss": 0.5598, "mean_token_accuracy": 0.8265779912471771, "num_tokens": 59063971.0, "step": 49130 }, { "entropy": 1.860026153922081, "epoch": 0.15232970049420355, "grad_norm": 7.569554805755615, "learning_rate": 6.481936981418064e-06, "loss": 0.5103, "mean_token_accuracy": 0.8441597208380699, "num_tokens": 59075534.0, "step": 49140 }, { "entropy": 1.9424165233969688, "epoch": 0.15236069961925325, "grad_norm": 7.921428680419922, "learning_rate": 6.481277530897865e-06, "loss": 0.603, "mean_token_accuracy": 0.8199049532413483, "num_tokens": 59086811.0, "step": 49150 }, { "entropy": 1.862628909945488, "epoch": 0.15239169874430294, "grad_norm": 7.977451801300049, "learning_rate": 6.480618281607572e-06, "loss": 0.494, "mean_token_accuracy": 0.8402055114507675, "num_tokens": 59099011.0, "step": 49160 }, { "entropy": 1.8622022837400436, "epoch": 0.15242269786935264, "grad_norm": 8.97208309173584, "learning_rate": 6.479959233444862e-06, "loss": 0.5266, "mean_token_accuracy": 0.8342375844717026, "num_tokens": 59110548.0, "step": 49170 }, { "entropy": 1.8860869765281678, "epoch": 0.15245369699440234, "grad_norm": 8.571884155273438, "learning_rate": 6.47930038630749e-06, "loss": 0.5412, "mean_token_accuracy": 0.8293202951550483, "num_tokens": 59121640.0, "step": 49180 }, { "entropy": 1.8871512919664384, "epoch": 0.15248469611945203, "grad_norm": 8.496338844299316, "learning_rate": 6.478641740093281e-06, "loss": 0.5286, "mean_token_accuracy": 0.8324317663908005, "num_tokens": 59133528.0, "step": 49190 }, { "entropy": 1.9237959653139114, "epoch": 0.15251569524450173, "grad_norm": 7.444377422332764, "learning_rate": 6.4779832947001306e-06, "loss": 0.5512, "mean_token_accuracy": 0.8336405113339425, "num_tokens": 59144958.0, "step": 49200 }, { "entropy": 1.9096000641584396, "epoch": 0.15254669436955143, "grad_norm": 10.638242721557617, "learning_rate": 6.47732505002601e-06, "loss": 0.5257, "mean_token_accuracy": 0.836242513358593, "num_tokens": 59155880.0, "step": 49210 }, { "entropy": 1.895295462012291, "epoch": 0.15257769349460112, "grad_norm": 8.52206802368164, "learning_rate": 6.4766670059689615e-06, "loss": 0.5051, "mean_token_accuracy": 0.834420631825924, "num_tokens": 59168200.0, "step": 49220 }, { "entropy": 1.8659065082669257, "epoch": 0.15260869261965082, "grad_norm": 8.12820816040039, "learning_rate": 6.476009162427102e-06, "loss": 0.6056, "mean_token_accuracy": 0.8158292233943939, "num_tokens": 59180782.0, "step": 49230 }, { "entropy": 1.8979418486356736, "epoch": 0.15263969174470052, "grad_norm": 10.203886985778809, "learning_rate": 6.475351519298617e-06, "loss": 0.5562, "mean_token_accuracy": 0.836780446767807, "num_tokens": 59192007.0, "step": 49240 }, { "entropy": 1.7612644746899604, "epoch": 0.15267069086975021, "grad_norm": 7.155702114105225, "learning_rate": 6.474694076481769e-06, "loss": 0.4834, "mean_token_accuracy": 0.8396282330155372, "num_tokens": 59205137.0, "step": 49250 }, { "entropy": 1.8855042546987533, "epoch": 0.15270168999479988, "grad_norm": 10.510095596313477, "learning_rate": 6.474036833874888e-06, "loss": 0.5486, "mean_token_accuracy": 0.8316191598773003, "num_tokens": 59216398.0, "step": 49260 }, { "entropy": 1.9063111320137978, "epoch": 0.15273268911984958, "grad_norm": 8.204238891601562, "learning_rate": 6.4733797913763806e-06, "loss": 0.5644, "mean_token_accuracy": 0.8286448165774345, "num_tokens": 59228148.0, "step": 49270 }, { "entropy": 1.8491440996527673, "epoch": 0.15276368824489928, "grad_norm": 4.404919147491455, "learning_rate": 6.472722948884723e-06, "loss": 0.4648, "mean_token_accuracy": 0.8485887929797172, "num_tokens": 59240379.0, "step": 49280 }, { "entropy": 1.8710645034909248, "epoch": 0.15279468736994897, "grad_norm": 4.196066856384277, "learning_rate": 6.472066306298462e-06, "loss": 0.5844, "mean_token_accuracy": 0.821666119992733, "num_tokens": 59251646.0, "step": 49290 }, { "entropy": 1.8651590749621392, "epoch": 0.15282568649499867, "grad_norm": 9.875333786010742, "learning_rate": 6.471409863516221e-06, "loss": 0.5306, "mean_token_accuracy": 0.8275208979845047, "num_tokens": 59263652.0, "step": 49300 }, { "entropy": 1.9215037196874618, "epoch": 0.15285668562004837, "grad_norm": 9.746886253356934, "learning_rate": 6.470753620436694e-06, "loss": 0.5378, "mean_token_accuracy": 0.8245725408196449, "num_tokens": 59275186.0, "step": 49310 }, { "entropy": 1.9342552363872527, "epoch": 0.15288768474509806, "grad_norm": 8.793591499328613, "learning_rate": 6.470097576958641e-06, "loss": 0.5609, "mean_token_accuracy": 0.8296951711177826, "num_tokens": 59286058.0, "step": 49320 }, { "entropy": 1.8291342303156852, "epoch": 0.15291868387014776, "grad_norm": 8.690282821655273, "learning_rate": 6.469441732980904e-06, "loss": 0.4498, "mean_token_accuracy": 0.8339086413383484, "num_tokens": 59298932.0, "step": 49330 }, { "entropy": 1.8255377933382988, "epoch": 0.15294968299519746, "grad_norm": 10.050409317016602, "learning_rate": 6.468786088402388e-06, "loss": 0.4545, "mean_token_accuracy": 0.8509842753410339, "num_tokens": 59311303.0, "step": 49340 }, { "entropy": 1.871800681948662, "epoch": 0.15298068212024715, "grad_norm": 8.856807708740234, "learning_rate": 6.468130643122074e-06, "loss": 0.5604, "mean_token_accuracy": 0.8348386570811271, "num_tokens": 59322970.0, "step": 49350 }, { "entropy": 1.8413175642490387, "epoch": 0.15301168124529685, "grad_norm": 10.395611763000488, "learning_rate": 6.4674753970390126e-06, "loss": 0.5536, "mean_token_accuracy": 0.825497391819954, "num_tokens": 59335548.0, "step": 49360 }, { "entropy": 1.9431413426995277, "epoch": 0.15304268037034655, "grad_norm": 8.011473655700684, "learning_rate": 6.46682035005233e-06, "loss": 0.6076, "mean_token_accuracy": 0.8263207510113716, "num_tokens": 59347158.0, "step": 49370 }, { "entropy": 1.8601020842790603, "epoch": 0.15307367949539624, "grad_norm": 10.249993324279785, "learning_rate": 6.466165502061217e-06, "loss": 0.5366, "mean_token_accuracy": 0.833003306388855, "num_tokens": 59359361.0, "step": 49380 }, { "entropy": 1.8623602211475372, "epoch": 0.15310467862044594, "grad_norm": 9.118306159973145, "learning_rate": 6.465510852964943e-06, "loss": 0.5088, "mean_token_accuracy": 0.8340568155050277, "num_tokens": 59371827.0, "step": 49390 }, { "entropy": 1.9150095269083978, "epoch": 0.15313567774549564, "grad_norm": 8.470681190490723, "learning_rate": 6.464856402662844e-06, "loss": 0.5721, "mean_token_accuracy": 0.8367661386728287, "num_tokens": 59383425.0, "step": 49400 }, { "entropy": 1.9471410617232323, "epoch": 0.15316667687054533, "grad_norm": 3.6562201976776123, "learning_rate": 6.4642021510543284e-06, "loss": 0.5508, "mean_token_accuracy": 0.8367534473538398, "num_tokens": 59394778.0, "step": 49410 }, { "entropy": 1.831431895494461, "epoch": 0.15319767599559503, "grad_norm": 7.588027000427246, "learning_rate": 6.463548098038879e-06, "loss": 0.4789, "mean_token_accuracy": 0.8452267602086068, "num_tokens": 59407172.0, "step": 49420 }, { "entropy": 1.8925320595502853, "epoch": 0.15322867512064473, "grad_norm": 8.88277816772461, "learning_rate": 6.462894243516044e-06, "loss": 0.5003, "mean_token_accuracy": 0.8345058739185334, "num_tokens": 59418806.0, "step": 49430 }, { "entropy": 1.8566480353474617, "epoch": 0.15325967424569442, "grad_norm": 4.175265789031982, "learning_rate": 6.462240587385448e-06, "loss": 0.5153, "mean_token_accuracy": 0.8374285340309143, "num_tokens": 59430897.0, "step": 49440 }, { "entropy": 1.8792894035577774, "epoch": 0.15329067337074412, "grad_norm": 11.97266674041748, "learning_rate": 6.461587129546784e-06, "loss": 0.5335, "mean_token_accuracy": 0.8309213414788246, "num_tokens": 59442625.0, "step": 49450 }, { "entropy": 1.8707554250955583, "epoch": 0.15332167249579381, "grad_norm": 9.689730644226074, "learning_rate": 6.460933869899815e-06, "loss": 0.5504, "mean_token_accuracy": 0.8266577154397965, "num_tokens": 59454969.0, "step": 49460 }, { "entropy": 1.912343481183052, "epoch": 0.1533526716208435, "grad_norm": 9.543248176574707, "learning_rate": 6.460280808344378e-06, "loss": 0.5702, "mean_token_accuracy": 0.8356236189603805, "num_tokens": 59467285.0, "step": 49470 }, { "entropy": 1.8799829974770546, "epoch": 0.1533836707458932, "grad_norm": 7.834064960479736, "learning_rate": 6.459627944780378e-06, "loss": 0.5296, "mean_token_accuracy": 0.8366628587245941, "num_tokens": 59480058.0, "step": 49480 }, { "entropy": 1.8075303509831429, "epoch": 0.1534146698709429, "grad_norm": 3.9849984645843506, "learning_rate": 6.458975279107794e-06, "loss": 0.5013, "mean_token_accuracy": 0.8453790143132209, "num_tokens": 59493616.0, "step": 49490 }, { "entropy": 1.8907064393162727, "epoch": 0.1534456689959926, "grad_norm": 8.766815185546875, "learning_rate": 6.458322811226673e-06, "loss": 0.5133, "mean_token_accuracy": 0.838210554420948, "num_tokens": 59506402.0, "step": 49500 }, { "entropy": 1.8980057820677758, "epoch": 0.15347666812104227, "grad_norm": 9.522225379943848, "learning_rate": 6.457670541037133e-06, "loss": 0.522, "mean_token_accuracy": 0.8347028121352196, "num_tokens": 59518065.0, "step": 49510 }, { "entropy": 1.9182281613349914, "epoch": 0.15350766724609197, "grad_norm": 8.175332069396973, "learning_rate": 6.457018468439363e-06, "loss": 0.5776, "mean_token_accuracy": 0.8285543143749237, "num_tokens": 59528843.0, "step": 49520 }, { "entropy": 1.8754586443305015, "epoch": 0.15353866637114166, "grad_norm": 8.504158020019531, "learning_rate": 6.456366593333622e-06, "loss": 0.5776, "mean_token_accuracy": 0.8261403411626815, "num_tokens": 59541901.0, "step": 49530 }, { "entropy": 1.816955418884754, "epoch": 0.15356966549619136, "grad_norm": 8.501500129699707, "learning_rate": 6.455714915620241e-06, "loss": 0.4729, "mean_token_accuracy": 0.840816356241703, "num_tokens": 59555193.0, "step": 49540 }, { "entropy": 1.8059712588787078, "epoch": 0.15360066462124106, "grad_norm": 4.024796009063721, "learning_rate": 6.45506343519962e-06, "loss": 0.4305, "mean_token_accuracy": 0.8564706295728683, "num_tokens": 59568219.0, "step": 49550 }, { "entropy": 1.932602970302105, "epoch": 0.15363166374629075, "grad_norm": 8.853862762451172, "learning_rate": 6.4544121519722305e-06, "loss": 0.5538, "mean_token_accuracy": 0.8353964149951935, "num_tokens": 59579262.0, "step": 49560 }, { "entropy": 1.8641065925359726, "epoch": 0.15366266287134045, "grad_norm": 7.992645263671875, "learning_rate": 6.453761065838612e-06, "loss": 0.5253, "mean_token_accuracy": 0.842236676812172, "num_tokens": 59590979.0, "step": 49570 }, { "entropy": 1.8351068049669266, "epoch": 0.15369366199639015, "grad_norm": 11.022222518920898, "learning_rate": 6.453110176699378e-06, "loss": 0.5175, "mean_token_accuracy": 0.836832246184349, "num_tokens": 59603268.0, "step": 49580 }, { "entropy": 1.8691863656044005, "epoch": 0.15372466112143984, "grad_norm": 10.182769775390625, "learning_rate": 6.452459484455208e-06, "loss": 0.4967, "mean_token_accuracy": 0.8315588355064392, "num_tokens": 59615230.0, "step": 49590 }, { "entropy": 1.867138534784317, "epoch": 0.15375566024648954, "grad_norm": 8.996567726135254, "learning_rate": 6.451808989006854e-06, "loss": 0.5441, "mean_token_accuracy": 0.8165970057249069, "num_tokens": 59628275.0, "step": 49600 }, { "entropy": 1.8459146052598954, "epoch": 0.15378665937153924, "grad_norm": 4.567706108093262, "learning_rate": 6.451158690255139e-06, "loss": 0.4739, "mean_token_accuracy": 0.8464686393737793, "num_tokens": 59640506.0, "step": 49610 }, { "entropy": 1.9115084454417228, "epoch": 0.15381765849658893, "grad_norm": 8.155994415283203, "learning_rate": 6.450508588100953e-06, "loss": 0.5458, "mean_token_accuracy": 0.8274505391716958, "num_tokens": 59651786.0, "step": 49620 }, { "entropy": 1.8388926222920419, "epoch": 0.15384865762163863, "grad_norm": 3.681293487548828, "learning_rate": 6.449858682445258e-06, "loss": 0.4744, "mean_token_accuracy": 0.842620424926281, "num_tokens": 59664048.0, "step": 49630 }, { "entropy": 1.836945366859436, "epoch": 0.15387965674668833, "grad_norm": 9.795785903930664, "learning_rate": 6.449208973189086e-06, "loss": 0.4704, "mean_token_accuracy": 0.8397533014416695, "num_tokens": 59676318.0, "step": 49640 }, { "entropy": 1.9197256535291671, "epoch": 0.15391065587173802, "grad_norm": 9.240389823913574, "learning_rate": 6.448559460233536e-06, "loss": 0.5298, "mean_token_accuracy": 0.8350172653794289, "num_tokens": 59686954.0, "step": 49650 }, { "entropy": 1.9680290162563323, "epoch": 0.15394165499678772, "grad_norm": 8.425545692443848, "learning_rate": 6.447910143479779e-06, "loss": 0.568, "mean_token_accuracy": 0.8262524694204331, "num_tokens": 59697663.0, "step": 49660 }, { "entropy": 1.8470780551433563, "epoch": 0.15397265412183742, "grad_norm": 4.306032180786133, "learning_rate": 6.447261022829057e-06, "loss": 0.4315, "mean_token_accuracy": 0.847578053176403, "num_tokens": 59710534.0, "step": 49670 }, { "entropy": 1.9209625497460365, "epoch": 0.1540036532468871, "grad_norm": 8.741031646728516, "learning_rate": 6.446612098182679e-06, "loss": 0.5573, "mean_token_accuracy": 0.8277770847082138, "num_tokens": 59721380.0, "step": 49680 }, { "entropy": 1.7767624616622926, "epoch": 0.1540346523719368, "grad_norm": 9.159987449645996, "learning_rate": 6.445963369442024e-06, "loss": 0.4768, "mean_token_accuracy": 0.8386806547641754, "num_tokens": 59734270.0, "step": 49690 }, { "entropy": 1.9001009285449981, "epoch": 0.1540656514969865, "grad_norm": 13.77109432220459, "learning_rate": 6.4453148365085425e-06, "loss": 0.5171, "mean_token_accuracy": 0.8450674623250961, "num_tokens": 59746960.0, "step": 49700 }, { "entropy": 1.9309724509716033, "epoch": 0.1540966506220362, "grad_norm": 4.395373821258545, "learning_rate": 6.444666499283752e-06, "loss": 0.5595, "mean_token_accuracy": 0.8193286895751953, "num_tokens": 59758480.0, "step": 49710 }, { "entropy": 1.8494668424129486, "epoch": 0.1541276497470859, "grad_norm": 4.505528926849365, "learning_rate": 6.444018357669239e-06, "loss": 0.4911, "mean_token_accuracy": 0.8275418624281883, "num_tokens": 59772380.0, "step": 49720 }, { "entropy": 1.9033274337649346, "epoch": 0.1541586488721356, "grad_norm": 9.880914688110352, "learning_rate": 6.443370411566663e-06, "loss": 0.5613, "mean_token_accuracy": 0.8315735951066017, "num_tokens": 59784089.0, "step": 49730 }, { "entropy": 1.888562636077404, "epoch": 0.1541896479971853, "grad_norm": 8.08756160736084, "learning_rate": 6.442722660877747e-06, "loss": 0.5053, "mean_token_accuracy": 0.8301805481314659, "num_tokens": 59796171.0, "step": 49740 }, { "entropy": 1.90590338408947, "epoch": 0.154220647122235, "grad_norm": 9.111113548278809, "learning_rate": 6.44207510550429e-06, "loss": 0.5416, "mean_token_accuracy": 0.8257271036505699, "num_tokens": 59807917.0, "step": 49750 }, { "entropy": 1.9074049085378646, "epoch": 0.15425164624728466, "grad_norm": 9.030037879943848, "learning_rate": 6.441427745348153e-06, "loss": 0.5366, "mean_token_accuracy": 0.8337802454829216, "num_tokens": 59818916.0, "step": 49760 }, { "entropy": 1.8746358096599578, "epoch": 0.15428264537233435, "grad_norm": 7.909918308258057, "learning_rate": 6.440780580311269e-06, "loss": 0.5091, "mean_token_accuracy": 0.8407538250088692, "num_tokens": 59830862.0, "step": 49770 }, { "entropy": 1.875384160876274, "epoch": 0.15431364449738405, "grad_norm": 8.324300765991211, "learning_rate": 6.4401336102956434e-06, "loss": 0.5317, "mean_token_accuracy": 0.8312161207199097, "num_tokens": 59842486.0, "step": 49780 }, { "entropy": 1.9422027677297593, "epoch": 0.15434464362243375, "grad_norm": 9.501036643981934, "learning_rate": 6.439486835203346e-06, "loss": 0.5842, "mean_token_accuracy": 0.8298379242420196, "num_tokens": 59853228.0, "step": 49790 }, { "entropy": 1.854721449315548, "epoch": 0.15437564274748344, "grad_norm": 7.378803730010986, "learning_rate": 6.438840254936516e-06, "loss": 0.509, "mean_token_accuracy": 0.8368782594799995, "num_tokens": 59864876.0, "step": 49800 }, { "entropy": 1.8604190409183503, "epoch": 0.15440664187253314, "grad_norm": 11.659805297851562, "learning_rate": 6.438193869397364e-06, "loss": 0.5423, "mean_token_accuracy": 0.8247652351856232, "num_tokens": 59877865.0, "step": 49810 }, { "entropy": 1.8469142317771912, "epoch": 0.15443764099758284, "grad_norm": 4.957024097442627, "learning_rate": 6.437547678488166e-06, "loss": 0.5404, "mean_token_accuracy": 0.8259366884827614, "num_tokens": 59890976.0, "step": 49820 }, { "entropy": 1.8666848599910737, "epoch": 0.15446864012263253, "grad_norm": 9.470239639282227, "learning_rate": 6.436901682111268e-06, "loss": 0.4959, "mean_token_accuracy": 0.8429308265447617, "num_tokens": 59903295.0, "step": 49830 }, { "entropy": 1.9163474783301353, "epoch": 0.15449963924768223, "grad_norm": 9.075465202331543, "learning_rate": 6.436255880169087e-06, "loss": 0.5747, "mean_token_accuracy": 0.8243316933512688, "num_tokens": 59914765.0, "step": 49840 }, { "entropy": 1.8828782141208649, "epoch": 0.15453063837273193, "grad_norm": 9.262798309326172, "learning_rate": 6.4356102725641035e-06, "loss": 0.5203, "mean_token_accuracy": 0.837623517215252, "num_tokens": 59925835.0, "step": 49850 }, { "entropy": 1.8749229982495308, "epoch": 0.15456163749778162, "grad_norm": 8.3054780960083, "learning_rate": 6.434964859198871e-06, "loss": 0.4989, "mean_token_accuracy": 0.8357605487108231, "num_tokens": 59937496.0, "step": 49860 }, { "entropy": 1.9846446454524993, "epoch": 0.15459263662283132, "grad_norm": 7.847235679626465, "learning_rate": 6.434319639976007e-06, "loss": 0.5728, "mean_token_accuracy": 0.8296759322285652, "num_tokens": 59948007.0, "step": 49870 }, { "entropy": 1.8958945728838443, "epoch": 0.15462363574788102, "grad_norm": 8.949000358581543, "learning_rate": 6.433674614798204e-06, "loss": 0.5476, "mean_token_accuracy": 0.8374200582504272, "num_tokens": 59960116.0, "step": 49880 }, { "entropy": 1.8910530745983123, "epoch": 0.1546546348729307, "grad_norm": 11.521330833435059, "learning_rate": 6.433029783568216e-06, "loss": 0.5068, "mean_token_accuracy": 0.8429587453603744, "num_tokens": 59971792.0, "step": 49890 }, { "entropy": 1.9622877299785615, "epoch": 0.1546856339979804, "grad_norm": 8.818026542663574, "learning_rate": 6.4323851461888694e-06, "loss": 0.5706, "mean_token_accuracy": 0.8334489464759827, "num_tokens": 59982734.0, "step": 49900 }, { "entropy": 1.9317353338003158, "epoch": 0.1547166331230301, "grad_norm": 9.778739929199219, "learning_rate": 6.431740702563056e-06, "loss": 0.5676, "mean_token_accuracy": 0.8346163481473923, "num_tokens": 59993700.0, "step": 49910 }, { "entropy": 1.8633641496300697, "epoch": 0.1547476322480798, "grad_norm": 8.956582069396973, "learning_rate": 6.431096452593738e-06, "loss": 0.4891, "mean_token_accuracy": 0.8405205994844437, "num_tokens": 60005298.0, "step": 49920 }, { "entropy": 1.9589593350887298, "epoch": 0.1547786313731295, "grad_norm": 7.157927989959717, "learning_rate": 6.4304523961839436e-06, "loss": 0.5482, "mean_token_accuracy": 0.8311053574085235, "num_tokens": 60016287.0, "step": 49930 }, { "entropy": 1.9007370486855506, "epoch": 0.1548096304981792, "grad_norm": 11.051389694213867, "learning_rate": 6.429808533236771e-06, "loss": 0.536, "mean_token_accuracy": 0.830623884499073, "num_tokens": 60028218.0, "step": 49940 }, { "entropy": 1.8595467865467072, "epoch": 0.1548406296232289, "grad_norm": 10.759744644165039, "learning_rate": 6.429164863655384e-06, "loss": 0.5172, "mean_token_accuracy": 0.8303674578666687, "num_tokens": 60040116.0, "step": 49950 }, { "entropy": 1.8307231336832046, "epoch": 0.1548716287482786, "grad_norm": 9.683849334716797, "learning_rate": 6.428521387343016e-06, "loss": 0.4501, "mean_token_accuracy": 0.847040268778801, "num_tokens": 60053053.0, "step": 49960 }, { "entropy": 1.855046309530735, "epoch": 0.15490262787332829, "grad_norm": 7.891843318939209, "learning_rate": 6.427878104202968e-06, "loss": 0.5095, "mean_token_accuracy": 0.8327124953269959, "num_tokens": 60065345.0, "step": 49970 }, { "entropy": 1.9063360676169396, "epoch": 0.15493362699837798, "grad_norm": 10.191269874572754, "learning_rate": 6.4272350141386095e-06, "loss": 0.5739, "mean_token_accuracy": 0.8289344042539597, "num_tokens": 60077605.0, "step": 49980 }, { "entropy": 1.8241621538996697, "epoch": 0.15496462612342768, "grad_norm": 10.132854461669922, "learning_rate": 6.4265921170533755e-06, "loss": 0.4474, "mean_token_accuracy": 0.8447253108024597, "num_tokens": 60089882.0, "step": 49990 }, { "entropy": 1.9241633802652358, "epoch": 0.15499562524847738, "grad_norm": 8.673702239990234, "learning_rate": 6.425949412850768e-06, "loss": 0.5634, "mean_token_accuracy": 0.8289055705070496, "num_tokens": 60100890.0, "step": 50000 }, { "entropy": 1.9505061358213425, "epoch": 0.15502662437352704, "grad_norm": 8.550101280212402, "learning_rate": 6.4253069014343615e-06, "loss": 0.5529, "mean_token_accuracy": 0.8236529782414437, "num_tokens": 60112909.0, "step": 50010 }, { "entropy": 1.9289822548627853, "epoch": 0.15505762349857674, "grad_norm": 8.745688438415527, "learning_rate": 6.424664582707793e-06, "loss": 0.5462, "mean_token_accuracy": 0.8206963390111923, "num_tokens": 60124202.0, "step": 50020 }, { "entropy": 1.7902291625738145, "epoch": 0.15508862262362644, "grad_norm": 3.6163487434387207, "learning_rate": 6.424022456574768e-06, "loss": 0.4297, "mean_token_accuracy": 0.8540971964597702, "num_tokens": 60137010.0, "step": 50030 }, { "entropy": 1.907183986902237, "epoch": 0.15511962174867613, "grad_norm": 9.091652870178223, "learning_rate": 6.42338052293906e-06, "loss": 0.5648, "mean_token_accuracy": 0.8403370261192322, "num_tokens": 60148040.0, "step": 50040 }, { "entropy": 1.8011170402169228, "epoch": 0.15515062087372583, "grad_norm": 4.003586769104004, "learning_rate": 6.4227387817045115e-06, "loss": 0.4439, "mean_token_accuracy": 0.8478186964988709, "num_tokens": 60160541.0, "step": 50050 }, { "entropy": 1.8726102083921432, "epoch": 0.15518161999877553, "grad_norm": 9.244813919067383, "learning_rate": 6.42209723277503e-06, "loss": 0.5469, "mean_token_accuracy": 0.8411934614181519, "num_tokens": 60171909.0, "step": 50060 }, { "entropy": 1.8079619467258454, "epoch": 0.15521261912382522, "grad_norm": 8.226319313049316, "learning_rate": 6.421455876054589e-06, "loss": 0.4337, "mean_token_accuracy": 0.8530865862965584, "num_tokens": 60184414.0, "step": 50070 }, { "entropy": 1.9148479044437408, "epoch": 0.15524361824887492, "grad_norm": 9.0443696975708, "learning_rate": 6.420814711447232e-06, "loss": 0.549, "mean_token_accuracy": 0.8360567212104797, "num_tokens": 60196202.0, "step": 50080 }, { "entropy": 1.754614818096161, "epoch": 0.15527461737392462, "grad_norm": 9.89665412902832, "learning_rate": 6.42017373885707e-06, "loss": 0.4003, "mean_token_accuracy": 0.849841496348381, "num_tokens": 60209012.0, "step": 50090 }, { "entropy": 1.7963035687804223, "epoch": 0.1553056164989743, "grad_norm": 3.856391429901123, "learning_rate": 6.419532958188275e-06, "loss": 0.4344, "mean_token_accuracy": 0.8433475911617279, "num_tokens": 60221933.0, "step": 50100 }, { "entropy": 1.9359655752778053, "epoch": 0.155336615624024, "grad_norm": 8.836137771606445, "learning_rate": 6.418892369345093e-06, "loss": 0.5711, "mean_token_accuracy": 0.823367503285408, "num_tokens": 60232952.0, "step": 50110 }, { "entropy": 1.9027406126260757, "epoch": 0.1553676147490737, "grad_norm": 8.101412773132324, "learning_rate": 6.418251972231836e-06, "loss": 0.5538, "mean_token_accuracy": 0.8346571624279022, "num_tokens": 60244389.0, "step": 50120 }, { "entropy": 1.8543214410543443, "epoch": 0.1553986138741234, "grad_norm": 4.314391613006592, "learning_rate": 6.417611766752878e-06, "loss": 0.4862, "mean_token_accuracy": 0.8420602694153786, "num_tokens": 60256224.0, "step": 50130 }, { "entropy": 1.9107976794242858, "epoch": 0.1554296129991731, "grad_norm": 9.208541870117188, "learning_rate": 6.416971752812663e-06, "loss": 0.5544, "mean_token_accuracy": 0.8198239028453826, "num_tokens": 60267830.0, "step": 50140 }, { "entropy": 1.8322341233491897, "epoch": 0.1554606121242228, "grad_norm": 4.257328033447266, "learning_rate": 6.416331930315704e-06, "loss": 0.4799, "mean_token_accuracy": 0.8367450326681137, "num_tokens": 60281007.0, "step": 50150 }, { "entropy": 1.9688672095537185, "epoch": 0.1554916112492725, "grad_norm": 7.649997711181641, "learning_rate": 6.415692299166574e-06, "loss": 0.6013, "mean_token_accuracy": 0.8196437805891037, "num_tokens": 60291878.0, "step": 50160 }, { "entropy": 1.9667943209409713, "epoch": 0.1555226103743222, "grad_norm": 9.782792091369629, "learning_rate": 6.41505285926992e-06, "loss": 0.5785, "mean_token_accuracy": 0.8302391171455383, "num_tokens": 60302517.0, "step": 50170 }, { "entropy": 1.8588977128267288, "epoch": 0.1555536094993719, "grad_norm": 7.907834053039551, "learning_rate": 6.41441361053045e-06, "loss": 0.4772, "mean_token_accuracy": 0.8453133404254913, "num_tokens": 60314120.0, "step": 50180 }, { "entropy": 1.8701641455292701, "epoch": 0.15558460862442158, "grad_norm": 8.567540168762207, "learning_rate": 6.413774552852943e-06, "loss": 0.5398, "mean_token_accuracy": 0.8220806911587715, "num_tokens": 60326477.0, "step": 50190 }, { "entropy": 1.8901820540428163, "epoch": 0.15561560774947128, "grad_norm": 13.7083740234375, "learning_rate": 6.41313568614224e-06, "loss": 0.5659, "mean_token_accuracy": 0.8234784409403801, "num_tokens": 60338791.0, "step": 50200 }, { "entropy": 1.8873229175806046, "epoch": 0.15564660687452098, "grad_norm": 7.920546054840088, "learning_rate": 6.4124970103032505e-06, "loss": 0.5607, "mean_token_accuracy": 0.8268739849328994, "num_tokens": 60350606.0, "step": 50210 }, { "entropy": 1.8220305427908898, "epoch": 0.15567760599957067, "grad_norm": 9.152536392211914, "learning_rate": 6.411858525240952e-06, "loss": 0.4489, "mean_token_accuracy": 0.8429689288139344, "num_tokens": 60362867.0, "step": 50220 }, { "entropy": 1.9848755061626435, "epoch": 0.15570860512462037, "grad_norm": 8.138653755187988, "learning_rate": 6.411220230860381e-06, "loss": 0.5795, "mean_token_accuracy": 0.8322038248181343, "num_tokens": 60373606.0, "step": 50230 }, { "entropy": 1.9133560702204704, "epoch": 0.15573960424967007, "grad_norm": 8.208351135253906, "learning_rate": 6.410582127066652e-06, "loss": 0.5564, "mean_token_accuracy": 0.8335413232445716, "num_tokens": 60385315.0, "step": 50240 }, { "entropy": 1.8043673783540726, "epoch": 0.15577060337471973, "grad_norm": 8.482664108276367, "learning_rate": 6.4099442137649356e-06, "loss": 0.505, "mean_token_accuracy": 0.8454105794429779, "num_tokens": 60398022.0, "step": 50250 }, { "entropy": 1.9254692614078521, "epoch": 0.15580160249976943, "grad_norm": 10.395545959472656, "learning_rate": 6.409306490860473e-06, "loss": 0.5454, "mean_token_accuracy": 0.8282811924815178, "num_tokens": 60409280.0, "step": 50260 }, { "entropy": 1.8486803263425826, "epoch": 0.15583260162481913, "grad_norm": 10.327423095703125, "learning_rate": 6.408668958258571e-06, "loss": 0.5141, "mean_token_accuracy": 0.8356290921568871, "num_tokens": 60421673.0, "step": 50270 }, { "entropy": 1.8446889415383338, "epoch": 0.15586360074986882, "grad_norm": 10.985127449035645, "learning_rate": 6.408031615864598e-06, "loss": 0.5038, "mean_token_accuracy": 0.8322935044765473, "num_tokens": 60434775.0, "step": 50280 }, { "entropy": 1.943482118844986, "epoch": 0.15589459987491852, "grad_norm": 9.617402076721191, "learning_rate": 6.407394463583996e-06, "loss": 0.5226, "mean_token_accuracy": 0.8407940044999123, "num_tokens": 60445972.0, "step": 50290 }, { "entropy": 1.85861434340477, "epoch": 0.15592559899996822, "grad_norm": 9.958732604980469, "learning_rate": 6.406757501322266e-06, "loss": 0.5453, "mean_token_accuracy": 0.8299056336283683, "num_tokens": 60458006.0, "step": 50300 }, { "entropy": 1.887197096645832, "epoch": 0.15595659812501791, "grad_norm": 11.390666961669922, "learning_rate": 6.406120728984979e-06, "loss": 0.5547, "mean_token_accuracy": 0.8242016166448594, "num_tokens": 60470382.0, "step": 50310 }, { "entropy": 1.8762848794460296, "epoch": 0.1559875972500676, "grad_norm": 7.51845121383667, "learning_rate": 6.4054841464777696e-06, "loss": 0.5142, "mean_token_accuracy": 0.8328775450587272, "num_tokens": 60482717.0, "step": 50320 }, { "entropy": 1.9675488024950027, "epoch": 0.1560185963751173, "grad_norm": 9.021512985229492, "learning_rate": 6.404847753706339e-06, "loss": 0.5867, "mean_token_accuracy": 0.8280714631080628, "num_tokens": 60493388.0, "step": 50330 }, { "entropy": 1.9901121139526368, "epoch": 0.156049595500167, "grad_norm": 8.092247009277344, "learning_rate": 6.404211550576453e-06, "loss": 0.5651, "mean_token_accuracy": 0.8340759441256523, "num_tokens": 60504429.0, "step": 50340 }, { "entropy": 1.8783787608146667, "epoch": 0.1560805946252167, "grad_norm": 9.310537338256836, "learning_rate": 6.4035755369939425e-06, "loss": 0.5001, "mean_token_accuracy": 0.8348104074597359, "num_tokens": 60516387.0, "step": 50350 }, { "entropy": 1.8168802306056022, "epoch": 0.1561115937502664, "grad_norm": 3.784146785736084, "learning_rate": 6.4029397128647065e-06, "loss": 0.4781, "mean_token_accuracy": 0.8457161352038384, "num_tokens": 60529393.0, "step": 50360 }, { "entropy": 1.7647080093622207, "epoch": 0.1561425928753161, "grad_norm": 9.681499481201172, "learning_rate": 6.402304078094705e-06, "loss": 0.3944, "mean_token_accuracy": 0.8579833477735519, "num_tokens": 60543184.0, "step": 50370 }, { "entropy": 1.8571357518434524, "epoch": 0.1561735920003658, "grad_norm": 8.170286178588867, "learning_rate": 6.40166863258997e-06, "loss": 0.4846, "mean_token_accuracy": 0.8396312475204468, "num_tokens": 60555841.0, "step": 50380 }, { "entropy": 1.9185105443000794, "epoch": 0.1562045911254155, "grad_norm": 7.637692451477051, "learning_rate": 6.401033376256593e-06, "loss": 0.5627, "mean_token_accuracy": 0.8341576635837555, "num_tokens": 60567373.0, "step": 50390 }, { "entropy": 1.951904332637787, "epoch": 0.15623559025046518, "grad_norm": 7.780064105987549, "learning_rate": 6.40039830900073e-06, "loss": 0.5698, "mean_token_accuracy": 0.8260944783687592, "num_tokens": 60578359.0, "step": 50400 }, { "entropy": 1.937084037065506, "epoch": 0.15626658937551488, "grad_norm": 8.136171340942383, "learning_rate": 6.399763430728608e-06, "loss": 0.5558, "mean_token_accuracy": 0.8284446790814399, "num_tokens": 60590095.0, "step": 50410 }, { "entropy": 1.8329094797372818, "epoch": 0.15629758850056458, "grad_norm": 7.345213890075684, "learning_rate": 6.399128741346514e-06, "loss": 0.486, "mean_token_accuracy": 0.8410397842526436, "num_tokens": 60603043.0, "step": 50420 }, { "entropy": 1.8990901306271553, "epoch": 0.15632858762561427, "grad_norm": 8.570768356323242, "learning_rate": 6.398494240760803e-06, "loss": 0.5062, "mean_token_accuracy": 0.8417127162218094, "num_tokens": 60614516.0, "step": 50430 }, { "entropy": 1.8423827588558197, "epoch": 0.15635958675066397, "grad_norm": 4.247179985046387, "learning_rate": 6.397859928877893e-06, "loss": 0.4673, "mean_token_accuracy": 0.8417179524898529, "num_tokens": 60626871.0, "step": 50440 }, { "entropy": 2.0029925853013992, "epoch": 0.15639058587571367, "grad_norm": 8.819866180419922, "learning_rate": 6.3972258056042655e-06, "loss": 0.5682, "mean_token_accuracy": 0.829233068227768, "num_tokens": 60637756.0, "step": 50450 }, { "entropy": 1.9364984780550003, "epoch": 0.15642158500076336, "grad_norm": 9.162713050842285, "learning_rate": 6.396591870846475e-06, "loss": 0.5832, "mean_token_accuracy": 0.8301840484142303, "num_tokens": 60648877.0, "step": 50460 }, { "entropy": 1.9391376033425332, "epoch": 0.15645258412581306, "grad_norm": 8.27598762512207, "learning_rate": 6.395958124511129e-06, "loss": 0.5563, "mean_token_accuracy": 0.8327331587672233, "num_tokens": 60659919.0, "step": 50470 }, { "entropy": 1.9072021529078484, "epoch": 0.15648358325086276, "grad_norm": 9.113195419311523, "learning_rate": 6.395324566504908e-06, "loss": 0.5266, "mean_token_accuracy": 0.831612104177475, "num_tokens": 60671627.0, "step": 50480 }, { "entropy": 1.9049764469265937, "epoch": 0.15651458237591245, "grad_norm": 4.894548416137695, "learning_rate": 6.394691196734555e-06, "loss": 0.5166, "mean_token_accuracy": 0.8367709815502167, "num_tokens": 60683963.0, "step": 50490 }, { "entropy": 1.8629326492547988, "epoch": 0.15654558150096212, "grad_norm": 5.698431968688965, "learning_rate": 6.394058015106876e-06, "loss": 0.461, "mean_token_accuracy": 0.8405030608177185, "num_tokens": 60696888.0, "step": 50500 }, { "entropy": 1.9971718460321426, "epoch": 0.15657658062601182, "grad_norm": 7.774511337280273, "learning_rate": 6.393425021528746e-06, "loss": 0.5977, "mean_token_accuracy": 0.8220291540026665, "num_tokens": 60707257.0, "step": 50510 }, { "entropy": 1.9252537876367568, "epoch": 0.15660757975106152, "grad_norm": 4.02286434173584, "learning_rate": 6.392792215907099e-06, "loss": 0.51, "mean_token_accuracy": 0.8373642593622208, "num_tokens": 60719173.0, "step": 50520 }, { "entropy": 1.9036842539906502, "epoch": 0.1566385788761112, "grad_norm": 9.243278503417969, "learning_rate": 6.392159598148937e-06, "loss": 0.5501, "mean_token_accuracy": 0.8288576260209084, "num_tokens": 60730887.0, "step": 50530 }, { "entropy": 1.950339911878109, "epoch": 0.1566695780011609, "grad_norm": 9.252776145935059, "learning_rate": 6.391527168161323e-06, "loss": 0.6262, "mean_token_accuracy": 0.8195764616131782, "num_tokens": 60742662.0, "step": 50540 }, { "entropy": 1.9394678846001625, "epoch": 0.1567005771262106, "grad_norm": 9.990178108215332, "learning_rate": 6.390894925851392e-06, "loss": 0.5444, "mean_token_accuracy": 0.8261475265026093, "num_tokens": 60753958.0, "step": 50550 }, { "entropy": 1.8357860133051873, "epoch": 0.1567315762512603, "grad_norm": 6.805025100708008, "learning_rate": 6.390262871126333e-06, "loss": 0.4584, "mean_token_accuracy": 0.8525337189435959, "num_tokens": 60766091.0, "step": 50560 }, { "entropy": 1.8700495898723601, "epoch": 0.15676257537631, "grad_norm": 8.728248596191406, "learning_rate": 6.3896310038934085e-06, "loss": 0.5036, "mean_token_accuracy": 0.8417010590434074, "num_tokens": 60777610.0, "step": 50570 }, { "entropy": 1.8651820093393325, "epoch": 0.1567935745013597, "grad_norm": 9.503561019897461, "learning_rate": 6.388999324059937e-06, "loss": 0.4944, "mean_token_accuracy": 0.8336687937378884, "num_tokens": 60789545.0, "step": 50580 }, { "entropy": 1.8762501239776612, "epoch": 0.1568245736264094, "grad_norm": 10.497973442077637, "learning_rate": 6.3883678315333085e-06, "loss": 0.5375, "mean_token_accuracy": 0.8256103843450546, "num_tokens": 60801947.0, "step": 50590 }, { "entropy": 1.8612007051706314, "epoch": 0.1568555727514591, "grad_norm": 8.15131950378418, "learning_rate": 6.387736526220971e-06, "loss": 0.4962, "mean_token_accuracy": 0.8382210031151771, "num_tokens": 60814283.0, "step": 50600 }, { "entropy": 1.7936832830309868, "epoch": 0.15688657187650878, "grad_norm": 8.772533416748047, "learning_rate": 6.387105408030442e-06, "loss": 0.4245, "mean_token_accuracy": 0.8425097793340683, "num_tokens": 60827792.0, "step": 50610 }, { "entropy": 1.969840306043625, "epoch": 0.15691757100155848, "grad_norm": 8.165382385253906, "learning_rate": 6.386474476869298e-06, "loss": 0.5996, "mean_token_accuracy": 0.8277344390749931, "num_tokens": 60838984.0, "step": 50620 }, { "entropy": 1.8312163800001144, "epoch": 0.15694857012660818, "grad_norm": 3.5525267124176025, "learning_rate": 6.3858437326451805e-06, "loss": 0.5051, "mean_token_accuracy": 0.8387185871601105, "num_tokens": 60852094.0, "step": 50630 }, { "entropy": 1.8953583613038063, "epoch": 0.15697956925165787, "grad_norm": 9.432021141052246, "learning_rate": 6.3852131752658e-06, "loss": 0.4871, "mean_token_accuracy": 0.8444772154092789, "num_tokens": 60863876.0, "step": 50640 }, { "entropy": 1.8836359083652496, "epoch": 0.15701056837670757, "grad_norm": 9.322160720825195, "learning_rate": 6.384582804638923e-06, "loss": 0.5454, "mean_token_accuracy": 0.8311738058924675, "num_tokens": 60876801.0, "step": 50650 }, { "entropy": 1.92313295006752, "epoch": 0.15704156750175727, "grad_norm": 8.350409507751465, "learning_rate": 6.383952620672385e-06, "loss": 0.5855, "mean_token_accuracy": 0.8254999339580535, "num_tokens": 60887997.0, "step": 50660 }, { "entropy": 1.9507147312164306, "epoch": 0.15707256662680696, "grad_norm": 8.378862380981445, "learning_rate": 6.383322623274081e-06, "loss": 0.5762, "mean_token_accuracy": 0.8160288318991661, "num_tokens": 60899916.0, "step": 50670 }, { "entropy": 1.7835942827165128, "epoch": 0.15710356575185666, "grad_norm": 7.95631217956543, "learning_rate": 6.382692812351976e-06, "loss": 0.4651, "mean_token_accuracy": 0.8510005518794059, "num_tokens": 60914587.0, "step": 50680 }, { "entropy": 1.8697744831442833, "epoch": 0.15713456487690636, "grad_norm": 4.098609924316406, "learning_rate": 6.382063187814093e-06, "loss": 0.4901, "mean_token_accuracy": 0.8406439542770385, "num_tokens": 60926953.0, "step": 50690 }, { "entropy": 1.8595498703420161, "epoch": 0.15716556400195605, "grad_norm": 8.888344764709473, "learning_rate": 6.381433749568522e-06, "loss": 0.4505, "mean_token_accuracy": 0.8456152617931366, "num_tokens": 60939063.0, "step": 50700 }, { "entropy": 1.8515797272324561, "epoch": 0.15719656312700575, "grad_norm": 6.374711513519287, "learning_rate": 6.380804497523409e-06, "loss": 0.4767, "mean_token_accuracy": 0.844392117857933, "num_tokens": 60950485.0, "step": 50710 }, { "entropy": 1.8461545348167419, "epoch": 0.15722756225205545, "grad_norm": 9.635364532470703, "learning_rate": 6.380175431586977e-06, "loss": 0.4855, "mean_token_accuracy": 0.8438180416822434, "num_tokens": 60962994.0, "step": 50720 }, { "entropy": 1.9317549824714662, "epoch": 0.15725856137710514, "grad_norm": 9.940279960632324, "learning_rate": 6.379546551667498e-06, "loss": 0.5661, "mean_token_accuracy": 0.8308566614985466, "num_tokens": 60974445.0, "step": 50730 }, { "entropy": 1.919760164618492, "epoch": 0.15728956050215484, "grad_norm": 17.22686004638672, "learning_rate": 6.3789178576733166e-06, "loss": 0.5501, "mean_token_accuracy": 0.8357348814606667, "num_tokens": 60985676.0, "step": 50740 }, { "entropy": 1.893989272415638, "epoch": 0.1573205596272045, "grad_norm": 8.417506217956543, "learning_rate": 6.378289349512838e-06, "loss": 0.5289, "mean_token_accuracy": 0.840349805355072, "num_tokens": 60997353.0, "step": 50750 }, { "entropy": 1.9457985952496528, "epoch": 0.1573515587522542, "grad_norm": 9.5839204788208, "learning_rate": 6.377661027094528e-06, "loss": 0.5644, "mean_token_accuracy": 0.8289625599980355, "num_tokens": 61008540.0, "step": 50760 }, { "entropy": 1.8832007065415381, "epoch": 0.1573825578773039, "grad_norm": 8.148798942565918, "learning_rate": 6.377032890326919e-06, "loss": 0.4871, "mean_token_accuracy": 0.8396278038620949, "num_tokens": 61020653.0, "step": 50770 }, { "entropy": 1.8223789624869824, "epoch": 0.1574135570023536, "grad_norm": 8.812091827392578, "learning_rate": 6.376404939118606e-06, "loss": 0.4469, "mean_token_accuracy": 0.8405194833874703, "num_tokens": 61033783.0, "step": 50780 }, { "entropy": 1.908838202059269, "epoch": 0.1574445561274033, "grad_norm": 9.226709365844727, "learning_rate": 6.3757771733782435e-06, "loss": 0.5498, "mean_token_accuracy": 0.8301179006695747, "num_tokens": 61045896.0, "step": 50790 }, { "entropy": 1.8385404348373413, "epoch": 0.157475555252453, "grad_norm": 3.6262776851654053, "learning_rate": 6.375149593014555e-06, "loss": 0.4527, "mean_token_accuracy": 0.844711446762085, "num_tokens": 61059216.0, "step": 50800 }, { "entropy": 1.8200566530227662, "epoch": 0.1575065543775027, "grad_norm": 4.00075626373291, "learning_rate": 6.3745221979363226e-06, "loss": 0.4372, "mean_token_accuracy": 0.8521258249878884, "num_tokens": 61072093.0, "step": 50810 }, { "entropy": 1.9493874102830886, "epoch": 0.15753755350255239, "grad_norm": 7.502165794372559, "learning_rate": 6.373894988052391e-06, "loss": 0.5919, "mean_token_accuracy": 0.8284584790468216, "num_tokens": 61083605.0, "step": 50820 }, { "entropy": 1.8854053810238838, "epoch": 0.15756855262760208, "grad_norm": 8.985443115234375, "learning_rate": 6.373267963271668e-06, "loss": 0.4746, "mean_token_accuracy": 0.8452239811420441, "num_tokens": 61095760.0, "step": 50830 }, { "entropy": 1.9016931489109994, "epoch": 0.15759955175265178, "grad_norm": 8.221643447875977, "learning_rate": 6.372641123503127e-06, "loss": 0.4685, "mean_token_accuracy": 0.8430635541677475, "num_tokens": 61108080.0, "step": 50840 }, { "entropy": 1.9211482793092727, "epoch": 0.15763055087770148, "grad_norm": 10.981581687927246, "learning_rate": 6.372014468655801e-06, "loss": 0.5592, "mean_token_accuracy": 0.825599467754364, "num_tokens": 61120063.0, "step": 50850 }, { "entropy": 1.948448945581913, "epoch": 0.15766155000275117, "grad_norm": 8.988844871520996, "learning_rate": 6.371387998638789e-06, "loss": 0.58, "mean_token_accuracy": 0.8233562901616096, "num_tokens": 61131416.0, "step": 50860 }, { "entropy": 1.923220631480217, "epoch": 0.15769254912780087, "grad_norm": 8.43247127532959, "learning_rate": 6.3707617133612456e-06, "loss": 0.5363, "mean_token_accuracy": 0.831841279566288, "num_tokens": 61143372.0, "step": 50870 }, { "entropy": 1.890692350268364, "epoch": 0.15772354825285057, "grad_norm": 4.421103477478027, "learning_rate": 6.370135612732394e-06, "loss": 0.5208, "mean_token_accuracy": 0.8370485186576844, "num_tokens": 61155644.0, "step": 50880 }, { "entropy": 1.9700417637825012, "epoch": 0.15775454737790026, "grad_norm": 4.649320602416992, "learning_rate": 6.36950969666152e-06, "loss": 0.5609, "mean_token_accuracy": 0.828958623111248, "num_tokens": 61166750.0, "step": 50890 }, { "entropy": 1.8618547976017, "epoch": 0.15778554650294996, "grad_norm": 7.937415599822998, "learning_rate": 6.368883965057968e-06, "loss": 0.5103, "mean_token_accuracy": 0.8421330273151397, "num_tokens": 61179716.0, "step": 50900 }, { "entropy": 1.9229750469326974, "epoch": 0.15781654562799965, "grad_norm": 7.491119861602783, "learning_rate": 6.368258417831149e-06, "loss": 0.5214, "mean_token_accuracy": 0.8261383548378944, "num_tokens": 61191546.0, "step": 50910 }, { "entropy": 1.9325373709201812, "epoch": 0.15784754475304935, "grad_norm": 9.881427764892578, "learning_rate": 6.367633054890532e-06, "loss": 0.5417, "mean_token_accuracy": 0.8323504284024239, "num_tokens": 61203143.0, "step": 50920 }, { "entropy": 1.8541507571935654, "epoch": 0.15787854387809905, "grad_norm": 4.566098690032959, "learning_rate": 6.367007876145651e-06, "loss": 0.5094, "mean_token_accuracy": 0.8295344039797783, "num_tokens": 61216098.0, "step": 50930 }, { "entropy": 1.9541061252355576, "epoch": 0.15790954300314874, "grad_norm": 8.65689754486084, "learning_rate": 6.3663828815061e-06, "loss": 0.6162, "mean_token_accuracy": 0.8173148021101951, "num_tokens": 61227492.0, "step": 50940 }, { "entropy": 1.9283887058496476, "epoch": 0.15794054212819844, "grad_norm": 8.396080017089844, "learning_rate": 6.36575807088154e-06, "loss": 0.513, "mean_token_accuracy": 0.8281655997037888, "num_tokens": 61239330.0, "step": 50950 }, { "entropy": 1.8624481573700904, "epoch": 0.15797154125324814, "grad_norm": 9.15777587890625, "learning_rate": 6.365133444181688e-06, "loss": 0.4972, "mean_token_accuracy": 0.8356254741549491, "num_tokens": 61252054.0, "step": 50960 }, { "entropy": 1.9177945956587792, "epoch": 0.15800254037829783, "grad_norm": 9.632381439208984, "learning_rate": 6.364509001316326e-06, "loss": 0.5361, "mean_token_accuracy": 0.8355124577879905, "num_tokens": 61263663.0, "step": 50970 }, { "entropy": 1.8173129588365555, "epoch": 0.15803353950334753, "grad_norm": 9.271222114562988, "learning_rate": 6.363884742195296e-06, "loss": 0.4709, "mean_token_accuracy": 0.8449875220656395, "num_tokens": 61277606.0, "step": 50980 }, { "entropy": 1.97777401804924, "epoch": 0.1580645386283972, "grad_norm": 11.567782402038574, "learning_rate": 6.363260666728507e-06, "loss": 0.5419, "mean_token_accuracy": 0.8399723425507546, "num_tokens": 61287908.0, "step": 50990 }, { "entropy": 1.8491854876279832, "epoch": 0.1580955377534469, "grad_norm": 3.4037585258483887, "learning_rate": 6.362636774825923e-06, "loss": 0.5092, "mean_token_accuracy": 0.8377619609236717, "num_tokens": 61300804.0, "step": 51000 }, { "entropy": 1.9141796737909318, "epoch": 0.1581265368784966, "grad_norm": 4.402615547180176, "learning_rate": 6.362013066397575e-06, "loss": 0.5334, "mean_token_accuracy": 0.8279877364635467, "num_tokens": 61312498.0, "step": 51010 }, { "entropy": 1.9528733968734742, "epoch": 0.1581575360035463, "grad_norm": 8.701175689697266, "learning_rate": 6.361389541353552e-06, "loss": 0.5713, "mean_token_accuracy": 0.836171668767929, "num_tokens": 61323367.0, "step": 51020 }, { "entropy": 1.8788547486066818, "epoch": 0.158188535128596, "grad_norm": 9.006681442260742, "learning_rate": 6.360766199604007e-06, "loss": 0.549, "mean_token_accuracy": 0.8283375024795532, "num_tokens": 61336492.0, "step": 51030 }, { "entropy": 1.856010665744543, "epoch": 0.15821953425364568, "grad_norm": 4.132041931152344, "learning_rate": 6.360143041059156e-06, "loss": 0.4608, "mean_token_accuracy": 0.8421657636761666, "num_tokens": 61349803.0, "step": 51040 }, { "entropy": 1.8853138819336892, "epoch": 0.15825053337869538, "grad_norm": 4.42047643661499, "learning_rate": 6.359520065629272e-06, "loss": 0.5469, "mean_token_accuracy": 0.8277313068509102, "num_tokens": 61361430.0, "step": 51050 }, { "entropy": 1.8170242205262184, "epoch": 0.15828153250374508, "grad_norm": 3.389427423477173, "learning_rate": 6.358897273224693e-06, "loss": 0.4527, "mean_token_accuracy": 0.8529497593641281, "num_tokens": 61374400.0, "step": 51060 }, { "entropy": 1.8062128841876983, "epoch": 0.15831253162879477, "grad_norm": 3.821852207183838, "learning_rate": 6.358274663755817e-06, "loss": 0.4512, "mean_token_accuracy": 0.8493776023387909, "num_tokens": 61388410.0, "step": 51070 }, { "entropy": 1.904846543073654, "epoch": 0.15834353075384447, "grad_norm": 11.317872047424316, "learning_rate": 6.357652237133105e-06, "loss": 0.5412, "mean_token_accuracy": 0.830742597579956, "num_tokens": 61399765.0, "step": 51080 }, { "entropy": 1.9227543294429779, "epoch": 0.15837452987889417, "grad_norm": 9.054058074951172, "learning_rate": 6.357029993267079e-06, "loss": 0.594, "mean_token_accuracy": 0.8182252869009972, "num_tokens": 61411205.0, "step": 51090 }, { "entropy": 1.7773767858743668, "epoch": 0.15840552900394386, "grad_norm": 3.7279295921325684, "learning_rate": 6.356407932068319e-06, "loss": 0.4533, "mean_token_accuracy": 0.8401523649692535, "num_tokens": 61425116.0, "step": 51100 }, { "entropy": 1.932927194237709, "epoch": 0.15843652812899356, "grad_norm": 8.091573715209961, "learning_rate": 6.35578605344747e-06, "loss": 0.5335, "mean_token_accuracy": 0.8386119574308395, "num_tokens": 61436160.0, "step": 51110 }, { "entropy": 1.9222755640745164, "epoch": 0.15846752725404326, "grad_norm": 9.271939277648926, "learning_rate": 6.355164357315238e-06, "loss": 0.5488, "mean_token_accuracy": 0.8323445156216621, "num_tokens": 61447404.0, "step": 51120 }, { "entropy": 1.8717841893434524, "epoch": 0.15849852637909295, "grad_norm": 4.64661979675293, "learning_rate": 6.354542843582387e-06, "loss": 0.5269, "mean_token_accuracy": 0.8282085090875626, "num_tokens": 61459307.0, "step": 51130 }, { "entropy": 1.7520866304636002, "epoch": 0.15852952550414265, "grad_norm": 8.69471263885498, "learning_rate": 6.353921512159747e-06, "loss": 0.392, "mean_token_accuracy": 0.8573095768690109, "num_tokens": 61473555.0, "step": 51140 }, { "entropy": 1.933935022354126, "epoch": 0.15856052462919235, "grad_norm": 9.661917686462402, "learning_rate": 6.353300362958204e-06, "loss": 0.5802, "mean_token_accuracy": 0.8254079014062882, "num_tokens": 61485440.0, "step": 51150 }, { "entropy": 1.8875377878546715, "epoch": 0.15859152375424204, "grad_norm": 9.894033432006836, "learning_rate": 6.352679395888709e-06, "loss": 0.5426, "mean_token_accuracy": 0.8381941422820092, "num_tokens": 61497948.0, "step": 51160 }, { "entropy": 1.954133327305317, "epoch": 0.15862252287929174, "grad_norm": 13.155253410339355, "learning_rate": 6.3520586108622695e-06, "loss": 0.5527, "mean_token_accuracy": 0.8287410020828248, "num_tokens": 61509102.0, "step": 51170 }, { "entropy": 1.9011639848351478, "epoch": 0.15865352200434144, "grad_norm": 9.405858039855957, "learning_rate": 6.351438007789959e-06, "loss": 0.506, "mean_token_accuracy": 0.8395056575536728, "num_tokens": 61521308.0, "step": 51180 }, { "entropy": 1.9040748074650764, "epoch": 0.15868452112939113, "grad_norm": 8.241883277893066, "learning_rate": 6.350817586582909e-06, "loss": 0.5341, "mean_token_accuracy": 0.8309203192591668, "num_tokens": 61533828.0, "step": 51190 }, { "entropy": 1.9427318632602693, "epoch": 0.15871552025444083, "grad_norm": 8.195354461669922, "learning_rate": 6.35019734715231e-06, "loss": 0.5027, "mean_token_accuracy": 0.8373890981078148, "num_tokens": 61545278.0, "step": 51200 }, { "entropy": 1.904130506515503, "epoch": 0.15874651937949052, "grad_norm": 7.979343891143799, "learning_rate": 6.349577289409418e-06, "loss": 0.5099, "mean_token_accuracy": 0.8309107288718224, "num_tokens": 61557125.0, "step": 51210 }, { "entropy": 1.9387200504541398, "epoch": 0.15877751850454022, "grad_norm": 8.9723539352417, "learning_rate": 6.348957413265544e-06, "loss": 0.5938, "mean_token_accuracy": 0.8150834292173386, "num_tokens": 61569250.0, "step": 51220 }, { "entropy": 1.8315011352300643, "epoch": 0.15880851762958992, "grad_norm": 4.4107136726379395, "learning_rate": 6.348337718632065e-06, "loss": 0.4648, "mean_token_accuracy": 0.8472426652908325, "num_tokens": 61582476.0, "step": 51230 }, { "entropy": 1.8459465608000756, "epoch": 0.1588395167546396, "grad_norm": 8.931220054626465, "learning_rate": 6.347718205420413e-06, "loss": 0.4387, "mean_token_accuracy": 0.8469175025820732, "num_tokens": 61595011.0, "step": 51240 }, { "entropy": 1.9565140813589097, "epoch": 0.15887051587968928, "grad_norm": 8.753551483154297, "learning_rate": 6.347098873542088e-06, "loss": 0.5644, "mean_token_accuracy": 0.8283399358391762, "num_tokens": 61606389.0, "step": 51250 }, { "entropy": 1.9582069575786591, "epoch": 0.15890151500473898, "grad_norm": 7.78233003616333, "learning_rate": 6.346479722908642e-06, "loss": 0.5422, "mean_token_accuracy": 0.8336837723851204, "num_tokens": 61617999.0, "step": 51260 }, { "entropy": 1.9196703389286995, "epoch": 0.15893251412978868, "grad_norm": 8.762555122375488, "learning_rate": 6.345860753431693e-06, "loss": 0.5081, "mean_token_accuracy": 0.8311418369412422, "num_tokens": 61630045.0, "step": 51270 }, { "entropy": 1.9337300062179565, "epoch": 0.15896351325483837, "grad_norm": 9.353802680969238, "learning_rate": 6.345241965022917e-06, "loss": 0.5269, "mean_token_accuracy": 0.8350248277187348, "num_tokens": 61642241.0, "step": 51280 }, { "entropy": 1.903528119623661, "epoch": 0.15899451237988807, "grad_norm": 8.561066627502441, "learning_rate": 6.344623357594051e-06, "loss": 0.5307, "mean_token_accuracy": 0.8348277062177658, "num_tokens": 61654226.0, "step": 51290 }, { "entropy": 1.9056920766830445, "epoch": 0.15902551150493777, "grad_norm": 4.238293170928955, "learning_rate": 6.344004931056894e-06, "loss": 0.4882, "mean_token_accuracy": 0.8365932151675224, "num_tokens": 61666601.0, "step": 51300 }, { "entropy": 1.900420580804348, "epoch": 0.15905651062998746, "grad_norm": 8.474109649658203, "learning_rate": 6.343386685323301e-06, "loss": 0.5645, "mean_token_accuracy": 0.8356752887368202, "num_tokens": 61678955.0, "step": 51310 }, { "entropy": 1.7979067623615266, "epoch": 0.15908750975503716, "grad_norm": 7.289546012878418, "learning_rate": 6.34276862030519e-06, "loss": 0.4091, "mean_token_accuracy": 0.8524860993027688, "num_tokens": 61692933.0, "step": 51320 }, { "entropy": 1.9506612807512282, "epoch": 0.15911850888008686, "grad_norm": 7.817605018615723, "learning_rate": 6.342150735914539e-06, "loss": 0.5449, "mean_token_accuracy": 0.8349381595849991, "num_tokens": 61703549.0, "step": 51330 }, { "entropy": 1.9035085812211037, "epoch": 0.15914950800513655, "grad_norm": 8.978363037109375, "learning_rate": 6.341533032063384e-06, "loss": 0.5406, "mean_token_accuracy": 0.8332462772727013, "num_tokens": 61715518.0, "step": 51340 }, { "entropy": 1.9238849043846131, "epoch": 0.15918050713018625, "grad_norm": 10.23372745513916, "learning_rate": 6.3409155086638244e-06, "loss": 0.5827, "mean_token_accuracy": 0.8245359167456627, "num_tokens": 61726468.0, "step": 51350 }, { "entropy": 1.912158764898777, "epoch": 0.15921150625523595, "grad_norm": 8.236334800720215, "learning_rate": 6.3402981656280174e-06, "loss": 0.5213, "mean_token_accuracy": 0.8354252070188523, "num_tokens": 61738604.0, "step": 51360 }, { "entropy": 1.9760198339819908, "epoch": 0.15924250538028564, "grad_norm": 9.30893611907959, "learning_rate": 6.339681002868179e-06, "loss": 0.561, "mean_token_accuracy": 0.8204899609088898, "num_tokens": 61749706.0, "step": 51370 }, { "entropy": 1.9051903694868089, "epoch": 0.15927350450533534, "grad_norm": 9.52764892578125, "learning_rate": 6.3390640202965856e-06, "loss": 0.5108, "mean_token_accuracy": 0.8380590423941612, "num_tokens": 61761965.0, "step": 51380 }, { "entropy": 1.9080309465527534, "epoch": 0.15930450363038504, "grad_norm": 3.284393072128296, "learning_rate": 6.338447217825577e-06, "loss": 0.5122, "mean_token_accuracy": 0.8422815844416618, "num_tokens": 61773196.0, "step": 51390 }, { "entropy": 1.848653295636177, "epoch": 0.15933550275543473, "grad_norm": 9.541685104370117, "learning_rate": 6.337830595367548e-06, "loss": 0.52, "mean_token_accuracy": 0.835855670273304, "num_tokens": 61785726.0, "step": 51400 }, { "entropy": 1.8705054201185702, "epoch": 0.15936650188048443, "grad_norm": 8.726213455200195, "learning_rate": 6.337214152834954e-06, "loss": 0.5133, "mean_token_accuracy": 0.838579186797142, "num_tokens": 61798381.0, "step": 51410 }, { "entropy": 1.9128465965390204, "epoch": 0.15939750100553413, "grad_norm": 4.461379528045654, "learning_rate": 6.336597890140311e-06, "loss": 0.5283, "mean_token_accuracy": 0.8308456495404244, "num_tokens": 61809810.0, "step": 51420 }, { "entropy": 1.8830090552568435, "epoch": 0.15942850013058382, "grad_norm": 7.6069135665893555, "learning_rate": 6.335981807196195e-06, "loss": 0.4831, "mean_token_accuracy": 0.8455763354897499, "num_tokens": 61821564.0, "step": 51430 }, { "entropy": 1.879130232334137, "epoch": 0.15945949925563352, "grad_norm": 8.862757682800293, "learning_rate": 6.335365903915241e-06, "loss": 0.4903, "mean_token_accuracy": 0.841680321097374, "num_tokens": 61833721.0, "step": 51440 }, { "entropy": 1.8420701980590821, "epoch": 0.15949049838068322, "grad_norm": 9.544261932373047, "learning_rate": 6.334750180210142e-06, "loss": 0.4785, "mean_token_accuracy": 0.8434771358966827, "num_tokens": 61845911.0, "step": 51450 }, { "entropy": 1.8399369686841964, "epoch": 0.1595214975057329, "grad_norm": 8.646150588989258, "learning_rate": 6.334134635993651e-06, "loss": 0.4886, "mean_token_accuracy": 0.8362537592649459, "num_tokens": 61858452.0, "step": 51460 }, { "entropy": 1.8282588481903077, "epoch": 0.1595524966307826, "grad_norm": 8.844307899475098, "learning_rate": 6.333519271178583e-06, "loss": 0.4811, "mean_token_accuracy": 0.8485338285565376, "num_tokens": 61871207.0, "step": 51470 }, { "entropy": 1.9945002496242523, "epoch": 0.1595834957558323, "grad_norm": 9.600845336914062, "learning_rate": 6.332904085677809e-06, "loss": 0.6231, "mean_token_accuracy": 0.8192642524838447, "num_tokens": 61881849.0, "step": 51480 }, { "entropy": 1.866794577240944, "epoch": 0.15961449488088197, "grad_norm": 8.577037811279297, "learning_rate": 6.33228907940426e-06, "loss": 0.5177, "mean_token_accuracy": 0.8294419586658478, "num_tokens": 61894411.0, "step": 51490 }, { "entropy": 1.9821971401572227, "epoch": 0.15964549400593167, "grad_norm": 7.966568470001221, "learning_rate": 6.3316742522709295e-06, "loss": 0.5735, "mean_token_accuracy": 0.8234156042337417, "num_tokens": 61905974.0, "step": 51500 }, { "entropy": 1.928943158686161, "epoch": 0.15967649313098137, "grad_norm": 8.466790199279785, "learning_rate": 6.331059604190863e-06, "loss": 0.5206, "mean_token_accuracy": 0.83587566614151, "num_tokens": 61918230.0, "step": 51510 }, { "entropy": 1.8466455608606338, "epoch": 0.15970749225603106, "grad_norm": 8.579726219177246, "learning_rate": 6.330445135077171e-06, "loss": 0.4981, "mean_token_accuracy": 0.8351440250873565, "num_tokens": 61930533.0, "step": 51520 }, { "entropy": 1.9407551258802413, "epoch": 0.15973849138108076, "grad_norm": 9.855902671813965, "learning_rate": 6.329830844843021e-06, "loss": 0.5384, "mean_token_accuracy": 0.8276364892721176, "num_tokens": 61941838.0, "step": 51530 }, { "entropy": 1.8404507592320443, "epoch": 0.15976949050613046, "grad_norm": 7.8672261238098145, "learning_rate": 6.329216733401641e-06, "loss": 0.4824, "mean_token_accuracy": 0.8361625537276268, "num_tokens": 61954790.0, "step": 51540 }, { "entropy": 1.8961133405566215, "epoch": 0.15980048963118015, "grad_norm": 9.657519340515137, "learning_rate": 6.328602800666316e-06, "loss": 0.538, "mean_token_accuracy": 0.8254346460103988, "num_tokens": 61967113.0, "step": 51550 }, { "entropy": 1.887896779179573, "epoch": 0.15983148875622985, "grad_norm": 7.927542209625244, "learning_rate": 6.32798904655039e-06, "loss": 0.4913, "mean_token_accuracy": 0.8375117152929306, "num_tokens": 61979299.0, "step": 51560 }, { "entropy": 1.8948641210794448, "epoch": 0.15986248788127955, "grad_norm": 7.149417400360107, "learning_rate": 6.327375470967267e-06, "loss": 0.5088, "mean_token_accuracy": 0.8403451159596443, "num_tokens": 61990608.0, "step": 51570 }, { "entropy": 1.8972643151879311, "epoch": 0.15989348700632924, "grad_norm": 10.748580932617188, "learning_rate": 6.326762073830408e-06, "loss": 0.5441, "mean_token_accuracy": 0.8346004873514176, "num_tokens": 62002236.0, "step": 51580 }, { "entropy": 1.9104709938168525, "epoch": 0.15992448613137894, "grad_norm": 9.303431510925293, "learning_rate": 6.326148855053335e-06, "loss": 0.542, "mean_token_accuracy": 0.8291504085063934, "num_tokens": 62013316.0, "step": 51590 }, { "entropy": 1.9200339168310165, "epoch": 0.15995548525642864, "grad_norm": 8.650131225585938, "learning_rate": 6.325535814549628e-06, "loss": 0.5125, "mean_token_accuracy": 0.8344766482710838, "num_tokens": 62024979.0, "step": 51600 }, { "entropy": 1.9062774524092674, "epoch": 0.15998648438147833, "grad_norm": 9.20338249206543, "learning_rate": 6.324922952232924e-06, "loss": 0.5443, "mean_token_accuracy": 0.8257553368806839, "num_tokens": 62037183.0, "step": 51610 }, { "entropy": 1.8092211425304412, "epoch": 0.16001748350652803, "grad_norm": 8.952025413513184, "learning_rate": 6.32431026801692e-06, "loss": 0.4568, "mean_token_accuracy": 0.8433082342147827, "num_tokens": 62050479.0, "step": 51620 }, { "entropy": 1.9418270736932755, "epoch": 0.16004848263157773, "grad_norm": 9.070456504821777, "learning_rate": 6.3236977618153725e-06, "loss": 0.5269, "mean_token_accuracy": 0.8335362508893013, "num_tokens": 62062292.0, "step": 51630 }, { "entropy": 1.8717383340001106, "epoch": 0.16007948175662742, "grad_norm": 4.403903961181641, "learning_rate": 6.323085433542092e-06, "loss": 0.4676, "mean_token_accuracy": 0.8389642179012299, "num_tokens": 62074628.0, "step": 51640 }, { "entropy": 1.9106849119067193, "epoch": 0.16011048088167712, "grad_norm": 9.37865924835205, "learning_rate": 6.3224732831109535e-06, "loss": 0.5259, "mean_token_accuracy": 0.8324133858084679, "num_tokens": 62086818.0, "step": 51650 }, { "entropy": 1.885637989640236, "epoch": 0.16014148000672682, "grad_norm": 4.711241722106934, "learning_rate": 6.321861310435887e-06, "loss": 0.4828, "mean_token_accuracy": 0.8371865153312683, "num_tokens": 62098896.0, "step": 51660 }, { "entropy": 1.9405640929937362, "epoch": 0.1601724791317765, "grad_norm": 8.641385078430176, "learning_rate": 6.3212495154308804e-06, "loss": 0.5958, "mean_token_accuracy": 0.8278973504900933, "num_tokens": 62109979.0, "step": 51670 }, { "entropy": 1.934703852236271, "epoch": 0.1602034782568262, "grad_norm": 9.276317596435547, "learning_rate": 6.32063789800998e-06, "loss": 0.5185, "mean_token_accuracy": 0.8384066224098206, "num_tokens": 62121358.0, "step": 51680 }, { "entropy": 1.885528053343296, "epoch": 0.1602344773818759, "grad_norm": 8.702717781066895, "learning_rate": 6.320026458087292e-06, "loss": 0.5103, "mean_token_accuracy": 0.8399693235754967, "num_tokens": 62132820.0, "step": 51690 }, { "entropy": 1.955546210706234, "epoch": 0.1602654765069256, "grad_norm": 9.13160228729248, "learning_rate": 6.319415195576981e-06, "loss": 0.5602, "mean_token_accuracy": 0.8254757478833199, "num_tokens": 62143735.0, "step": 51700 }, { "entropy": 1.9146698236465454, "epoch": 0.1602964756319753, "grad_norm": 8.36129379272461, "learning_rate": 6.318804110393267e-06, "loss": 0.4777, "mean_token_accuracy": 0.8527568742632866, "num_tokens": 62155133.0, "step": 51710 }, { "entropy": 1.859630098938942, "epoch": 0.160327474757025, "grad_norm": 4.852166175842285, "learning_rate": 6.318193202450428e-06, "loss": 0.5001, "mean_token_accuracy": 0.8412252008914948, "num_tokens": 62167946.0, "step": 51720 }, { "entropy": 1.9424054473638535, "epoch": 0.16035847388207466, "grad_norm": 10.02141284942627, "learning_rate": 6.317582471662803e-06, "loss": 0.5314, "mean_token_accuracy": 0.839588788151741, "num_tokens": 62179109.0, "step": 51730 }, { "entropy": 1.9220136627554893, "epoch": 0.16038947300712436, "grad_norm": 12.059248924255371, "learning_rate": 6.3169719179447885e-06, "loss": 0.5252, "mean_token_accuracy": 0.8367831781506538, "num_tokens": 62190468.0, "step": 51740 }, { "entropy": 1.85854529440403, "epoch": 0.16042047213217406, "grad_norm": 9.116106986999512, "learning_rate": 6.316361541210837e-06, "loss": 0.4878, "mean_token_accuracy": 0.8309743300080299, "num_tokens": 62202630.0, "step": 51750 }, { "entropy": 1.9260679721832275, "epoch": 0.16045147125722375, "grad_norm": 9.218046188354492, "learning_rate": 6.315751341375458e-06, "loss": 0.5193, "mean_token_accuracy": 0.8390769004821778, "num_tokens": 62214408.0, "step": 51760 }, { "entropy": 1.9168349742889403, "epoch": 0.16048247038227345, "grad_norm": 8.362269401550293, "learning_rate": 6.315141318353224e-06, "loss": 0.5034, "mean_token_accuracy": 0.8344539046287537, "num_tokens": 62226824.0, "step": 51770 }, { "entropy": 1.847986987233162, "epoch": 0.16051346950732315, "grad_norm": 8.942157745361328, "learning_rate": 6.314531472058758e-06, "loss": 0.5114, "mean_token_accuracy": 0.8420057892799377, "num_tokens": 62239194.0, "step": 51780 }, { "entropy": 1.890682005882263, "epoch": 0.16054446863237284, "grad_norm": 8.703591346740723, "learning_rate": 6.313921802406747e-06, "loss": 0.5093, "mean_token_accuracy": 0.8423530831933022, "num_tokens": 62251150.0, "step": 51790 }, { "entropy": 1.8866778627038001, "epoch": 0.16057546775742254, "grad_norm": 9.444439888000488, "learning_rate": 6.313312309311932e-06, "loss": 0.5286, "mean_token_accuracy": 0.8364355772733688, "num_tokens": 62262711.0, "step": 51800 }, { "entropy": 1.8455047011375427, "epoch": 0.16060646688247224, "grad_norm": 8.43193244934082, "learning_rate": 6.312702992689113e-06, "loss": 0.4802, "mean_token_accuracy": 0.8457055032253266, "num_tokens": 62274824.0, "step": 51810 }, { "entropy": 1.850141017138958, "epoch": 0.16063746600752193, "grad_norm": 10.177520751953125, "learning_rate": 6.312093852453148e-06, "loss": 0.4953, "mean_token_accuracy": 0.832466046512127, "num_tokens": 62286953.0, "step": 51820 }, { "entropy": 1.8169025957584382, "epoch": 0.16066846513257163, "grad_norm": 2.502871036529541, "learning_rate": 6.31148488851895e-06, "loss": 0.4419, "mean_token_accuracy": 0.8574206903576851, "num_tokens": 62300292.0, "step": 51830 }, { "entropy": 1.8187232732772827, "epoch": 0.16069946425762133, "grad_norm": 7.720560550689697, "learning_rate": 6.3108761008014915e-06, "loss": 0.4664, "mean_token_accuracy": 0.8429305925965309, "num_tokens": 62312236.0, "step": 51840 }, { "entropy": 1.879136349260807, "epoch": 0.16073046338267102, "grad_norm": 7.850799083709717, "learning_rate": 6.310267489215804e-06, "loss": 0.4894, "mean_token_accuracy": 0.8317036911845207, "num_tokens": 62324178.0, "step": 51850 }, { "entropy": 1.7745792135596274, "epoch": 0.16076146250772072, "grad_norm": 4.761654853820801, "learning_rate": 6.309659053676972e-06, "loss": 0.4362, "mean_token_accuracy": 0.8426973283290863, "num_tokens": 62338322.0, "step": 51860 }, { "entropy": 1.8939239963889123, "epoch": 0.16079246163277042, "grad_norm": 3.76540470123291, "learning_rate": 6.309050794100141e-06, "loss": 0.5121, "mean_token_accuracy": 0.8374624997377396, "num_tokens": 62349756.0, "step": 51870 }, { "entropy": 1.9361850887537002, "epoch": 0.1608234607578201, "grad_norm": 7.414719581604004, "learning_rate": 6.308442710400513e-06, "loss": 0.5092, "mean_token_accuracy": 0.8424745500087738, "num_tokens": 62360166.0, "step": 51880 }, { "entropy": 1.816766545176506, "epoch": 0.1608544598828698, "grad_norm": 9.440898895263672, "learning_rate": 6.3078348024933465e-06, "loss": 0.4855, "mean_token_accuracy": 0.8390632942318916, "num_tokens": 62373447.0, "step": 51890 }, { "entropy": 1.9127420842647553, "epoch": 0.1608854590079195, "grad_norm": 8.341848373413086, "learning_rate": 6.307227070293956e-06, "loss": 0.5394, "mean_token_accuracy": 0.8356220990419387, "num_tokens": 62384460.0, "step": 51900 }, { "entropy": 1.8038682281970977, "epoch": 0.1609164581329692, "grad_norm": 3.9414637088775635, "learning_rate": 6.3066195137177146e-06, "loss": 0.4422, "mean_token_accuracy": 0.841181357204914, "num_tokens": 62397300.0, "step": 51910 }, { "entropy": 1.9235049843788148, "epoch": 0.1609474572580189, "grad_norm": 4.492739677429199, "learning_rate": 6.306012132680054e-06, "loss": 0.5551, "mean_token_accuracy": 0.8201109856367111, "num_tokens": 62409356.0, "step": 51920 }, { "entropy": 1.8290775090456008, "epoch": 0.1609784563830686, "grad_norm": 3.958709955215454, "learning_rate": 6.3054049270964605e-06, "loss": 0.4805, "mean_token_accuracy": 0.83324736058712, "num_tokens": 62423295.0, "step": 51930 }, { "entropy": 1.933160249888897, "epoch": 0.1610094555081183, "grad_norm": 7.670217037200928, "learning_rate": 6.304797896882477e-06, "loss": 0.5262, "mean_token_accuracy": 0.8253769144415856, "num_tokens": 62434328.0, "step": 51940 }, { "entropy": 1.8668658897280692, "epoch": 0.161040454633168, "grad_norm": 4.041083812713623, "learning_rate": 6.3041910419537055e-06, "loss": 0.4744, "mean_token_accuracy": 0.8454795464873314, "num_tokens": 62446093.0, "step": 51950 }, { "entropy": 1.9969589874148368, "epoch": 0.1610714537582177, "grad_norm": 10.19937801361084, "learning_rate": 6.3035843622258045e-06, "loss": 0.5644, "mean_token_accuracy": 0.8266312971711158, "num_tokens": 62457402.0, "step": 51960 }, { "entropy": 1.911833395063877, "epoch": 0.16110245288326738, "grad_norm": 9.627182006835938, "learning_rate": 6.302977857614485e-06, "loss": 0.5166, "mean_token_accuracy": 0.8356106966733933, "num_tokens": 62470041.0, "step": 51970 }, { "entropy": 1.9067854672670363, "epoch": 0.16113345200831705, "grad_norm": 3.0671586990356445, "learning_rate": 6.302371528035522e-06, "loss": 0.52, "mean_token_accuracy": 0.8428362473845482, "num_tokens": 62482431.0, "step": 51980 }, { "entropy": 1.9820441797375679, "epoch": 0.16116445113336675, "grad_norm": 11.670134544372559, "learning_rate": 6.301765373404741e-06, "loss": 0.5354, "mean_token_accuracy": 0.8310918644070625, "num_tokens": 62494113.0, "step": 51990 }, { "entropy": 1.9489408016204834, "epoch": 0.16119545025841645, "grad_norm": 9.345502853393555, "learning_rate": 6.301159393638029e-06, "loss": 0.5212, "mean_token_accuracy": 0.8365255728363991, "num_tokens": 62505895.0, "step": 52000 }, { "entropy": 1.9901140004396438, "epoch": 0.16122644938346614, "grad_norm": 8.158758163452148, "learning_rate": 6.300553588651326e-06, "loss": 0.5582, "mean_token_accuracy": 0.8245610594749451, "num_tokens": 62516556.0, "step": 52010 }, { "entropy": 1.8812553852796554, "epoch": 0.16125744850851584, "grad_norm": 4.510929584503174, "learning_rate": 6.2999479583606295e-06, "loss": 0.4991, "mean_token_accuracy": 0.8425406232476235, "num_tokens": 62529781.0, "step": 52020 }, { "entropy": 1.9245860621333122, "epoch": 0.16128844763356553, "grad_norm": 5.81223726272583, "learning_rate": 6.299342502681993e-06, "loss": 0.5076, "mean_token_accuracy": 0.8385741889476777, "num_tokens": 62541343.0, "step": 52030 }, { "entropy": 1.8486530616879464, "epoch": 0.16131944675861523, "grad_norm": 9.269059181213379, "learning_rate": 6.298737221531529e-06, "loss": 0.4251, "mean_token_accuracy": 0.8453450709581375, "num_tokens": 62554053.0, "step": 52040 }, { "entropy": 1.9573129430413245, "epoch": 0.16135044588366493, "grad_norm": 4.898494720458984, "learning_rate": 6.298132114825405e-06, "loss": 0.5759, "mean_token_accuracy": 0.8290790885686874, "num_tokens": 62565444.0, "step": 52050 }, { "entropy": 1.9822146385908126, "epoch": 0.16138144500871462, "grad_norm": 9.721142768859863, "learning_rate": 6.2975271824798425e-06, "loss": 0.5361, "mean_token_accuracy": 0.8361834734678268, "num_tokens": 62576680.0, "step": 52060 }, { "entropy": 1.9362813726067543, "epoch": 0.16141244413376432, "grad_norm": 9.152141571044922, "learning_rate": 6.2969224244111225e-06, "loss": 0.524, "mean_token_accuracy": 0.8337451457977295, "num_tokens": 62588501.0, "step": 52070 }, { "entropy": 1.9137926280498505, "epoch": 0.16144344325881402, "grad_norm": 7.9217047691345215, "learning_rate": 6.296317840535582e-06, "loss": 0.4653, "mean_token_accuracy": 0.8420775726437568, "num_tokens": 62601151.0, "step": 52080 }, { "entropy": 2.0264596685767176, "epoch": 0.16147444238386371, "grad_norm": 9.242731094360352, "learning_rate": 6.295713430769611e-06, "loss": 0.5702, "mean_token_accuracy": 0.8302224919199943, "num_tokens": 62612685.0, "step": 52090 }, { "entropy": 1.9726679027080536, "epoch": 0.1615054415089134, "grad_norm": 3.5792272090911865, "learning_rate": 6.29510919502966e-06, "loss": 0.5095, "mean_token_accuracy": 0.8441024616360664, "num_tokens": 62624794.0, "step": 52100 }, { "entropy": 2.029518485069275, "epoch": 0.1615364406339631, "grad_norm": 9.071422576904297, "learning_rate": 6.294505133232234e-06, "loss": 0.6116, "mean_token_accuracy": 0.815070490539074, "num_tokens": 62636376.0, "step": 52110 }, { "entropy": 1.9669608414173125, "epoch": 0.1615674397590128, "grad_norm": 8.340578079223633, "learning_rate": 6.293901245293893e-06, "loss": 0.5875, "mean_token_accuracy": 0.827280244231224, "num_tokens": 62648432.0, "step": 52120 }, { "entropy": 1.9860535800457, "epoch": 0.1615984388840625, "grad_norm": 8.199106216430664, "learning_rate": 6.293297531131253e-06, "loss": 0.5733, "mean_token_accuracy": 0.83171975761652, "num_tokens": 62660966.0, "step": 52130 }, { "entropy": 2.0739176899194716, "epoch": 0.1616294380091122, "grad_norm": 10.330364227294922, "learning_rate": 6.292693990660986e-06, "loss": 0.5954, "mean_token_accuracy": 0.8177470460534095, "num_tokens": 62671756.0, "step": 52140 }, { "entropy": 1.9170552849769593, "epoch": 0.1616604371341619, "grad_norm": 8.208378791809082, "learning_rate": 6.292090623799823e-06, "loss": 0.4852, "mean_token_accuracy": 0.8416977405548096, "num_tokens": 62683638.0, "step": 52150 }, { "entropy": 1.8894746892154217, "epoch": 0.1616914362592116, "grad_norm": 6.65895414352417, "learning_rate": 6.291487430464548e-06, "loss": 0.4755, "mean_token_accuracy": 0.845618699491024, "num_tokens": 62696586.0, "step": 52160 }, { "entropy": 1.8414909780025481, "epoch": 0.1617224353842613, "grad_norm": 9.130861282348633, "learning_rate": 6.290884410572e-06, "loss": 0.4207, "mean_token_accuracy": 0.8525190591812134, "num_tokens": 62709495.0, "step": 52170 }, { "entropy": 1.9621912389993668, "epoch": 0.16175343450931098, "grad_norm": 9.641398429870605, "learning_rate": 6.290281564039078e-06, "loss": 0.551, "mean_token_accuracy": 0.8189822494983673, "num_tokens": 62721137.0, "step": 52180 }, { "entropy": 1.869258552789688, "epoch": 0.16178443363436068, "grad_norm": 5.695123672485352, "learning_rate": 6.28967889078273e-06, "loss": 0.5037, "mean_token_accuracy": 0.8320344671607017, "num_tokens": 62734333.0, "step": 52190 }, { "entropy": 1.860531361401081, "epoch": 0.16181543275941038, "grad_norm": 7.216394424438477, "learning_rate": 6.289076390719966e-06, "loss": 0.4677, "mean_token_accuracy": 0.8472234353423118, "num_tokens": 62746055.0, "step": 52200 }, { "entropy": 1.9746491819620133, "epoch": 0.16184643188446007, "grad_norm": 9.281171798706055, "learning_rate": 6.2884740637678486e-06, "loss": 0.5562, "mean_token_accuracy": 0.8372055932879447, "num_tokens": 62757053.0, "step": 52210 }, { "entropy": 1.9339981719851493, "epoch": 0.16187743100950977, "grad_norm": 10.073769569396973, "learning_rate": 6.2878719098434975e-06, "loss": 0.577, "mean_token_accuracy": 0.8267990037798881, "num_tokens": 62768818.0, "step": 52220 }, { "entropy": 1.75873264670372, "epoch": 0.16190843013455944, "grad_norm": 3.918210744857788, "learning_rate": 6.287269928864085e-06, "loss": 0.4269, "mean_token_accuracy": 0.8522544384002686, "num_tokens": 62782705.0, "step": 52230 }, { "entropy": 1.9759118229150772, "epoch": 0.16193942925960914, "grad_norm": 8.20750904083252, "learning_rate": 6.286668120746842e-06, "loss": 0.5464, "mean_token_accuracy": 0.8221010401844978, "num_tokens": 62794524.0, "step": 52240 }, { "entropy": 1.8256248250603675, "epoch": 0.16197042838465883, "grad_norm": 3.0523853302001953, "learning_rate": 6.286066485409056e-06, "loss": 0.4388, "mean_token_accuracy": 0.849830961227417, "num_tokens": 62807370.0, "step": 52250 }, { "entropy": 1.864981435239315, "epoch": 0.16200142750970853, "grad_norm": 8.436219215393066, "learning_rate": 6.285465022768064e-06, "loss": 0.524, "mean_token_accuracy": 0.8273715361952781, "num_tokens": 62820130.0, "step": 52260 }, { "entropy": 1.9281801611185074, "epoch": 0.16203242663475823, "grad_norm": 8.644983291625977, "learning_rate": 6.284863732741263e-06, "loss": 0.4807, "mean_token_accuracy": 0.8462273836135864, "num_tokens": 62832096.0, "step": 52270 }, { "entropy": 1.9527894288301468, "epoch": 0.16206342575980792, "grad_norm": 9.219314575195312, "learning_rate": 6.284262615246107e-06, "loss": 0.5651, "mean_token_accuracy": 0.8327635273337364, "num_tokens": 62843243.0, "step": 52280 }, { "entropy": 1.823768149316311, "epoch": 0.16209442488485762, "grad_norm": 3.6631810665130615, "learning_rate": 6.283661670200099e-06, "loss": 0.4393, "mean_token_accuracy": 0.8509679839015007, "num_tokens": 62856797.0, "step": 52290 }, { "entropy": 1.9193030267953872, "epoch": 0.16212542400990732, "grad_norm": 19.060598373413086, "learning_rate": 6.283060897520804e-06, "loss": 0.5283, "mean_token_accuracy": 0.8333917066454888, "num_tokens": 62869635.0, "step": 52300 }, { "entropy": 1.9955947354435921, "epoch": 0.162156423134957, "grad_norm": 4.73981237411499, "learning_rate": 6.282460297125835e-06, "loss": 0.5073, "mean_token_accuracy": 0.8443202391266823, "num_tokens": 62881179.0, "step": 52310 }, { "entropy": 1.9485764712095262, "epoch": 0.1621874222600067, "grad_norm": 8.990504264831543, "learning_rate": 6.281859868932869e-06, "loss": 0.5494, "mean_token_accuracy": 0.8265508517622948, "num_tokens": 62893144.0, "step": 52320 }, { "entropy": 1.9622452184557915, "epoch": 0.1622184213850564, "grad_norm": 8.875653266906738, "learning_rate": 6.281259612859629e-06, "loss": 0.5053, "mean_token_accuracy": 0.8386439487338067, "num_tokens": 62904857.0, "step": 52330 }, { "entropy": 1.889234210550785, "epoch": 0.1622494205101061, "grad_norm": 10.868480682373047, "learning_rate": 6.2806595288239e-06, "loss": 0.4941, "mean_token_accuracy": 0.8372628569602967, "num_tokens": 62917145.0, "step": 52340 }, { "entropy": 1.8264701277017594, "epoch": 0.1622804196351558, "grad_norm": 9.613685607910156, "learning_rate": 6.2800596167435165e-06, "loss": 0.4617, "mean_token_accuracy": 0.850235678255558, "num_tokens": 62930488.0, "step": 52350 }, { "entropy": 1.854723860323429, "epoch": 0.1623114187602055, "grad_norm": 9.742049217224121, "learning_rate": 6.279459876536374e-06, "loss": 0.4825, "mean_token_accuracy": 0.8460089027881622, "num_tokens": 62942658.0, "step": 52360 }, { "entropy": 1.9286055207252502, "epoch": 0.1623424178852552, "grad_norm": 4.1667304039001465, "learning_rate": 6.278860308120416e-06, "loss": 0.5199, "mean_token_accuracy": 0.8262563273310661, "num_tokens": 62953922.0, "step": 52370 }, { "entropy": 1.7510716319084167, "epoch": 0.1623734170103049, "grad_norm": 8.469949722290039, "learning_rate": 6.278260911413646e-06, "loss": 0.415, "mean_token_accuracy": 0.8467779159545898, "num_tokens": 62967483.0, "step": 52380 }, { "entropy": 1.9073357090353966, "epoch": 0.16240441613535458, "grad_norm": 8.063495635986328, "learning_rate": 6.27766168633412e-06, "loss": 0.5298, "mean_token_accuracy": 0.8321097016334533, "num_tokens": 62979720.0, "step": 52390 }, { "entropy": 1.9408669352531434, "epoch": 0.16243541526040428, "grad_norm": 10.30274772644043, "learning_rate": 6.277062632799949e-06, "loss": 0.515, "mean_token_accuracy": 0.8371255546808243, "num_tokens": 62991264.0, "step": 52400 }, { "entropy": 1.9547599002718925, "epoch": 0.16246641438545398, "grad_norm": 8.85878849029541, "learning_rate": 6.276463750729301e-06, "loss": 0.5293, "mean_token_accuracy": 0.8315024048089981, "num_tokens": 63003018.0, "step": 52410 }, { "entropy": 1.9163083508610725, "epoch": 0.16249741351050367, "grad_norm": 9.163949966430664, "learning_rate": 6.2758650400403964e-06, "loss": 0.4808, "mean_token_accuracy": 0.8458400592207909, "num_tokens": 63014741.0, "step": 52420 }, { "entropy": 1.9846520096063613, "epoch": 0.16252841263555337, "grad_norm": 7.502941131591797, "learning_rate": 6.275266500651508e-06, "loss": 0.5835, "mean_token_accuracy": 0.8262620836496353, "num_tokens": 63025808.0, "step": 52430 }, { "entropy": 1.8606478095054626, "epoch": 0.16255941176060307, "grad_norm": 4.25178861618042, "learning_rate": 6.274668132480967e-06, "loss": 0.4987, "mean_token_accuracy": 0.8410593420267105, "num_tokens": 63038607.0, "step": 52440 }, { "entropy": 1.8719539806246757, "epoch": 0.16259041088565276, "grad_norm": 4.510512828826904, "learning_rate": 6.274069935447157e-06, "loss": 0.491, "mean_token_accuracy": 0.8395673424005509, "num_tokens": 63050950.0, "step": 52450 }, { "entropy": 1.8893551722168922, "epoch": 0.16262141001070246, "grad_norm": 3.8029367923736572, "learning_rate": 6.273471909468518e-06, "loss": 0.487, "mean_token_accuracy": 0.8418749555945396, "num_tokens": 63063475.0, "step": 52460 }, { "entropy": 1.8528440594673157, "epoch": 0.16265240913575213, "grad_norm": 8.126982688903809, "learning_rate": 6.272874054463543e-06, "loss": 0.458, "mean_token_accuracy": 0.8420227542519569, "num_tokens": 63076738.0, "step": 52470 }, { "entropy": 1.9618181601166724, "epoch": 0.16268340826080183, "grad_norm": 11.542217254638672, "learning_rate": 6.272276370350776e-06, "loss": 0.5376, "mean_token_accuracy": 0.8369494050741195, "num_tokens": 63087700.0, "step": 52480 }, { "entropy": 1.9238998390734197, "epoch": 0.16271440738585152, "grad_norm": 7.640765190124512, "learning_rate": 6.271678857048824e-06, "loss": 0.5036, "mean_token_accuracy": 0.8411808148026466, "num_tokens": 63099944.0, "step": 52490 }, { "entropy": 1.888495209813118, "epoch": 0.16274540651090122, "grad_norm": 7.534304141998291, "learning_rate": 6.271081514476341e-06, "loss": 0.4903, "mean_token_accuracy": 0.8466553881764411, "num_tokens": 63112697.0, "step": 52500 }, { "entropy": 1.9347805023193358, "epoch": 0.16277640563595092, "grad_norm": 7.860371112823486, "learning_rate": 6.270484342552038e-06, "loss": 0.535, "mean_token_accuracy": 0.8298186644911766, "num_tokens": 63124515.0, "step": 52510 }, { "entropy": 1.8718351736664771, "epoch": 0.1628074047610006, "grad_norm": 11.632791519165039, "learning_rate": 6.269887341194678e-06, "loss": 0.4855, "mean_token_accuracy": 0.8416498139500618, "num_tokens": 63137607.0, "step": 52520 }, { "entropy": 1.8613640531897544, "epoch": 0.1628384038860503, "grad_norm": 5.130488872528076, "learning_rate": 6.269290510323079e-06, "loss": 0.5124, "mean_token_accuracy": 0.8313415363430977, "num_tokens": 63150710.0, "step": 52530 }, { "entropy": 1.8332793086767196, "epoch": 0.1628694030111, "grad_norm": 8.796858787536621, "learning_rate": 6.2686938498561155e-06, "loss": 0.4807, "mean_token_accuracy": 0.8430872440338135, "num_tokens": 63163425.0, "step": 52540 }, { "entropy": 1.8761805295944214, "epoch": 0.1629004021361497, "grad_norm": 4.210749626159668, "learning_rate": 6.268097359712715e-06, "loss": 0.4861, "mean_token_accuracy": 0.8424938842654228, "num_tokens": 63175765.0, "step": 52550 }, { "entropy": 1.9241129815578462, "epoch": 0.1629314012611994, "grad_norm": 7.9560465812683105, "learning_rate": 6.267501039811856e-06, "loss": 0.5341, "mean_token_accuracy": 0.836355759203434, "num_tokens": 63187161.0, "step": 52560 }, { "entropy": 1.9873510360717774, "epoch": 0.1629624003862491, "grad_norm": 8.924347877502441, "learning_rate": 6.2669048900725745e-06, "loss": 0.6066, "mean_token_accuracy": 0.8214996859431267, "num_tokens": 63198468.0, "step": 52570 }, { "entropy": 1.9314937353134156, "epoch": 0.1629933995112988, "grad_norm": 5.659384727478027, "learning_rate": 6.266308910413959e-06, "loss": 0.5159, "mean_token_accuracy": 0.8333692610263824, "num_tokens": 63210134.0, "step": 52580 }, { "entropy": 2.028110182285309, "epoch": 0.1630243986363485, "grad_norm": 8.73976993560791, "learning_rate": 6.2657131007551516e-06, "loss": 0.6023, "mean_token_accuracy": 0.8240557476878166, "num_tokens": 63221318.0, "step": 52590 }, { "entropy": 1.8795430675148963, "epoch": 0.16305539776139819, "grad_norm": 7.525078296661377, "learning_rate": 6.265117461015348e-06, "loss": 0.4751, "mean_token_accuracy": 0.8401163816452026, "num_tokens": 63234281.0, "step": 52600 }, { "entropy": 1.9891944900155067, "epoch": 0.16308639688644788, "grad_norm": 8.702574729919434, "learning_rate": 6.2645219911138e-06, "loss": 0.5248, "mean_token_accuracy": 0.8414581835269928, "num_tokens": 63245528.0, "step": 52610 }, { "entropy": 1.9499772146344185, "epoch": 0.16311739601149758, "grad_norm": 8.397680282592773, "learning_rate": 6.263926690969809e-06, "loss": 0.5396, "mean_token_accuracy": 0.8338237330317497, "num_tokens": 63257825.0, "step": 52620 }, { "entropy": 1.9680059745907783, "epoch": 0.16314839513654728, "grad_norm": 8.269797325134277, "learning_rate": 6.263331560502734e-06, "loss": 0.5671, "mean_token_accuracy": 0.8288321003317833, "num_tokens": 63269246.0, "step": 52630 }, { "entropy": 1.9180641785264014, "epoch": 0.16317939426159697, "grad_norm": 7.361063003540039, "learning_rate": 6.262736599631985e-06, "loss": 0.4958, "mean_token_accuracy": 0.8423007354140282, "num_tokens": 63281078.0, "step": 52640 }, { "entropy": 1.9701405078172685, "epoch": 0.16321039338664667, "grad_norm": 9.259462356567383, "learning_rate": 6.262141808277028e-06, "loss": 0.5578, "mean_token_accuracy": 0.8284604266285897, "num_tokens": 63292492.0, "step": 52650 }, { "entropy": 1.8732146829366685, "epoch": 0.16324139251169636, "grad_norm": 4.93553352355957, "learning_rate": 6.261547186357378e-06, "loss": 0.5442, "mean_token_accuracy": 0.8330485403537751, "num_tokens": 63305365.0, "step": 52660 }, { "entropy": 1.8874418213963509, "epoch": 0.16327239163674606, "grad_norm": 12.51389217376709, "learning_rate": 6.260952733792611e-06, "loss": 0.4924, "mean_token_accuracy": 0.8417252108454705, "num_tokens": 63317381.0, "step": 52670 }, { "entropy": 1.9065624982118607, "epoch": 0.16330339076179576, "grad_norm": 4.349983215332031, "learning_rate": 6.26035845050235e-06, "loss": 0.4946, "mean_token_accuracy": 0.83489740639925, "num_tokens": 63329554.0, "step": 52680 }, { "entropy": 1.9560609757900238, "epoch": 0.16333438988684545, "grad_norm": 8.065526008605957, "learning_rate": 6.259764336406272e-06, "loss": 0.5565, "mean_token_accuracy": 0.8307097449898719, "num_tokens": 63341290.0, "step": 52690 }, { "entropy": 1.9741875648498535, "epoch": 0.16336538901189515, "grad_norm": 4.957176685333252, "learning_rate": 6.259170391424109e-06, "loss": 0.5308, "mean_token_accuracy": 0.8235329478979111, "num_tokens": 63352756.0, "step": 52700 }, { "entropy": 1.897114197909832, "epoch": 0.16339638813694485, "grad_norm": 7.078670024871826, "learning_rate": 6.25857661547565e-06, "loss": 0.4845, "mean_token_accuracy": 0.8442688778042793, "num_tokens": 63364598.0, "step": 52710 }, { "entropy": 1.9124154165387153, "epoch": 0.16342738726199452, "grad_norm": 8.284811973571777, "learning_rate": 6.257983008480728e-06, "loss": 0.5272, "mean_token_accuracy": 0.8364136472344399, "num_tokens": 63376703.0, "step": 52720 }, { "entropy": 1.9177636936306954, "epoch": 0.1634583863870442, "grad_norm": 6.819045066833496, "learning_rate": 6.257389570359238e-06, "loss": 0.4727, "mean_token_accuracy": 0.846918910741806, "num_tokens": 63389432.0, "step": 52730 }, { "entropy": 1.9418374314904212, "epoch": 0.1634893855120939, "grad_norm": 9.605525970458984, "learning_rate": 6.256796301031124e-06, "loss": 0.5572, "mean_token_accuracy": 0.8311137303709983, "num_tokens": 63401131.0, "step": 52740 }, { "entropy": 1.8737993687391281, "epoch": 0.1635203846371436, "grad_norm": 8.54281234741211, "learning_rate": 6.256203200416383e-06, "loss": 0.5133, "mean_token_accuracy": 0.8313992112874985, "num_tokens": 63413232.0, "step": 52750 }, { "entropy": 1.8898098900914193, "epoch": 0.1635513837621933, "grad_norm": 8.377137184143066, "learning_rate": 6.255610268435066e-06, "loss": 0.5053, "mean_token_accuracy": 0.8360078081488609, "num_tokens": 63425498.0, "step": 52760 }, { "entropy": 1.9383550241589547, "epoch": 0.163582382887243, "grad_norm": 8.232552528381348, "learning_rate": 6.255017505007278e-06, "loss": 0.5117, "mean_token_accuracy": 0.8423770934343338, "num_tokens": 63436970.0, "step": 52770 }, { "entropy": 1.9232452899217605, "epoch": 0.1636133820122927, "grad_norm": 7.204452991485596, "learning_rate": 6.254424910053175e-06, "loss": 0.5308, "mean_token_accuracy": 0.8377453580498695, "num_tokens": 63448237.0, "step": 52780 }, { "entropy": 1.9503621995449065, "epoch": 0.1636443811373424, "grad_norm": 7.743916034698486, "learning_rate": 6.253832483492968e-06, "loss": 0.604, "mean_token_accuracy": 0.818188302218914, "num_tokens": 63459542.0, "step": 52790 }, { "entropy": 1.9227148860692977, "epoch": 0.1636753802623921, "grad_norm": 8.97805404663086, "learning_rate": 6.253240225246917e-06, "loss": 0.5535, "mean_token_accuracy": 0.835084454715252, "num_tokens": 63470819.0, "step": 52800 }, { "entropy": 1.8931695908308028, "epoch": 0.16370637938744179, "grad_norm": 8.5215482711792, "learning_rate": 6.25264813523534e-06, "loss": 0.4692, "mean_token_accuracy": 0.841637410223484, "num_tokens": 63483021.0, "step": 52810 }, { "entropy": 1.9346808150410653, "epoch": 0.16373737851249148, "grad_norm": 3.57350754737854, "learning_rate": 6.252056213378607e-06, "loss": 0.5261, "mean_token_accuracy": 0.8383998513221741, "num_tokens": 63495186.0, "step": 52820 }, { "entropy": 1.8804344907402992, "epoch": 0.16376837763754118, "grad_norm": 8.589835166931152, "learning_rate": 6.251464459597134e-06, "loss": 0.4566, "mean_token_accuracy": 0.8427335307002067, "num_tokens": 63507869.0, "step": 52830 }, { "entropy": 1.8359813764691353, "epoch": 0.16379937676259088, "grad_norm": 9.559521675109863, "learning_rate": 6.2508728738114e-06, "loss": 0.4589, "mean_token_accuracy": 0.8516914382576942, "num_tokens": 63519894.0, "step": 52840 }, { "entropy": 1.9041066259145736, "epoch": 0.16383037588764057, "grad_norm": 4.24696683883667, "learning_rate": 6.250281455941929e-06, "loss": 0.5371, "mean_token_accuracy": 0.8393527343869209, "num_tokens": 63532566.0, "step": 52850 }, { "entropy": 1.857696245610714, "epoch": 0.16386137501269027, "grad_norm": 13.18340015411377, "learning_rate": 6.249690205909301e-06, "loss": 0.4958, "mean_token_accuracy": 0.8314845860004425, "num_tokens": 63545311.0, "step": 52860 }, { "entropy": 1.9533265694975852, "epoch": 0.16389237413773997, "grad_norm": 8.314949035644531, "learning_rate": 6.249099123634147e-06, "loss": 0.5793, "mean_token_accuracy": 0.8232409819960594, "num_tokens": 63556591.0, "step": 52870 }, { "entropy": 1.9042376592755317, "epoch": 0.16392337326278966, "grad_norm": 7.390749454498291, "learning_rate": 6.248508209037151e-06, "loss": 0.5547, "mean_token_accuracy": 0.827096089720726, "num_tokens": 63568839.0, "step": 52880 }, { "entropy": 1.8934383913874626, "epoch": 0.16395437238783936, "grad_norm": 8.171467781066895, "learning_rate": 6.2479174620390516e-06, "loss": 0.5367, "mean_token_accuracy": 0.8317389547824859, "num_tokens": 63580636.0, "step": 52890 }, { "entropy": 1.8028965070843697, "epoch": 0.16398537151288906, "grad_norm": 9.49335765838623, "learning_rate": 6.247326882560637e-06, "loss": 0.5213, "mean_token_accuracy": 0.8371429294347763, "num_tokens": 63593629.0, "step": 52900 }, { "entropy": 1.848221817612648, "epoch": 0.16401637063793875, "grad_norm": 8.622106552124023, "learning_rate": 6.246736470522748e-06, "loss": 0.4548, "mean_token_accuracy": 0.8468390449881553, "num_tokens": 63606139.0, "step": 52910 }, { "entropy": 1.8176067188382148, "epoch": 0.16404736976298845, "grad_norm": 9.547903060913086, "learning_rate": 6.2461462258462804e-06, "loss": 0.4316, "mean_token_accuracy": 0.854963929951191, "num_tokens": 63619469.0, "step": 52920 }, { "entropy": 1.9598242774605752, "epoch": 0.16407836888803815, "grad_norm": 8.696121215820312, "learning_rate": 6.245556148452177e-06, "loss": 0.5457, "mean_token_accuracy": 0.8299388602375984, "num_tokens": 63630770.0, "step": 52930 }, { "entropy": 1.9861217468976975, "epoch": 0.16410936801308784, "grad_norm": 7.65268611907959, "learning_rate": 6.244966238261442e-06, "loss": 0.5992, "mean_token_accuracy": 0.8203268766403198, "num_tokens": 63641633.0, "step": 52940 }, { "entropy": 1.933871328830719, "epoch": 0.16414036713813754, "grad_norm": 4.0689263343811035, "learning_rate": 6.2443764951951215e-06, "loss": 0.5137, "mean_token_accuracy": 0.8379737511277199, "num_tokens": 63653263.0, "step": 52950 }, { "entropy": 1.924587444961071, "epoch": 0.16417136626318724, "grad_norm": 9.592199325561523, "learning_rate": 6.24378691917432e-06, "loss": 0.5423, "mean_token_accuracy": 0.8331802487373352, "num_tokens": 63664170.0, "step": 52960 }, { "entropy": 1.922967004776001, "epoch": 0.1642023653882369, "grad_norm": 3.9042298793792725, "learning_rate": 6.2431975101201926e-06, "loss": 0.534, "mean_token_accuracy": 0.8360741794109344, "num_tokens": 63675465.0, "step": 52970 }, { "entropy": 1.8445443019270897, "epoch": 0.1642333645132866, "grad_norm": 7.786768913269043, "learning_rate": 6.242608267953947e-06, "loss": 0.4816, "mean_token_accuracy": 0.8440708309412003, "num_tokens": 63688438.0, "step": 52980 }, { "entropy": 1.9318128436803819, "epoch": 0.1642643636383363, "grad_norm": 7.483189582824707, "learning_rate": 6.242019192596842e-06, "loss": 0.5058, "mean_token_accuracy": 0.8481845185160637, "num_tokens": 63700364.0, "step": 52990 }, { "entropy": 1.9713671594858169, "epoch": 0.164295362763386, "grad_norm": 7.716215133666992, "learning_rate": 6.241430283970189e-06, "loss": 0.5784, "mean_token_accuracy": 0.8340044632554054, "num_tokens": 63711655.0, "step": 53000 }, { "entropy": 1.916495531797409, "epoch": 0.1643263618884357, "grad_norm": 7.8675971031188965, "learning_rate": 6.24084154199535e-06, "loss": 0.5383, "mean_token_accuracy": 0.8242413744330406, "num_tokens": 63723415.0, "step": 53010 }, { "entropy": 1.8746812611818313, "epoch": 0.1643573610134854, "grad_norm": 4.13045072555542, "learning_rate": 6.240252966593741e-06, "loss": 0.4903, "mean_token_accuracy": 0.8390706807374955, "num_tokens": 63735381.0, "step": 53020 }, { "entropy": 1.9430684581398965, "epoch": 0.16438836013853508, "grad_norm": 8.874407768249512, "learning_rate": 6.23966455768683e-06, "loss": 0.5869, "mean_token_accuracy": 0.8245584949851036, "num_tokens": 63746701.0, "step": 53030 }, { "entropy": 1.9074441373348237, "epoch": 0.16441935926358478, "grad_norm": 3.734499454498291, "learning_rate": 6.239076315196135e-06, "loss": 0.5123, "mean_token_accuracy": 0.8352471068501472, "num_tokens": 63758681.0, "step": 53040 }, { "entropy": 1.9198872432112695, "epoch": 0.16445035838863448, "grad_norm": 7.779430866241455, "learning_rate": 6.2384882390432265e-06, "loss": 0.5311, "mean_token_accuracy": 0.8286612689495086, "num_tokens": 63770142.0, "step": 53050 }, { "entropy": 1.9856867283582686, "epoch": 0.16448135751368417, "grad_norm": 8.502123832702637, "learning_rate": 6.2379003291497265e-06, "loss": 0.5602, "mean_token_accuracy": 0.8293860018253326, "num_tokens": 63781495.0, "step": 53060 }, { "entropy": 1.8787162870168685, "epoch": 0.16451235663873387, "grad_norm": 3.1227447986602783, "learning_rate": 6.237312585437309e-06, "loss": 0.5165, "mean_token_accuracy": 0.8432197883725167, "num_tokens": 63793956.0, "step": 53070 }, { "entropy": 1.8861326739192008, "epoch": 0.16454335576378357, "grad_norm": 9.353623390197754, "learning_rate": 6.236725007827702e-06, "loss": 0.4855, "mean_token_accuracy": 0.8440233051776886, "num_tokens": 63806818.0, "step": 53080 }, { "entropy": 1.7985064759850502, "epoch": 0.16457435488883326, "grad_norm": 10.218011856079102, "learning_rate": 6.23613759624268e-06, "loss": 0.4521, "mean_token_accuracy": 0.8478423848748207, "num_tokens": 63820679.0, "step": 53090 }, { "entropy": 1.8236029118299484, "epoch": 0.16460535401388296, "grad_norm": 4.523934841156006, "learning_rate": 6.235550350604071e-06, "loss": 0.4179, "mean_token_accuracy": 0.8463995724916458, "num_tokens": 63833871.0, "step": 53100 }, { "entropy": 1.8776141807436943, "epoch": 0.16463635313893266, "grad_norm": 4.4371161460876465, "learning_rate": 6.234963270833758e-06, "loss": 0.474, "mean_token_accuracy": 0.8449958503246308, "num_tokens": 63845568.0, "step": 53110 }, { "entropy": 1.8257340744137764, "epoch": 0.16466735226398235, "grad_norm": 9.007696151733398, "learning_rate": 6.234376356853673e-06, "loss": 0.431, "mean_token_accuracy": 0.8533379226922989, "num_tokens": 63858077.0, "step": 53120 }, { "entropy": 1.8883861318230628, "epoch": 0.16469835138903205, "grad_norm": 8.40960693359375, "learning_rate": 6.233789608585796e-06, "loss": 0.5674, "mean_token_accuracy": 0.8298670023679733, "num_tokens": 63870245.0, "step": 53130 }, { "entropy": 1.993936476111412, "epoch": 0.16472935051408175, "grad_norm": 9.627812385559082, "learning_rate": 6.233203025952166e-06, "loss": 0.6206, "mean_token_accuracy": 0.8264122769236565, "num_tokens": 63881175.0, "step": 53140 }, { "entropy": 1.8338624104857444, "epoch": 0.16476034963913144, "grad_norm": 10.100017547607422, "learning_rate": 6.232616608874865e-06, "loss": 0.4989, "mean_token_accuracy": 0.8356154605746269, "num_tokens": 63894063.0, "step": 53150 }, { "entropy": 1.8793131604790687, "epoch": 0.16479134876418114, "grad_norm": 8.098730087280273, "learning_rate": 6.232030357276034e-06, "loss": 0.4788, "mean_token_accuracy": 0.8461975052952766, "num_tokens": 63905919.0, "step": 53160 }, { "entropy": 1.9642018005251884, "epoch": 0.16482234788923084, "grad_norm": 7.914994716644287, "learning_rate": 6.231444271077859e-06, "loss": 0.555, "mean_token_accuracy": 0.829823549091816, "num_tokens": 63916952.0, "step": 53170 }, { "entropy": 1.8831199273467063, "epoch": 0.16485334701428053, "grad_norm": 10.928045272827148, "learning_rate": 6.23085835020258e-06, "loss": 0.5219, "mean_token_accuracy": 0.8363705515861511, "num_tokens": 63928969.0, "step": 53180 }, { "entropy": 1.9851090759038925, "epoch": 0.16488434613933023, "grad_norm": 9.08737564086914, "learning_rate": 6.230272594572488e-06, "loss": 0.5901, "mean_token_accuracy": 0.8261380925774574, "num_tokens": 63939337.0, "step": 53190 }, { "entropy": 1.908937330543995, "epoch": 0.16491534526437993, "grad_norm": 11.058307647705078, "learning_rate": 6.229687004109927e-06, "loss": 0.5673, "mean_token_accuracy": 0.8325806826353073, "num_tokens": 63951300.0, "step": 53200 }, { "entropy": 1.9652216613292695, "epoch": 0.16494634438942962, "grad_norm": 9.384000778198242, "learning_rate": 6.229101578737288e-06, "loss": 0.5616, "mean_token_accuracy": 0.8316969543695449, "num_tokens": 63961965.0, "step": 53210 }, { "entropy": 1.9054602891206742, "epoch": 0.1649773435144793, "grad_norm": 4.693671226501465, "learning_rate": 6.228516318377016e-06, "loss": 0.5676, "mean_token_accuracy": 0.8300312593579292, "num_tokens": 63974012.0, "step": 53220 }, { "entropy": 1.87213394343853, "epoch": 0.165008342639529, "grad_norm": 8.082072257995605, "learning_rate": 6.227931222951605e-06, "loss": 0.522, "mean_token_accuracy": 0.836493344604969, "num_tokens": 63985663.0, "step": 53230 }, { "entropy": 1.9317936196923255, "epoch": 0.16503934176457868, "grad_norm": 8.324505805969238, "learning_rate": 6.227346292383604e-06, "loss": 0.5724, "mean_token_accuracy": 0.8266930311918259, "num_tokens": 63996929.0, "step": 53240 }, { "entropy": 1.8709998071193694, "epoch": 0.16507034088962838, "grad_norm": 8.387805938720703, "learning_rate": 6.226761526595607e-06, "loss": 0.557, "mean_token_accuracy": 0.8292562425136566, "num_tokens": 64008847.0, "step": 53250 }, { "entropy": 1.8826777443289757, "epoch": 0.16510134001467808, "grad_norm": 7.2748541831970215, "learning_rate": 6.2261769255102635e-06, "loss": 0.513, "mean_token_accuracy": 0.8443334132432938, "num_tokens": 64020608.0, "step": 53260 }, { "entropy": 1.8901048377156258, "epoch": 0.16513233913972777, "grad_norm": 4.1918625831604, "learning_rate": 6.225592489050273e-06, "loss": 0.5043, "mean_token_accuracy": 0.8356076464056968, "num_tokens": 64032582.0, "step": 53270 }, { "entropy": 1.8856966137886046, "epoch": 0.16516333826477747, "grad_norm": 9.814857482910156, "learning_rate": 6.225008217138383e-06, "loss": 0.5243, "mean_token_accuracy": 0.8268331870436668, "num_tokens": 64044568.0, "step": 53280 }, { "entropy": 1.9235829666256905, "epoch": 0.16519433738982717, "grad_norm": 8.297536849975586, "learning_rate": 6.224424109697395e-06, "loss": 0.509, "mean_token_accuracy": 0.8338031157851219, "num_tokens": 64055682.0, "step": 53290 }, { "entropy": 1.8362517848610878, "epoch": 0.16522533651487686, "grad_norm": 4.987396717071533, "learning_rate": 6.2238401666501594e-06, "loss": 0.4394, "mean_token_accuracy": 0.842101874947548, "num_tokens": 64068519.0, "step": 53300 }, { "entropy": 1.8466239094734191, "epoch": 0.16525633563992656, "grad_norm": 7.831912517547607, "learning_rate": 6.2232563879195784e-06, "loss": 0.4627, "mean_token_accuracy": 0.8441867157816887, "num_tokens": 64081113.0, "step": 53310 }, { "entropy": 1.901265199482441, "epoch": 0.16528733476497626, "grad_norm": 9.943422317504883, "learning_rate": 6.222672773428604e-06, "loss": 0.5043, "mean_token_accuracy": 0.8303415447473526, "num_tokens": 64093471.0, "step": 53320 }, { "entropy": 1.8734540060162543, "epoch": 0.16531833389002595, "grad_norm": 9.80716609954834, "learning_rate": 6.222089323100241e-06, "loss": 0.5508, "mean_token_accuracy": 0.8340218141674995, "num_tokens": 64106551.0, "step": 53330 }, { "entropy": 1.8805134430527688, "epoch": 0.16534933301507565, "grad_norm": 8.605008125305176, "learning_rate": 6.221506036857539e-06, "loss": 0.4991, "mean_token_accuracy": 0.8431431293487549, "num_tokens": 64119113.0, "step": 53340 }, { "entropy": 1.8234007403254509, "epoch": 0.16538033214012535, "grad_norm": 4.009090423583984, "learning_rate": 6.220922914623604e-06, "loss": 0.4587, "mean_token_accuracy": 0.8466459035873413, "num_tokens": 64132616.0, "step": 53350 }, { "entropy": 1.8845936864614488, "epoch": 0.16541133126517504, "grad_norm": 10.148909568786621, "learning_rate": 6.2203399563215905e-06, "loss": 0.5034, "mean_token_accuracy": 0.8377220943570137, "num_tokens": 64145357.0, "step": 53360 }, { "entropy": 1.959778368473053, "epoch": 0.16544233039022474, "grad_norm": 8.213615417480469, "learning_rate": 6.219757161874702e-06, "loss": 0.5736, "mean_token_accuracy": 0.8356760248541832, "num_tokens": 64156979.0, "step": 53370 }, { "entropy": 1.9542056560516357, "epoch": 0.16547332951527444, "grad_norm": 7.512001991271973, "learning_rate": 6.219174531206195e-06, "loss": 0.5212, "mean_token_accuracy": 0.8426282212138176, "num_tokens": 64167936.0, "step": 53380 }, { "entropy": 1.8867290809750557, "epoch": 0.16550432864032413, "grad_norm": 11.55746078491211, "learning_rate": 6.2185920642393724e-06, "loss": 0.538, "mean_token_accuracy": 0.8335390016436577, "num_tokens": 64178732.0, "step": 53390 }, { "entropy": 1.9342323437333107, "epoch": 0.16553532776537383, "grad_norm": 8.260815620422363, "learning_rate": 6.218009760897592e-06, "loss": 0.5032, "mean_token_accuracy": 0.8423149228096009, "num_tokens": 64189523.0, "step": 53400 }, { "entropy": 1.8842595234513282, "epoch": 0.16556632689042353, "grad_norm": 8.171215057373047, "learning_rate": 6.21742762110426e-06, "loss": 0.4739, "mean_token_accuracy": 0.852245531976223, "num_tokens": 64200851.0, "step": 53410 }, { "entropy": 1.9514251738786696, "epoch": 0.16559732601547322, "grad_norm": 11.585762977600098, "learning_rate": 6.216845644782831e-06, "loss": 0.6304, "mean_token_accuracy": 0.8217004343867302, "num_tokens": 64212043.0, "step": 53420 }, { "entropy": 1.9424443751573564, "epoch": 0.16562832514052292, "grad_norm": 7.723189353942871, "learning_rate": 6.216263831856811e-06, "loss": 0.5982, "mean_token_accuracy": 0.830877748131752, "num_tokens": 64223504.0, "step": 53430 }, { "entropy": 1.9373986154794693, "epoch": 0.16565932426557262, "grad_norm": 8.541019439697266, "learning_rate": 6.215682182249758e-06, "loss": 0.5521, "mean_token_accuracy": 0.8272872015833854, "num_tokens": 64235303.0, "step": 53440 }, { "entropy": 1.926292023062706, "epoch": 0.1656903233906223, "grad_norm": 9.712891578674316, "learning_rate": 6.215100695885277e-06, "loss": 0.5625, "mean_token_accuracy": 0.8317703247070313, "num_tokens": 64246713.0, "step": 53450 }, { "entropy": 1.8299464777112007, "epoch": 0.16572132251567198, "grad_norm": 8.690601348876953, "learning_rate": 6.214519372687023e-06, "loss": 0.4614, "mean_token_accuracy": 0.8461026042699814, "num_tokens": 64259907.0, "step": 53460 }, { "entropy": 1.9531344637274741, "epoch": 0.16575232164072168, "grad_norm": 8.977042198181152, "learning_rate": 6.2139382125787065e-06, "loss": 0.5531, "mean_token_accuracy": 0.8346742272377015, "num_tokens": 64271538.0, "step": 53470 }, { "entropy": 1.870873971283436, "epoch": 0.16578332076577137, "grad_norm": 10.686627388000488, "learning_rate": 6.213357215484079e-06, "loss": 0.5115, "mean_token_accuracy": 0.8391006410121917, "num_tokens": 64283477.0, "step": 53480 }, { "entropy": 1.884416152536869, "epoch": 0.16581431989082107, "grad_norm": 9.072136878967285, "learning_rate": 6.2127763813269494e-06, "loss": 0.5146, "mean_token_accuracy": 0.8330440178513527, "num_tokens": 64295950.0, "step": 53490 }, { "entropy": 1.8264600470662118, "epoch": 0.16584531901587077, "grad_norm": 3.7949202060699463, "learning_rate": 6.212195710031174e-06, "loss": 0.4127, "mean_token_accuracy": 0.8554823577404023, "num_tokens": 64308519.0, "step": 53500 }, { "entropy": 1.8256630197167396, "epoch": 0.16587631814092046, "grad_norm": 8.502163887023926, "learning_rate": 6.211615201520656e-06, "loss": 0.4813, "mean_token_accuracy": 0.8437755182385445, "num_tokens": 64321187.0, "step": 53510 }, { "entropy": 1.7614564910531043, "epoch": 0.16590731726597016, "grad_norm": 9.997268676757812, "learning_rate": 6.211034855719356e-06, "loss": 0.466, "mean_token_accuracy": 0.8389209792017936, "num_tokens": 64334994.0, "step": 53520 }, { "entropy": 1.934888207912445, "epoch": 0.16593831639101986, "grad_norm": 7.5674519538879395, "learning_rate": 6.210454672551274e-06, "loss": 0.5638, "mean_token_accuracy": 0.8264615863561631, "num_tokens": 64345597.0, "step": 53530 }, { "entropy": 1.8772856667637825, "epoch": 0.16596931551606955, "grad_norm": 10.031050682067871, "learning_rate": 6.209874651940466e-06, "loss": 0.4958, "mean_token_accuracy": 0.8403290301561356, "num_tokens": 64357564.0, "step": 53540 }, { "entropy": 1.8533400312066077, "epoch": 0.16600031464111925, "grad_norm": 10.334144592285156, "learning_rate": 6.209294793811038e-06, "loss": 0.4953, "mean_token_accuracy": 0.8353090777993202, "num_tokens": 64369611.0, "step": 53550 }, { "entropy": 1.940661406517029, "epoch": 0.16603131376616895, "grad_norm": 11.576028823852539, "learning_rate": 6.208715098087144e-06, "loss": 0.5575, "mean_token_accuracy": 0.8241597592830658, "num_tokens": 64380934.0, "step": 53560 }, { "entropy": 1.9238759770989418, "epoch": 0.16606231289121864, "grad_norm": 6.117959022521973, "learning_rate": 6.208135564692989e-06, "loss": 0.581, "mean_token_accuracy": 0.824966461956501, "num_tokens": 64392121.0, "step": 53570 }, { "entropy": 1.9680738180875779, "epoch": 0.16609331201626834, "grad_norm": 6.631012916564941, "learning_rate": 6.207556193552824e-06, "loss": 0.5566, "mean_token_accuracy": 0.8322376728057861, "num_tokens": 64403048.0, "step": 53580 }, { "entropy": 1.895513379573822, "epoch": 0.16612431114131804, "grad_norm": 9.87430191040039, "learning_rate": 6.206976984590952e-06, "loss": 0.6129, "mean_token_accuracy": 0.8248256593942642, "num_tokens": 64414999.0, "step": 53590 }, { "entropy": 1.8739793948829173, "epoch": 0.16615531026636773, "grad_norm": 10.234819412231445, "learning_rate": 6.206397937731726e-06, "loss": 0.4756, "mean_token_accuracy": 0.8436161160469056, "num_tokens": 64427895.0, "step": 53600 }, { "entropy": 1.885368101298809, "epoch": 0.16618630939141743, "grad_norm": 7.840350151062012, "learning_rate": 6.205819052899549e-06, "loss": 0.5118, "mean_token_accuracy": 0.8369289055466652, "num_tokens": 64440003.0, "step": 53610 }, { "entropy": 1.9216457679867744, "epoch": 0.16621730851646713, "grad_norm": 9.213172912597656, "learning_rate": 6.205240330018869e-06, "loss": 0.5225, "mean_token_accuracy": 0.840261846780777, "num_tokens": 64451450.0, "step": 53620 }, { "entropy": 1.9085844457149506, "epoch": 0.16624830764151682, "grad_norm": 8.908021926879883, "learning_rate": 6.2046617690141876e-06, "loss": 0.5254, "mean_token_accuracy": 0.8355207294225693, "num_tokens": 64463262.0, "step": 53630 }, { "entropy": 1.932268613576889, "epoch": 0.16627930676656652, "grad_norm": 8.519464492797852, "learning_rate": 6.204083369810055e-06, "loss": 0.5445, "mean_token_accuracy": 0.8326381966471672, "num_tokens": 64474641.0, "step": 53640 }, { "entropy": 1.8569614686071874, "epoch": 0.16631030589161622, "grad_norm": 4.804983139038086, "learning_rate": 6.203505132331069e-06, "loss": 0.4986, "mean_token_accuracy": 0.8352816164493561, "num_tokens": 64487602.0, "step": 53650 }, { "entropy": 1.858240360021591, "epoch": 0.1663413050166659, "grad_norm": 10.345724105834961, "learning_rate": 6.202927056501878e-06, "loss": 0.5177, "mean_token_accuracy": 0.8289370179176331, "num_tokens": 64500395.0, "step": 53660 }, { "entropy": 1.807939064502716, "epoch": 0.1663723041417156, "grad_norm": 7.798347473144531, "learning_rate": 6.202349142247179e-06, "loss": 0.4629, "mean_token_accuracy": 0.8403982803225517, "num_tokens": 64513854.0, "step": 53670 }, { "entropy": 1.972070437669754, "epoch": 0.1664033032667653, "grad_norm": 7.832572937011719, "learning_rate": 6.201771389491718e-06, "loss": 0.5794, "mean_token_accuracy": 0.8308521166443825, "num_tokens": 64524527.0, "step": 53680 }, { "entropy": 1.8211177855730056, "epoch": 0.166434302391815, "grad_norm": 7.727509498596191, "learning_rate": 6.20119379816029e-06, "loss": 0.4504, "mean_token_accuracy": 0.8501279547810554, "num_tokens": 64536806.0, "step": 53690 }, { "entropy": 1.9093163907527924, "epoch": 0.1664653015168647, "grad_norm": 8.725934982299805, "learning_rate": 6.20061636817774e-06, "loss": 0.5555, "mean_token_accuracy": 0.8283161029219628, "num_tokens": 64547580.0, "step": 53700 }, { "entropy": 1.8534967467188834, "epoch": 0.16649630064191437, "grad_norm": 9.739397048950195, "learning_rate": 6.200039099468959e-06, "loss": 0.4736, "mean_token_accuracy": 0.8432888746261596, "num_tokens": 64559322.0, "step": 53710 }, { "entropy": 1.8829628586769105, "epoch": 0.16652729976696407, "grad_norm": 9.243392944335938, "learning_rate": 6.1994619919588925e-06, "loss": 0.5197, "mean_token_accuracy": 0.8410766318440437, "num_tokens": 64571411.0, "step": 53720 }, { "entropy": 1.850360631942749, "epoch": 0.16655829889201376, "grad_norm": 8.87844467163086, "learning_rate": 6.19888504557253e-06, "loss": 0.4892, "mean_token_accuracy": 0.8418926939368248, "num_tokens": 64583534.0, "step": 53730 }, { "entropy": 1.7772665143013, "epoch": 0.16658929801706346, "grad_norm": 10.492433547973633, "learning_rate": 6.198308260234912e-06, "loss": 0.4102, "mean_token_accuracy": 0.8469656750559806, "num_tokens": 64596994.0, "step": 53740 }, { "entropy": 1.8638063728809358, "epoch": 0.16662029714211316, "grad_norm": 4.035283088684082, "learning_rate": 6.197731635871126e-06, "loss": 0.5047, "mean_token_accuracy": 0.8372615680098534, "num_tokens": 64608787.0, "step": 53750 }, { "entropy": 1.8116811349987985, "epoch": 0.16665129626716285, "grad_norm": 3.7760744094848633, "learning_rate": 6.197155172406311e-06, "loss": 0.483, "mean_token_accuracy": 0.8445352002978325, "num_tokens": 64620766.0, "step": 53760 }, { "entropy": 1.764721181988716, "epoch": 0.16668229539221255, "grad_norm": 9.610373497009277, "learning_rate": 6.19657886976565e-06, "loss": 0.4509, "mean_token_accuracy": 0.8372749388217926, "num_tokens": 64634866.0, "step": 53770 }, { "entropy": 1.9135527536273003, "epoch": 0.16671329451726224, "grad_norm": 7.860201358795166, "learning_rate": 6.196002727874382e-06, "loss": 0.528, "mean_token_accuracy": 0.8299287378787994, "num_tokens": 64646883.0, "step": 53780 }, { "entropy": 1.9007146388292313, "epoch": 0.16674429364231194, "grad_norm": 3.8875370025634766, "learning_rate": 6.195426746657789e-06, "loss": 0.5254, "mean_token_accuracy": 0.8316178724169732, "num_tokens": 64658973.0, "step": 53790 }, { "entropy": 1.826033054292202, "epoch": 0.16677529276736164, "grad_norm": 8.549474716186523, "learning_rate": 6.194850926041201e-06, "loss": 0.4855, "mean_token_accuracy": 0.848523524403572, "num_tokens": 64671112.0, "step": 53800 }, { "entropy": 1.9412501826882362, "epoch": 0.16680629189241133, "grad_norm": 8.226346015930176, "learning_rate": 6.194275265950003e-06, "loss": 0.5854, "mean_token_accuracy": 0.817940565943718, "num_tokens": 64682585.0, "step": 53810 }, { "entropy": 1.9826242715120315, "epoch": 0.16683729101746103, "grad_norm": 7.8780622482299805, "learning_rate": 6.193699766309622e-06, "loss": 0.5808, "mean_token_accuracy": 0.8211043074727058, "num_tokens": 64693614.0, "step": 53820 }, { "entropy": 1.9336616814136505, "epoch": 0.16686829014251073, "grad_norm": 10.512943267822266, "learning_rate": 6.193124427045535e-06, "loss": 0.576, "mean_token_accuracy": 0.8250611051917076, "num_tokens": 64705081.0, "step": 53830 }, { "entropy": 1.8315666154026986, "epoch": 0.16689928926756042, "grad_norm": 9.55489730834961, "learning_rate": 6.1925492480832705e-06, "loss": 0.5247, "mean_token_accuracy": 0.8315412878990174, "num_tokens": 64717880.0, "step": 53840 }, { "entropy": 1.878394363820553, "epoch": 0.16693028839261012, "grad_norm": 7.533677101135254, "learning_rate": 6.1919742293484e-06, "loss": 0.5741, "mean_token_accuracy": 0.8287183776497841, "num_tokens": 64728991.0, "step": 53850 }, { "entropy": 1.9339822947978973, "epoch": 0.16696128751765982, "grad_norm": 4.745457172393799, "learning_rate": 6.1913993707665485e-06, "loss": 0.5554, "mean_token_accuracy": 0.8289982140064239, "num_tokens": 64740494.0, "step": 53860 }, { "entropy": 1.8400120854377746, "epoch": 0.16699228664270951, "grad_norm": 8.681490898132324, "learning_rate": 6.190824672263388e-06, "loss": 0.4626, "mean_token_accuracy": 0.8391780480742455, "num_tokens": 64753203.0, "step": 53870 }, { "entropy": 1.9312397927045821, "epoch": 0.1670232857677592, "grad_norm": 9.676896095275879, "learning_rate": 6.190250133764637e-06, "loss": 0.5628, "mean_token_accuracy": 0.832855261862278, "num_tokens": 64764277.0, "step": 53880 }, { "entropy": 1.8353455513715744, "epoch": 0.1670542848928089, "grad_norm": 7.824866771697998, "learning_rate": 6.189675755196064e-06, "loss": 0.4281, "mean_token_accuracy": 0.854548205435276, "num_tokens": 64776606.0, "step": 53890 }, { "entropy": 1.9409961223602294, "epoch": 0.1670852840178586, "grad_norm": 9.087993621826172, "learning_rate": 6.189101536483484e-06, "loss": 0.5796, "mean_token_accuracy": 0.8235799849033356, "num_tokens": 64788237.0, "step": 53900 }, { "entropy": 1.8919391304254531, "epoch": 0.1671162831429083, "grad_norm": 7.690661430358887, "learning_rate": 6.188527477552761e-06, "loss": 0.564, "mean_token_accuracy": 0.832922050356865, "num_tokens": 64800456.0, "step": 53910 }, { "entropy": 1.8101703599095345, "epoch": 0.167147282267958, "grad_norm": 8.9353609085083, "learning_rate": 6.18795357832981e-06, "loss": 0.4504, "mean_token_accuracy": 0.8428716316819191, "num_tokens": 64813777.0, "step": 53920 }, { "entropy": 1.8997833237051964, "epoch": 0.1671782813930077, "grad_norm": 5.414516448974609, "learning_rate": 6.187379838740587e-06, "loss": 0.5056, "mean_token_accuracy": 0.8327810063958168, "num_tokens": 64826025.0, "step": 53930 }, { "entropy": 1.9362305417656898, "epoch": 0.1672092805180574, "grad_norm": 7.484461784362793, "learning_rate": 6.186806258711105e-06, "loss": 0.5841, "mean_token_accuracy": 0.8236838817596436, "num_tokens": 64837897.0, "step": 53940 }, { "entropy": 1.7711832091212272, "epoch": 0.1672402796431071, "grad_norm": 3.344655990600586, "learning_rate": 6.186232838167419e-06, "loss": 0.3899, "mean_token_accuracy": 0.847083292901516, "num_tokens": 64852441.0, "step": 53950 }, { "entropy": 1.8284263715147973, "epoch": 0.16727127876815676, "grad_norm": 7.023416042327881, "learning_rate": 6.185659577035632e-06, "loss": 0.4613, "mean_token_accuracy": 0.8410525426268578, "num_tokens": 64865349.0, "step": 53960 }, { "entropy": 1.9532892093062402, "epoch": 0.16730227789320645, "grad_norm": 7.449609756469727, "learning_rate": 6.185086475241898e-06, "loss": 0.5544, "mean_token_accuracy": 0.8271322563290596, "num_tokens": 64876306.0, "step": 53970 }, { "entropy": 1.943205328285694, "epoch": 0.16733327701825615, "grad_norm": 9.61139965057373, "learning_rate": 6.184513532712416e-06, "loss": 0.5391, "mean_token_accuracy": 0.8366443976759911, "num_tokens": 64888292.0, "step": 53980 }, { "entropy": 1.9256486117839813, "epoch": 0.16736427614330585, "grad_norm": 5.424153804779053, "learning_rate": 6.183940749373436e-06, "loss": 0.5322, "mean_token_accuracy": 0.836666439473629, "num_tokens": 64899609.0, "step": 53990 }, { "entropy": 1.9205065920948983, "epoch": 0.16739527526835554, "grad_norm": 8.4097900390625, "learning_rate": 6.1833681251512516e-06, "loss": 0.4996, "mean_token_accuracy": 0.8369838654994964, "num_tokens": 64911133.0, "step": 54000 }, { "entropy": 1.837955741584301, "epoch": 0.16742627439340524, "grad_norm": 4.225302696228027, "learning_rate": 6.182795659972208e-06, "loss": 0.4938, "mean_token_accuracy": 0.8357640966773033, "num_tokens": 64923539.0, "step": 54010 }, { "entropy": 1.87107597514987, "epoch": 0.16745727351845494, "grad_norm": 9.220149993896484, "learning_rate": 6.182223353762697e-06, "loss": 0.4919, "mean_token_accuracy": 0.8416076585650444, "num_tokens": 64935263.0, "step": 54020 }, { "entropy": 1.8615114361047744, "epoch": 0.16748827264350463, "grad_norm": 8.91309928894043, "learning_rate": 6.181651206449155e-06, "loss": 0.4344, "mean_token_accuracy": 0.8517223030328751, "num_tokens": 64947376.0, "step": 54030 }, { "entropy": 1.8877154156565665, "epoch": 0.16751927176855433, "grad_norm": 4.553346633911133, "learning_rate": 6.181079217958073e-06, "loss": 0.4656, "mean_token_accuracy": 0.8457445085048676, "num_tokens": 64959541.0, "step": 54040 }, { "entropy": 1.8997020781040193, "epoch": 0.16755027089360403, "grad_norm": 9.368854522705078, "learning_rate": 6.180507388215983e-06, "loss": 0.5, "mean_token_accuracy": 0.8364048600196838, "num_tokens": 64972259.0, "step": 54050 }, { "entropy": 1.8942409604787827, "epoch": 0.16758127001865372, "grad_norm": 8.198491096496582, "learning_rate": 6.1799357171494655e-06, "loss": 0.5034, "mean_token_accuracy": 0.8378923922777176, "num_tokens": 64983781.0, "step": 54060 }, { "entropy": 1.8540250420570374, "epoch": 0.16761226914370342, "grad_norm": 10.47086238861084, "learning_rate": 6.179364204685151e-06, "loss": 0.5252, "mean_token_accuracy": 0.8316616043448448, "num_tokens": 64996187.0, "step": 54070 }, { "entropy": 1.9425106346607208, "epoch": 0.16764326826875312, "grad_norm": 9.815423965454102, "learning_rate": 6.1787928507497175e-06, "loss": 0.5643, "mean_token_accuracy": 0.8328223183751107, "num_tokens": 65007642.0, "step": 54080 }, { "entropy": 1.7625815600156785, "epoch": 0.1676742673938028, "grad_norm": 4.722161769866943, "learning_rate": 6.178221655269889e-06, "loss": 0.4629, "mean_token_accuracy": 0.851328319311142, "num_tokens": 65021921.0, "step": 54090 }, { "entropy": 1.877830518782139, "epoch": 0.1677052665188525, "grad_norm": 9.134954452514648, "learning_rate": 6.177650618172437e-06, "loss": 0.4936, "mean_token_accuracy": 0.8438004121184349, "num_tokens": 65033368.0, "step": 54100 }, { "entropy": 1.8599343091249465, "epoch": 0.1677362656439022, "grad_norm": 10.030871391296387, "learning_rate": 6.177079739384181e-06, "loss": 0.5236, "mean_token_accuracy": 0.8409288361668587, "num_tokens": 65045878.0, "step": 54110 }, { "entropy": 1.7290209725499153, "epoch": 0.1677672647689519, "grad_norm": 4.038937568664551, "learning_rate": 6.176509018831986e-06, "loss": 0.4231, "mean_token_accuracy": 0.8499686598777771, "num_tokens": 65060347.0, "step": 54120 }, { "entropy": 1.8978177532553673, "epoch": 0.1677982638940016, "grad_norm": 8.696316719055176, "learning_rate": 6.175938456442767e-06, "loss": 0.5338, "mean_token_accuracy": 0.8306406125426292, "num_tokens": 65071304.0, "step": 54130 }, { "entropy": 1.8237817287445068, "epoch": 0.1678292630190513, "grad_norm": 5.508317947387695, "learning_rate": 6.175368052143486e-06, "loss": 0.4792, "mean_token_accuracy": 0.84134771078825, "num_tokens": 65083895.0, "step": 54140 }, { "entropy": 1.8195566333830357, "epoch": 0.167860262144101, "grad_norm": 3.8358309268951416, "learning_rate": 6.174797805861148e-06, "loss": 0.464, "mean_token_accuracy": 0.8476643204689026, "num_tokens": 65096233.0, "step": 54150 }, { "entropy": 1.8712603464722632, "epoch": 0.1678912612691507, "grad_norm": 8.841750144958496, "learning_rate": 6.174227717522813e-06, "loss": 0.5239, "mean_token_accuracy": 0.834831403195858, "num_tokens": 65108051.0, "step": 54160 }, { "entropy": 1.8733776569366456, "epoch": 0.16792226039420038, "grad_norm": 7.8258280754089355, "learning_rate": 6.173657787055579e-06, "loss": 0.5251, "mean_token_accuracy": 0.8338091254234314, "num_tokens": 65120148.0, "step": 54170 }, { "entropy": 1.7442950963974, "epoch": 0.16795325951925008, "grad_norm": 9.016060829162598, "learning_rate": 6.173088014386599e-06, "loss": 0.3985, "mean_token_accuracy": 0.85818080753088, "num_tokens": 65133990.0, "step": 54180 }, { "entropy": 1.8510777726769447, "epoch": 0.16798425864429978, "grad_norm": 4.038900852203369, "learning_rate": 6.1725183994430695e-06, "loss": 0.5051, "mean_token_accuracy": 0.8405930832028389, "num_tokens": 65145866.0, "step": 54190 }, { "entropy": 1.874160850048065, "epoch": 0.16801525776934945, "grad_norm": 9.50784969329834, "learning_rate": 6.1719489421522305e-06, "loss": 0.5411, "mean_token_accuracy": 0.8350860238075256, "num_tokens": 65157550.0, "step": 54200 }, { "entropy": 1.8412716314196587, "epoch": 0.16804625689439914, "grad_norm": 4.003512859344482, "learning_rate": 6.1713796424413765e-06, "loss": 0.4365, "mean_token_accuracy": 0.8513872250914574, "num_tokens": 65169597.0, "step": 54210 }, { "entropy": 1.9005790546536445, "epoch": 0.16807725601944884, "grad_norm": 9.995362281799316, "learning_rate": 6.170810500237844e-06, "loss": 0.5528, "mean_token_accuracy": 0.8309820532798767, "num_tokens": 65180854.0, "step": 54220 }, { "entropy": 1.8263572067022324, "epoch": 0.16810825514449854, "grad_norm": 7.676513195037842, "learning_rate": 6.170241515469018e-06, "loss": 0.4498, "mean_token_accuracy": 0.8502917841076851, "num_tokens": 65192720.0, "step": 54230 }, { "entropy": 1.832256343960762, "epoch": 0.16813925426954823, "grad_norm": 4.75320291519165, "learning_rate": 6.1696726880623285e-06, "loss": 0.4836, "mean_token_accuracy": 0.8385294854640961, "num_tokens": 65204655.0, "step": 54240 }, { "entropy": 1.8810152530670166, "epoch": 0.16817025339459793, "grad_norm": 9.239448547363281, "learning_rate": 6.1691040179452545e-06, "loss": 0.5374, "mean_token_accuracy": 0.832660236954689, "num_tokens": 65216115.0, "step": 54250 }, { "entropy": 1.7937642320990563, "epoch": 0.16820125251964763, "grad_norm": 8.329822540283203, "learning_rate": 6.168535505045321e-06, "loss": 0.4775, "mean_token_accuracy": 0.8271018177270889, "num_tokens": 65229542.0, "step": 54260 }, { "entropy": 1.8577524468302726, "epoch": 0.16823225164469732, "grad_norm": 3.8464255332946777, "learning_rate": 6.1679671492901005e-06, "loss": 0.5141, "mean_token_accuracy": 0.8379319593310356, "num_tokens": 65241000.0, "step": 54270 }, { "entropy": 1.8444037914276123, "epoch": 0.16826325076974702, "grad_norm": 9.02126407623291, "learning_rate": 6.167398950607211e-06, "loss": 0.4539, "mean_token_accuracy": 0.8464455410838128, "num_tokens": 65253095.0, "step": 54280 }, { "entropy": 1.8382619485259055, "epoch": 0.16829424989479672, "grad_norm": 8.693649291992188, "learning_rate": 6.166830908924317e-06, "loss": 0.5167, "mean_token_accuracy": 0.8406226679682731, "num_tokens": 65265949.0, "step": 54290 }, { "entropy": 1.9115242898464202, "epoch": 0.1683252490198464, "grad_norm": 3.702934741973877, "learning_rate": 6.16626302416913e-06, "loss": 0.5333, "mean_token_accuracy": 0.833556392788887, "num_tokens": 65277257.0, "step": 54300 }, { "entropy": 1.8722372770309448, "epoch": 0.1683562481448961, "grad_norm": 9.893916130065918, "learning_rate": 6.16569529626941e-06, "loss": 0.495, "mean_token_accuracy": 0.8460499599575997, "num_tokens": 65288847.0, "step": 54310 }, { "entropy": 1.8180507212877273, "epoch": 0.1683872472699458, "grad_norm": 9.089763641357422, "learning_rate": 6.165127725152958e-06, "loss": 0.5071, "mean_token_accuracy": 0.8375856161117554, "num_tokens": 65301059.0, "step": 54320 }, { "entropy": 1.8814466312527656, "epoch": 0.1684182463949955, "grad_norm": 8.537693977355957, "learning_rate": 6.1645603107476316e-06, "loss": 0.4984, "mean_token_accuracy": 0.8440789937973022, "num_tokens": 65313029.0, "step": 54330 }, { "entropy": 1.8159365728497505, "epoch": 0.1684492455200452, "grad_norm": 3.978240489959717, "learning_rate": 6.163993052981323e-06, "loss": 0.483, "mean_token_accuracy": 0.8402403250336647, "num_tokens": 65325995.0, "step": 54340 }, { "entropy": 1.8512892082333565, "epoch": 0.1684802446450949, "grad_norm": 3.7935879230499268, "learning_rate": 6.163425951781979e-06, "loss": 0.5167, "mean_token_accuracy": 0.8360736206173897, "num_tokens": 65338118.0, "step": 54350 }, { "entropy": 1.802795946598053, "epoch": 0.1685112437701446, "grad_norm": 8.774079322814941, "learning_rate": 6.162859007077591e-06, "loss": 0.5186, "mean_token_accuracy": 0.8323176607489586, "num_tokens": 65350124.0, "step": 54360 }, { "entropy": 1.9113412827253342, "epoch": 0.1685422428951943, "grad_norm": 9.643198013305664, "learning_rate": 6.162292218796194e-06, "loss": 0.6128, "mean_token_accuracy": 0.821797750890255, "num_tokens": 65361216.0, "step": 54370 }, { "entropy": 1.880385261774063, "epoch": 0.16857324202024399, "grad_norm": 4.427012920379639, "learning_rate": 6.161725586865874e-06, "loss": 0.514, "mean_token_accuracy": 0.8347649067640305, "num_tokens": 65372951.0, "step": 54380 }, { "entropy": 1.8761203557252883, "epoch": 0.16860424114529368, "grad_norm": 7.965682506561279, "learning_rate": 6.1611591112147576e-06, "loss": 0.5154, "mean_token_accuracy": 0.8341512382030487, "num_tokens": 65384596.0, "step": 54390 }, { "entropy": 1.9145608723163605, "epoch": 0.16863524027034338, "grad_norm": 9.253949165344238, "learning_rate": 6.160592791771022e-06, "loss": 0.5824, "mean_token_accuracy": 0.8162755355238914, "num_tokens": 65396034.0, "step": 54400 }, { "entropy": 1.831618282198906, "epoch": 0.16866623939539307, "grad_norm": 8.794281005859375, "learning_rate": 6.16002662846289e-06, "loss": 0.4911, "mean_token_accuracy": 0.8415708675980568, "num_tokens": 65408333.0, "step": 54410 }, { "entropy": 1.8698890820145606, "epoch": 0.16869723852044277, "grad_norm": 9.203094482421875, "learning_rate": 6.159460621218628e-06, "loss": 0.5114, "mean_token_accuracy": 0.8259828209877014, "num_tokens": 65420329.0, "step": 54420 }, { "entropy": 1.8370443254709243, "epoch": 0.16872823764549247, "grad_norm": 4.42335319519043, "learning_rate": 6.158894769966554e-06, "loss": 0.51, "mean_token_accuracy": 0.8294874534010888, "num_tokens": 65432554.0, "step": 54430 }, { "entropy": 1.929424050450325, "epoch": 0.16875923677054216, "grad_norm": 8.103418350219727, "learning_rate": 6.158329074635024e-06, "loss": 0.6025, "mean_token_accuracy": 0.8246657729148865, "num_tokens": 65443420.0, "step": 54440 }, { "entropy": 1.9033832669258117, "epoch": 0.16879023589559183, "grad_norm": 11.029186248779297, "learning_rate": 6.157763535152448e-06, "loss": 0.5328, "mean_token_accuracy": 0.8389760866761208, "num_tokens": 65455561.0, "step": 54450 }, { "entropy": 1.8955723971128464, "epoch": 0.16882123502064153, "grad_norm": 10.687283515930176, "learning_rate": 6.1571981514472765e-06, "loss": 0.5405, "mean_token_accuracy": 0.8430651158094407, "num_tokens": 65467045.0, "step": 54460 }, { "entropy": 1.9213042184710503, "epoch": 0.16885223414569123, "grad_norm": 8.316679000854492, "learning_rate": 6.156632923448008e-06, "loss": 0.5787, "mean_token_accuracy": 0.8268252417445183, "num_tokens": 65479119.0, "step": 54470 }, { "entropy": 1.8560591742396355, "epoch": 0.16888323327074092, "grad_norm": 4.520045757293701, "learning_rate": 6.156067851083189e-06, "loss": 0.5089, "mean_token_accuracy": 0.8373791992664337, "num_tokens": 65491055.0, "step": 54480 }, { "entropy": 1.8120755463838578, "epoch": 0.16891423239579062, "grad_norm": 9.259931564331055, "learning_rate": 6.1555029342814085e-06, "loss": 0.4521, "mean_token_accuracy": 0.8388746708631516, "num_tokens": 65503547.0, "step": 54490 }, { "entropy": 1.8446748018264771, "epoch": 0.16894523152084032, "grad_norm": 5.609341144561768, "learning_rate": 6.154938172971303e-06, "loss": 0.517, "mean_token_accuracy": 0.8386319100856781, "num_tokens": 65516410.0, "step": 54500 }, { "entropy": 1.7979810684919357, "epoch": 0.16897623064589, "grad_norm": 8.738170623779297, "learning_rate": 6.154373567081555e-06, "loss": 0.4665, "mean_token_accuracy": 0.8370955526828766, "num_tokens": 65529389.0, "step": 54510 }, { "entropy": 1.9188962578773499, "epoch": 0.1690072297709397, "grad_norm": 8.479401588439941, "learning_rate": 6.15380911654089e-06, "loss": 0.536, "mean_token_accuracy": 0.8272537529468537, "num_tokens": 65541159.0, "step": 54520 }, { "entropy": 1.8414244174957275, "epoch": 0.1690382288959894, "grad_norm": 9.82351016998291, "learning_rate": 6.153244821278084e-06, "loss": 0.4987, "mean_token_accuracy": 0.8392342895269393, "num_tokens": 65553128.0, "step": 54530 }, { "entropy": 1.9202020570635796, "epoch": 0.1690692280210391, "grad_norm": 7.951665878295898, "learning_rate": 6.152680681221957e-06, "loss": 0.5564, "mean_token_accuracy": 0.8311819225549698, "num_tokens": 65564486.0, "step": 54540 }, { "entropy": 1.7439219117164613, "epoch": 0.1691002271460888, "grad_norm": 7.696526527404785, "learning_rate": 6.1521166963013704e-06, "loss": 0.4214, "mean_token_accuracy": 0.8509344890713691, "num_tokens": 65578178.0, "step": 54550 }, { "entropy": 1.8539911389350892, "epoch": 0.1691312262711385, "grad_norm": 8.005914688110352, "learning_rate": 6.151552866445237e-06, "loss": 0.487, "mean_token_accuracy": 0.8328571185469628, "num_tokens": 65590436.0, "step": 54560 }, { "entropy": 1.7372994154691697, "epoch": 0.1691622253961882, "grad_norm": 8.57450008392334, "learning_rate": 6.150989191582515e-06, "loss": 0.4778, "mean_token_accuracy": 0.8435987919569016, "num_tokens": 65604247.0, "step": 54570 }, { "entropy": 1.8149765402078628, "epoch": 0.1691932245212379, "grad_norm": 9.524360656738281, "learning_rate": 6.150425671642202e-06, "loss": 0.4857, "mean_token_accuracy": 0.8407787919044495, "num_tokens": 65616338.0, "step": 54580 }, { "entropy": 1.7159005388617516, "epoch": 0.16922422364628759, "grad_norm": 8.246034622192383, "learning_rate": 6.1498623065533485e-06, "loss": 0.4175, "mean_token_accuracy": 0.8674893081188202, "num_tokens": 65629499.0, "step": 54590 }, { "entropy": 1.7766979977488517, "epoch": 0.16925522277133728, "grad_norm": 3.937098741531372, "learning_rate": 6.1492990962450465e-06, "loss": 0.4736, "mean_token_accuracy": 0.8527488321065902, "num_tokens": 65641287.0, "step": 54600 }, { "entropy": 1.8135869204998016, "epoch": 0.16928622189638698, "grad_norm": 9.094282150268555, "learning_rate": 6.148736040646432e-06, "loss": 0.5036, "mean_token_accuracy": 0.8407742142677307, "num_tokens": 65653614.0, "step": 54610 }, { "entropy": 1.7666838884353637, "epoch": 0.16931722102143668, "grad_norm": 8.157979011535645, "learning_rate": 6.148173139686692e-06, "loss": 0.4819, "mean_token_accuracy": 0.845091213285923, "num_tokens": 65666332.0, "step": 54620 }, { "entropy": 1.7945738911628724, "epoch": 0.16934822014648637, "grad_norm": 4.526710510253906, "learning_rate": 6.147610393295055e-06, "loss": 0.5045, "mean_token_accuracy": 0.8315363183617592, "num_tokens": 65679667.0, "step": 54630 }, { "entropy": 1.8847228914499283, "epoch": 0.16937921927153607, "grad_norm": 8.596298217773438, "learning_rate": 6.147047801400793e-06, "loss": 0.569, "mean_token_accuracy": 0.8253517493605613, "num_tokens": 65690829.0, "step": 54640 }, { "entropy": 1.9206166476011277, "epoch": 0.16941021839658577, "grad_norm": 7.514388561248779, "learning_rate": 6.146485363933227e-06, "loss": 0.5221, "mean_token_accuracy": 0.8408697739243507, "num_tokens": 65702308.0, "step": 54650 }, { "entropy": 1.8065496653318405, "epoch": 0.16944121752163546, "grad_norm": 8.206125259399414, "learning_rate": 6.145923080821722e-06, "loss": 0.5337, "mean_token_accuracy": 0.8288014903664589, "num_tokens": 65715353.0, "step": 54660 }, { "entropy": 1.8215434283018113, "epoch": 0.16947221664668516, "grad_norm": 10.296819686889648, "learning_rate": 6.145360951995688e-06, "loss": 0.5272, "mean_token_accuracy": 0.8371994882822037, "num_tokens": 65727578.0, "step": 54670 }, { "entropy": 1.8798654437065125, "epoch": 0.16950321577173486, "grad_norm": 8.519831657409668, "learning_rate": 6.144798977384581e-06, "loss": 0.4857, "mean_token_accuracy": 0.8545126497745514, "num_tokens": 65739521.0, "step": 54680 }, { "entropy": 1.9068218991160393, "epoch": 0.16953421489678455, "grad_norm": 7.611881256103516, "learning_rate": 6.144237156917899e-06, "loss": 0.5547, "mean_token_accuracy": 0.8298189043998718, "num_tokens": 65751188.0, "step": 54690 }, { "entropy": 1.8502942398190498, "epoch": 0.16956521402183422, "grad_norm": 8.908893585205078, "learning_rate": 6.143675490525191e-06, "loss": 0.4892, "mean_token_accuracy": 0.8421571254730225, "num_tokens": 65763519.0, "step": 54700 }, { "entropy": 1.926678617298603, "epoch": 0.16959621314688392, "grad_norm": 8.78310489654541, "learning_rate": 6.143113978136046e-06, "loss": 0.5346, "mean_token_accuracy": 0.8322173491120338, "num_tokens": 65775010.0, "step": 54710 }, { "entropy": 1.9700055122375488, "epoch": 0.16962721227193361, "grad_norm": 9.067830085754395, "learning_rate": 6.1425526196801e-06, "loss": 0.5819, "mean_token_accuracy": 0.824844454228878, "num_tokens": 65785503.0, "step": 54720 }, { "entropy": 1.8520893216133119, "epoch": 0.1696582113969833, "grad_norm": 4.218345642089844, "learning_rate": 6.1419914150870315e-06, "loss": 0.5206, "mean_token_accuracy": 0.8238402858376503, "num_tokens": 65799211.0, "step": 54730 }, { "entropy": 1.9024053156375884, "epoch": 0.169689210522033, "grad_norm": 5.589046001434326, "learning_rate": 6.141430364286569e-06, "loss": 0.5102, "mean_token_accuracy": 0.8389298126101494, "num_tokens": 65811181.0, "step": 54740 }, { "entropy": 1.8264511436223985, "epoch": 0.1697202096470827, "grad_norm": 9.439501762390137, "learning_rate": 6.140869467208483e-06, "loss": 0.5153, "mean_token_accuracy": 0.839516569674015, "num_tokens": 65824036.0, "step": 54750 }, { "entropy": 1.9636918157339096, "epoch": 0.1697512087721324, "grad_norm": 8.578853607177734, "learning_rate": 6.140308723782587e-06, "loss": 0.5827, "mean_token_accuracy": 0.8268523201346397, "num_tokens": 65834646.0, "step": 54760 }, { "entropy": 1.8637161239981652, "epoch": 0.1697822078971821, "grad_norm": 8.782843589782715, "learning_rate": 6.1397481339387444e-06, "loss": 0.5552, "mean_token_accuracy": 0.8350477933883667, "num_tokens": 65846256.0, "step": 54770 }, { "entropy": 1.8629076793789863, "epoch": 0.1698132070222318, "grad_norm": 8.186110496520996, "learning_rate": 6.139187697606855e-06, "loss": 0.4779, "mean_token_accuracy": 0.84441829174757, "num_tokens": 65858192.0, "step": 54780 }, { "entropy": 1.8901616916060449, "epoch": 0.1698442061472815, "grad_norm": 9.074200630187988, "learning_rate": 6.138627414716874e-06, "loss": 0.5114, "mean_token_accuracy": 0.8320909798145294, "num_tokens": 65869497.0, "step": 54790 }, { "entropy": 1.8830202847719193, "epoch": 0.1698752052723312, "grad_norm": 8.882548332214355, "learning_rate": 6.138067285198796e-06, "loss": 0.5194, "mean_token_accuracy": 0.8349261358380318, "num_tokens": 65881627.0, "step": 54800 }, { "entropy": 1.8568723633885384, "epoch": 0.16990620439738088, "grad_norm": 8.651144981384277, "learning_rate": 6.1375073089826556e-06, "loss": 0.5063, "mean_token_accuracy": 0.8417845577001571, "num_tokens": 65894013.0, "step": 54810 }, { "entropy": 1.6487878575921058, "epoch": 0.16993720352243058, "grad_norm": 2.7366909980773926, "learning_rate": 6.13694748599854e-06, "loss": 0.3365, "mean_token_accuracy": 0.862517063319683, "num_tokens": 65909433.0, "step": 54820 }, { "entropy": 1.9101332992315292, "epoch": 0.16996820264748028, "grad_norm": 9.56188678741455, "learning_rate": 6.136387816176578e-06, "loss": 0.596, "mean_token_accuracy": 0.8215406507253646, "num_tokens": 65921625.0, "step": 54830 }, { "entropy": 1.7827743321657181, "epoch": 0.16999920177252997, "grad_norm": 8.454669952392578, "learning_rate": 6.135828299446942e-06, "loss": 0.439, "mean_token_accuracy": 0.8514051124453544, "num_tokens": 65934681.0, "step": 54840 }, { "entropy": 1.8705067232251167, "epoch": 0.17003020089757967, "grad_norm": 8.229545593261719, "learning_rate": 6.135268935739851e-06, "loss": 0.4988, "mean_token_accuracy": 0.847279503941536, "num_tokens": 65946087.0, "step": 54850 }, { "entropy": 1.7427136436104775, "epoch": 0.17006120002262937, "grad_norm": 10.23863697052002, "learning_rate": 6.134709724985567e-06, "loss": 0.4771, "mean_token_accuracy": 0.8420625299215316, "num_tokens": 65959753.0, "step": 54860 }, { "entropy": 1.8604324340820313, "epoch": 0.17009219914767906, "grad_norm": 9.759727478027344, "learning_rate": 6.134150667114395e-06, "loss": 0.5228, "mean_token_accuracy": 0.8294590935111046, "num_tokens": 65971696.0, "step": 54870 }, { "entropy": 1.8827021196484566, "epoch": 0.17012319827272876, "grad_norm": 8.212512969970703, "learning_rate": 6.133591762056689e-06, "loss": 0.4834, "mean_token_accuracy": 0.8433489888906479, "num_tokens": 65983076.0, "step": 54880 }, { "entropy": 1.8750552967190743, "epoch": 0.17015419739777846, "grad_norm": 9.474791526794434, "learning_rate": 6.133033009742842e-06, "loss": 0.4965, "mean_token_accuracy": 0.8393171012401581, "num_tokens": 65994772.0, "step": 54890 }, { "entropy": 1.7940944850444793, "epoch": 0.17018519652282815, "grad_norm": 4.888198375701904, "learning_rate": 6.132474410103298e-06, "loss": 0.4454, "mean_token_accuracy": 0.8466850072145462, "num_tokens": 66007645.0, "step": 54900 }, { "entropy": 1.8010751977562904, "epoch": 0.17021619564787785, "grad_norm": 8.661337852478027, "learning_rate": 6.131915963068537e-06, "loss": 0.4722, "mean_token_accuracy": 0.8513104304671287, "num_tokens": 66020326.0, "step": 54910 }, { "entropy": 1.7909870207309724, "epoch": 0.17024719477292755, "grad_norm": 7.634841442108154, "learning_rate": 6.13135766856909e-06, "loss": 0.4178, "mean_token_accuracy": 0.8531160518527031, "num_tokens": 66033075.0, "step": 54920 }, { "entropy": 1.8581923857331275, "epoch": 0.17027819389797724, "grad_norm": 8.779280662536621, "learning_rate": 6.130799526535529e-06, "loss": 0.5948, "mean_token_accuracy": 0.8206439360976219, "num_tokens": 66046620.0, "step": 54930 }, { "entropy": 1.8611151441931724, "epoch": 0.1703091930230269, "grad_norm": 4.603653430938721, "learning_rate": 6.1302415368984725e-06, "loss": 0.5261, "mean_token_accuracy": 0.8282294601202012, "num_tokens": 66059390.0, "step": 54940 }, { "entropy": 1.9167177721858024, "epoch": 0.1703401921480766, "grad_norm": 3.782064199447632, "learning_rate": 6.129683699588581e-06, "loss": 0.5186, "mean_token_accuracy": 0.8340412959456444, "num_tokens": 66071324.0, "step": 54950 }, { "entropy": 1.7558236002922059, "epoch": 0.1703711912731263, "grad_norm": 3.8991594314575195, "learning_rate": 6.129126014536561e-06, "loss": 0.403, "mean_token_accuracy": 0.8545200228691101, "num_tokens": 66084375.0, "step": 54960 }, { "entropy": 1.9062550529837607, "epoch": 0.170402190398176, "grad_norm": 3.855738639831543, "learning_rate": 6.12856848167316e-06, "loss": 0.5194, "mean_token_accuracy": 0.8293624669313431, "num_tokens": 66096293.0, "step": 54970 }, { "entropy": 1.9391276210546493, "epoch": 0.1704331895232257, "grad_norm": 8.4966402053833, "learning_rate": 6.1280111009291744e-06, "loss": 0.5505, "mean_token_accuracy": 0.8367387875914574, "num_tokens": 66106920.0, "step": 54980 }, { "entropy": 1.8991558268666267, "epoch": 0.1704641886482754, "grad_norm": 5.249369144439697, "learning_rate": 6.127453872235442e-06, "loss": 0.5751, "mean_token_accuracy": 0.8212674275040627, "num_tokens": 66118337.0, "step": 54990 }, { "entropy": 1.8485107704997064, "epoch": 0.1704951877733251, "grad_norm": 9.159661293029785, "learning_rate": 6.1268967955228405e-06, "loss": 0.5451, "mean_token_accuracy": 0.8363189935684204, "num_tokens": 66130596.0, "step": 55000 }, { "entropy": 1.851409375667572, "epoch": 0.1705261868983748, "grad_norm": 8.441533088684082, "learning_rate": 6.126339870722301e-06, "loss": 0.4712, "mean_token_accuracy": 0.8494085937738418, "num_tokens": 66142188.0, "step": 55010 }, { "entropy": 1.9319586962461472, "epoch": 0.17055718602342448, "grad_norm": 9.650931358337402, "learning_rate": 6.12578309776479e-06, "loss": 0.5737, "mean_token_accuracy": 0.8261142611503601, "num_tokens": 66153013.0, "step": 55020 }, { "entropy": 1.8354326650500297, "epoch": 0.17058818514847418, "grad_norm": 3.9305970668792725, "learning_rate": 6.125226476581324e-06, "loss": 0.4783, "mean_token_accuracy": 0.8330473214387893, "num_tokens": 66165874.0, "step": 55030 }, { "entropy": 1.9324169605970383, "epoch": 0.17061918427352388, "grad_norm": 4.871352672576904, "learning_rate": 6.124670007102958e-06, "loss": 0.5163, "mean_token_accuracy": 0.8405327409505844, "num_tokens": 66176898.0, "step": 55040 }, { "entropy": 1.8468404799699782, "epoch": 0.17065018339857357, "grad_norm": 9.20655632019043, "learning_rate": 6.124113689260793e-06, "loss": 0.5028, "mean_token_accuracy": 0.8357664123177528, "num_tokens": 66188431.0, "step": 55050 }, { "entropy": 1.7879943639039992, "epoch": 0.17068118252362327, "grad_norm": 3.959418296813965, "learning_rate": 6.123557522985977e-06, "loss": 0.4314, "mean_token_accuracy": 0.8417293280363083, "num_tokens": 66201593.0, "step": 55060 }, { "entropy": 1.924604320526123, "epoch": 0.17071218164867297, "grad_norm": 8.743783950805664, "learning_rate": 6.123001508209696e-06, "loss": 0.6023, "mean_token_accuracy": 0.8270179316401481, "num_tokens": 66212878.0, "step": 55070 }, { "entropy": 1.9165123641490935, "epoch": 0.17074318077372266, "grad_norm": 7.5204949378967285, "learning_rate": 6.122445644863187e-06, "loss": 0.5624, "mean_token_accuracy": 0.8256003141403199, "num_tokens": 66224720.0, "step": 55080 }, { "entropy": 1.889313419163227, "epoch": 0.17077417989877236, "grad_norm": 8.590926170349121, "learning_rate": 6.121889932877719e-06, "loss": 0.5542, "mean_token_accuracy": 0.8257635533809662, "num_tokens": 66236384.0, "step": 55090 }, { "entropy": 1.870141127705574, "epoch": 0.17080517902382206, "grad_norm": 3.6843600273132324, "learning_rate": 6.121334372184618e-06, "loss": 0.5093, "mean_token_accuracy": 0.8345419481396675, "num_tokens": 66248946.0, "step": 55100 }, { "entropy": 1.8741691306233406, "epoch": 0.17083617814887175, "grad_norm": 9.643596649169922, "learning_rate": 6.120778962715248e-06, "loss": 0.4838, "mean_token_accuracy": 0.8437133207917213, "num_tokens": 66260837.0, "step": 55110 }, { "entropy": 1.8679534777998925, "epoch": 0.17086717727392145, "grad_norm": 4.9696149826049805, "learning_rate": 6.120223704401012e-06, "loss": 0.4552, "mean_token_accuracy": 0.8462657079100608, "num_tokens": 66272827.0, "step": 55120 }, { "entropy": 1.8148210868239403, "epoch": 0.17089817639897115, "grad_norm": 8.890388488769531, "learning_rate": 6.119668597173365e-06, "loss": 0.4812, "mean_token_accuracy": 0.8454957276582717, "num_tokens": 66285297.0, "step": 55130 }, { "entropy": 1.8707555949687957, "epoch": 0.17092917552402084, "grad_norm": 8.708013534545898, "learning_rate": 6.119113640963797e-06, "loss": 0.5601, "mean_token_accuracy": 0.8221472725272179, "num_tokens": 66296618.0, "step": 55140 }, { "entropy": 1.8355353966355323, "epoch": 0.17096017464907054, "grad_norm": 9.65738296508789, "learning_rate": 6.11855883570385e-06, "loss": 0.5414, "mean_token_accuracy": 0.8310455739498138, "num_tokens": 66309122.0, "step": 55150 }, { "entropy": 1.8585526466369628, "epoch": 0.17099117377412024, "grad_norm": 9.257162094116211, "learning_rate": 6.118004181325103e-06, "loss": 0.5515, "mean_token_accuracy": 0.8375745385885238, "num_tokens": 66321433.0, "step": 55160 }, { "entropy": 1.8798829302191735, "epoch": 0.17102217289916993, "grad_norm": 8.34717845916748, "learning_rate": 6.117449677759181e-06, "loss": 0.5329, "mean_token_accuracy": 0.8309325784444809, "num_tokens": 66333576.0, "step": 55170 }, { "entropy": 1.9052462548017501, "epoch": 0.17105317202421963, "grad_norm": 9.19088363647461, "learning_rate": 6.11689532493775e-06, "loss": 0.5508, "mean_token_accuracy": 0.8245049402117729, "num_tokens": 66345424.0, "step": 55180 }, { "entropy": 1.8513251379132272, "epoch": 0.1710841711492693, "grad_norm": 4.772027969360352, "learning_rate": 6.1163411227925265e-06, "loss": 0.5349, "mean_token_accuracy": 0.8387829244136811, "num_tokens": 66357556.0, "step": 55190 }, { "entropy": 1.8194545328617096, "epoch": 0.171115170274319, "grad_norm": 8.34814453125, "learning_rate": 6.11578707125526e-06, "loss": 0.5304, "mean_token_accuracy": 0.8422711014747619, "num_tokens": 66370273.0, "step": 55200 }, { "entropy": 1.8819702237844467, "epoch": 0.1711461693993687, "grad_norm": 8.056477546691895, "learning_rate": 6.1152331702577514e-06, "loss": 0.5244, "mean_token_accuracy": 0.8342429891228675, "num_tokens": 66382365.0, "step": 55210 }, { "entropy": 1.898891557753086, "epoch": 0.1711771685244184, "grad_norm": 4.703845977783203, "learning_rate": 6.114679419731841e-06, "loss": 0.5002, "mean_token_accuracy": 0.8460740655660629, "num_tokens": 66393344.0, "step": 55220 }, { "entropy": 1.957570144534111, "epoch": 0.17120816764946808, "grad_norm": 7.900186061859131, "learning_rate": 6.114125819609411e-06, "loss": 0.5556, "mean_token_accuracy": 0.8310376718640328, "num_tokens": 66404439.0, "step": 55230 }, { "entropy": 1.8322395712137223, "epoch": 0.17123916677451778, "grad_norm": 3.6799228191375732, "learning_rate": 6.113572369822391e-06, "loss": 0.5098, "mean_token_accuracy": 0.8300682485103608, "num_tokens": 66417136.0, "step": 55240 }, { "entropy": 1.8394572094082833, "epoch": 0.17127016589956748, "grad_norm": 4.595379829406738, "learning_rate": 6.113019070302754e-06, "loss": 0.4841, "mean_token_accuracy": 0.8420197859406471, "num_tokens": 66429039.0, "step": 55250 }, { "entropy": 1.8247755616903305, "epoch": 0.17130116502461717, "grad_norm": 7.622194290161133, "learning_rate": 6.11246592098251e-06, "loss": 0.4805, "mean_token_accuracy": 0.8401867300271988, "num_tokens": 66441004.0, "step": 55260 }, { "entropy": 1.858515314757824, "epoch": 0.17133216414966687, "grad_norm": 9.903068542480469, "learning_rate": 6.111912921793715e-06, "loss": 0.4979, "mean_token_accuracy": 0.8486079826951027, "num_tokens": 66452827.0, "step": 55270 }, { "entropy": 1.8595426589250565, "epoch": 0.17136316327471657, "grad_norm": 6.444674968719482, "learning_rate": 6.111360072668473e-06, "loss": 0.5119, "mean_token_accuracy": 0.8341106563806534, "num_tokens": 66464688.0, "step": 55280 }, { "entropy": 1.7985023483633995, "epoch": 0.17139416239976626, "grad_norm": 9.139451026916504, "learning_rate": 6.110807373538924e-06, "loss": 0.4902, "mean_token_accuracy": 0.8438055410981178, "num_tokens": 66478770.0, "step": 55290 }, { "entropy": 1.8379919603466988, "epoch": 0.17142516152481596, "grad_norm": 2.0652432441711426, "learning_rate": 6.110254824337254e-06, "loss": 0.5378, "mean_token_accuracy": 0.8349084377288818, "num_tokens": 66491604.0, "step": 55300 }, { "entropy": 1.8246984764933587, "epoch": 0.17145616064986566, "grad_norm": 4.141010284423828, "learning_rate": 6.109702424995692e-06, "loss": 0.461, "mean_token_accuracy": 0.8502539679408073, "num_tokens": 66503887.0, "step": 55310 }, { "entropy": 1.7455498218536376, "epoch": 0.17148715977491535, "grad_norm": 7.454645156860352, "learning_rate": 6.1091501754465084e-06, "loss": 0.4684, "mean_token_accuracy": 0.8398867219686508, "num_tokens": 66516981.0, "step": 55320 }, { "entropy": 1.8381555840373038, "epoch": 0.17151815889996505, "grad_norm": 8.063959121704102, "learning_rate": 6.10859807562202e-06, "loss": 0.5018, "mean_token_accuracy": 0.8499990120530129, "num_tokens": 66528982.0, "step": 55330 }, { "entropy": 1.9049826204776763, "epoch": 0.17154915802501475, "grad_norm": 8.288491249084473, "learning_rate": 6.108046125454582e-06, "loss": 0.5242, "mean_token_accuracy": 0.8419345527887344, "num_tokens": 66540768.0, "step": 55340 }, { "entropy": 1.8578782469034194, "epoch": 0.17158015715006444, "grad_norm": 7.979318618774414, "learning_rate": 6.107494324876594e-06, "loss": 0.5466, "mean_token_accuracy": 0.8317851856350899, "num_tokens": 66552308.0, "step": 55350 }, { "entropy": 1.8647135615348815, "epoch": 0.17161115627511414, "grad_norm": 4.911669731140137, "learning_rate": 6.1069426738205e-06, "loss": 0.5339, "mean_token_accuracy": 0.8424048766493797, "num_tokens": 66564209.0, "step": 55360 }, { "entropy": 1.9092617228627204, "epoch": 0.17164215540016384, "grad_norm": 9.604966163635254, "learning_rate": 6.106391172218784e-06, "loss": 0.5279, "mean_token_accuracy": 0.8417677640914917, "num_tokens": 66575424.0, "step": 55370 }, { "entropy": 1.8798686414957047, "epoch": 0.17167315452521353, "grad_norm": 9.60474681854248, "learning_rate": 6.105839820003976e-06, "loss": 0.4949, "mean_token_accuracy": 0.8326731339097023, "num_tokens": 66588461.0, "step": 55380 }, { "entropy": 1.787381762266159, "epoch": 0.17170415365026323, "grad_norm": 8.320876121520996, "learning_rate": 6.105288617108646e-06, "loss": 0.4899, "mean_token_accuracy": 0.8443598464131356, "num_tokens": 66601689.0, "step": 55390 }, { "entropy": 1.8882825881242753, "epoch": 0.17173515277531293, "grad_norm": 8.86194896697998, "learning_rate": 6.104737563465406e-06, "loss": 0.5774, "mean_token_accuracy": 0.8265114605426789, "num_tokens": 66613156.0, "step": 55400 }, { "entropy": 1.9447656571865082, "epoch": 0.17176615190036262, "grad_norm": 10.87563419342041, "learning_rate": 6.104186659006913e-06, "loss": 0.5485, "mean_token_accuracy": 0.8329192861914635, "num_tokens": 66623750.0, "step": 55410 }, { "entropy": 1.862581080198288, "epoch": 0.17179715102541232, "grad_norm": 9.020846366882324, "learning_rate": 6.103635903665865e-06, "loss": 0.5222, "mean_token_accuracy": 0.8274956732988358, "num_tokens": 66636578.0, "step": 55420 }, { "entropy": 1.8903913646936417, "epoch": 0.17182815015046202, "grad_norm": 8.09145736694336, "learning_rate": 6.103085297375004e-06, "loss": 0.5167, "mean_token_accuracy": 0.8357395201921463, "num_tokens": 66648330.0, "step": 55430 }, { "entropy": 1.9172933578491211, "epoch": 0.17185914927551169, "grad_norm": 4.03477144241333, "learning_rate": 6.102534840067114e-06, "loss": 0.5521, "mean_token_accuracy": 0.8339840278029442, "num_tokens": 66660056.0, "step": 55440 }, { "entropy": 1.9062219202518462, "epoch": 0.17189014840056138, "grad_norm": 9.107460021972656, "learning_rate": 6.101984531675016e-06, "loss": 0.565, "mean_token_accuracy": 0.833404429256916, "num_tokens": 66670678.0, "step": 55450 }, { "entropy": 1.9296542406082153, "epoch": 0.17192114752561108, "grad_norm": 8.541874885559082, "learning_rate": 6.1014343721315835e-06, "loss": 0.517, "mean_token_accuracy": 0.8412581771612168, "num_tokens": 66680863.0, "step": 55460 }, { "entropy": 1.9104670375585555, "epoch": 0.17195214665066078, "grad_norm": 7.40362548828125, "learning_rate": 6.1008843613697255e-06, "loss": 0.5266, "mean_token_accuracy": 0.8393293395638466, "num_tokens": 66692889.0, "step": 55470 }, { "entropy": 1.8890238150954246, "epoch": 0.17198314577571047, "grad_norm": 9.017800331115723, "learning_rate": 6.100334499322393e-06, "loss": 0.5257, "mean_token_accuracy": 0.834419809281826, "num_tokens": 66705358.0, "step": 55480 }, { "entropy": 1.8500746071338654, "epoch": 0.17201414490076017, "grad_norm": 8.356595993041992, "learning_rate": 6.099784785922585e-06, "loss": 0.4723, "mean_token_accuracy": 0.8483300238847733, "num_tokens": 66717381.0, "step": 55490 }, { "entropy": 1.9085285305976867, "epoch": 0.17204514402580987, "grad_norm": 9.017643928527832, "learning_rate": 6.0992352211033335e-06, "loss": 0.5308, "mean_token_accuracy": 0.8379713967442513, "num_tokens": 66729410.0, "step": 55500 }, { "entropy": 1.914148934185505, "epoch": 0.17207614315085956, "grad_norm": 8.142106056213379, "learning_rate": 6.098685804797724e-06, "loss": 0.5254, "mean_token_accuracy": 0.8404997661709785, "num_tokens": 66740551.0, "step": 55510 }, { "entropy": 1.9194943860173226, "epoch": 0.17210714227590926, "grad_norm": 8.375265121459961, "learning_rate": 6.098136536938873e-06, "loss": 0.5537, "mean_token_accuracy": 0.8302345380187035, "num_tokens": 66751861.0, "step": 55520 }, { "entropy": 1.8494600921869278, "epoch": 0.17213814140095895, "grad_norm": 7.672314643859863, "learning_rate": 6.097587417459949e-06, "loss": 0.4787, "mean_token_accuracy": 0.8483859553933144, "num_tokens": 66764180.0, "step": 55530 }, { "entropy": 1.876507543027401, "epoch": 0.17216914052600865, "grad_norm": 8.333982467651367, "learning_rate": 6.097038446294156e-06, "loss": 0.4906, "mean_token_accuracy": 0.8330820754170418, "num_tokens": 66776554.0, "step": 55540 }, { "entropy": 1.8608839854598045, "epoch": 0.17220013965105835, "grad_norm": 8.950138092041016, "learning_rate": 6.096489623374742e-06, "loss": 0.5448, "mean_token_accuracy": 0.8301695078611374, "num_tokens": 66789000.0, "step": 55550 }, { "entropy": 1.871629747748375, "epoch": 0.17223113877610804, "grad_norm": 4.220654487609863, "learning_rate": 6.095940948634997e-06, "loss": 0.578, "mean_token_accuracy": 0.8214164420962333, "num_tokens": 66802167.0, "step": 55560 }, { "entropy": 1.8488945305347442, "epoch": 0.17226213790115774, "grad_norm": 4.729830265045166, "learning_rate": 6.095392422008255e-06, "loss": 0.4952, "mean_token_accuracy": 0.8359545990824699, "num_tokens": 66813727.0, "step": 55570 }, { "entropy": 1.8354024216532707, "epoch": 0.17229313702620744, "grad_norm": 8.174845695495605, "learning_rate": 6.094844043427889e-06, "loss": 0.4816, "mean_token_accuracy": 0.8510124906897545, "num_tokens": 66825754.0, "step": 55580 }, { "entropy": 1.8678600177168847, "epoch": 0.17232413615125713, "grad_norm": 11.34805679321289, "learning_rate": 6.094295812827316e-06, "loss": 0.4958, "mean_token_accuracy": 0.8442059218883514, "num_tokens": 66838613.0, "step": 55590 }, { "entropy": 1.8801333606243134, "epoch": 0.17235513527630683, "grad_norm": 8.798630714416504, "learning_rate": 6.0937477301399924e-06, "loss": 0.5115, "mean_token_accuracy": 0.8383255407214165, "num_tokens": 66850523.0, "step": 55600 }, { "entropy": 1.863646823167801, "epoch": 0.17238613440135653, "grad_norm": 8.633447647094727, "learning_rate": 6.093199795299421e-06, "loss": 0.5343, "mean_token_accuracy": 0.8378429308533668, "num_tokens": 66863104.0, "step": 55610 }, { "entropy": 1.9260516792535782, "epoch": 0.17241713352640622, "grad_norm": 6.970504283905029, "learning_rate": 6.092652008239141e-06, "loss": 0.5621, "mean_token_accuracy": 0.8314559891819954, "num_tokens": 66874671.0, "step": 55620 }, { "entropy": 1.9149534597992897, "epoch": 0.17244813265145592, "grad_norm": 5.021518230438232, "learning_rate": 6.0921043688927366e-06, "loss": 0.53, "mean_token_accuracy": 0.8325030341744423, "num_tokens": 66886294.0, "step": 55630 }, { "entropy": 1.9480685591697693, "epoch": 0.17247913177650562, "grad_norm": 8.716386795043945, "learning_rate": 6.091556877193834e-06, "loss": 0.5884, "mean_token_accuracy": 0.8272656366229058, "num_tokens": 66897341.0, "step": 55640 }, { "entropy": 1.8431043431162835, "epoch": 0.17251013090155531, "grad_norm": 8.646214485168457, "learning_rate": 6.091009533076101e-06, "loss": 0.4873, "mean_token_accuracy": 0.8376592069864273, "num_tokens": 66909553.0, "step": 55650 }, { "entropy": 1.9310444086790084, "epoch": 0.172541130026605, "grad_norm": 7.217070579528809, "learning_rate": 6.090462336473245e-06, "loss": 0.6076, "mean_token_accuracy": 0.8188234835863113, "num_tokens": 66920698.0, "step": 55660 }, { "entropy": 1.7853051990270614, "epoch": 0.1725721291516547, "grad_norm": 11.338160514831543, "learning_rate": 6.089915287319018e-06, "loss": 0.5264, "mean_token_accuracy": 0.8323128372430801, "num_tokens": 66933403.0, "step": 55670 }, { "entropy": 1.8359568476676942, "epoch": 0.17260312827670438, "grad_norm": 4.960665702819824, "learning_rate": 6.089368385547212e-06, "loss": 0.4729, "mean_token_accuracy": 0.837063156068325, "num_tokens": 66945928.0, "step": 55680 }, { "entropy": 1.7752636238932609, "epoch": 0.17263412740175407, "grad_norm": 9.67094612121582, "learning_rate": 6.088821631091659e-06, "loss": 0.4704, "mean_token_accuracy": 0.8366348952054977, "num_tokens": 66959462.0, "step": 55690 }, { "entropy": 1.7888485848903657, "epoch": 0.17266512652680377, "grad_norm": 9.076850891113281, "learning_rate": 6.088275023886237e-06, "loss": 0.4558, "mean_token_accuracy": 0.8395483061671257, "num_tokens": 66972656.0, "step": 55700 }, { "entropy": 1.8987372756004333, "epoch": 0.17269612565185347, "grad_norm": 7.72656774520874, "learning_rate": 6.087728563864862e-06, "loss": 0.5102, "mean_token_accuracy": 0.8445681780576706, "num_tokens": 66983938.0, "step": 55710 }, { "entropy": 1.894448073208332, "epoch": 0.17272712477690316, "grad_norm": 8.844822883605957, "learning_rate": 6.087182250961492e-06, "loss": 0.5572, "mean_token_accuracy": 0.8229103952646255, "num_tokens": 66996286.0, "step": 55720 }, { "entropy": 1.7768873780965806, "epoch": 0.17275812390195286, "grad_norm": 4.438620567321777, "learning_rate": 6.086636085110128e-06, "loss": 0.394, "mean_token_accuracy": 0.8516925528645516, "num_tokens": 67010213.0, "step": 55730 }, { "entropy": 1.8323894530534743, "epoch": 0.17278912302700256, "grad_norm": 9.754355430603027, "learning_rate": 6.08609006624481e-06, "loss": 0.4419, "mean_token_accuracy": 0.8407779783010483, "num_tokens": 67022907.0, "step": 55740 }, { "entropy": 1.8603756889700889, "epoch": 0.17282012215205225, "grad_norm": 10.162839889526367, "learning_rate": 6.085544194299622e-06, "loss": 0.5008, "mean_token_accuracy": 0.8378032267093658, "num_tokens": 67035045.0, "step": 55750 }, { "entropy": 1.7585888996720314, "epoch": 0.17285112127710195, "grad_norm": 9.135080337524414, "learning_rate": 6.084998469208687e-06, "loss": 0.4527, "mean_token_accuracy": 0.8512668639421463, "num_tokens": 67048645.0, "step": 55760 }, { "entropy": 1.8613043814897536, "epoch": 0.17288212040215165, "grad_norm": 9.348379135131836, "learning_rate": 6.084452890906173e-06, "loss": 0.4957, "mean_token_accuracy": 0.8385450929403305, "num_tokens": 67060973.0, "step": 55770 }, { "entropy": 1.9134966135025024, "epoch": 0.17291311952720134, "grad_norm": 9.092744827270508, "learning_rate": 6.083907459326285e-06, "loss": 0.5414, "mean_token_accuracy": 0.838169914484024, "num_tokens": 67072327.0, "step": 55780 }, { "entropy": 1.8521252527832985, "epoch": 0.17294411865225104, "grad_norm": 8.718194007873535, "learning_rate": 6.08336217440327e-06, "loss": 0.5484, "mean_token_accuracy": 0.8312934651970864, "num_tokens": 67084319.0, "step": 55790 }, { "entropy": 1.8963706970214844, "epoch": 0.17297511777730074, "grad_norm": 8.831055641174316, "learning_rate": 6.08281703607142e-06, "loss": 0.5294, "mean_token_accuracy": 0.8281921342015266, "num_tokens": 67096177.0, "step": 55800 }, { "entropy": 1.8676214978098868, "epoch": 0.17300611690235043, "grad_norm": 9.61502742767334, "learning_rate": 6.082272044265064e-06, "loss": 0.4739, "mean_token_accuracy": 0.8491611614823341, "num_tokens": 67108106.0, "step": 55810 }, { "entropy": 1.8385474801063537, "epoch": 0.17303711602740013, "grad_norm": 3.662489414215088, "learning_rate": 6.0817271989185745e-06, "loss": 0.4754, "mean_token_accuracy": 0.8533884555101394, "num_tokens": 67120164.0, "step": 55820 }, { "entropy": 1.7729324698448181, "epoch": 0.17306811515244983, "grad_norm": 4.637348175048828, "learning_rate": 6.081182499966365e-06, "loss": 0.4626, "mean_token_accuracy": 0.8501248374581337, "num_tokens": 67134497.0, "step": 55830 }, { "entropy": 1.9585619807243346, "epoch": 0.17309911427749952, "grad_norm": 7.857696056365967, "learning_rate": 6.080637947342887e-06, "loss": 0.5646, "mean_token_accuracy": 0.8382192403078079, "num_tokens": 67145397.0, "step": 55840 }, { "entropy": 1.809555734694004, "epoch": 0.17313011340254922, "grad_norm": 5.004009246826172, "learning_rate": 6.080093540982638e-06, "loss": 0.4573, "mean_token_accuracy": 0.8505311787128449, "num_tokens": 67157963.0, "step": 55850 }, { "entropy": 1.9263376638293266, "epoch": 0.17316111252759891, "grad_norm": 7.850517272949219, "learning_rate": 6.079549280820153e-06, "loss": 0.5709, "mean_token_accuracy": 0.8317690268158913, "num_tokens": 67169537.0, "step": 55860 }, { "entropy": 1.9509922355413436, "epoch": 0.1731921116526486, "grad_norm": 9.430130958557129, "learning_rate": 6.079005166790011e-06, "loss": 0.56, "mean_token_accuracy": 0.8258754447102546, "num_tokens": 67180681.0, "step": 55870 }, { "entropy": 1.9066618397831916, "epoch": 0.1732231107776983, "grad_norm": 8.74619197845459, "learning_rate": 6.078461198826828e-06, "loss": 0.5422, "mean_token_accuracy": 0.8300882831215859, "num_tokens": 67191836.0, "step": 55880 }, { "entropy": 1.7900028765201568, "epoch": 0.173254109902748, "grad_norm": 6.856855869293213, "learning_rate": 6.077917376865262e-06, "loss": 0.4932, "mean_token_accuracy": 0.8394763827323913, "num_tokens": 67206034.0, "step": 55890 }, { "entropy": 1.9227029278874397, "epoch": 0.1732851090277977, "grad_norm": 8.958833694458008, "learning_rate": 6.077373700840018e-06, "loss": 0.4931, "mean_token_accuracy": 0.8387436643242836, "num_tokens": 67217234.0, "step": 55900 }, { "entropy": 1.9202365294098853, "epoch": 0.1733161081528474, "grad_norm": 4.27255392074585, "learning_rate": 6.076830170685832e-06, "loss": 0.5532, "mean_token_accuracy": 0.827874468266964, "num_tokens": 67228530.0, "step": 55910 }, { "entropy": 1.908749097585678, "epoch": 0.1733471072778971, "grad_norm": 8.770615577697754, "learning_rate": 6.07628678633749e-06, "loss": 0.5257, "mean_token_accuracy": 0.8317655339837075, "num_tokens": 67240085.0, "step": 55920 }, { "entropy": 1.8192409992218017, "epoch": 0.17337810640294676, "grad_norm": 8.269194602966309, "learning_rate": 6.0757435477298085e-06, "loss": 0.501, "mean_token_accuracy": 0.8376537606120109, "num_tokens": 67252721.0, "step": 55930 }, { "entropy": 1.835218572616577, "epoch": 0.17340910552799646, "grad_norm": 8.810011863708496, "learning_rate": 6.075200454797657e-06, "loss": 0.4887, "mean_token_accuracy": 0.8403037443757058, "num_tokens": 67265339.0, "step": 55940 }, { "entropy": 1.9089889451861382, "epoch": 0.17344010465304616, "grad_norm": 5.602406978607178, "learning_rate": 6.0746575074759365e-06, "loss": 0.5341, "mean_token_accuracy": 0.8314460068941116, "num_tokens": 67278228.0, "step": 55950 }, { "entropy": 1.8503844618797303, "epoch": 0.17347110377809585, "grad_norm": 4.604806423187256, "learning_rate": 6.074114705699592e-06, "loss": 0.4924, "mean_token_accuracy": 0.8360475450754166, "num_tokens": 67290208.0, "step": 55960 }, { "entropy": 1.8351913020014763, "epoch": 0.17350210290314555, "grad_norm": 10.080077171325684, "learning_rate": 6.073572049403609e-06, "loss": 0.5014, "mean_token_accuracy": 0.8317245200276375, "num_tokens": 67303011.0, "step": 55970 }, { "entropy": 1.7987764164805413, "epoch": 0.17353310202819525, "grad_norm": 8.698798179626465, "learning_rate": 6.073029538523015e-06, "loss": 0.4725, "mean_token_accuracy": 0.8366831094026566, "num_tokens": 67316802.0, "step": 55980 }, { "entropy": 1.8339210391044616, "epoch": 0.17356410115324494, "grad_norm": 3.881253957748413, "learning_rate": 6.072487172992875e-06, "loss": 0.4636, "mean_token_accuracy": 0.8414662271738053, "num_tokens": 67329858.0, "step": 55990 }, { "entropy": 1.8059268981218337, "epoch": 0.17359510027829464, "grad_norm": 4.120025157928467, "learning_rate": 6.0719449527482976e-06, "loss": 0.4618, "mean_token_accuracy": 0.8513453081250191, "num_tokens": 67342567.0, "step": 56000 }, { "entropy": 1.9046938508749007, "epoch": 0.17362609940334434, "grad_norm": 4.1900787353515625, "learning_rate": 6.07140287772443e-06, "loss": 0.5465, "mean_token_accuracy": 0.8171088561415673, "num_tokens": 67354240.0, "step": 56010 }, { "entropy": 1.8154946908354759, "epoch": 0.17365709852839403, "grad_norm": 7.720127582550049, "learning_rate": 6.070860947856461e-06, "loss": 0.5129, "mean_token_accuracy": 0.841141340136528, "num_tokens": 67366512.0, "step": 56020 }, { "entropy": 1.7826613813638688, "epoch": 0.17368809765344373, "grad_norm": 8.429262161254883, "learning_rate": 6.07031916307962e-06, "loss": 0.4866, "mean_token_accuracy": 0.8355521574616432, "num_tokens": 67379705.0, "step": 56030 }, { "entropy": 1.8282306790351868, "epoch": 0.17371909677849343, "grad_norm": 8.058755874633789, "learning_rate": 6.0697775233291746e-06, "loss": 0.5237, "mean_token_accuracy": 0.8397795766592026, "num_tokens": 67392350.0, "step": 56040 }, { "entropy": 1.7306653708219528, "epoch": 0.17375009590354312, "grad_norm": 8.275370597839355, "learning_rate": 6.069236028540436e-06, "loss": 0.4619, "mean_token_accuracy": 0.8475747913122177, "num_tokens": 67406980.0, "step": 56050 }, { "entropy": 1.8713230773806573, "epoch": 0.17378109502859282, "grad_norm": 8.6502103805542, "learning_rate": 6.068694678648755e-06, "loss": 0.5318, "mean_token_accuracy": 0.8327876642346382, "num_tokens": 67419032.0, "step": 56060 }, { "entropy": 1.9657189399003983, "epoch": 0.17381209415364252, "grad_norm": 8.886541366577148, "learning_rate": 6.068153473589519e-06, "loss": 0.5763, "mean_token_accuracy": 0.8254697665572166, "num_tokens": 67429412.0, "step": 56070 }, { "entropy": 1.9012456104159354, "epoch": 0.1738430932786922, "grad_norm": 7.720726013183594, "learning_rate": 6.0676124132981626e-06, "loss": 0.5181, "mean_token_accuracy": 0.8330578565597534, "num_tokens": 67441415.0, "step": 56080 }, { "entropy": 1.8742031693458556, "epoch": 0.1738740924037419, "grad_norm": 3.867934465408325, "learning_rate": 6.067071497710155e-06, "loss": 0.5121, "mean_token_accuracy": 0.8421854302287102, "num_tokens": 67453389.0, "step": 56090 }, { "entropy": 1.9605601608753205, "epoch": 0.1739050915287916, "grad_norm": 8.339366912841797, "learning_rate": 6.066530726761009e-06, "loss": 0.5707, "mean_token_accuracy": 0.8267245456576348, "num_tokens": 67464579.0, "step": 56100 }, { "entropy": 1.9092912346124649, "epoch": 0.1739360906538413, "grad_norm": 8.375153541564941, "learning_rate": 6.065990100386274e-06, "loss": 0.5162, "mean_token_accuracy": 0.8393413156270981, "num_tokens": 67476222.0, "step": 56110 }, { "entropy": 1.805423805117607, "epoch": 0.173967089778891, "grad_norm": 7.936673641204834, "learning_rate": 6.065449618521544e-06, "loss": 0.4564, "mean_token_accuracy": 0.8524986699223518, "num_tokens": 67488739.0, "step": 56120 }, { "entropy": 1.8608886793255806, "epoch": 0.1739980889039407, "grad_norm": 9.917237281799316, "learning_rate": 6.0649092811024514e-06, "loss": 0.51, "mean_token_accuracy": 0.8281596288084984, "num_tokens": 67500883.0, "step": 56130 }, { "entropy": 1.9484786361455917, "epoch": 0.1740290880289904, "grad_norm": 11.2413969039917, "learning_rate": 6.064369088064665e-06, "loss": 0.5384, "mean_token_accuracy": 0.8397915035486221, "num_tokens": 67512036.0, "step": 56140 }, { "entropy": 1.896202652156353, "epoch": 0.1740600871540401, "grad_norm": 10.009648323059082, "learning_rate": 6.063829039343899e-06, "loss": 0.4998, "mean_token_accuracy": 0.8400693356990814, "num_tokens": 67523562.0, "step": 56150 }, { "entropy": 1.8820205710828304, "epoch": 0.17409108627908979, "grad_norm": 4.450737953186035, "learning_rate": 6.063289134875907e-06, "loss": 0.4927, "mean_token_accuracy": 0.8278883457183838, "num_tokens": 67536330.0, "step": 56160 }, { "entropy": 1.8365131139755249, "epoch": 0.17412208540413948, "grad_norm": 4.729415416717529, "learning_rate": 6.062749374596479e-06, "loss": 0.4538, "mean_token_accuracy": 0.8442412883043289, "num_tokens": 67549775.0, "step": 56170 }, { "entropy": 1.9157144859433175, "epoch": 0.17415308452918915, "grad_norm": 8.306681632995605, "learning_rate": 6.062209758441451e-06, "loss": 0.5175, "mean_token_accuracy": 0.8310726657509804, "num_tokens": 67561670.0, "step": 56180 }, { "entropy": 1.8954879730939864, "epoch": 0.17418408365423885, "grad_norm": 9.215730667114258, "learning_rate": 6.0616702863466905e-06, "loss": 0.5044, "mean_token_accuracy": 0.8425735384225845, "num_tokens": 67573051.0, "step": 56190 }, { "entropy": 1.9187992975115775, "epoch": 0.17421508277928854, "grad_norm": 7.861081600189209, "learning_rate": 6.061130958248112e-06, "loss": 0.601, "mean_token_accuracy": 0.8312497869133949, "num_tokens": 67585205.0, "step": 56200 }, { "entropy": 1.954244077205658, "epoch": 0.17424608190433824, "grad_norm": 9.027844429016113, "learning_rate": 6.060591774081669e-06, "loss": 0.5498, "mean_token_accuracy": 0.8323284685611725, "num_tokens": 67596130.0, "step": 56210 }, { "entropy": 1.917762076854706, "epoch": 0.17427708102938794, "grad_norm": 8.658714294433594, "learning_rate": 6.060052733783352e-06, "loss": 0.5318, "mean_token_accuracy": 0.8378539264202118, "num_tokens": 67607764.0, "step": 56220 }, { "entropy": 1.9130154103040695, "epoch": 0.17430808015443763, "grad_norm": 10.506196022033691, "learning_rate": 6.0595138372891934e-06, "loss": 0.5378, "mean_token_accuracy": 0.8319619536399842, "num_tokens": 67619152.0, "step": 56230 }, { "entropy": 1.8991611540317535, "epoch": 0.17433907927948733, "grad_norm": 7.824307918548584, "learning_rate": 6.0589750845352644e-06, "loss": 0.5565, "mean_token_accuracy": 0.8250974491238594, "num_tokens": 67630997.0, "step": 56240 }, { "entropy": 1.8883090645074845, "epoch": 0.17437007840453703, "grad_norm": 8.599298477172852, "learning_rate": 6.058436475457677e-06, "loss": 0.5425, "mean_token_accuracy": 0.831093080341816, "num_tokens": 67642786.0, "step": 56250 }, { "entropy": 1.8896732196211814, "epoch": 0.17440107752958672, "grad_norm": 4.400026798248291, "learning_rate": 6.057898009992582e-06, "loss": 0.53, "mean_token_accuracy": 0.8287911862134933, "num_tokens": 67654432.0, "step": 56260 }, { "entropy": 1.941761639714241, "epoch": 0.17443207665463642, "grad_norm": 7.403299808502197, "learning_rate": 6.057359688076171e-06, "loss": 0.5551, "mean_token_accuracy": 0.8296359524130821, "num_tokens": 67665128.0, "step": 56270 }, { "entropy": 1.8876193895936013, "epoch": 0.17446307577968612, "grad_norm": 8.507538795471191, "learning_rate": 6.0568215096446736e-06, "loss": 0.5421, "mean_token_accuracy": 0.8257350817322731, "num_tokens": 67676670.0, "step": 56280 }, { "entropy": 1.9427810519933701, "epoch": 0.1744940749047358, "grad_norm": 8.400871276855469, "learning_rate": 6.0562834746343615e-06, "loss": 0.5415, "mean_token_accuracy": 0.8306556642055511, "num_tokens": 67688070.0, "step": 56290 }, { "entropy": 1.8497335493564606, "epoch": 0.1745250740297855, "grad_norm": 8.613706588745117, "learning_rate": 6.0557455829815425e-06, "loss": 0.4978, "mean_token_accuracy": 0.8401633113622665, "num_tokens": 67699880.0, "step": 56300 }, { "entropy": 1.8545069240033627, "epoch": 0.1745560731548352, "grad_norm": 8.968703269958496, "learning_rate": 6.055207834622569e-06, "loss": 0.4838, "mean_token_accuracy": 0.83926263153553, "num_tokens": 67712700.0, "step": 56310 }, { "entropy": 1.878834429383278, "epoch": 0.1745870722798849, "grad_norm": 3.72434663772583, "learning_rate": 6.054670229493826e-06, "loss": 0.5328, "mean_token_accuracy": 0.837174066901207, "num_tokens": 67724809.0, "step": 56320 }, { "entropy": 1.891117848455906, "epoch": 0.1746180714049346, "grad_norm": 7.3367180824279785, "learning_rate": 6.054132767531746e-06, "loss": 0.5233, "mean_token_accuracy": 0.8528507500886917, "num_tokens": 67735990.0, "step": 56330 }, { "entropy": 1.818806654214859, "epoch": 0.1746490705299843, "grad_norm": 4.021301746368408, "learning_rate": 6.053595448672795e-06, "loss": 0.4479, "mean_token_accuracy": 0.848919802904129, "num_tokens": 67748381.0, "step": 56340 }, { "entropy": 1.9311149969697, "epoch": 0.174680069655034, "grad_norm": 9.249166488647461, "learning_rate": 6.053058272853482e-06, "loss": 0.5285, "mean_token_accuracy": 0.8428058341145516, "num_tokens": 67759883.0, "step": 56350 }, { "entropy": 1.8628065332770347, "epoch": 0.1747110687800837, "grad_norm": 5.258411884307861, "learning_rate": 6.0525212400103525e-06, "loss": 0.6164, "mean_token_accuracy": 0.8357118725776672, "num_tokens": 67772424.0, "step": 56360 }, { "entropy": 1.892784410715103, "epoch": 0.17474206790513339, "grad_norm": 3.5523910522460938, "learning_rate": 6.051984350079994e-06, "loss": 0.5414, "mean_token_accuracy": 0.83593248128891, "num_tokens": 67783804.0, "step": 56370 }, { "entropy": 1.8566182047128676, "epoch": 0.17477306703018308, "grad_norm": 10.872590065002441, "learning_rate": 6.051447602999031e-06, "loss": 0.5343, "mean_token_accuracy": 0.8340441033244133, "num_tokens": 67795593.0, "step": 56380 }, { "entropy": 1.8216275811195373, "epoch": 0.17480406615523278, "grad_norm": 8.305917739868164, "learning_rate": 6.050910998704129e-06, "loss": 0.4729, "mean_token_accuracy": 0.8441584646701813, "num_tokens": 67807848.0, "step": 56390 }, { "entropy": 1.8245621785521506, "epoch": 0.17483506528028248, "grad_norm": 4.366250514984131, "learning_rate": 6.050374537131993e-06, "loss": 0.4799, "mean_token_accuracy": 0.8390408426523208, "num_tokens": 67820873.0, "step": 56400 }, { "entropy": 1.8166996166110039, "epoch": 0.17486606440533217, "grad_norm": 8.542943954467773, "learning_rate": 6.049838218219366e-06, "loss": 0.4522, "mean_token_accuracy": 0.8468366071581841, "num_tokens": 67833571.0, "step": 56410 }, { "entropy": 1.858149343729019, "epoch": 0.17489706353038187, "grad_norm": 8.90046501159668, "learning_rate": 6.049302041903031e-06, "loss": 0.4834, "mean_token_accuracy": 0.8406156003475189, "num_tokens": 67845941.0, "step": 56420 }, { "entropy": 1.965251961350441, "epoch": 0.17492806265543154, "grad_norm": 10.929553985595703, "learning_rate": 6.048766008119811e-06, "loss": 0.5884, "mean_token_accuracy": 0.8183304697275162, "num_tokens": 67856898.0, "step": 56430 }, { "entropy": 1.9665222853422164, "epoch": 0.17495906178048123, "grad_norm": 9.303450584411621, "learning_rate": 6.048230116806566e-06, "loss": 0.5542, "mean_token_accuracy": 0.8342076927423477, "num_tokens": 67868059.0, "step": 56440 }, { "entropy": 1.916182427108288, "epoch": 0.17499006090553093, "grad_norm": 7.816822052001953, "learning_rate": 6.047694367900196e-06, "loss": 0.5245, "mean_token_accuracy": 0.8353784546256066, "num_tokens": 67880386.0, "step": 56450 }, { "entropy": 1.9688440665602684, "epoch": 0.17502106003058063, "grad_norm": 8.755724906921387, "learning_rate": 6.047158761337643e-06, "loss": 0.5154, "mean_token_accuracy": 0.8388349682092666, "num_tokens": 67892083.0, "step": 56460 }, { "entropy": 1.8669242531061172, "epoch": 0.17505205915563032, "grad_norm": 7.993595600128174, "learning_rate": 6.046623297055885e-06, "loss": 0.5223, "mean_token_accuracy": 0.8375104799866676, "num_tokens": 67904144.0, "step": 56470 }, { "entropy": 1.816992525756359, "epoch": 0.17508305828068002, "grad_norm": 4.964598655700684, "learning_rate": 6.046087974991937e-06, "loss": 0.4835, "mean_token_accuracy": 0.8406194254755974, "num_tokens": 67917537.0, "step": 56480 }, { "entropy": 1.8135570034384727, "epoch": 0.17511405740572972, "grad_norm": 8.609065055847168, "learning_rate": 6.045552795082859e-06, "loss": 0.4481, "mean_token_accuracy": 0.8388777539134026, "num_tokens": 67931322.0, "step": 56490 }, { "entropy": 1.9724404200911523, "epoch": 0.1751450565307794, "grad_norm": 9.910813331604004, "learning_rate": 6.0450177572657435e-06, "loss": 0.5724, "mean_token_accuracy": 0.829445195198059, "num_tokens": 67942971.0, "step": 56500 }, { "entropy": 1.8769863620400429, "epoch": 0.1751760556558291, "grad_norm": 10.879551887512207, "learning_rate": 6.044482861477728e-06, "loss": 0.4882, "mean_token_accuracy": 0.84668777436018, "num_tokens": 67954835.0, "step": 56510 }, { "entropy": 1.895396353304386, "epoch": 0.1752070547808788, "grad_norm": 7.5075178146362305, "learning_rate": 6.043948107655985e-06, "loss": 0.4617, "mean_token_accuracy": 0.859335508942604, "num_tokens": 67966585.0, "step": 56520 }, { "entropy": 1.845558349788189, "epoch": 0.1752380539059285, "grad_norm": 5.098723411560059, "learning_rate": 6.0434134957377275e-06, "loss": 0.4722, "mean_token_accuracy": 0.8512143403291702, "num_tokens": 67978885.0, "step": 56530 }, { "entropy": 1.8857697710394858, "epoch": 0.1752690530309782, "grad_norm": 9.507451057434082, "learning_rate": 6.042879025660207e-06, "loss": 0.4985, "mean_token_accuracy": 0.8429853811860084, "num_tokens": 67990999.0, "step": 56540 }, { "entropy": 1.8623712986707688, "epoch": 0.1753000521560279, "grad_norm": 9.963186264038086, "learning_rate": 6.042344697360713e-06, "loss": 0.4936, "mean_token_accuracy": 0.8374731317162514, "num_tokens": 68003103.0, "step": 56550 }, { "entropy": 1.8676358297467233, "epoch": 0.1753310512810776, "grad_norm": 7.353335857391357, "learning_rate": 6.041810510776573e-06, "loss": 0.4875, "mean_token_accuracy": 0.8497109815478325, "num_tokens": 68015029.0, "step": 56560 }, { "entropy": 1.9030646055936813, "epoch": 0.1753620504061273, "grad_norm": 4.22896671295166, "learning_rate": 6.041276465845158e-06, "loss": 0.5131, "mean_token_accuracy": 0.8298840820789337, "num_tokens": 68026783.0, "step": 56570 }, { "entropy": 1.9653840601444243, "epoch": 0.175393049531177, "grad_norm": 7.451119899749756, "learning_rate": 6.040742562503874e-06, "loss": 0.6031, "mean_token_accuracy": 0.8305002480745316, "num_tokens": 68037670.0, "step": 56580 }, { "entropy": 1.9089856892824173, "epoch": 0.17542404865622668, "grad_norm": 9.136287689208984, "learning_rate": 6.040208800690164e-06, "loss": 0.4826, "mean_token_accuracy": 0.8452465951442718, "num_tokens": 68049712.0, "step": 56590 }, { "entropy": 1.9428860023617744, "epoch": 0.17545504778127638, "grad_norm": 4.60407829284668, "learning_rate": 6.039675180341514e-06, "loss": 0.5676, "mean_token_accuracy": 0.8250961035490036, "num_tokens": 68061772.0, "step": 56600 }, { "entropy": 1.8673185929656029, "epoch": 0.17548604690632608, "grad_norm": 8.91592788696289, "learning_rate": 6.039141701395445e-06, "loss": 0.5159, "mean_token_accuracy": 0.831261222064495, "num_tokens": 68074176.0, "step": 56610 }, { "entropy": 1.8924739733338356, "epoch": 0.17551704603137577, "grad_norm": 4.301705360412598, "learning_rate": 6.0386083637895194e-06, "loss": 0.589, "mean_token_accuracy": 0.8193011194467544, "num_tokens": 68087007.0, "step": 56620 }, { "entropy": 1.9513219073414803, "epoch": 0.17554804515642547, "grad_norm": 8.441484451293945, "learning_rate": 6.038075167461339e-06, "loss": 0.5498, "mean_token_accuracy": 0.8327566117048264, "num_tokens": 68098862.0, "step": 56630 }, { "entropy": 1.927374078333378, "epoch": 0.17557904428147517, "grad_norm": 9.029617309570312, "learning_rate": 6.037542112348537e-06, "loss": 0.5306, "mean_token_accuracy": 0.831793662905693, "num_tokens": 68111512.0, "step": 56640 }, { "entropy": 1.9223608300089836, "epoch": 0.17561004340652486, "grad_norm": 4.942859172821045, "learning_rate": 6.0370091983887946e-06, "loss": 0.4767, "mean_token_accuracy": 0.8408562824130058, "num_tokens": 68123365.0, "step": 56650 }, { "entropy": 1.9207605987787246, "epoch": 0.17564104253157456, "grad_norm": 8.518644332885742, "learning_rate": 6.036476425519826e-06, "loss": 0.5439, "mean_token_accuracy": 0.8278803005814552, "num_tokens": 68135831.0, "step": 56660 }, { "entropy": 1.8496787279844285, "epoch": 0.17567204165662423, "grad_norm": 8.98222541809082, "learning_rate": 6.0359437936793865e-06, "loss": 0.4803, "mean_token_accuracy": 0.8472736284136773, "num_tokens": 68147503.0, "step": 56670 }, { "entropy": 1.9197917222976684, "epoch": 0.17570304078167392, "grad_norm": 8.030874252319336, "learning_rate": 6.0354113028052645e-06, "loss": 0.5854, "mean_token_accuracy": 0.8222957968711853, "num_tokens": 68159296.0, "step": 56680 }, { "entropy": 1.7842533797025681, "epoch": 0.17573403990672362, "grad_norm": 4.326035976409912, "learning_rate": 6.034878952835293e-06, "loss": 0.452, "mean_token_accuracy": 0.8420939773321152, "num_tokens": 68173065.0, "step": 56690 }, { "entropy": 1.832868304848671, "epoch": 0.17576503903177332, "grad_norm": 7.654922962188721, "learning_rate": 6.0343467437073435e-06, "loss": 0.4637, "mean_token_accuracy": 0.8458761259913444, "num_tokens": 68185736.0, "step": 56700 }, { "entropy": 1.8796532317996024, "epoch": 0.17579603815682301, "grad_norm": 8.131157875061035, "learning_rate": 6.03381467535932e-06, "loss": 0.5128, "mean_token_accuracy": 0.8416437700390815, "num_tokens": 68197051.0, "step": 56710 }, { "entropy": 1.8534429788589477, "epoch": 0.1758270372818727, "grad_norm": 10.127676963806152, "learning_rate": 6.03328274772917e-06, "loss": 0.4784, "mean_token_accuracy": 0.8437806442379951, "num_tokens": 68209732.0, "step": 56720 }, { "entropy": 1.9118575572967529, "epoch": 0.1758580364069224, "grad_norm": 8.467833518981934, "learning_rate": 6.0327509607548775e-06, "loss": 0.5385, "mean_token_accuracy": 0.8397658735513687, "num_tokens": 68220339.0, "step": 56730 }, { "entropy": 1.844566436111927, "epoch": 0.1758890355319721, "grad_norm": 9.010001182556152, "learning_rate": 6.032219314374463e-06, "loss": 0.4874, "mean_token_accuracy": 0.8425126999616623, "num_tokens": 68232942.0, "step": 56740 }, { "entropy": 1.874419206380844, "epoch": 0.1759200346570218, "grad_norm": 3.708150625228882, "learning_rate": 6.03168780852599e-06, "loss": 0.5122, "mean_token_accuracy": 0.8311620220541954, "num_tokens": 68245087.0, "step": 56750 }, { "entropy": 1.936698080599308, "epoch": 0.1759510337820715, "grad_norm": 8.663212776184082, "learning_rate": 6.0311564431475544e-06, "loss": 0.5413, "mean_token_accuracy": 0.8401820287108421, "num_tokens": 68256104.0, "step": 56760 }, { "entropy": 1.7919904850423336, "epoch": 0.1759820329071212, "grad_norm": 8.531180381774902, "learning_rate": 6.030625218177295e-06, "loss": 0.4302, "mean_token_accuracy": 0.8482810765504837, "num_tokens": 68269355.0, "step": 56770 }, { "entropy": 1.8165376424789428, "epoch": 0.1760130320321709, "grad_norm": 8.21648120880127, "learning_rate": 6.030094133553386e-06, "loss": 0.3991, "mean_token_accuracy": 0.8565915554761887, "num_tokens": 68282068.0, "step": 56780 }, { "entropy": 1.9260249942541123, "epoch": 0.1760440311572206, "grad_norm": 10.875053405761719, "learning_rate": 6.029563189214042e-06, "loss": 0.5705, "mean_token_accuracy": 0.8371611222624779, "num_tokens": 68293043.0, "step": 56790 }, { "entropy": 1.894984395802021, "epoch": 0.17607503028227028, "grad_norm": 4.284290790557861, "learning_rate": 6.0290323850975115e-06, "loss": 0.5323, "mean_token_accuracy": 0.8329278856515885, "num_tokens": 68304689.0, "step": 56800 }, { "entropy": 1.826059702038765, "epoch": 0.17610602940731998, "grad_norm": 10.755820274353027, "learning_rate": 6.028501721142086e-06, "loss": 0.4972, "mean_token_accuracy": 0.8308580055832863, "num_tokens": 68317294.0, "step": 56810 }, { "entropy": 1.8452559620141984, "epoch": 0.17613702853236968, "grad_norm": 7.7797040939331055, "learning_rate": 6.027971197286092e-06, "loss": 0.5015, "mean_token_accuracy": 0.8381189867854119, "num_tokens": 68329628.0, "step": 56820 }, { "entropy": 1.96290685236454, "epoch": 0.17616802765741937, "grad_norm": 9.668147087097168, "learning_rate": 6.027440813467895e-06, "loss": 0.5797, "mean_token_accuracy": 0.822065281867981, "num_tokens": 68341030.0, "step": 56830 }, { "entropy": 1.869129341840744, "epoch": 0.17619902678246907, "grad_norm": 8.411051750183105, "learning_rate": 6.026910569625899e-06, "loss": 0.4821, "mean_token_accuracy": 0.8431349366903305, "num_tokens": 68352755.0, "step": 56840 }, { "entropy": 1.8781275868415832, "epoch": 0.17623002590751877, "grad_norm": 8.966902732849121, "learning_rate": 6.026380465698544e-06, "loss": 0.4906, "mean_token_accuracy": 0.8468410953879356, "num_tokens": 68364276.0, "step": 56850 }, { "entropy": 1.7951830908656121, "epoch": 0.17626102503256846, "grad_norm": 7.949598789215088, "learning_rate": 6.025850501624308e-06, "loss": 0.4386, "mean_token_accuracy": 0.8530111536383629, "num_tokens": 68377483.0, "step": 56860 }, { "entropy": 1.8381464675068855, "epoch": 0.17629202415761816, "grad_norm": 8.990347862243652, "learning_rate": 6.025320677341711e-06, "loss": 0.477, "mean_token_accuracy": 0.8483751729130745, "num_tokens": 68390308.0, "step": 56870 }, { "entropy": 1.8240345925092698, "epoch": 0.17632302328266786, "grad_norm": 7.0853071212768555, "learning_rate": 6.024790992789304e-06, "loss": 0.4229, "mean_token_accuracy": 0.8524978414177895, "num_tokens": 68402956.0, "step": 56880 }, { "entropy": 1.8852536737918855, "epoch": 0.17635402240771755, "grad_norm": 7.855372905731201, "learning_rate": 6.024261447905683e-06, "loss": 0.529, "mean_token_accuracy": 0.830386458337307, "num_tokens": 68415648.0, "step": 56890 }, { "entropy": 1.903229682147503, "epoch": 0.17638502153276725, "grad_norm": 9.828873634338379, "learning_rate": 6.0237320426294755e-06, "loss": 0.5071, "mean_token_accuracy": 0.8335711434483528, "num_tokens": 68427530.0, "step": 56900 }, { "entropy": 1.9013172283768653, "epoch": 0.17641602065781695, "grad_norm": 8.85859203338623, "learning_rate": 6.023202776899353e-06, "loss": 0.5314, "mean_token_accuracy": 0.8388579234480857, "num_tokens": 68439199.0, "step": 56910 }, { "entropy": 1.7970546633005142, "epoch": 0.17644701978286662, "grad_norm": 7.830367088317871, "learning_rate": 6.0226736506540186e-06, "loss": 0.4507, "mean_token_accuracy": 0.8480477169156074, "num_tokens": 68452889.0, "step": 56920 }, { "entropy": 1.9641900137066841, "epoch": 0.1764780189079163, "grad_norm": 9.607945442199707, "learning_rate": 6.022144663832216e-06, "loss": 0.5687, "mean_token_accuracy": 0.8301128730177879, "num_tokens": 68464241.0, "step": 56930 }, { "entropy": 1.8105278745293618, "epoch": 0.176509018032966, "grad_norm": 8.978128433227539, "learning_rate": 6.0216158163727265e-06, "loss": 0.4757, "mean_token_accuracy": 0.8478241801261902, "num_tokens": 68477760.0, "step": 56940 }, { "entropy": 1.8664100661873817, "epoch": 0.1765400171580157, "grad_norm": 9.166288375854492, "learning_rate": 6.021087108214369e-06, "loss": 0.5079, "mean_token_accuracy": 0.8363597363233566, "num_tokens": 68489199.0, "step": 56950 }, { "entropy": 1.86256393045187, "epoch": 0.1765710162830654, "grad_norm": 11.094559669494629, "learning_rate": 6.020558539296e-06, "loss": 0.4993, "mean_token_accuracy": 0.8493609294295311, "num_tokens": 68500857.0, "step": 56960 }, { "entropy": 1.9616552650928498, "epoch": 0.1766020154081151, "grad_norm": 9.776663780212402, "learning_rate": 6.020030109556513e-06, "loss": 0.5729, "mean_token_accuracy": 0.8265084490180016, "num_tokens": 68512216.0, "step": 56970 }, { "entropy": 1.9011280700564384, "epoch": 0.1766330145331648, "grad_norm": 10.027840614318848, "learning_rate": 6.019501818934841e-06, "loss": 0.5192, "mean_token_accuracy": 0.83545922935009, "num_tokens": 68524348.0, "step": 56980 }, { "entropy": 1.898293286561966, "epoch": 0.1766640136582145, "grad_norm": 8.679490089416504, "learning_rate": 6.018973667369951e-06, "loss": 0.4967, "mean_token_accuracy": 0.837830375134945, "num_tokens": 68536357.0, "step": 56990 }, { "entropy": 1.8468020305037498, "epoch": 0.1766950127832642, "grad_norm": 10.170060157775879, "learning_rate": 6.01844565480085e-06, "loss": 0.4664, "mean_token_accuracy": 0.8466181620955467, "num_tokens": 68548639.0, "step": 57000 }, { "entropy": 1.8502362087368964, "epoch": 0.17672601190831388, "grad_norm": 9.918789863586426, "learning_rate": 6.017917781166582e-06, "loss": 0.4941, "mean_token_accuracy": 0.8434646666049957, "num_tokens": 68560221.0, "step": 57010 }, { "entropy": 1.847883252799511, "epoch": 0.17675701103336358, "grad_norm": 3.5449788570404053, "learning_rate": 6.017390046406228e-06, "loss": 0.5113, "mean_token_accuracy": 0.8291626140475273, "num_tokens": 68573365.0, "step": 57020 }, { "entropy": 1.8949782311916352, "epoch": 0.17678801015841328, "grad_norm": 8.271769523620605, "learning_rate": 6.016862450458908e-06, "loss": 0.5132, "mean_token_accuracy": 0.8308848381042481, "num_tokens": 68585450.0, "step": 57030 }, { "entropy": 1.8946194440126418, "epoch": 0.17681900928346297, "grad_norm": 4.019629955291748, "learning_rate": 6.016334993263777e-06, "loss": 0.5717, "mean_token_accuracy": 0.832112868130207, "num_tokens": 68597707.0, "step": 57040 }, { "entropy": 1.8897631257772445, "epoch": 0.17685000840851267, "grad_norm": 7.272465705871582, "learning_rate": 6.015807674760029e-06, "loss": 0.5713, "mean_token_accuracy": 0.8315581545233727, "num_tokens": 68610457.0, "step": 57050 }, { "entropy": 1.925081330537796, "epoch": 0.17688100753356237, "grad_norm": 3.892751932144165, "learning_rate": 6.015280494886894e-06, "loss": 0.5467, "mean_token_accuracy": 0.8374591827392578, "num_tokens": 68621838.0, "step": 57060 }, { "entropy": 1.9003553301095963, "epoch": 0.17691200665861206, "grad_norm": 8.307111740112305, "learning_rate": 6.01475345358364e-06, "loss": 0.549, "mean_token_accuracy": 0.8233644008636475, "num_tokens": 68633799.0, "step": 57070 }, { "entropy": 1.8528232917189598, "epoch": 0.17694300578366176, "grad_norm": 5.3729071617126465, "learning_rate": 6.014226550789571e-06, "loss": 0.4674, "mean_token_accuracy": 0.8408889621496201, "num_tokens": 68645858.0, "step": 57080 }, { "entropy": 1.8784978061914444, "epoch": 0.17697400490871146, "grad_norm": 9.919316291809082, "learning_rate": 6.013699786444032e-06, "loss": 0.5215, "mean_token_accuracy": 0.8381554082036018, "num_tokens": 68657900.0, "step": 57090 }, { "entropy": 1.870173905789852, "epoch": 0.17700500403376115, "grad_norm": 8.955236434936523, "learning_rate": 6.013173160486402e-06, "loss": 0.4849, "mean_token_accuracy": 0.8371668472886086, "num_tokens": 68670085.0, "step": 57100 }, { "entropy": 1.836390271782875, "epoch": 0.17703600315881085, "grad_norm": 8.182779312133789, "learning_rate": 6.012646672856096e-06, "loss": 0.4137, "mean_token_accuracy": 0.8486859187483787, "num_tokens": 68682961.0, "step": 57110 }, { "entropy": 1.8538921266794204, "epoch": 0.17706700228386055, "grad_norm": 9.032723426818848, "learning_rate": 6.012120323492569e-06, "loss": 0.4751, "mean_token_accuracy": 0.8346615865826607, "num_tokens": 68695560.0, "step": 57120 }, { "entropy": 1.7396719381213188, "epoch": 0.17709800140891024, "grad_norm": 9.137755393981934, "learning_rate": 6.0115941123353115e-06, "loss": 0.3782, "mean_token_accuracy": 0.8600768327713013, "num_tokens": 68709891.0, "step": 57130 }, { "entropy": 1.9093570321798325, "epoch": 0.17712900053395994, "grad_norm": 8.14976978302002, "learning_rate": 6.011068039323853e-06, "loss": 0.5074, "mean_token_accuracy": 0.845938429236412, "num_tokens": 68721365.0, "step": 57140 }, { "entropy": 1.9436588928103447, "epoch": 0.17715999965900964, "grad_norm": 7.569816589355469, "learning_rate": 6.010542104397757e-06, "loss": 0.5499, "mean_token_accuracy": 0.8280878692865372, "num_tokens": 68733349.0, "step": 57150 }, { "entropy": 1.8442727386951447, "epoch": 0.17719099878405933, "grad_norm": 8.786954879760742, "learning_rate": 6.0100163074966265e-06, "loss": 0.4243, "mean_token_accuracy": 0.8567957922816276, "num_tokens": 68746143.0, "step": 57160 }, { "entropy": 1.9321425527334213, "epoch": 0.177221997909109, "grad_norm": 5.256120681762695, "learning_rate": 6.009490648560099e-06, "loss": 0.5664, "mean_token_accuracy": 0.8285882025957108, "num_tokens": 68757817.0, "step": 57170 }, { "entropy": 1.8465531781315803, "epoch": 0.1772529970341587, "grad_norm": 9.743329048156738, "learning_rate": 6.008965127527853e-06, "loss": 0.4624, "mean_token_accuracy": 0.842889928817749, "num_tokens": 68771328.0, "step": 57180 }, { "entropy": 1.9369317084550857, "epoch": 0.1772839961592084, "grad_norm": 3.970804452896118, "learning_rate": 6.008439744339599e-06, "loss": 0.5458, "mean_token_accuracy": 0.8342833817005157, "num_tokens": 68782338.0, "step": 57190 }, { "entropy": 1.8963096350431443, "epoch": 0.1773149952842581, "grad_norm": 10.260050773620605, "learning_rate": 6.007914498935089e-06, "loss": 0.5204, "mean_token_accuracy": 0.8387321591377258, "num_tokens": 68793787.0, "step": 57200 }, { "entropy": 1.9125867441296578, "epoch": 0.1773459944093078, "grad_norm": 8.963105201721191, "learning_rate": 6.007389391254107e-06, "loss": 0.5465, "mean_token_accuracy": 0.8316601321101189, "num_tokens": 68805415.0, "step": 57210 }, { "entropy": 1.9211559012532233, "epoch": 0.17737699353435749, "grad_norm": 8.377967834472656, "learning_rate": 6.006864421236479e-06, "loss": 0.5096, "mean_token_accuracy": 0.84478460252285, "num_tokens": 68816320.0, "step": 57220 }, { "entropy": 1.9259533405303955, "epoch": 0.17740799265940718, "grad_norm": 8.574922561645508, "learning_rate": 6.0063395888220646e-06, "loss": 0.6228, "mean_token_accuracy": 0.8221300706267357, "num_tokens": 68828212.0, "step": 57230 }, { "entropy": 1.94825499355793, "epoch": 0.17743899178445688, "grad_norm": 7.872236251831055, "learning_rate": 6.00581489395076e-06, "loss": 0.5457, "mean_token_accuracy": 0.8307785525918007, "num_tokens": 68839479.0, "step": 57240 }, { "entropy": 1.8726135820150376, "epoch": 0.17746999090950658, "grad_norm": 9.208822250366211, "learning_rate": 6.005290336562501e-06, "loss": 0.4791, "mean_token_accuracy": 0.8378603532910347, "num_tokens": 68851565.0, "step": 57250 }, { "entropy": 1.899706956744194, "epoch": 0.17750099003455627, "grad_norm": 8.506078720092773, "learning_rate": 6.004765916597255e-06, "loss": 0.4913, "mean_token_accuracy": 0.842448279261589, "num_tokens": 68863462.0, "step": 57260 }, { "entropy": 1.8596750691533088, "epoch": 0.17753198915960597, "grad_norm": 9.010077476501465, "learning_rate": 6.004241633995031e-06, "loss": 0.4798, "mean_token_accuracy": 0.8384414002299309, "num_tokens": 68875441.0, "step": 57270 }, { "entropy": 1.9451793283224106, "epoch": 0.17756298828465567, "grad_norm": 14.787067413330078, "learning_rate": 6.0037174886958745e-06, "loss": 0.5643, "mean_token_accuracy": 0.8292500406503678, "num_tokens": 68886706.0, "step": 57280 }, { "entropy": 1.9556769192218781, "epoch": 0.17759398740970536, "grad_norm": 9.24488353729248, "learning_rate": 6.003193480639865e-06, "loss": 0.5579, "mean_token_accuracy": 0.8255641996860504, "num_tokens": 68899129.0, "step": 57290 }, { "entropy": 1.9216732487082482, "epoch": 0.17762498653475506, "grad_norm": 9.25411605834961, "learning_rate": 6.0026696097671166e-06, "loss": 0.5175, "mean_token_accuracy": 0.8260634735226631, "num_tokens": 68911411.0, "step": 57300 }, { "entropy": 1.8534406289458274, "epoch": 0.17765598565980475, "grad_norm": 9.491771697998047, "learning_rate": 6.002145876017787e-06, "loss": 0.539, "mean_token_accuracy": 0.8245239853858948, "num_tokens": 68923378.0, "step": 57310 }, { "entropy": 1.8846858441829681, "epoch": 0.17768698478485445, "grad_norm": 8.265665054321289, "learning_rate": 6.001622279332065e-06, "loss": 0.4853, "mean_token_accuracy": 0.8445610210299492, "num_tokens": 68934573.0, "step": 57320 }, { "entropy": 1.88294820189476, "epoch": 0.17771798390990415, "grad_norm": 6.753918170928955, "learning_rate": 6.001098819650178e-06, "loss": 0.4659, "mean_token_accuracy": 0.846784770488739, "num_tokens": 68946288.0, "step": 57330 }, { "entropy": 1.8599652536213398, "epoch": 0.17774898303495384, "grad_norm": 3.2606992721557617, "learning_rate": 6.000575496912389e-06, "loss": 0.5001, "mean_token_accuracy": 0.8316014409065247, "num_tokens": 68958131.0, "step": 57340 }, { "entropy": 1.8438660085201264, "epoch": 0.17777998216000354, "grad_norm": 4.111084461212158, "learning_rate": 6.000052311058995e-06, "loss": 0.5221, "mean_token_accuracy": 0.8313129052519799, "num_tokens": 68970796.0, "step": 57350 }, { "entropy": 1.95847550034523, "epoch": 0.17781098128505324, "grad_norm": 8.4618501663208, "learning_rate": 5.999529262030336e-06, "loss": 0.5624, "mean_token_accuracy": 0.8282546654343605, "num_tokens": 68982324.0, "step": 57360 }, { "entropy": 1.8982812404632567, "epoch": 0.17784198041010293, "grad_norm": 9.305404663085938, "learning_rate": 5.999006349766783e-06, "loss": 0.4753, "mean_token_accuracy": 0.8398912832140922, "num_tokens": 68993770.0, "step": 57370 }, { "entropy": 1.9295596539974214, "epoch": 0.17787297953515263, "grad_norm": 8.27092456817627, "learning_rate": 5.998483574208745e-06, "loss": 0.521, "mean_token_accuracy": 0.8346618101000786, "num_tokens": 69005186.0, "step": 57380 }, { "entropy": 1.8877547219395638, "epoch": 0.17790397866020233, "grad_norm": 9.524073600769043, "learning_rate": 5.997960935296666e-06, "loss": 0.5023, "mean_token_accuracy": 0.8391301646828652, "num_tokens": 69016767.0, "step": 57390 }, { "entropy": 1.8745584458112716, "epoch": 0.17793497778525202, "grad_norm": 9.270411491394043, "learning_rate": 5.99743843297103e-06, "loss": 0.4864, "mean_token_accuracy": 0.8369218707084656, "num_tokens": 69028704.0, "step": 57400 }, { "entropy": 1.910376462340355, "epoch": 0.1779659769103017, "grad_norm": 9.455318450927734, "learning_rate": 5.9969160671723535e-06, "loss": 0.5464, "mean_token_accuracy": 0.8310431107878685, "num_tokens": 69039809.0, "step": 57410 }, { "entropy": 1.8097993165254593, "epoch": 0.1779969760353514, "grad_norm": 6.114592552185059, "learning_rate": 5.996393837841191e-06, "loss": 0.4764, "mean_token_accuracy": 0.8315378293395043, "num_tokens": 69053275.0, "step": 57420 }, { "entropy": 1.8377384558320045, "epoch": 0.1780279751604011, "grad_norm": 10.482206344604492, "learning_rate": 5.995871744918132e-06, "loss": 0.5011, "mean_token_accuracy": 0.8455873817205429, "num_tokens": 69065797.0, "step": 57430 }, { "entropy": 1.8282571971416473, "epoch": 0.17805897428545078, "grad_norm": 8.590948104858398, "learning_rate": 5.995349788343804e-06, "loss": 0.4795, "mean_token_accuracy": 0.8498968616127968, "num_tokens": 69078035.0, "step": 57440 }, { "entropy": 1.8669711872935295, "epoch": 0.17808997341050048, "grad_norm": 8.990492820739746, "learning_rate": 5.994827968058869e-06, "loss": 0.532, "mean_token_accuracy": 0.8290546640753746, "num_tokens": 69090734.0, "step": 57450 }, { "entropy": 1.8619839981198312, "epoch": 0.17812097253555018, "grad_norm": 3.9954440593719482, "learning_rate": 5.9943062840040275e-06, "loss": 0.4637, "mean_token_accuracy": 0.8474828630685807, "num_tokens": 69103255.0, "step": 57460 }, { "entropy": 1.7928624168038367, "epoch": 0.17815197166059987, "grad_norm": 9.835958480834961, "learning_rate": 5.993784736120013e-06, "loss": 0.4709, "mean_token_accuracy": 0.8395032018423081, "num_tokens": 69115875.0, "step": 57470 }, { "entropy": 1.8707625821232796, "epoch": 0.17818297078564957, "grad_norm": 8.842391014099121, "learning_rate": 5.9932633243475954e-06, "loss": 0.4943, "mean_token_accuracy": 0.842163647711277, "num_tokens": 69127843.0, "step": 57480 }, { "entropy": 1.7798241943120956, "epoch": 0.17821396991069927, "grad_norm": 9.086782455444336, "learning_rate": 5.992742048627585e-06, "loss": 0.4477, "mean_token_accuracy": 0.8449828371405601, "num_tokens": 69140178.0, "step": 57490 }, { "entropy": 1.885751624405384, "epoch": 0.17824496903574896, "grad_norm": 7.488559722900391, "learning_rate": 5.992220908900822e-06, "loss": 0.5086, "mean_token_accuracy": 0.8432123690843583, "num_tokens": 69151411.0, "step": 57500 }, { "entropy": 1.9364232003688813, "epoch": 0.17827596816079866, "grad_norm": 7.98928165435791, "learning_rate": 5.991699905108188e-06, "loss": 0.5405, "mean_token_accuracy": 0.8352708846330643, "num_tokens": 69162326.0, "step": 57510 }, { "entropy": 1.7974600687623024, "epoch": 0.17830696728584836, "grad_norm": 8.123475074768066, "learning_rate": 5.991179037190596e-06, "loss": 0.4522, "mean_token_accuracy": 0.8448081642389298, "num_tokens": 69174698.0, "step": 57520 }, { "entropy": 1.8795274257659913, "epoch": 0.17833796641089805, "grad_norm": 5.942895412445068, "learning_rate": 5.9906583050889985e-06, "loss": 0.5132, "mean_token_accuracy": 0.847043776512146, "num_tokens": 69186725.0, "step": 57530 }, { "entropy": 1.895848709344864, "epoch": 0.17836896553594775, "grad_norm": 10.697137832641602, "learning_rate": 5.990137708744383e-06, "loss": 0.5632, "mean_token_accuracy": 0.8229943498969078, "num_tokens": 69198477.0, "step": 57540 }, { "entropy": 1.8663012593984605, "epoch": 0.17839996466099745, "grad_norm": 10.071105003356934, "learning_rate": 5.989617248097771e-06, "loss": 0.4907, "mean_token_accuracy": 0.8398059472441674, "num_tokens": 69210496.0, "step": 57550 }, { "entropy": 1.8629123076796532, "epoch": 0.17843096378604714, "grad_norm": 8.073697090148926, "learning_rate": 5.989096923090223e-06, "loss": 0.5158, "mean_token_accuracy": 0.8346897527575493, "num_tokens": 69222754.0, "step": 57560 }, { "entropy": 1.947044813632965, "epoch": 0.17846196291109684, "grad_norm": 10.283122062683105, "learning_rate": 5.988576733662831e-06, "loss": 0.5859, "mean_token_accuracy": 0.8340685039758682, "num_tokens": 69233747.0, "step": 57570 }, { "entropy": 1.8908654794096946, "epoch": 0.17849296203614654, "grad_norm": 8.264286041259766, "learning_rate": 5.988056679756728e-06, "loss": 0.573, "mean_token_accuracy": 0.8300464496016502, "num_tokens": 69244865.0, "step": 57580 }, { "entropy": 1.9280391097068788, "epoch": 0.17852396116119623, "grad_norm": 8.630804061889648, "learning_rate": 5.9875367613130775e-06, "loss": 0.5689, "mean_token_accuracy": 0.8240269988775253, "num_tokens": 69255702.0, "step": 57590 }, { "entropy": 1.935716523230076, "epoch": 0.17855496028624593, "grad_norm": 11.321623802185059, "learning_rate": 5.987016978273085e-06, "loss": 0.5363, "mean_token_accuracy": 0.8363067075610161, "num_tokens": 69266825.0, "step": 57600 }, { "entropy": 1.9014330476522445, "epoch": 0.17858595941129563, "grad_norm": 4.646116733551025, "learning_rate": 5.986497330577986e-06, "loss": 0.522, "mean_token_accuracy": 0.8390189468860626, "num_tokens": 69279081.0, "step": 57610 }, { "entropy": 1.9175360828638077, "epoch": 0.17861695853634532, "grad_norm": 9.152368545532227, "learning_rate": 5.985977818169053e-06, "loss": 0.5684, "mean_token_accuracy": 0.8228761807084084, "num_tokens": 69290443.0, "step": 57620 }, { "entropy": 1.8552556172013284, "epoch": 0.17864795766139502, "grad_norm": 7.568161487579346, "learning_rate": 5.985458440987597e-06, "loss": 0.4541, "mean_token_accuracy": 0.8391215577721596, "num_tokens": 69303861.0, "step": 57630 }, { "entropy": 1.7614154547452927, "epoch": 0.17867895678644471, "grad_norm": 3.9971959590911865, "learning_rate": 5.984939198974961e-06, "loss": 0.4513, "mean_token_accuracy": 0.8515268057584763, "num_tokens": 69317877.0, "step": 57640 }, { "entropy": 1.9185373350977897, "epoch": 0.1787099559114944, "grad_norm": 11.149886131286621, "learning_rate": 5.984420092072528e-06, "loss": 0.5383, "mean_token_accuracy": 0.8318190723657608, "num_tokens": 69329893.0, "step": 57650 }, { "entropy": 1.9363491863012314, "epoch": 0.17874095503654408, "grad_norm": 9.132575035095215, "learning_rate": 5.983901120221711e-06, "loss": 0.5636, "mean_token_accuracy": 0.8298118308186531, "num_tokens": 69341278.0, "step": 57660 }, { "entropy": 1.833200243115425, "epoch": 0.17877195416159378, "grad_norm": 4.589186668395996, "learning_rate": 5.983382283363963e-06, "loss": 0.4279, "mean_token_accuracy": 0.8535523906350135, "num_tokens": 69354018.0, "step": 57670 }, { "entropy": 1.952676109969616, "epoch": 0.17880295328664347, "grad_norm": 8.8577241897583, "learning_rate": 5.9828635814407695e-06, "loss": 0.5592, "mean_token_accuracy": 0.8286620289087295, "num_tokens": 69365463.0, "step": 57680 }, { "entropy": 1.901480646431446, "epoch": 0.17883395241169317, "grad_norm": 8.063117027282715, "learning_rate": 5.9823450143936555e-06, "loss": 0.4938, "mean_token_accuracy": 0.8349184259772301, "num_tokens": 69377683.0, "step": 57690 }, { "entropy": 1.8743933662772179, "epoch": 0.17886495153674287, "grad_norm": 8.365898132324219, "learning_rate": 5.981826582164176e-06, "loss": 0.5108, "mean_token_accuracy": 0.8368162304162979, "num_tokens": 69390015.0, "step": 57700 }, { "entropy": 1.776149820536375, "epoch": 0.17889595066179256, "grad_norm": 9.224735260009766, "learning_rate": 5.9813082846939264e-06, "loss": 0.4258, "mean_token_accuracy": 0.8448210716247558, "num_tokens": 69402894.0, "step": 57710 }, { "entropy": 1.7384593583643437, "epoch": 0.17892694978684226, "grad_norm": 8.464068412780762, "learning_rate": 5.980790121924534e-06, "loss": 0.3974, "mean_token_accuracy": 0.8530717685818672, "num_tokens": 69416819.0, "step": 57720 }, { "entropy": 1.8666021645069122, "epoch": 0.17895794891189196, "grad_norm": 10.071202278137207, "learning_rate": 5.980272093797667e-06, "loss": 0.5063, "mean_token_accuracy": 0.8388993337750434, "num_tokens": 69428622.0, "step": 57730 }, { "entropy": 1.9105965510010718, "epoch": 0.17898894803694165, "grad_norm": 7.725157737731934, "learning_rate": 5.979754200255019e-06, "loss": 0.5431, "mean_token_accuracy": 0.8272575065493584, "num_tokens": 69439573.0, "step": 57740 }, { "entropy": 1.880422416329384, "epoch": 0.17901994716199135, "grad_norm": 4.6876630783081055, "learning_rate": 5.979236441238329e-06, "loss": 0.5393, "mean_token_accuracy": 0.8352604731917381, "num_tokens": 69451040.0, "step": 57750 }, { "entropy": 1.877747993171215, "epoch": 0.17905094628704105, "grad_norm": 3.8717169761657715, "learning_rate": 5.978718816689365e-06, "loss": 0.5186, "mean_token_accuracy": 0.8352397963404655, "num_tokens": 69462601.0, "step": 57760 }, { "entropy": 1.9101334929466247, "epoch": 0.17908194541209074, "grad_norm": 9.124340057373047, "learning_rate": 5.978201326549935e-06, "loss": 0.5731, "mean_token_accuracy": 0.8258951917290688, "num_tokens": 69474464.0, "step": 57770 }, { "entropy": 1.7846893429756165, "epoch": 0.17911294453714044, "grad_norm": 3.8451902866363525, "learning_rate": 5.977683970761876e-06, "loss": 0.465, "mean_token_accuracy": 0.8453226387500763, "num_tokens": 69487808.0, "step": 57780 }, { "entropy": 1.8912085384130477, "epoch": 0.17914394366219014, "grad_norm": 9.206360816955566, "learning_rate": 5.9771667492670675e-06, "loss": 0.5444, "mean_token_accuracy": 0.8258654251694679, "num_tokens": 69499784.0, "step": 57790 }, { "entropy": 1.8862205877900124, "epoch": 0.17917494278723983, "grad_norm": 4.152833461761475, "learning_rate": 5.97664966200742e-06, "loss": 0.5273, "mean_token_accuracy": 0.8353832781314849, "num_tokens": 69511090.0, "step": 57800 }, { "entropy": 1.8243310183286667, "epoch": 0.17920594191228953, "grad_norm": 11.02452278137207, "learning_rate": 5.9761327089248786e-06, "loss": 0.4937, "mean_token_accuracy": 0.840731629729271, "num_tokens": 69523788.0, "step": 57810 }, { "entropy": 1.9363866940140724, "epoch": 0.17923694103733923, "grad_norm": 4.2471923828125, "learning_rate": 5.975615889961425e-06, "loss": 0.5913, "mean_token_accuracy": 0.8229147881269455, "num_tokens": 69534659.0, "step": 57820 }, { "entropy": 1.8675603151321412, "epoch": 0.17926794016238892, "grad_norm": 8.263809204101562, "learning_rate": 5.9750992050590765e-06, "loss": 0.5291, "mean_token_accuracy": 0.8418093547224998, "num_tokens": 69546361.0, "step": 57830 }, { "entropy": 1.8654843851923943, "epoch": 0.17929893928743862, "grad_norm": 9.333740234375, "learning_rate": 5.974582654159884e-06, "loss": 0.4803, "mean_token_accuracy": 0.8490580499172211, "num_tokens": 69558369.0, "step": 57840 }, { "entropy": 1.7780871927738189, "epoch": 0.17932993841248832, "grad_norm": 3.7377822399139404, "learning_rate": 5.974066237205935e-06, "loss": 0.4321, "mean_token_accuracy": 0.8493701010942459, "num_tokens": 69571586.0, "step": 57850 }, { "entropy": 1.8709719255566597, "epoch": 0.179360937537538, "grad_norm": 9.209651947021484, "learning_rate": 5.9735499541393515e-06, "loss": 0.5304, "mean_token_accuracy": 0.8328555643558502, "num_tokens": 69584134.0, "step": 57860 }, { "entropy": 1.9305126339197158, "epoch": 0.1793919366625877, "grad_norm": 8.458016395568848, "learning_rate": 5.9730338049022905e-06, "loss": 0.5505, "mean_token_accuracy": 0.8329010114073754, "num_tokens": 69595889.0, "step": 57870 }, { "entropy": 1.9079819098114967, "epoch": 0.1794229357876374, "grad_norm": 8.438733100891113, "learning_rate": 5.972517789436941e-06, "loss": 0.5549, "mean_token_accuracy": 0.8301453992724419, "num_tokens": 69607872.0, "step": 57880 }, { "entropy": 1.8880163133144379, "epoch": 0.1794539349126871, "grad_norm": 7.341186046600342, "learning_rate": 5.972001907685534e-06, "loss": 0.4612, "mean_token_accuracy": 0.8447798684239387, "num_tokens": 69619608.0, "step": 57890 }, { "entropy": 1.9878419309854507, "epoch": 0.1794849340377368, "grad_norm": 8.431941032409668, "learning_rate": 5.9714861595903275e-06, "loss": 0.5709, "mean_token_accuracy": 0.8319448336958886, "num_tokens": 69630342.0, "step": 57900 }, { "entropy": 1.9403752774000167, "epoch": 0.17951593316278647, "grad_norm": 8.231103897094727, "learning_rate": 5.9709705450936195e-06, "loss": 0.5535, "mean_token_accuracy": 0.8287773564457893, "num_tokens": 69641589.0, "step": 57910 }, { "entropy": 1.8349755868315696, "epoch": 0.17954693228783616, "grad_norm": 8.210265159606934, "learning_rate": 5.9704550641377414e-06, "loss": 0.5199, "mean_token_accuracy": 0.8326278194785118, "num_tokens": 69654695.0, "step": 57920 }, { "entropy": 1.8748487085103989, "epoch": 0.17957793141288586, "grad_norm": 4.568990230560303, "learning_rate": 5.96993971666506e-06, "loss": 0.516, "mean_token_accuracy": 0.8239820554852486, "num_tokens": 69667546.0, "step": 57930 }, { "entropy": 1.957274827361107, "epoch": 0.17960893053793556, "grad_norm": 8.795912742614746, "learning_rate": 5.969424502617975e-06, "loss": 0.5353, "mean_token_accuracy": 0.8339650616049766, "num_tokens": 69678423.0, "step": 57940 }, { "entropy": 1.9020697817206382, "epoch": 0.17963992966298525, "grad_norm": 8.028707504272461, "learning_rate": 5.968909421938924e-06, "loss": 0.499, "mean_token_accuracy": 0.83273094445467, "num_tokens": 69690504.0, "step": 57950 }, { "entropy": 1.9501001119613648, "epoch": 0.17967092878803495, "grad_norm": 8.39437198638916, "learning_rate": 5.968394474570377e-06, "loss": 0.5423, "mean_token_accuracy": 0.8293194517493248, "num_tokens": 69701909.0, "step": 57960 }, { "entropy": 1.8961822897195817, "epoch": 0.17970192791308465, "grad_norm": 3.989229202270508, "learning_rate": 5.9678796604548385e-06, "loss": 0.5007, "mean_token_accuracy": 0.8313602998852729, "num_tokens": 69713928.0, "step": 57970 }, { "entropy": 1.9584363132715226, "epoch": 0.17973292703813434, "grad_norm": 10.755518913269043, "learning_rate": 5.967364979534849e-06, "loss": 0.5435, "mean_token_accuracy": 0.8270615398883819, "num_tokens": 69725093.0, "step": 57980 }, { "entropy": 1.883443070948124, "epoch": 0.17976392616318404, "grad_norm": 8.810373306274414, "learning_rate": 5.966850431752984e-06, "loss": 0.5011, "mean_token_accuracy": 0.8435442551970482, "num_tokens": 69736710.0, "step": 57990 }, { "entropy": 1.838029107451439, "epoch": 0.17979492528823374, "grad_norm": 7.354464530944824, "learning_rate": 5.9663360170518524e-06, "loss": 0.4682, "mean_token_accuracy": 0.8321570068597793, "num_tokens": 69749525.0, "step": 58000 }, { "entropy": 1.860961727797985, "epoch": 0.17982592441328343, "grad_norm": 8.58539867401123, "learning_rate": 5.965821735374097e-06, "loss": 0.5197, "mean_token_accuracy": 0.842749148607254, "num_tokens": 69761090.0, "step": 58010 }, { "entropy": 1.888116455078125, "epoch": 0.17985692353833313, "grad_norm": 8.116665840148926, "learning_rate": 5.965307586662398e-06, "loss": 0.4385, "mean_token_accuracy": 0.8538775756955147, "num_tokens": 69773218.0, "step": 58020 }, { "entropy": 1.88920701444149, "epoch": 0.17988792266338283, "grad_norm": 8.906111717224121, "learning_rate": 5.964793570859469e-06, "loss": 0.5258, "mean_token_accuracy": 0.8328505247831345, "num_tokens": 69785392.0, "step": 58030 }, { "entropy": 1.9361517250537872, "epoch": 0.17991892178843252, "grad_norm": 10.055381774902344, "learning_rate": 5.964279687908057e-06, "loss": 0.5385, "mean_token_accuracy": 0.8345837727189064, "num_tokens": 69796770.0, "step": 58040 }, { "entropy": 1.856403675675392, "epoch": 0.17994992091348222, "grad_norm": 2.6345648765563965, "learning_rate": 5.963765937750943e-06, "loss": 0.4652, "mean_token_accuracy": 0.8511771202087403, "num_tokens": 69808863.0, "step": 58050 }, { "entropy": 1.8975456178188324, "epoch": 0.17998092003853192, "grad_norm": 9.3054780960083, "learning_rate": 5.963252320330947e-06, "loss": 0.5478, "mean_token_accuracy": 0.827577319741249, "num_tokens": 69820718.0, "step": 58060 }, { "entropy": 1.8229490399360657, "epoch": 0.1800119191635816, "grad_norm": 8.946834564208984, "learning_rate": 5.962738835590917e-06, "loss": 0.4613, "mean_token_accuracy": 0.8495317012071609, "num_tokens": 69834215.0, "step": 58070 }, { "entropy": 1.9095330134034156, "epoch": 0.1800429182886313, "grad_norm": 8.070923805236816, "learning_rate": 5.962225483473742e-06, "loss": 0.5089, "mean_token_accuracy": 0.8391377106308937, "num_tokens": 69846084.0, "step": 58080 }, { "entropy": 1.7783651992678642, "epoch": 0.180073917413681, "grad_norm": 7.508241176605225, "learning_rate": 5.961712263922337e-06, "loss": 0.4244, "mean_token_accuracy": 0.8533773466944694, "num_tokens": 69859333.0, "step": 58090 }, { "entropy": 1.926021693646908, "epoch": 0.1801049165387307, "grad_norm": 9.783551216125488, "learning_rate": 5.961199176879661e-06, "loss": 0.5203, "mean_token_accuracy": 0.8343341812491417, "num_tokens": 69870605.0, "step": 58100 }, { "entropy": 1.9040902271866798, "epoch": 0.1801359156637804, "grad_norm": 8.443004608154297, "learning_rate": 5.960686222288703e-06, "loss": 0.5384, "mean_token_accuracy": 0.8259869500994682, "num_tokens": 69883254.0, "step": 58110 }, { "entropy": 1.887841096520424, "epoch": 0.1801669147888301, "grad_norm": 9.13963794708252, "learning_rate": 5.960173400092483e-06, "loss": 0.4781, "mean_token_accuracy": 0.8409109726548195, "num_tokens": 69895422.0, "step": 58120 }, { "entropy": 1.8949230402708053, "epoch": 0.1801979139138798, "grad_norm": 8.608329772949219, "learning_rate": 5.9596607102340605e-06, "loss": 0.5482, "mean_token_accuracy": 0.8356387764215469, "num_tokens": 69907555.0, "step": 58130 }, { "entropy": 1.9181236669421196, "epoch": 0.1802289130389295, "grad_norm": 7.1196513175964355, "learning_rate": 5.959148152656526e-06, "loss": 0.4865, "mean_token_accuracy": 0.8519378170371056, "num_tokens": 69919079.0, "step": 58140 }, { "entropy": 1.9497553408145905, "epoch": 0.18025991216397916, "grad_norm": 9.516104698181152, "learning_rate": 5.958635727303008e-06, "loss": 0.5585, "mean_token_accuracy": 0.8375312581658363, "num_tokens": 69929946.0, "step": 58150 }, { "entropy": 1.8249136477708816, "epoch": 0.18029091128902885, "grad_norm": 4.427674770355225, "learning_rate": 5.958123434116665e-06, "loss": 0.4367, "mean_token_accuracy": 0.8447049587965012, "num_tokens": 69942319.0, "step": 58160 }, { "entropy": 1.8562700033187867, "epoch": 0.18032191041407855, "grad_norm": 9.44267749786377, "learning_rate": 5.957611273040691e-06, "loss": 0.5141, "mean_token_accuracy": 0.8396990388631821, "num_tokens": 69954897.0, "step": 58170 }, { "entropy": 1.9479858607053757, "epoch": 0.18035290953912825, "grad_norm": 4.776163101196289, "learning_rate": 5.9570992440183166e-06, "loss": 0.5244, "mean_token_accuracy": 0.8349523320794106, "num_tokens": 69966494.0, "step": 58180 }, { "entropy": 1.9115686953067779, "epoch": 0.18038390866417794, "grad_norm": 8.399941444396973, "learning_rate": 5.956587346992802e-06, "loss": 0.4839, "mean_token_accuracy": 0.8438914701342582, "num_tokens": 69978598.0, "step": 58190 }, { "entropy": 1.8765361204743385, "epoch": 0.18041490778922764, "grad_norm": 9.405889511108398, "learning_rate": 5.956075581907446e-06, "loss": 0.5624, "mean_token_accuracy": 0.8236325919628144, "num_tokens": 69990680.0, "step": 58200 }, { "entropy": 1.9489685088396071, "epoch": 0.18044590691427734, "grad_norm": 8.940106391906738, "learning_rate": 5.955563948705578e-06, "loss": 0.5632, "mean_token_accuracy": 0.8357775121927261, "num_tokens": 70001208.0, "step": 58210 }, { "entropy": 1.8073627695441246, "epoch": 0.18047690603932703, "grad_norm": 7.959947109222412, "learning_rate": 5.955052447330566e-06, "loss": 0.4221, "mean_token_accuracy": 0.856120876967907, "num_tokens": 70015168.0, "step": 58220 }, { "entropy": 1.912624678015709, "epoch": 0.18050790516437673, "grad_norm": 10.096214294433594, "learning_rate": 5.954541077725806e-06, "loss": 0.5486, "mean_token_accuracy": 0.821705561876297, "num_tokens": 70027352.0, "step": 58230 }, { "entropy": 1.7947795525193215, "epoch": 0.18053890428942643, "grad_norm": 3.570014715194702, "learning_rate": 5.954029839834733e-06, "loss": 0.395, "mean_token_accuracy": 0.8589225232601165, "num_tokens": 70040539.0, "step": 58240 }, { "entropy": 1.9037404909729958, "epoch": 0.18056990341447612, "grad_norm": 7.7414655685424805, "learning_rate": 5.953518733600813e-06, "loss": 0.5514, "mean_token_accuracy": 0.828503304719925, "num_tokens": 70052279.0, "step": 58250 }, { "entropy": 1.898376226425171, "epoch": 0.18060090253952582, "grad_norm": 4.189719200134277, "learning_rate": 5.953007758967547e-06, "loss": 0.4731, "mean_token_accuracy": 0.8411821514368057, "num_tokens": 70064479.0, "step": 58260 }, { "entropy": 1.8012950256466866, "epoch": 0.18063190166457552, "grad_norm": 2.9318971633911133, "learning_rate": 5.952496915878471e-06, "loss": 0.4324, "mean_token_accuracy": 0.8473293244838714, "num_tokens": 70077477.0, "step": 58270 }, { "entropy": 1.883352592587471, "epoch": 0.1806629007896252, "grad_norm": 8.893994331359863, "learning_rate": 5.951986204277154e-06, "loss": 0.4891, "mean_token_accuracy": 0.8392975389957428, "num_tokens": 70089437.0, "step": 58280 }, { "entropy": 1.9155563935637474, "epoch": 0.1806938999146749, "grad_norm": 9.788602828979492, "learning_rate": 5.951475624107198e-06, "loss": 0.527, "mean_token_accuracy": 0.8240778625011445, "num_tokens": 70101237.0, "step": 58290 }, { "entropy": 1.8512123331427575, "epoch": 0.1807248990397246, "grad_norm": 8.938436508178711, "learning_rate": 5.950965175312241e-06, "loss": 0.4717, "mean_token_accuracy": 0.836862875521183, "num_tokens": 70114119.0, "step": 58300 }, { "entropy": 1.8680644989013673, "epoch": 0.1807558981647743, "grad_norm": 11.42545223236084, "learning_rate": 5.950454857835951e-06, "loss": 0.4764, "mean_token_accuracy": 0.8473235473036766, "num_tokens": 70125999.0, "step": 58310 }, { "entropy": 1.8300606310367584, "epoch": 0.180786897289824, "grad_norm": 6.41502046585083, "learning_rate": 5.949944671622034e-06, "loss": 0.5353, "mean_token_accuracy": 0.8300027817487716, "num_tokens": 70139069.0, "step": 58320 }, { "entropy": 1.9075913026928901, "epoch": 0.1808178964148737, "grad_norm": 3.9195168018341064, "learning_rate": 5.949434616614229e-06, "loss": 0.5197, "mean_token_accuracy": 0.8323316410183906, "num_tokens": 70151975.0, "step": 58330 }, { "entropy": 1.979878196120262, "epoch": 0.1808488955399234, "grad_norm": 8.486638069152832, "learning_rate": 5.948924692756306e-06, "loss": 0.5894, "mean_token_accuracy": 0.8257222607731819, "num_tokens": 70162362.0, "step": 58340 }, { "entropy": 1.891125389933586, "epoch": 0.1808798946649731, "grad_norm": 4.342275142669678, "learning_rate": 5.948414899992072e-06, "loss": 0.542, "mean_token_accuracy": 0.8310712307691575, "num_tokens": 70174516.0, "step": 58350 }, { "entropy": 1.8640663996338844, "epoch": 0.1809108937900228, "grad_norm": 2.7677135467529297, "learning_rate": 5.947905238265366e-06, "loss": 0.4979, "mean_token_accuracy": 0.8343622460961342, "num_tokens": 70186361.0, "step": 58360 }, { "entropy": 1.863309583067894, "epoch": 0.18094189291507248, "grad_norm": 9.252532958984375, "learning_rate": 5.947395707520059e-06, "loss": 0.5103, "mean_token_accuracy": 0.8355363115668297, "num_tokens": 70198676.0, "step": 58370 }, { "entropy": 1.9271214351058006, "epoch": 0.18097289204012218, "grad_norm": 8.357312202453613, "learning_rate": 5.946886307700062e-06, "loss": 0.535, "mean_token_accuracy": 0.8216869860887528, "num_tokens": 70210561.0, "step": 58380 }, { "entropy": 1.8037690997123719, "epoch": 0.18100389116517188, "grad_norm": 8.436007499694824, "learning_rate": 5.94637703874931e-06, "loss": 0.4989, "mean_token_accuracy": 0.8438222482800484, "num_tokens": 70224444.0, "step": 58390 }, { "entropy": 1.9045485824346542, "epoch": 0.18103489029022155, "grad_norm": 10.353751182556152, "learning_rate": 5.9458679006117815e-06, "loss": 0.5498, "mean_token_accuracy": 0.8378313854336739, "num_tokens": 70236240.0, "step": 58400 }, { "entropy": 1.9282925948500633, "epoch": 0.18106588941527124, "grad_norm": 9.726922035217285, "learning_rate": 5.94535889323148e-06, "loss": 0.4797, "mean_token_accuracy": 0.8426464855670929, "num_tokens": 70247776.0, "step": 58410 }, { "entropy": 1.9179017692804337, "epoch": 0.18109688854032094, "grad_norm": 8.368406295776367, "learning_rate": 5.944850016552449e-06, "loss": 0.5916, "mean_token_accuracy": 0.820852018892765, "num_tokens": 70259710.0, "step": 58420 }, { "entropy": 1.9104688361287117, "epoch": 0.18112788766537063, "grad_norm": 9.314393997192383, "learning_rate": 5.944341270518763e-06, "loss": 0.5022, "mean_token_accuracy": 0.8437484547495842, "num_tokens": 70271807.0, "step": 58430 }, { "entropy": 1.9487433806061745, "epoch": 0.18115888679042033, "grad_norm": 9.065957069396973, "learning_rate": 5.943832655074528e-06, "loss": 0.5531, "mean_token_accuracy": 0.822340051829815, "num_tokens": 70283681.0, "step": 58440 }, { "entropy": 1.8573688492178917, "epoch": 0.18118988591547003, "grad_norm": 8.39069652557373, "learning_rate": 5.943324170163888e-06, "loss": 0.5027, "mean_token_accuracy": 0.8431822940707207, "num_tokens": 70296587.0, "step": 58450 }, { "entropy": 1.8324483245611192, "epoch": 0.18122088504051972, "grad_norm": 4.025021076202393, "learning_rate": 5.942815815731015e-06, "loss": 0.4318, "mean_token_accuracy": 0.8505256190896034, "num_tokens": 70308977.0, "step": 58460 }, { "entropy": 1.9166489720344544, "epoch": 0.18125188416556942, "grad_norm": 7.720203876495361, "learning_rate": 5.942307591720121e-06, "loss": 0.4921, "mean_token_accuracy": 0.8491469100117683, "num_tokens": 70320331.0, "step": 58470 }, { "entropy": 1.8525561004877091, "epoch": 0.18128288329061912, "grad_norm": 8.706543922424316, "learning_rate": 5.941799498075445e-06, "loss": 0.4696, "mean_token_accuracy": 0.8432158127427101, "num_tokens": 70331972.0, "step": 58480 }, { "entropy": 1.734443087875843, "epoch": 0.18131388241566881, "grad_norm": 4.216269493103027, "learning_rate": 5.9412915347412624e-06, "loss": 0.4215, "mean_token_accuracy": 0.8574797987937928, "num_tokens": 70345958.0, "step": 58490 }, { "entropy": 1.885965469479561, "epoch": 0.1813448815407185, "grad_norm": 8.010455131530762, "learning_rate": 5.940783701661882e-06, "loss": 0.4823, "mean_token_accuracy": 0.8380753323435783, "num_tokens": 70357917.0, "step": 58500 }, { "entropy": 1.9255839720368386, "epoch": 0.1813758806657682, "grad_norm": 8.077774047851562, "learning_rate": 5.940275998781646e-06, "loss": 0.5291, "mean_token_accuracy": 0.8398918956518173, "num_tokens": 70369190.0, "step": 58510 }, { "entropy": 1.8902644366025925, "epoch": 0.1814068797908179, "grad_norm": 9.07833480834961, "learning_rate": 5.939768426044928e-06, "loss": 0.5209, "mean_token_accuracy": 0.8410121381282807, "num_tokens": 70381045.0, "step": 58520 }, { "entropy": 1.8733600035309792, "epoch": 0.1814378789158676, "grad_norm": 7.863108158111572, "learning_rate": 5.939260983396139e-06, "loss": 0.5245, "mean_token_accuracy": 0.8383708387613297, "num_tokens": 70394169.0, "step": 58530 }, { "entropy": 1.9640895150601865, "epoch": 0.1814688780409173, "grad_norm": 9.28256607055664, "learning_rate": 5.938753670779716e-06, "loss": 0.5632, "mean_token_accuracy": 0.8161120370030404, "num_tokens": 70405772.0, "step": 58540 }, { "entropy": 1.8138402692973614, "epoch": 0.181499877165967, "grad_norm": 9.445598602294922, "learning_rate": 5.938246488140139e-06, "loss": 0.4378, "mean_token_accuracy": 0.8572772964835167, "num_tokens": 70418813.0, "step": 58550 }, { "entropy": 1.9521236777305604, "epoch": 0.1815308762910167, "grad_norm": 6.890706539154053, "learning_rate": 5.937739435421913e-06, "loss": 0.5465, "mean_token_accuracy": 0.8317166283726692, "num_tokens": 70430085.0, "step": 58560 }, { "entropy": 1.8753592044115066, "epoch": 0.1815618754160664, "grad_norm": Infinity, "learning_rate": 5.9372325125695796e-06, "loss": 0.513, "mean_token_accuracy": 0.8267385110259056, "num_tokens": 70442432.0, "step": 58570 }, { "entropy": 1.951469287276268, "epoch": 0.18159287454111608, "grad_norm": 10.176626205444336, "learning_rate": 5.936725719527712e-06, "loss": 0.5294, "mean_token_accuracy": 0.8351786851882934, "num_tokens": 70453518.0, "step": 58580 }, { "entropy": 2.006668972969055, "epoch": 0.18162387366616578, "grad_norm": 10.691234588623047, "learning_rate": 5.9362190562409196e-06, "loss": 0.6187, "mean_token_accuracy": 0.8179316386580467, "num_tokens": 70464169.0, "step": 58590 }, { "entropy": 1.9340445652604104, "epoch": 0.18165487279121548, "grad_norm": 8.558653831481934, "learning_rate": 5.93571252265384e-06, "loss": 0.5382, "mean_token_accuracy": 0.8301935106515884, "num_tokens": 70476295.0, "step": 58600 }, { "entropy": 2.03122977912426, "epoch": 0.18168587191626517, "grad_norm": 8.362186431884766, "learning_rate": 5.935206118711151e-06, "loss": 0.6264, "mean_token_accuracy": 0.8174452885985375, "num_tokens": 70487260.0, "step": 58610 }, { "entropy": 1.9458183541893959, "epoch": 0.18171687104131487, "grad_norm": 4.3328633308410645, "learning_rate": 5.934699844357555e-06, "loss": 0.5, "mean_token_accuracy": 0.834861546754837, "num_tokens": 70498573.0, "step": 58620 }, { "entropy": 1.9818433836102485, "epoch": 0.18174787016636457, "grad_norm": 9.87380313873291, "learning_rate": 5.934193699537794e-06, "loss": 0.5678, "mean_token_accuracy": 0.8219478592276573, "num_tokens": 70510126.0, "step": 58630 }, { "entropy": 1.9035624399781228, "epoch": 0.18177886929141426, "grad_norm": 7.710408687591553, "learning_rate": 5.933687684196638e-06, "loss": 0.5346, "mean_token_accuracy": 0.8381109595298767, "num_tokens": 70522421.0, "step": 58640 }, { "entropy": 1.9757555976510048, "epoch": 0.18180986841646393, "grad_norm": 8.179011344909668, "learning_rate": 5.933181798278895e-06, "loss": 0.5466, "mean_token_accuracy": 0.8351363927125931, "num_tokens": 70534084.0, "step": 58650 }, { "entropy": 1.9270098477602005, "epoch": 0.18184086754151363, "grad_norm": 4.406381130218506, "learning_rate": 5.9326760417294036e-06, "loss": 0.5338, "mean_token_accuracy": 0.8375926703214646, "num_tokens": 70545476.0, "step": 58660 }, { "entropy": 1.9433436125516892, "epoch": 0.18187186666656333, "grad_norm": 8.257979393005371, "learning_rate": 5.9321704144930335e-06, "loss": 0.6162, "mean_token_accuracy": 0.8256099238991738, "num_tokens": 70557302.0, "step": 58670 }, { "entropy": 1.8725889652967453, "epoch": 0.18190286579161302, "grad_norm": 8.043081283569336, "learning_rate": 5.931664916514689e-06, "loss": 0.476, "mean_token_accuracy": 0.8415656134486198, "num_tokens": 70569142.0, "step": 58680 }, { "entropy": 1.922675184905529, "epoch": 0.18193386491666272, "grad_norm": 9.857270240783691, "learning_rate": 5.931159547739309e-06, "loss": 0.5054, "mean_token_accuracy": 0.8421023860573769, "num_tokens": 70580427.0, "step": 58690 }, { "entropy": 1.8865813314914703, "epoch": 0.18196486404171242, "grad_norm": 8.099855422973633, "learning_rate": 5.9306543081118605e-06, "loss": 0.5244, "mean_token_accuracy": 0.8428427502512932, "num_tokens": 70591829.0, "step": 58700 }, { "entropy": 1.7871639668941497, "epoch": 0.1819958631667621, "grad_norm": 8.50505256652832, "learning_rate": 5.9301491975773485e-06, "loss": 0.4363, "mean_token_accuracy": 0.8578494325280189, "num_tokens": 70605530.0, "step": 58710 }, { "entropy": 1.9226329445838928, "epoch": 0.1820268622918118, "grad_norm": 9.982844352722168, "learning_rate": 5.929644216080808e-06, "loss": 0.5019, "mean_token_accuracy": 0.8366420567035675, "num_tokens": 70616806.0, "step": 58720 }, { "entropy": 1.8149694345891476, "epoch": 0.1820578614168615, "grad_norm": 8.859916687011719, "learning_rate": 5.9291393635673065e-06, "loss": 0.4231, "mean_token_accuracy": 0.8594744309782982, "num_tokens": 70629601.0, "step": 58730 }, { "entropy": 1.8165232956409454, "epoch": 0.1820888605419112, "grad_norm": 8.397076606750488, "learning_rate": 5.928634639981946e-06, "loss": 0.5598, "mean_token_accuracy": 0.8303966209292412, "num_tokens": 70642458.0, "step": 58740 }, { "entropy": 1.8342901065945625, "epoch": 0.1821198596669609, "grad_norm": 4.7208123207092285, "learning_rate": 5.9281300452698584e-06, "loss": 0.4633, "mean_token_accuracy": 0.850199481844902, "num_tokens": 70655561.0, "step": 58750 }, { "entropy": 1.959836632013321, "epoch": 0.1821508587920106, "grad_norm": 7.517179489135742, "learning_rate": 5.927625579376213e-06, "loss": 0.5588, "mean_token_accuracy": 0.8339650839567184, "num_tokens": 70666015.0, "step": 58760 }, { "entropy": 1.8640755087137222, "epoch": 0.1821818579170603, "grad_norm": 8.22607135772705, "learning_rate": 5.927121242246206e-06, "loss": 0.4876, "mean_token_accuracy": 0.8419701069593429, "num_tokens": 70678336.0, "step": 58770 }, { "entropy": 1.8295077085494995, "epoch": 0.18221285704211, "grad_norm": 7.90572452545166, "learning_rate": 5.9266170338250715e-06, "loss": 0.4507, "mean_token_accuracy": 0.8469627618789672, "num_tokens": 70691062.0, "step": 58780 }, { "entropy": 1.8726282447576523, "epoch": 0.18224385616715968, "grad_norm": 9.512833595275879, "learning_rate": 5.926112954058072e-06, "loss": 0.4848, "mean_token_accuracy": 0.840190976858139, "num_tokens": 70702498.0, "step": 58790 }, { "entropy": 1.8806493178009986, "epoch": 0.18227485529220938, "grad_norm": 9.236995697021484, "learning_rate": 5.925609002890504e-06, "loss": 0.5467, "mean_token_accuracy": 0.8324322685599327, "num_tokens": 70713997.0, "step": 58800 }, { "entropy": 1.9413601398468017, "epoch": 0.18230585441725908, "grad_norm": 8.431058883666992, "learning_rate": 5.9251051802677e-06, "loss": 0.5951, "mean_token_accuracy": 0.8179917827248573, "num_tokens": 70725865.0, "step": 58810 }, { "entropy": 1.8975177273154258, "epoch": 0.18233685354230877, "grad_norm": 9.826079368591309, "learning_rate": 5.9246014861350176e-06, "loss": 0.5408, "mean_token_accuracy": 0.8294125840067863, "num_tokens": 70737735.0, "step": 58820 }, { "entropy": 1.8288067810237407, "epoch": 0.18236785266735847, "grad_norm": 8.987369537353516, "learning_rate": 5.924097920437855e-06, "loss": 0.5116, "mean_token_accuracy": 0.8347997352480888, "num_tokens": 70750964.0, "step": 58830 }, { "entropy": 1.9133558467030525, "epoch": 0.18239885179240817, "grad_norm": 8.020203590393066, "learning_rate": 5.923594483121636e-06, "loss": 0.553, "mean_token_accuracy": 0.8251760572195053, "num_tokens": 70762325.0, "step": 58840 }, { "entropy": 1.9130362540483474, "epoch": 0.18242985091745786, "grad_norm": 4.7962446212768555, "learning_rate": 5.923091174131822e-06, "loss": 0.5566, "mean_token_accuracy": 0.8287200018763542, "num_tokens": 70773880.0, "step": 58850 }, { "entropy": 1.8333944439888001, "epoch": 0.18246085004250756, "grad_norm": 8.215493202209473, "learning_rate": 5.922587993413905e-06, "loss": 0.4459, "mean_token_accuracy": 0.8558073043823242, "num_tokens": 70786236.0, "step": 58860 }, { "entropy": 1.8580266639590264, "epoch": 0.18249184916755726, "grad_norm": 8.1721830368042, "learning_rate": 5.922084940913409e-06, "loss": 0.4505, "mean_token_accuracy": 0.8527326479554176, "num_tokens": 70798107.0, "step": 58870 }, { "entropy": 1.8850371599197389, "epoch": 0.18252284829260695, "grad_norm": 9.118043899536133, "learning_rate": 5.921582016575889e-06, "loss": 0.5387, "mean_token_accuracy": 0.8236063331365585, "num_tokens": 70809614.0, "step": 58880 }, { "entropy": 1.819055077433586, "epoch": 0.18255384741765662, "grad_norm": 6.715834617614746, "learning_rate": 5.921079220346936e-06, "loss": 0.4664, "mean_token_accuracy": 0.8425466433167458, "num_tokens": 70821709.0, "step": 58890 }, { "entropy": 1.9392906963825225, "epoch": 0.18258484654270632, "grad_norm": 9.14821720123291, "learning_rate": 5.920576552172171e-06, "loss": 0.5465, "mean_token_accuracy": 0.8290189146995545, "num_tokens": 70833465.0, "step": 58900 }, { "entropy": 1.8775691345334053, "epoch": 0.18261584566775602, "grad_norm": 9.709592819213867, "learning_rate": 5.920074011997246e-06, "loss": 0.5464, "mean_token_accuracy": 0.8243531733751297, "num_tokens": 70845723.0, "step": 58910 }, { "entropy": 1.8897256642580031, "epoch": 0.1826468447928057, "grad_norm": 8.386919975280762, "learning_rate": 5.919571599767849e-06, "loss": 0.5157, "mean_token_accuracy": 0.8361333101987839, "num_tokens": 70858171.0, "step": 58920 }, { "entropy": 1.810855358839035, "epoch": 0.1826778439178554, "grad_norm": 7.87431001663208, "learning_rate": 5.919069315429698e-06, "loss": 0.4808, "mean_token_accuracy": 0.8387002602219582, "num_tokens": 70870960.0, "step": 58930 }, { "entropy": 1.812534037232399, "epoch": 0.1827088430429051, "grad_norm": 9.197452545166016, "learning_rate": 5.91856715892854e-06, "loss": 0.411, "mean_token_accuracy": 0.8409740015864372, "num_tokens": 70884301.0, "step": 58940 }, { "entropy": 1.8245099663734436, "epoch": 0.1827398421679548, "grad_norm": 3.566840410232544, "learning_rate": 5.918065130210162e-06, "loss": 0.435, "mean_token_accuracy": 0.8494891449809074, "num_tokens": 70897472.0, "step": 58950 }, { "entropy": 1.8804219841957093, "epoch": 0.1827708412930045, "grad_norm": 9.882930755615234, "learning_rate": 5.917563229220378e-06, "loss": 0.4904, "mean_token_accuracy": 0.8421027392148972, "num_tokens": 70909937.0, "step": 58960 }, { "entropy": 1.8350219413638116, "epoch": 0.1828018404180542, "grad_norm": 5.378752708435059, "learning_rate": 5.917061455905032e-06, "loss": 0.5055, "mean_token_accuracy": 0.8302841424942017, "num_tokens": 70922611.0, "step": 58970 }, { "entropy": 1.9114596903324128, "epoch": 0.1828328395431039, "grad_norm": 9.584024429321289, "learning_rate": 5.9165598102100065e-06, "loss": 0.5156, "mean_token_accuracy": 0.8317157745361328, "num_tokens": 70934342.0, "step": 58980 }, { "entropy": 1.9586676806211472, "epoch": 0.1828638386681536, "grad_norm": 9.2459135055542, "learning_rate": 5.916058292081212e-06, "loss": 0.5584, "mean_token_accuracy": 0.8311464220285416, "num_tokens": 70945014.0, "step": 58990 }, { "entropy": 1.8728288680315017, "epoch": 0.18289483779320329, "grad_norm": 3.3124911785125732, "learning_rate": 5.91555690146459e-06, "loss": 0.5009, "mean_token_accuracy": 0.8396729901432991, "num_tokens": 70957244.0, "step": 59000 }, { "entropy": 1.8486259281635284, "epoch": 0.18292583691825298, "grad_norm": 7.671779155731201, "learning_rate": 5.9150556383061166e-06, "loss": 0.5108, "mean_token_accuracy": 0.8414855524897575, "num_tokens": 70969181.0, "step": 59010 }, { "entropy": 1.8501243054866792, "epoch": 0.18295683604330268, "grad_norm": 7.360293865203857, "learning_rate": 5.914554502551802e-06, "loss": 0.4727, "mean_token_accuracy": 0.8381767392158508, "num_tokens": 70981480.0, "step": 59020 }, { "entropy": 1.9539805203676224, "epoch": 0.18298783516835238, "grad_norm": 8.94968032836914, "learning_rate": 5.91405349414768e-06, "loss": 0.6436, "mean_token_accuracy": 0.8138018161058426, "num_tokens": 70992634.0, "step": 59030 }, { "entropy": 1.7947911590337753, "epoch": 0.18301883429340207, "grad_norm": 8.198612213134766, "learning_rate": 5.913552613039827e-06, "loss": 0.4641, "mean_token_accuracy": 0.8452375203371048, "num_tokens": 71006127.0, "step": 59040 }, { "entropy": 1.837024737894535, "epoch": 0.18304983341845177, "grad_norm": 3.7017059326171875, "learning_rate": 5.913051859174345e-06, "loss": 0.5035, "mean_token_accuracy": 0.8444261506199837, "num_tokens": 71018041.0, "step": 59050 }, { "entropy": 1.7747617959976196, "epoch": 0.18308083254350146, "grad_norm": 7.044408321380615, "learning_rate": 5.9125512324973685e-06, "loss": 0.4534, "mean_token_accuracy": 0.8478129550814628, "num_tokens": 71031917.0, "step": 59060 }, { "entropy": 1.8644176304340363, "epoch": 0.18311183166855116, "grad_norm": 9.228703498840332, "learning_rate": 5.9120507329550645e-06, "loss": 0.4961, "mean_token_accuracy": 0.8401506051421166, "num_tokens": 71044161.0, "step": 59070 }, { "entropy": 1.8438007518649102, "epoch": 0.18314283079360086, "grad_norm": 4.598083019256592, "learning_rate": 5.9115503604936345e-06, "loss": 0.4624, "mean_token_accuracy": 0.8480850771069527, "num_tokens": 71057367.0, "step": 59080 }, { "entropy": 1.9214376494288445, "epoch": 0.18317382991865055, "grad_norm": 11.023812294006348, "learning_rate": 5.911050115059307e-06, "loss": 0.5445, "mean_token_accuracy": 0.8322188884019852, "num_tokens": 71068645.0, "step": 59090 }, { "entropy": 1.9257762372493743, "epoch": 0.18320482904370025, "grad_norm": 7.3912129402160645, "learning_rate": 5.910549996598346e-06, "loss": 0.5122, "mean_token_accuracy": 0.8420733541250229, "num_tokens": 71079883.0, "step": 59100 }, { "entropy": 1.9468917578458786, "epoch": 0.18323582816874995, "grad_norm": 8.324677467346191, "learning_rate": 5.910050005057045e-06, "loss": 0.5494, "mean_token_accuracy": 0.8290826484560967, "num_tokens": 71090907.0, "step": 59110 }, { "entropy": 1.930706176161766, "epoch": 0.18326682729379964, "grad_norm": 9.13598918914795, "learning_rate": 5.909550140381733e-06, "loss": 0.5705, "mean_token_accuracy": 0.8350569799542427, "num_tokens": 71101884.0, "step": 59120 }, { "entropy": 1.8789242595434188, "epoch": 0.18329782641884934, "grad_norm": 10.5730619430542, "learning_rate": 5.9090504025187655e-06, "loss": 0.4738, "mean_token_accuracy": 0.8466743901371956, "num_tokens": 71113590.0, "step": 59130 }, { "entropy": 1.901440866291523, "epoch": 0.183328825543899, "grad_norm": 12.086167335510254, "learning_rate": 5.908550791414533e-06, "loss": 0.5615, "mean_token_accuracy": 0.8337482139468193, "num_tokens": 71125988.0, "step": 59140 }, { "entropy": 1.8326066508889198, "epoch": 0.1833598246689487, "grad_norm": 2.8919966220855713, "learning_rate": 5.908051307015459e-06, "loss": 0.4791, "mean_token_accuracy": 0.844254644215107, "num_tokens": 71139896.0, "step": 59150 }, { "entropy": 1.9314706429839135, "epoch": 0.1833908237939984, "grad_norm": 8.493422508239746, "learning_rate": 5.907551949267995e-06, "loss": 0.5188, "mean_token_accuracy": 0.8385980620980262, "num_tokens": 71151553.0, "step": 59160 }, { "entropy": 1.8806381478905678, "epoch": 0.1834218229190481, "grad_norm": 8.074383735656738, "learning_rate": 5.907052718118627e-06, "loss": 0.4823, "mean_token_accuracy": 0.8440838649868965, "num_tokens": 71163731.0, "step": 59170 }, { "entropy": 1.845897839963436, "epoch": 0.1834528220440978, "grad_norm": 10.374253273010254, "learning_rate": 5.9065536135138725e-06, "loss": 0.4877, "mean_token_accuracy": 0.8441713094711304, "num_tokens": 71175735.0, "step": 59180 }, { "entropy": 1.9331976994872093, "epoch": 0.1834838211691475, "grad_norm": 8.653754234313965, "learning_rate": 5.906054635400278e-06, "loss": 0.5448, "mean_token_accuracy": 0.834554848074913, "num_tokens": 71187616.0, "step": 59190 }, { "entropy": 1.8752469643950462, "epoch": 0.1835148202941972, "grad_norm": 9.133798599243164, "learning_rate": 5.905555783724424e-06, "loss": 0.4555, "mean_token_accuracy": 0.8436867401003838, "num_tokens": 71199959.0, "step": 59200 }, { "entropy": 1.8521633878350259, "epoch": 0.1835458194192469, "grad_norm": 8.649680137634277, "learning_rate": 5.905057058432922e-06, "loss": 0.4741, "mean_token_accuracy": 0.842129784822464, "num_tokens": 71212099.0, "step": 59210 }, { "entropy": 1.8513922914862633, "epoch": 0.18357681854429658, "grad_norm": 9.206934928894043, "learning_rate": 5.904558459472417e-06, "loss": 0.5184, "mean_token_accuracy": 0.8332608088850975, "num_tokens": 71224564.0, "step": 59220 }, { "entropy": 1.8147864386439323, "epoch": 0.18360781766934628, "grad_norm": 7.711428165435791, "learning_rate": 5.904059986789582e-06, "loss": 0.5106, "mean_token_accuracy": 0.8427559122443199, "num_tokens": 71237203.0, "step": 59230 }, { "entropy": 1.8857770830392837, "epoch": 0.18363881679439598, "grad_norm": 8.450910568237305, "learning_rate": 5.903561640331122e-06, "loss": 0.5048, "mean_token_accuracy": 0.8380788192152977, "num_tokens": 71249147.0, "step": 59240 }, { "entropy": 1.8396921649575233, "epoch": 0.18366981591944567, "grad_norm": 2.8776183128356934, "learning_rate": 5.9030634200437765e-06, "loss": 0.4636, "mean_token_accuracy": 0.8463966697454453, "num_tokens": 71260897.0, "step": 59250 }, { "entropy": 1.8520937889814377, "epoch": 0.18370081504449537, "grad_norm": 6.755105018615723, "learning_rate": 5.902565325874313e-06, "loss": 0.5058, "mean_token_accuracy": 0.8342436105012894, "num_tokens": 71273021.0, "step": 59260 }, { "entropy": 1.8717355713248254, "epoch": 0.18373181416954507, "grad_norm": 8.250826835632324, "learning_rate": 5.902067357769535e-06, "loss": 0.5023, "mean_token_accuracy": 0.8344789668917656, "num_tokens": 71285282.0, "step": 59270 }, { "entropy": 1.911249254643917, "epoch": 0.18376281329459476, "grad_norm": 11.614958763122559, "learning_rate": 5.901569515676272e-06, "loss": 0.5307, "mean_token_accuracy": 0.839310847222805, "num_tokens": 71297503.0, "step": 59280 }, { "entropy": 1.9195268914103507, "epoch": 0.18379381241964446, "grad_norm": 8.41645336151123, "learning_rate": 5.901071799541385e-06, "loss": 0.5092, "mean_token_accuracy": 0.8358474299311638, "num_tokens": 71309385.0, "step": 59290 }, { "entropy": 1.90712169110775, "epoch": 0.18382481154469416, "grad_norm": 8.294679641723633, "learning_rate": 5.900574209311775e-06, "loss": 0.5158, "mean_token_accuracy": 0.8497764050960541, "num_tokens": 71320129.0, "step": 59300 }, { "entropy": 1.935383716225624, "epoch": 0.18385581066974385, "grad_norm": 9.08409309387207, "learning_rate": 5.900076744934362e-06, "loss": 0.5022, "mean_token_accuracy": 0.8406318157911301, "num_tokens": 71331528.0, "step": 59310 }, { "entropy": 1.9089769005775452, "epoch": 0.18388680979479355, "grad_norm": 12.19558048248291, "learning_rate": 5.899579406356107e-06, "loss": 0.5055, "mean_token_accuracy": 0.8409374266862869, "num_tokens": 71342190.0, "step": 59320 }, { "entropy": 1.9273348927497864, "epoch": 0.18391780891984325, "grad_norm": 10.214935302734375, "learning_rate": 5.8990821935239975e-06, "loss": 0.5118, "mean_token_accuracy": 0.8436912566423416, "num_tokens": 71353668.0, "step": 59330 }, { "entropy": 1.9410814076662064, "epoch": 0.18394880804489294, "grad_norm": 9.362590789794922, "learning_rate": 5.898585106385053e-06, "loss": 0.5704, "mean_token_accuracy": 0.8309288114309311, "num_tokens": 71364044.0, "step": 59340 }, { "entropy": 1.9330678835511208, "epoch": 0.18397980716994264, "grad_norm": 7.387380599975586, "learning_rate": 5.898088144886326e-06, "loss": 0.5147, "mean_token_accuracy": 0.8400380194187165, "num_tokens": 71375184.0, "step": 59350 }, { "entropy": 1.9176151275634765, "epoch": 0.18401080629499234, "grad_norm": 8.243695259094238, "learning_rate": 5.897591308974896e-06, "loss": 0.5476, "mean_token_accuracy": 0.8329030573368073, "num_tokens": 71386183.0, "step": 59360 }, { "entropy": 1.7748358353972435, "epoch": 0.18404180542004203, "grad_norm": 4.477342128753662, "learning_rate": 5.897094598597879e-06, "loss": 0.4067, "mean_token_accuracy": 0.8530562177300454, "num_tokens": 71399368.0, "step": 59370 }, { "entropy": 1.8144743472337723, "epoch": 0.18407280454509173, "grad_norm": 9.520360946655273, "learning_rate": 5.896598013702419e-06, "loss": 0.4925, "mean_token_accuracy": 0.8422923222184181, "num_tokens": 71412727.0, "step": 59380 }, { "entropy": 1.8014607191085816, "epoch": 0.1841038036701414, "grad_norm": 6.801212787628174, "learning_rate": 5.8961015542356925e-06, "loss": 0.3999, "mean_token_accuracy": 0.8570068180561066, "num_tokens": 71425857.0, "step": 59390 }, { "entropy": 1.927448120713234, "epoch": 0.1841348027951911, "grad_norm": 8.77550220489502, "learning_rate": 5.895605220144907e-06, "loss": 0.5676, "mean_token_accuracy": 0.8222044169902801, "num_tokens": 71436727.0, "step": 59400 }, { "entropy": 1.9893068850040436, "epoch": 0.1841658019202408, "grad_norm": 9.02739143371582, "learning_rate": 5.8951090113772976e-06, "loss": 0.5979, "mean_token_accuracy": 0.8225637242197991, "num_tokens": 71447373.0, "step": 59410 }, { "entropy": 1.8202601961791516, "epoch": 0.1841968010452905, "grad_norm": 8.421029090881348, "learning_rate": 5.894612927880137e-06, "loss": 0.4609, "mean_token_accuracy": 0.8546611994504929, "num_tokens": 71460105.0, "step": 59420 }, { "entropy": 1.8888828486204148, "epoch": 0.18422780017034018, "grad_norm": 9.308785438537598, "learning_rate": 5.8941169696007235e-06, "loss": 0.53, "mean_token_accuracy": 0.8378401637077332, "num_tokens": 71472520.0, "step": 59430 }, { "entropy": 1.8498116791248322, "epoch": 0.18425879929538988, "grad_norm": 8.532123565673828, "learning_rate": 5.893621136486389e-06, "loss": 0.4873, "mean_token_accuracy": 0.844611656665802, "num_tokens": 71484511.0, "step": 59440 }, { "entropy": 1.8172200858592986, "epoch": 0.18428979842043958, "grad_norm": 4.122251033782959, "learning_rate": 5.893125428484495e-06, "loss": 0.4447, "mean_token_accuracy": 0.8459887281060219, "num_tokens": 71496637.0, "step": 59450 }, { "entropy": 1.8752017796039582, "epoch": 0.18432079754548927, "grad_norm": 9.73798942565918, "learning_rate": 5.892629845542437e-06, "loss": 0.4696, "mean_token_accuracy": 0.8463060513138772, "num_tokens": 71508739.0, "step": 59460 }, { "entropy": 1.8498885072767735, "epoch": 0.18435179667053897, "grad_norm": 8.824564933776855, "learning_rate": 5.892134387607636e-06, "loss": 0.4777, "mean_token_accuracy": 0.835616897046566, "num_tokens": 71521174.0, "step": 59470 }, { "entropy": 1.863853606581688, "epoch": 0.18438279579558867, "grad_norm": 9.248029708862305, "learning_rate": 5.891639054627551e-06, "loss": 0.5117, "mean_token_accuracy": 0.8358003750443459, "num_tokens": 71533111.0, "step": 59480 }, { "entropy": 1.9517934620380402, "epoch": 0.18441379492063836, "grad_norm": 9.871654510498047, "learning_rate": 5.891143846549664e-06, "loss": 0.5908, "mean_token_accuracy": 0.8294711738824845, "num_tokens": 71544244.0, "step": 59490 }, { "entropy": 1.8825423762202262, "epoch": 0.18444479404568806, "grad_norm": 7.173405170440674, "learning_rate": 5.890648763321495e-06, "loss": 0.486, "mean_token_accuracy": 0.8478507623076439, "num_tokens": 71555732.0, "step": 59500 }, { "entropy": 1.9470236003398895, "epoch": 0.18447579317073776, "grad_norm": 10.64505672454834, "learning_rate": 5.8901538048905904e-06, "loss": 0.5637, "mean_token_accuracy": 0.8279279932379723, "num_tokens": 71566717.0, "step": 59510 }, { "entropy": 1.8414243504405021, "epoch": 0.18450679229578745, "grad_norm": 8.195703506469727, "learning_rate": 5.8896589712045306e-06, "loss": 0.4448, "mean_token_accuracy": 0.8501287072896957, "num_tokens": 71578437.0, "step": 59520 }, { "entropy": 1.8852709889411927, "epoch": 0.18453779142083715, "grad_norm": 10.737709045410156, "learning_rate": 5.8891642622109235e-06, "loss": 0.4645, "mean_token_accuracy": 0.831559669971466, "num_tokens": 71590564.0, "step": 59530 }, { "entropy": 1.8008535578846931, "epoch": 0.18456879054588685, "grad_norm": 12.071378707885742, "learning_rate": 5.888669677857409e-06, "loss": 0.4656, "mean_token_accuracy": 0.8527644023299217, "num_tokens": 71604258.0, "step": 59540 }, { "entropy": 1.8511207103729248, "epoch": 0.18459978967093654, "grad_norm": 4.7530012130737305, "learning_rate": 5.88817521809166e-06, "loss": 0.4668, "mean_token_accuracy": 0.8385688096284867, "num_tokens": 71616859.0, "step": 59550 }, { "entropy": 1.9494419798254967, "epoch": 0.18463078879598624, "grad_norm": 10.362981796264648, "learning_rate": 5.887680882861378e-06, "loss": 0.537, "mean_token_accuracy": 0.833903931081295, "num_tokens": 71628020.0, "step": 59560 }, { "entropy": 1.9751142218708992, "epoch": 0.18466178792103594, "grad_norm": 10.168465614318848, "learning_rate": 5.887186672114294e-06, "loss": 0.5409, "mean_token_accuracy": 0.8342203125357628, "num_tokens": 71639405.0, "step": 59570 }, { "entropy": 1.9470524042844772, "epoch": 0.18469278704608563, "grad_norm": 7.566704273223877, "learning_rate": 5.886692585798173e-06, "loss": 0.5758, "mean_token_accuracy": 0.8385176613926888, "num_tokens": 71650529.0, "step": 59580 }, { "entropy": 1.8561659947037696, "epoch": 0.18472378617113533, "grad_norm": 8.916204452514648, "learning_rate": 5.886198623860807e-06, "loss": 0.4514, "mean_token_accuracy": 0.8419315591454506, "num_tokens": 71663274.0, "step": 59590 }, { "entropy": 1.9323371350765228, "epoch": 0.18475478529618503, "grad_norm": 10.571834564208984, "learning_rate": 5.8857047862500226e-06, "loss": 0.5731, "mean_token_accuracy": 0.8282590746879578, "num_tokens": 71674739.0, "step": 59600 }, { "entropy": 1.9620111271739007, "epoch": 0.18478578442123472, "grad_norm": 8.114259719848633, "learning_rate": 5.885211072913676e-06, "loss": 0.5633, "mean_token_accuracy": 0.8262233078479767, "num_tokens": 71685925.0, "step": 59610 }, { "entropy": 1.8970046997070313, "epoch": 0.18481678354628442, "grad_norm": 3.8900442123413086, "learning_rate": 5.884717483799649e-06, "loss": 0.4988, "mean_token_accuracy": 0.8467764779925346, "num_tokens": 71697604.0, "step": 59620 }, { "entropy": 1.8912265598773956, "epoch": 0.18484778267133412, "grad_norm": 9.921353340148926, "learning_rate": 5.884224018855862e-06, "loss": 0.5736, "mean_token_accuracy": 0.8240482151508332, "num_tokens": 71708748.0, "step": 59630 }, { "entropy": 1.873260524868965, "epoch": 0.18487878179638378, "grad_norm": 7.441627025604248, "learning_rate": 5.883730678030261e-06, "loss": 0.5304, "mean_token_accuracy": 0.831610731780529, "num_tokens": 71720631.0, "step": 59640 }, { "entropy": 1.8897306516766548, "epoch": 0.18490978092143348, "grad_norm": 8.014190673828125, "learning_rate": 5.883237461270822e-06, "loss": 0.5162, "mean_token_accuracy": 0.8387016162276268, "num_tokens": 71732006.0, "step": 59650 }, { "entropy": 1.8416223376989365, "epoch": 0.18494078004648318, "grad_norm": 6.837216854095459, "learning_rate": 5.882744368525556e-06, "loss": 0.5324, "mean_token_accuracy": 0.832354225218296, "num_tokens": 71744288.0, "step": 59660 }, { "entropy": 1.7982579827308656, "epoch": 0.18497177917153287, "grad_norm": 9.943193435668945, "learning_rate": 5.882251399742499e-06, "loss": 0.4099, "mean_token_accuracy": 0.8525391161441803, "num_tokens": 71757879.0, "step": 59670 }, { "entropy": 1.7894408702850342, "epoch": 0.18500277829658257, "grad_norm": 8.48753833770752, "learning_rate": 5.881758554869721e-06, "loss": 0.4682, "mean_token_accuracy": 0.8433696299791336, "num_tokens": 71772009.0, "step": 59680 }, { "entropy": 1.8790398433804512, "epoch": 0.18503377742163227, "grad_norm": 8.779352188110352, "learning_rate": 5.881265833855321e-06, "loss": 0.5417, "mean_token_accuracy": 0.8341058731079102, "num_tokens": 71784020.0, "step": 59690 }, { "entropy": 1.8383540600538253, "epoch": 0.18506477654668196, "grad_norm": 4.482400894165039, "learning_rate": 5.880773236647431e-06, "loss": 0.4772, "mean_token_accuracy": 0.8363133147358894, "num_tokens": 71795570.0, "step": 59700 }, { "entropy": 1.807381245493889, "epoch": 0.18509577567173166, "grad_norm": 7.861814498901367, "learning_rate": 5.8802807631942095e-06, "loss": 0.4422, "mean_token_accuracy": 0.8491622850298881, "num_tokens": 71808715.0, "step": 59710 }, { "entropy": 1.9171263113617898, "epoch": 0.18512677479678136, "grad_norm": 9.038065910339355, "learning_rate": 5.879788413443846e-06, "loss": 0.5153, "mean_token_accuracy": 0.8278371721506119, "num_tokens": 71820652.0, "step": 59720 }, { "entropy": 1.9229667693376542, "epoch": 0.18515777392183105, "grad_norm": 7.926364898681641, "learning_rate": 5.879296187344564e-06, "loss": 0.5683, "mean_token_accuracy": 0.8299353420734406, "num_tokens": 71832006.0, "step": 59730 }, { "entropy": 1.8270016878843307, "epoch": 0.18518877304688075, "grad_norm": 4.967191219329834, "learning_rate": 5.878804084844616e-06, "loss": 0.4648, "mean_token_accuracy": 0.8319668889045715, "num_tokens": 71845433.0, "step": 59740 }, { "entropy": 1.9323490008711814, "epoch": 0.18521977217193045, "grad_norm": 7.835849285125732, "learning_rate": 5.878312105892281e-06, "loss": 0.5384, "mean_token_accuracy": 0.8331502959132194, "num_tokens": 71856737.0, "step": 59750 }, { "entropy": 1.7635822594165802, "epoch": 0.18525077129698014, "grad_norm": 10.747203826904297, "learning_rate": 5.8778202504358716e-06, "loss": 0.4358, "mean_token_accuracy": 0.8548522099852562, "num_tokens": 71870471.0, "step": 59760 }, { "entropy": 1.7823803335428239, "epoch": 0.18528177042202984, "grad_norm": 3.434664249420166, "learning_rate": 5.8773285184237316e-06, "loss": 0.426, "mean_token_accuracy": 0.846599979698658, "num_tokens": 71884046.0, "step": 59770 }, { "entropy": 1.8083126276731492, "epoch": 0.18531276954707954, "grad_norm": 4.187022686004639, "learning_rate": 5.876836909804231e-06, "loss": 0.4878, "mean_token_accuracy": 0.8312985822558403, "num_tokens": 71897333.0, "step": 59780 }, { "entropy": 1.9598889589309691, "epoch": 0.18534376867212923, "grad_norm": 8.239096641540527, "learning_rate": 5.876345424525776e-06, "loss": 0.6611, "mean_token_accuracy": 0.8133495360612869, "num_tokens": 71908395.0, "step": 59790 }, { "entropy": 1.9233538269996644, "epoch": 0.18537476779717893, "grad_norm": 3.7987241744995117, "learning_rate": 5.8758540625367965e-06, "loss": 0.5428, "mean_token_accuracy": 0.8402821734547615, "num_tokens": 71920345.0, "step": 59800 }, { "entropy": 1.9380876675248147, "epoch": 0.18540576692222863, "grad_norm": 7.9834208488464355, "learning_rate": 5.875362823785758e-06, "loss": 0.5111, "mean_token_accuracy": 0.8361145481467247, "num_tokens": 71932372.0, "step": 59810 }, { "entropy": 1.9172768041491508, "epoch": 0.18543676604727832, "grad_norm": 10.48983097076416, "learning_rate": 5.8748717082211516e-06, "loss": 0.5123, "mean_token_accuracy": 0.8384231805801392, "num_tokens": 71944733.0, "step": 59820 }, { "entropy": 1.8952128738164902, "epoch": 0.18546776517232802, "grad_norm": 6.812406539916992, "learning_rate": 5.874380715791502e-06, "loss": 0.4774, "mean_token_accuracy": 0.846713088452816, "num_tokens": 71956856.0, "step": 59830 }, { "entropy": 1.780382940173149, "epoch": 0.18549876429737772, "grad_norm": 4.57772159576416, "learning_rate": 5.8738898464453644e-06, "loss": 0.4001, "mean_token_accuracy": 0.8531007096171379, "num_tokens": 71970503.0, "step": 59840 }, { "entropy": 1.910321943461895, "epoch": 0.1855297634224274, "grad_norm": 10.066534042358398, "learning_rate": 5.87339910013132e-06, "loss": 0.5479, "mean_token_accuracy": 0.8341235533356667, "num_tokens": 71982378.0, "step": 59850 }, { "entropy": 1.927718922495842, "epoch": 0.1855607625474771, "grad_norm": 9.32041072845459, "learning_rate": 5.872908476797983e-06, "loss": 0.5619, "mean_token_accuracy": 0.8251764044165611, "num_tokens": 71993478.0, "step": 59860 }, { "entropy": 1.9356975913047791, "epoch": 0.1855917616725268, "grad_norm": 4.789025783538818, "learning_rate": 5.872417976393997e-06, "loss": 0.5355, "mean_token_accuracy": 0.8277438759803772, "num_tokens": 72005113.0, "step": 59870 }, { "entropy": 1.8369565188884736, "epoch": 0.18562276079757647, "grad_norm": 7.20404577255249, "learning_rate": 5.871927598868036e-06, "loss": 0.5349, "mean_token_accuracy": 0.8378855675458908, "num_tokens": 72017827.0, "step": 59880 }, { "entropy": 1.8494772240519524, "epoch": 0.18565375992262617, "grad_norm": 8.194475173950195, "learning_rate": 5.871437344168805e-06, "loss": 0.5451, "mean_token_accuracy": 0.8221006825566292, "num_tokens": 72030285.0, "step": 59890 }, { "entropy": 1.9028642885386944, "epoch": 0.18568475904767587, "grad_norm": 8.854440689086914, "learning_rate": 5.870947212245036e-06, "loss": 0.5293, "mean_token_accuracy": 0.8278263434767723, "num_tokens": 72043015.0, "step": 59900 }, { "entropy": 1.8917630180716514, "epoch": 0.18571575817272556, "grad_norm": 8.142960548400879, "learning_rate": 5.870457203045496e-06, "loss": 0.5053, "mean_token_accuracy": 0.8398408219218254, "num_tokens": 72054537.0, "step": 59910 }, { "entropy": 1.8536349281668663, "epoch": 0.18574675729777526, "grad_norm": 8.590570449829102, "learning_rate": 5.869967316518973e-06, "loss": 0.4352, "mean_token_accuracy": 0.8522145226597786, "num_tokens": 72067016.0, "step": 59920 }, { "entropy": 1.9314019471406936, "epoch": 0.18577775642282496, "grad_norm": 4.44788932800293, "learning_rate": 5.869477552614296e-06, "loss": 0.5156, "mean_token_accuracy": 0.8373445898294449, "num_tokens": 72078617.0, "step": 59930 }, { "entropy": 1.8379703059792518, "epoch": 0.18580875554787465, "grad_norm": 8.737577438354492, "learning_rate": 5.868987911280315e-06, "loss": 0.4859, "mean_token_accuracy": 0.8353394463658332, "num_tokens": 72090576.0, "step": 59940 }, { "entropy": 1.9440933406352996, "epoch": 0.18583975467292435, "grad_norm": 8.472302436828613, "learning_rate": 5.868498392465915e-06, "loss": 0.5734, "mean_token_accuracy": 0.8333403542637825, "num_tokens": 72101344.0, "step": 59950 }, { "entropy": 1.8688756361603738, "epoch": 0.18587075379797405, "grad_norm": 9.316754341125488, "learning_rate": 5.868008996120008e-06, "loss": 0.5235, "mean_token_accuracy": 0.8337069198489189, "num_tokens": 72113227.0, "step": 59960 }, { "entropy": 1.834533803164959, "epoch": 0.18590175292302374, "grad_norm": 8.468538284301758, "learning_rate": 5.867519722191538e-06, "loss": 0.4466, "mean_token_accuracy": 0.8497491136193276, "num_tokens": 72125271.0, "step": 59970 }, { "entropy": 1.7592488691210746, "epoch": 0.18593275204807344, "grad_norm": 8.312585830688477, "learning_rate": 5.867030570629477e-06, "loss": 0.396, "mean_token_accuracy": 0.8604243487119675, "num_tokens": 72139001.0, "step": 59980 }, { "entropy": 1.9161117985844611, "epoch": 0.18596375117312314, "grad_norm": 4.209480285644531, "learning_rate": 5.866541541382829e-06, "loss": 0.5155, "mean_token_accuracy": 0.8296124190092087, "num_tokens": 72151289.0, "step": 59990 }, { "entropy": 1.9577097177505494, "epoch": 0.18599475029817283, "grad_norm": 9.150534629821777, "learning_rate": 5.866052634400624e-06, "loss": 0.58, "mean_token_accuracy": 0.8270249828696251, "num_tokens": 72161958.0, "step": 60000 }, { "entropy": 1.896253764629364, "epoch": 0.18602574942322253, "grad_norm": 5.07069206237793, "learning_rate": 5.865563849631925e-06, "loss": 0.5408, "mean_token_accuracy": 0.8247905030846596, "num_tokens": 72175097.0, "step": 60010 }, { "entropy": 1.7840982712805271, "epoch": 0.18605674854827223, "grad_norm": 8.719794273376465, "learning_rate": 5.865075187025824e-06, "loss": 0.4362, "mean_token_accuracy": 0.8515463337302208, "num_tokens": 72188721.0, "step": 60020 }, { "entropy": 1.994665226340294, "epoch": 0.18608774767332192, "grad_norm": 8.010587692260742, "learning_rate": 5.864586646531443e-06, "loss": 0.5875, "mean_token_accuracy": 0.8243452772498131, "num_tokens": 72199455.0, "step": 60030 }, { "entropy": 1.869076819717884, "epoch": 0.18611874679837162, "grad_norm": 8.577561378479004, "learning_rate": 5.864098228097931e-06, "loss": 0.4793, "mean_token_accuracy": 0.843721067905426, "num_tokens": 72211035.0, "step": 60040 }, { "entropy": 1.9081630438566208, "epoch": 0.18614974592342132, "grad_norm": 8.139861106872559, "learning_rate": 5.863609931674471e-06, "loss": 0.4883, "mean_token_accuracy": 0.847606098651886, "num_tokens": 72222707.0, "step": 60050 }, { "entropy": 1.9403681397438048, "epoch": 0.186180745048471, "grad_norm": 9.84714412689209, "learning_rate": 5.8631217572102716e-06, "loss": 0.5301, "mean_token_accuracy": 0.8385643243789673, "num_tokens": 72232917.0, "step": 60060 }, { "entropy": 1.9238607689738274, "epoch": 0.1862117441735207, "grad_norm": 8.912215232849121, "learning_rate": 5.862633704654573e-06, "loss": 0.5572, "mean_token_accuracy": 0.8306085541844368, "num_tokens": 72244516.0, "step": 60070 }, { "entropy": 1.9544622465968131, "epoch": 0.1862427432985704, "grad_norm": 8.447044372558594, "learning_rate": 5.862145773956647e-06, "loss": 0.5516, "mean_token_accuracy": 0.8342535004019738, "num_tokens": 72255606.0, "step": 60080 }, { "entropy": 1.8915927335619926, "epoch": 0.1862737424236201, "grad_norm": 9.219244003295898, "learning_rate": 5.861657965065788e-06, "loss": 0.5361, "mean_token_accuracy": 0.8286745131015778, "num_tokens": 72267179.0, "step": 60090 }, { "entropy": 1.9081023514270783, "epoch": 0.1863047415486698, "grad_norm": 3.8178694248199463, "learning_rate": 5.861170277931328e-06, "loss": 0.5429, "mean_token_accuracy": 0.8304624617099762, "num_tokens": 72279385.0, "step": 60100 }, { "entropy": 1.8515421822667122, "epoch": 0.1863357406737195, "grad_norm": 6.892448902130127, "learning_rate": 5.8606827125026256e-06, "loss": 0.4714, "mean_token_accuracy": 0.8440054371953011, "num_tokens": 72292367.0, "step": 60110 }, { "entropy": 1.941430465877056, "epoch": 0.1863667397987692, "grad_norm": 9.434157371520996, "learning_rate": 5.860195268729066e-06, "loss": 0.5499, "mean_token_accuracy": 0.8339191779494286, "num_tokens": 72303613.0, "step": 60120 }, { "entropy": 1.9135318726301194, "epoch": 0.18639773892381886, "grad_norm": 11.788899421691895, "learning_rate": 5.859707946560067e-06, "loss": 0.5565, "mean_token_accuracy": 0.8330492198467254, "num_tokens": 72314613.0, "step": 60130 }, { "entropy": 1.8537031307816505, "epoch": 0.18642873804886856, "grad_norm": 9.07436466217041, "learning_rate": 5.859220745945075e-06, "loss": 0.4944, "mean_token_accuracy": 0.8319351330399514, "num_tokens": 72326917.0, "step": 60140 }, { "entropy": 2.000509098172188, "epoch": 0.18645973717391826, "grad_norm": 11.218595504760742, "learning_rate": 5.858733666833567e-06, "loss": 0.5732, "mean_token_accuracy": 0.8274472177028656, "num_tokens": 72338272.0, "step": 60150 }, { "entropy": 1.9531633704900742, "epoch": 0.18649073629896795, "grad_norm": 8.319474220275879, "learning_rate": 5.858246709175047e-06, "loss": 0.5806, "mean_token_accuracy": 0.8167789533734322, "num_tokens": 72349717.0, "step": 60160 }, { "entropy": 1.8039245098829269, "epoch": 0.18652173542401765, "grad_norm": 5.849046230316162, "learning_rate": 5.8577598729190496e-06, "loss": 0.4335, "mean_token_accuracy": 0.8554789200425148, "num_tokens": 72362915.0, "step": 60170 }, { "entropy": 1.87906823605299, "epoch": 0.18655273454906734, "grad_norm": 8.501765251159668, "learning_rate": 5.85727315801514e-06, "loss": 0.4956, "mean_token_accuracy": 0.8379226326942444, "num_tokens": 72374922.0, "step": 60180 }, { "entropy": 1.8856094121932983, "epoch": 0.18658373367411704, "grad_norm": 8.660506248474121, "learning_rate": 5.8567865644129095e-06, "loss": 0.5027, "mean_token_accuracy": 0.8400798097252846, "num_tokens": 72387230.0, "step": 60190 }, { "entropy": 1.843792949616909, "epoch": 0.18661473279916674, "grad_norm": 7.968148231506348, "learning_rate": 5.856300092061984e-06, "loss": 0.5075, "mean_token_accuracy": 0.8412106573581696, "num_tokens": 72399496.0, "step": 60200 }, { "entropy": 1.863570548593998, "epoch": 0.18664573192421643, "grad_norm": 8.22105598449707, "learning_rate": 5.855813740912011e-06, "loss": 0.4864, "mean_token_accuracy": 0.8438234731554985, "num_tokens": 72412130.0, "step": 60210 }, { "entropy": 1.903498760610819, "epoch": 0.18667673104926613, "grad_norm": 8.098082542419434, "learning_rate": 5.855327510912675e-06, "loss": 0.487, "mean_token_accuracy": 0.8345488205552101, "num_tokens": 72425418.0, "step": 60220 }, { "entropy": 1.8605897799134254, "epoch": 0.18670773017431583, "grad_norm": 8.592905044555664, "learning_rate": 5.854841402013685e-06, "loss": 0.5436, "mean_token_accuracy": 0.8296092748641968, "num_tokens": 72437861.0, "step": 60230 }, { "entropy": 1.8651499822735786, "epoch": 0.18673872929936552, "grad_norm": 3.829653739929199, "learning_rate": 5.8543554141647814e-06, "loss": 0.4924, "mean_token_accuracy": 0.837422613799572, "num_tokens": 72450429.0, "step": 60240 }, { "entropy": 1.8985691770911217, "epoch": 0.18676972842441522, "grad_norm": 8.132059097290039, "learning_rate": 5.853869547315731e-06, "loss": 0.4996, "mean_token_accuracy": 0.8453919142484665, "num_tokens": 72462168.0, "step": 60250 }, { "entropy": 1.8822806507349015, "epoch": 0.18680072754946492, "grad_norm": 8.20763111114502, "learning_rate": 5.853383801416336e-06, "loss": 0.5387, "mean_token_accuracy": 0.8344938531517982, "num_tokens": 72473992.0, "step": 60260 }, { "entropy": 1.9301311939954757, "epoch": 0.18683172667451461, "grad_norm": 8.083572387695312, "learning_rate": 5.8528981764164205e-06, "loss": 0.5355, "mean_token_accuracy": 0.841622045636177, "num_tokens": 72484567.0, "step": 60270 }, { "entropy": 1.8997415751218796, "epoch": 0.1868627257995643, "grad_norm": 7.791327953338623, "learning_rate": 5.85241267226584e-06, "loss": 0.5359, "mean_token_accuracy": 0.8344151824712753, "num_tokens": 72495039.0, "step": 60280 }, { "entropy": 1.8864543735980988, "epoch": 0.186893724924614, "grad_norm": 7.940762042999268, "learning_rate": 5.851927288914482e-06, "loss": 0.5043, "mean_token_accuracy": 0.8393915817141533, "num_tokens": 72507012.0, "step": 60290 }, { "entropy": 1.8426005780696868, "epoch": 0.1869247240496637, "grad_norm": 8.484749794006348, "learning_rate": 5.85144202631226e-06, "loss": 0.4907, "mean_token_accuracy": 0.8417939618229866, "num_tokens": 72519213.0, "step": 60300 }, { "entropy": 1.860968105494976, "epoch": 0.1869557231747134, "grad_norm": 8.830284118652344, "learning_rate": 5.850956884409118e-06, "loss": 0.5251, "mean_token_accuracy": 0.848306556046009, "num_tokens": 72531214.0, "step": 60310 }, { "entropy": 1.921893371641636, "epoch": 0.1869867222997631, "grad_norm": 4.533517837524414, "learning_rate": 5.8504718631550285e-06, "loss": 0.4962, "mean_token_accuracy": 0.8360114693641663, "num_tokens": 72543240.0, "step": 60320 }, { "entropy": 1.93612492531538, "epoch": 0.1870177214248128, "grad_norm": 8.422826766967773, "learning_rate": 5.849986962499992e-06, "loss": 0.5496, "mean_token_accuracy": 0.8314146876335144, "num_tokens": 72554502.0, "step": 60330 }, { "entropy": 1.887687975168228, "epoch": 0.1870487205498625, "grad_norm": 8.538383483886719, "learning_rate": 5.8495021823940416e-06, "loss": 0.5353, "mean_token_accuracy": 0.8397644698619843, "num_tokens": 72565656.0, "step": 60340 }, { "entropy": 1.744446012377739, "epoch": 0.1870797196749122, "grad_norm": 8.830626487731934, "learning_rate": 5.849017522787233e-06, "loss": 0.512, "mean_token_accuracy": 0.8492047414183617, "num_tokens": 72579158.0, "step": 60350 }, { "entropy": 1.8573518849909305, "epoch": 0.18711071879996188, "grad_norm": 8.745538711547852, "learning_rate": 5.848532983629661e-06, "loss": 0.5339, "mean_token_accuracy": 0.8321371108293534, "num_tokens": 72591425.0, "step": 60360 }, { "entropy": 1.7826851338148118, "epoch": 0.18714171792501158, "grad_norm": 8.527620315551758, "learning_rate": 5.848048564871437e-06, "loss": 0.4543, "mean_token_accuracy": 0.8485151454806328, "num_tokens": 72604518.0, "step": 60370 }, { "entropy": 1.8439786598086356, "epoch": 0.18717271705006125, "grad_norm": 8.76579475402832, "learning_rate": 5.84756426646271e-06, "loss": 0.4614, "mean_token_accuracy": 0.8471570834517479, "num_tokens": 72616133.0, "step": 60380 }, { "entropy": 1.9413674965500831, "epoch": 0.18720371617511095, "grad_norm": 9.924832344055176, "learning_rate": 5.847080088353656e-06, "loss": 0.5343, "mean_token_accuracy": 0.8391248330473899, "num_tokens": 72627275.0, "step": 60390 }, { "entropy": 1.803322871029377, "epoch": 0.18723471530016064, "grad_norm": 7.1156816482543945, "learning_rate": 5.846596030494477e-06, "loss": 0.5098, "mean_token_accuracy": 0.831206327676773, "num_tokens": 72640255.0, "step": 60400 }, { "entropy": 1.9137111112475396, "epoch": 0.18726571442521034, "grad_norm": 9.406431198120117, "learning_rate": 5.8461120928354075e-06, "loss": 0.5411, "mean_token_accuracy": 0.8288335934281349, "num_tokens": 72651695.0, "step": 60410 }, { "entropy": 1.988877174258232, "epoch": 0.18729671355026004, "grad_norm": 8.406907081604004, "learning_rate": 5.845628275326711e-06, "loss": 0.5578, "mean_token_accuracy": 0.8310790181159973, "num_tokens": 72662701.0, "step": 60420 }, { "entropy": 1.844719012081623, "epoch": 0.18732771267530973, "grad_norm": 4.9071431159973145, "learning_rate": 5.845144577918675e-06, "loss": 0.4843, "mean_token_accuracy": 0.8463888078927994, "num_tokens": 72675062.0, "step": 60430 }, { "entropy": 1.9066783770918847, "epoch": 0.18735871180035943, "grad_norm": 9.20982551574707, "learning_rate": 5.844661000561621e-06, "loss": 0.4689, "mean_token_accuracy": 0.8450051352381707, "num_tokens": 72687250.0, "step": 60440 }, { "entropy": 1.8505268752574922, "epoch": 0.18738971092540913, "grad_norm": 12.914068222045898, "learning_rate": 5.844177543205897e-06, "loss": 0.4712, "mean_token_accuracy": 0.8434465497732162, "num_tokens": 72699919.0, "step": 60450 }, { "entropy": 1.7514374867081641, "epoch": 0.18742071005045882, "grad_norm": 8.30848503112793, "learning_rate": 5.843694205801879e-06, "loss": 0.3801, "mean_token_accuracy": 0.8658733040094375, "num_tokens": 72713733.0, "step": 60460 }, { "entropy": 1.89697934538126, "epoch": 0.18745170917550852, "grad_norm": 9.627775192260742, "learning_rate": 5.843210988299973e-06, "loss": 0.5452, "mean_token_accuracy": 0.8306502267718315, "num_tokens": 72726134.0, "step": 60470 }, { "entropy": 1.9438308894634246, "epoch": 0.18748270830055822, "grad_norm": 10.055462837219238, "learning_rate": 5.842727890650615e-06, "loss": 0.5728, "mean_token_accuracy": 0.8255834862589836, "num_tokens": 72736983.0, "step": 60480 }, { "entropy": 1.8450178682804108, "epoch": 0.1875137074256079, "grad_norm": 9.322096824645996, "learning_rate": 5.8422449128042654e-06, "loss": 0.4454, "mean_token_accuracy": 0.845012141764164, "num_tokens": 72749747.0, "step": 60490 }, { "entropy": 1.8929164052009582, "epoch": 0.1875447065506576, "grad_norm": 3.979761838912964, "learning_rate": 5.84176205471142e-06, "loss": 0.5459, "mean_token_accuracy": 0.8384419724345207, "num_tokens": 72761568.0, "step": 60500 }, { "entropy": 1.8335536420345306, "epoch": 0.1875757056757073, "grad_norm": 7.59586238861084, "learning_rate": 5.841279316322594e-06, "loss": 0.4567, "mean_token_accuracy": 0.8467094838619232, "num_tokens": 72774261.0, "step": 60510 }, { "entropy": 1.882728861272335, "epoch": 0.187606704800757, "grad_norm": 8.7779541015625, "learning_rate": 5.840796697588341e-06, "loss": 0.4918, "mean_token_accuracy": 0.8436086297035217, "num_tokens": 72786714.0, "step": 60520 }, { "entropy": 1.8310616135597229, "epoch": 0.1876377039258067, "grad_norm": 5.086270809173584, "learning_rate": 5.840314198459236e-06, "loss": 0.4538, "mean_token_accuracy": 0.8549797222018242, "num_tokens": 72799986.0, "step": 60530 }, { "entropy": 1.9134382233023643, "epoch": 0.1876687030508564, "grad_norm": 8.873359680175781, "learning_rate": 5.839831818885886e-06, "loss": 0.5306, "mean_token_accuracy": 0.8400856927037239, "num_tokens": 72811382.0, "step": 60540 }, { "entropy": 1.809613211452961, "epoch": 0.1876997021759061, "grad_norm": 4.331747055053711, "learning_rate": 5.839349558818926e-06, "loss": 0.4504, "mean_token_accuracy": 0.8486540347337723, "num_tokens": 72824270.0, "step": 60550 }, { "entropy": 1.908320277929306, "epoch": 0.1877307013009558, "grad_norm": 8.491447448730469, "learning_rate": 5.838867418209019e-06, "loss": 0.5636, "mean_token_accuracy": 0.8320151507854462, "num_tokens": 72836395.0, "step": 60560 }, { "entropy": 1.8660949409008025, "epoch": 0.18776170042600548, "grad_norm": 4.656119346618652, "learning_rate": 5.838385397006855e-06, "loss": 0.4516, "mean_token_accuracy": 0.8481364041566849, "num_tokens": 72849113.0, "step": 60570 }, { "entropy": 1.8910485938191415, "epoch": 0.18779269955105518, "grad_norm": 6.919294834136963, "learning_rate": 5.837903495163157e-06, "loss": 0.489, "mean_token_accuracy": 0.8406860768795014, "num_tokens": 72860770.0, "step": 60580 }, { "entropy": 1.9625378847122192, "epoch": 0.18782369867610488, "grad_norm": 10.646431922912598, "learning_rate": 5.837421712628675e-06, "loss": 0.5639, "mean_token_accuracy": 0.8330793261528016, "num_tokens": 72871585.0, "step": 60590 }, { "entropy": 1.8914886653423308, "epoch": 0.18785469780115457, "grad_norm": 7.903103828430176, "learning_rate": 5.836940049354182e-06, "loss": 0.5078, "mean_token_accuracy": 0.8413411438465118, "num_tokens": 72883535.0, "step": 60600 }, { "entropy": 1.8991345196962357, "epoch": 0.18788569692620427, "grad_norm": 4.898101329803467, "learning_rate": 5.8364585052904845e-06, "loss": 0.5244, "mean_token_accuracy": 0.8374688118696213, "num_tokens": 72895721.0, "step": 60610 }, { "entropy": 1.8644976392388344, "epoch": 0.18791669605125394, "grad_norm": 4.19233512878418, "learning_rate": 5.8359770803884175e-06, "loss": 0.505, "mean_token_accuracy": 0.8408412501215935, "num_tokens": 72907421.0, "step": 60620 }, { "entropy": 1.891925212740898, "epoch": 0.18794769517630364, "grad_norm": 7.999198913574219, "learning_rate": 5.835495774598844e-06, "loss": 0.4944, "mean_token_accuracy": 0.8336058259010315, "num_tokens": 72919575.0, "step": 60630 }, { "entropy": 1.8591307654976845, "epoch": 0.18797869430135333, "grad_norm": 8.912029266357422, "learning_rate": 5.8350145878726545e-06, "loss": 0.5389, "mean_token_accuracy": 0.8295843943953514, "num_tokens": 72932425.0, "step": 60640 }, { "entropy": 1.957919180393219, "epoch": 0.18800969342640303, "grad_norm": 8.599974632263184, "learning_rate": 5.834533520160769e-06, "loss": 0.5634, "mean_token_accuracy": 0.8301822498440743, "num_tokens": 72943322.0, "step": 60650 }, { "entropy": 1.9159081429243088, "epoch": 0.18804069255145273, "grad_norm": 7.521796226501465, "learning_rate": 5.834052571414132e-06, "loss": 0.5475, "mean_token_accuracy": 0.8309228405356407, "num_tokens": 72954379.0, "step": 60660 }, { "entropy": 1.8881125912070273, "epoch": 0.18807169167650242, "grad_norm": 8.066596984863281, "learning_rate": 5.833571741583721e-06, "loss": 0.4948, "mean_token_accuracy": 0.8445821896195411, "num_tokens": 72965758.0, "step": 60670 }, { "entropy": 1.8443811774253844, "epoch": 0.18810269080155212, "grad_norm": 9.628620147705078, "learning_rate": 5.8330910306205405e-06, "loss": 0.4416, "mean_token_accuracy": 0.8472337678074837, "num_tokens": 72977927.0, "step": 60680 }, { "entropy": 1.869968481361866, "epoch": 0.18813368992660182, "grad_norm": 8.748429298400879, "learning_rate": 5.832610438475622e-06, "loss": 0.4627, "mean_token_accuracy": 0.8413064509630204, "num_tokens": 72989566.0, "step": 60690 }, { "entropy": 1.881579375267029, "epoch": 0.1881646890516515, "grad_norm": 8.547125816345215, "learning_rate": 5.832129965100026e-06, "loss": 0.5417, "mean_token_accuracy": 0.8311194002628326, "num_tokens": 73000552.0, "step": 60700 }, { "entropy": 1.9194360464811324, "epoch": 0.1881956881767012, "grad_norm": 10.575980186462402, "learning_rate": 5.831649610444842e-06, "loss": 0.5357, "mean_token_accuracy": 0.8376916989684104, "num_tokens": 73011919.0, "step": 60710 }, { "entropy": 1.936641664803028, "epoch": 0.1882266873017509, "grad_norm": 7.608135223388672, "learning_rate": 5.831169374461185e-06, "loss": 0.5898, "mean_token_accuracy": 0.821084663271904, "num_tokens": 73023762.0, "step": 60720 }, { "entropy": 1.9660951375961304, "epoch": 0.1882576864268006, "grad_norm": 8.96996784210205, "learning_rate": 5.8306892571002025e-06, "loss": 0.5683, "mean_token_accuracy": 0.835235033929348, "num_tokens": 73034510.0, "step": 60730 }, { "entropy": 1.8755255818367005, "epoch": 0.1882886855518503, "grad_norm": 8.350106239318848, "learning_rate": 5.830209258313067e-06, "loss": 0.4726, "mean_token_accuracy": 0.8453781142830848, "num_tokens": 73046102.0, "step": 60740 }, { "entropy": 1.901422207057476, "epoch": 0.1883196846769, "grad_norm": 8.605900764465332, "learning_rate": 5.829729378050978e-06, "loss": 0.5604, "mean_token_accuracy": 0.8291002050042152, "num_tokens": 73057508.0, "step": 60750 }, { "entropy": 1.9229997634887694, "epoch": 0.1883506838019497, "grad_norm": 11.104288101196289, "learning_rate": 5.829249616265167e-06, "loss": 0.5378, "mean_token_accuracy": 0.8307598143815994, "num_tokens": 73068665.0, "step": 60760 }, { "entropy": 1.8038654774427414, "epoch": 0.1883816829269994, "grad_norm": 9.37712287902832, "learning_rate": 5.828769972906891e-06, "loss": 0.4427, "mean_token_accuracy": 0.8444581896066665, "num_tokens": 73081347.0, "step": 60770 }, { "entropy": 1.8795530632138253, "epoch": 0.18841268205204909, "grad_norm": 4.818871021270752, "learning_rate": 5.828290447927437e-06, "loss": 0.5366, "mean_token_accuracy": 0.8303244650363922, "num_tokens": 73093545.0, "step": 60780 }, { "entropy": 1.8115891009569167, "epoch": 0.18844368117709878, "grad_norm": 8.035147666931152, "learning_rate": 5.827811041278115e-06, "loss": 0.426, "mean_token_accuracy": 0.8465625256299972, "num_tokens": 73106680.0, "step": 60790 }, { "entropy": 1.9304178357124329, "epoch": 0.18847468030214848, "grad_norm": 9.54056167602539, "learning_rate": 5.827331752910272e-06, "loss": 0.5425, "mean_token_accuracy": 0.839380769431591, "num_tokens": 73117528.0, "step": 60800 }, { "entropy": 1.9495259881019593, "epoch": 0.18850567942719818, "grad_norm": 9.372381210327148, "learning_rate": 5.826852582775273e-06, "loss": 0.6048, "mean_token_accuracy": 0.8166373401880265, "num_tokens": 73128356.0, "step": 60810 }, { "entropy": 1.8412368908524512, "epoch": 0.18853667855224787, "grad_norm": 7.743039608001709, "learning_rate": 5.826373530824517e-06, "loss": 0.4486, "mean_token_accuracy": 0.8510013222694397, "num_tokens": 73140590.0, "step": 60820 }, { "entropy": 1.7964115411043167, "epoch": 0.18856767767729757, "grad_norm": 8.170472145080566, "learning_rate": 5.825894597009432e-06, "loss": 0.468, "mean_token_accuracy": 0.8488139227032662, "num_tokens": 73153546.0, "step": 60830 }, { "entropy": 1.755212776362896, "epoch": 0.18859867680234726, "grad_norm": 7.8742451667785645, "learning_rate": 5.825415781281467e-06, "loss": 0.4312, "mean_token_accuracy": 0.8522952437400818, "num_tokens": 73166595.0, "step": 60840 }, { "entropy": 1.8175699785351753, "epoch": 0.18862967592739696, "grad_norm": 8.804338455200195, "learning_rate": 5.824937083592109e-06, "loss": 0.499, "mean_token_accuracy": 0.8330580577254295, "num_tokens": 73178927.0, "step": 60850 }, { "entropy": 1.8207954451441766, "epoch": 0.18866067505244666, "grad_norm": 8.167973518371582, "learning_rate": 5.824458503892864e-06, "loss": 0.4969, "mean_token_accuracy": 0.8418833702802658, "num_tokens": 73190925.0, "step": 60860 }, { "entropy": 1.8893669456243516, "epoch": 0.18869167417749633, "grad_norm": 8.711833953857422, "learning_rate": 5.82398004213527e-06, "loss": 0.5558, "mean_token_accuracy": 0.8303925588726997, "num_tokens": 73202125.0, "step": 60870 }, { "entropy": 1.8512268111109733, "epoch": 0.18872267330254602, "grad_norm": 7.86924409866333, "learning_rate": 5.823501698270892e-06, "loss": 0.5392, "mean_token_accuracy": 0.8379854589700699, "num_tokens": 73214492.0, "step": 60880 }, { "entropy": 1.8381731614470482, "epoch": 0.18875367242759572, "grad_norm": 6.331354141235352, "learning_rate": 5.8230234722513236e-06, "loss": 0.518, "mean_token_accuracy": 0.8383203312754631, "num_tokens": 73226919.0, "step": 60890 }, { "entropy": 1.8610471919178964, "epoch": 0.18878467155264542, "grad_norm": 8.457162857055664, "learning_rate": 5.822545364028186e-06, "loss": 0.4755, "mean_token_accuracy": 0.8471423760056496, "num_tokens": 73238963.0, "step": 60900 }, { "entropy": 1.8293138653039933, "epoch": 0.1888156706776951, "grad_norm": 4.284148216247559, "learning_rate": 5.822067373553127e-06, "loss": 0.4498, "mean_token_accuracy": 0.8517989009618759, "num_tokens": 73251832.0, "step": 60910 }, { "entropy": 1.856334713101387, "epoch": 0.1888466698027448, "grad_norm": 8.815077781677246, "learning_rate": 5.8215895007778235e-06, "loss": 0.5533, "mean_token_accuracy": 0.8325720369815827, "num_tokens": 73264796.0, "step": 60920 }, { "entropy": 1.6921759322285652, "epoch": 0.1888776689277945, "grad_norm": 7.586294174194336, "learning_rate": 5.821111745653979e-06, "loss": 0.4335, "mean_token_accuracy": 0.8488885432481765, "num_tokens": 73279601.0, "step": 60930 }, { "entropy": 1.9192057803273201, "epoch": 0.1889086680528442, "grad_norm": 8.917458534240723, "learning_rate": 5.820634108133325e-06, "loss": 0.5522, "mean_token_accuracy": 0.825607393682003, "num_tokens": 73291372.0, "step": 60940 }, { "entropy": 1.8995835900306701, "epoch": 0.1889396671778939, "grad_norm": 9.669869422912598, "learning_rate": 5.820156588167624e-06, "loss": 0.5177, "mean_token_accuracy": 0.8387888163328171, "num_tokens": 73302760.0, "step": 60950 }, { "entropy": 1.9761246114969253, "epoch": 0.1889706663029436, "grad_norm": 9.00790786743164, "learning_rate": 5.819679185708661e-06, "loss": 0.5676, "mean_token_accuracy": 0.8249676078557968, "num_tokens": 73313701.0, "step": 60960 }, { "entropy": 1.9618436962366104, "epoch": 0.1890016654279933, "grad_norm": 9.204923629760742, "learning_rate": 5.8192019007082515e-06, "loss": 0.5978, "mean_token_accuracy": 0.8198722168803215, "num_tokens": 73324576.0, "step": 60970 }, { "entropy": 1.8996351152658462, "epoch": 0.189032664553043, "grad_norm": 8.046465873718262, "learning_rate": 5.818724733118237e-06, "loss": 0.5235, "mean_token_accuracy": 0.8394160747528077, "num_tokens": 73335871.0, "step": 60980 }, { "entropy": 1.8664459705352783, "epoch": 0.18906366367809269, "grad_norm": 9.74695873260498, "learning_rate": 5.8182476828904896e-06, "loss": 0.5478, "mean_token_accuracy": 0.8346919432282448, "num_tokens": 73347256.0, "step": 60990 }, { "entropy": 1.9113014698028565, "epoch": 0.18909466280314238, "grad_norm": 9.092202186584473, "learning_rate": 5.817770749976909e-06, "loss": 0.5356, "mean_token_accuracy": 0.8376618474721909, "num_tokens": 73358623.0, "step": 61000 }, { "entropy": 1.8848463252186776, "epoch": 0.18912566192819208, "grad_norm": 4.147286891937256, "learning_rate": 5.817293934329417e-06, "loss": 0.5323, "mean_token_accuracy": 0.8313426792621612, "num_tokens": 73370417.0, "step": 61010 }, { "entropy": 1.9214195176959037, "epoch": 0.18915666105324178, "grad_norm": 7.7611403465271, "learning_rate": 5.816817235899967e-06, "loss": 0.5371, "mean_token_accuracy": 0.83161641061306, "num_tokens": 73382133.0, "step": 61020 }, { "entropy": 1.9057343572378158, "epoch": 0.18918766017829147, "grad_norm": 9.498258590698242, "learning_rate": 5.816340654640542e-06, "loss": 0.5504, "mean_token_accuracy": 0.8384771049022675, "num_tokens": 73393440.0, "step": 61030 }, { "entropy": 1.834838719666004, "epoch": 0.18921865930334117, "grad_norm": 4.797052383422852, "learning_rate": 5.815864190503149e-06, "loss": 0.4313, "mean_token_accuracy": 0.8543123677372932, "num_tokens": 73406284.0, "step": 61040 }, { "entropy": 1.8589958116412162, "epoch": 0.18924965842839087, "grad_norm": 8.2996826171875, "learning_rate": 5.815387843439824e-06, "loss": 0.5076, "mean_token_accuracy": 0.8410072878003121, "num_tokens": 73419184.0, "step": 61050 }, { "entropy": 1.9609655499458314, "epoch": 0.18928065755344056, "grad_norm": 8.885133743286133, "learning_rate": 5.814911613402629e-06, "loss": 0.5731, "mean_token_accuracy": 0.8247525811195373, "num_tokens": 73430755.0, "step": 61060 }, { "entropy": 1.8497852653265, "epoch": 0.18931165667849026, "grad_norm": 8.99361801147461, "learning_rate": 5.814435500343657e-06, "loss": 0.467, "mean_token_accuracy": 0.8407298862934113, "num_tokens": 73443198.0, "step": 61070 }, { "entropy": 1.920815998315811, "epoch": 0.18934265580353996, "grad_norm": 8.852625846862793, "learning_rate": 5.813959504215025e-06, "loss": 0.5761, "mean_token_accuracy": 0.8310668468475342, "num_tokens": 73453909.0, "step": 61080 }, { "entropy": 1.8594837307929992, "epoch": 0.18937365492858965, "grad_norm": 5.037121772766113, "learning_rate": 5.813483624968877e-06, "loss": 0.5775, "mean_token_accuracy": 0.82839694917202, "num_tokens": 73466187.0, "step": 61090 }, { "entropy": 1.816783571243286, "epoch": 0.18940465405363935, "grad_norm": 2.801055669784546, "learning_rate": 5.813007862557388e-06, "loss": 0.4086, "mean_token_accuracy": 0.8508325532078743, "num_tokens": 73479306.0, "step": 61100 }, { "entropy": 1.9225141122937202, "epoch": 0.18943565317868905, "grad_norm": 8.705913543701172, "learning_rate": 5.812532216932759e-06, "loss": 0.6119, "mean_token_accuracy": 0.8328218296170234, "num_tokens": 73491113.0, "step": 61110 }, { "entropy": 1.8322004929184914, "epoch": 0.18946665230373871, "grad_norm": 3.7541184425354004, "learning_rate": 5.8120566880472155e-06, "loss": 0.4586, "mean_token_accuracy": 0.848419138789177, "num_tokens": 73503751.0, "step": 61120 }, { "entropy": 1.910209448635578, "epoch": 0.1894976514287884, "grad_norm": 9.48783016204834, "learning_rate": 5.811581275853014e-06, "loss": 0.5389, "mean_token_accuracy": 0.8345631062984467, "num_tokens": 73515104.0, "step": 61130 }, { "entropy": 1.8471285864710807, "epoch": 0.1895286505538381, "grad_norm": 3.989353656768799, "learning_rate": 5.811105980302438e-06, "loss": 0.497, "mean_token_accuracy": 0.8420290291309357, "num_tokens": 73527180.0, "step": 61140 }, { "entropy": 1.8412006407976151, "epoch": 0.1895596496788878, "grad_norm": 9.421629905700684, "learning_rate": 5.810630801347794e-06, "loss": 0.5035, "mean_token_accuracy": 0.843493039906025, "num_tokens": 73540379.0, "step": 61150 }, { "entropy": 1.89210002720356, "epoch": 0.1895906488039375, "grad_norm": 4.73581075668335, "learning_rate": 5.810155738941422e-06, "loss": 0.4675, "mean_token_accuracy": 0.8357082083821297, "num_tokens": 73552901.0, "step": 61160 }, { "entropy": 1.899315556883812, "epoch": 0.1896216479289872, "grad_norm": 7.832334518432617, "learning_rate": 5.809680793035686e-06, "loss": 0.4787, "mean_token_accuracy": 0.8473002672195434, "num_tokens": 73564938.0, "step": 61170 }, { "entropy": 1.9637476325035095, "epoch": 0.1896526470540369, "grad_norm": 10.968403816223145, "learning_rate": 5.8092059635829754e-06, "loss": 0.5948, "mean_token_accuracy": 0.8195749655365944, "num_tokens": 73576160.0, "step": 61180 }, { "entropy": 1.9289003670215608, "epoch": 0.1896836461790866, "grad_norm": 9.405878067016602, "learning_rate": 5.8087312505357115e-06, "loss": 0.5501, "mean_token_accuracy": 0.8394549310207366, "num_tokens": 73588394.0, "step": 61190 }, { "entropy": 1.9212181255221368, "epoch": 0.1897146453041363, "grad_norm": 9.955124855041504, "learning_rate": 5.80825665384634e-06, "loss": 0.5592, "mean_token_accuracy": 0.8281317040324211, "num_tokens": 73600253.0, "step": 61200 }, { "entropy": 1.8745829716324807, "epoch": 0.18974564442918598, "grad_norm": 8.92733097076416, "learning_rate": 5.807782173467334e-06, "loss": 0.4997, "mean_token_accuracy": 0.8472078263759613, "num_tokens": 73613086.0, "step": 61210 }, { "entropy": 1.931848169863224, "epoch": 0.18977664355423568, "grad_norm": 7.844409465789795, "learning_rate": 5.807307809351192e-06, "loss": 0.5246, "mean_token_accuracy": 0.8336080491542817, "num_tokens": 73624754.0, "step": 61220 }, { "entropy": 1.915302050113678, "epoch": 0.18980764267928538, "grad_norm": 7.905981540679932, "learning_rate": 5.806833561450444e-06, "loss": 0.5309, "mean_token_accuracy": 0.8398157224059105, "num_tokens": 73636264.0, "step": 61230 }, { "entropy": 1.980039805173874, "epoch": 0.18983864180433507, "grad_norm": 7.911728858947754, "learning_rate": 5.806359429717643e-06, "loss": 0.5563, "mean_token_accuracy": 0.8273883283138275, "num_tokens": 73647130.0, "step": 61240 }, { "entropy": 1.9102006241679192, "epoch": 0.18986964092938477, "grad_norm": 8.068841934204102, "learning_rate": 5.805885414105372e-06, "loss": 0.5615, "mean_token_accuracy": 0.8346290215849876, "num_tokens": 73658915.0, "step": 61250 }, { "entropy": 1.8921155214309693, "epoch": 0.18990064005443447, "grad_norm": 9.479058265686035, "learning_rate": 5.805411514566239e-06, "loss": 0.5147, "mean_token_accuracy": 0.8326509952545166, "num_tokens": 73670760.0, "step": 61260 }, { "entropy": 1.8476255759596825, "epoch": 0.18993163917948416, "grad_norm": 3.8252689838409424, "learning_rate": 5.804937731052881e-06, "loss": 0.4446, "mean_token_accuracy": 0.8441173121333122, "num_tokens": 73683779.0, "step": 61270 }, { "entropy": 1.8536283493041992, "epoch": 0.18996263830453386, "grad_norm": 9.392457008361816, "learning_rate": 5.80446406351796e-06, "loss": 0.446, "mean_token_accuracy": 0.8531845271587372, "num_tokens": 73695924.0, "step": 61280 }, { "entropy": 2.0113365948200226, "epoch": 0.18999363742958356, "grad_norm": 9.121764183044434, "learning_rate": 5.803990511914166e-06, "loss": 0.6117, "mean_token_accuracy": 0.825063693523407, "num_tokens": 73706995.0, "step": 61290 }, { "entropy": 1.8952318519353866, "epoch": 0.19002463655463325, "grad_norm": 8.177303314208984, "learning_rate": 5.803517076194217e-06, "loss": 0.5076, "mean_token_accuracy": 0.8383290365338325, "num_tokens": 73718874.0, "step": 61300 }, { "entropy": 1.9754302859306336, "epoch": 0.19005563567968295, "grad_norm": 9.628982543945312, "learning_rate": 5.803043756310858e-06, "loss": 0.6201, "mean_token_accuracy": 0.8155344128608704, "num_tokens": 73729826.0, "step": 61310 }, { "entropy": 1.8714706540107726, "epoch": 0.19008663480473265, "grad_norm": 8.074376106262207, "learning_rate": 5.802570552216857e-06, "loss": 0.4571, "mean_token_accuracy": 0.8505250707268714, "num_tokens": 73741893.0, "step": 61320 }, { "entropy": 1.9628629803657531, "epoch": 0.19011763392978234, "grad_norm": 4.110509872436523, "learning_rate": 5.802097463865013e-06, "loss": 0.5459, "mean_token_accuracy": 0.8337266921997071, "num_tokens": 73753334.0, "step": 61330 }, { "entropy": 1.9020420521497727, "epoch": 0.19014863305483204, "grad_norm": 8.163873672485352, "learning_rate": 5.801624491208153e-06, "loss": 0.4906, "mean_token_accuracy": 0.8378794327378273, "num_tokens": 73765817.0, "step": 61340 }, { "entropy": 1.8595923513174057, "epoch": 0.19017963217988174, "grad_norm": 9.472771644592285, "learning_rate": 5.8011516341991266e-06, "loss": 0.4882, "mean_token_accuracy": 0.8379010885953904, "num_tokens": 73778450.0, "step": 61350 }, { "entropy": 1.9552810430526733, "epoch": 0.1902106313049314, "grad_norm": 9.82465648651123, "learning_rate": 5.800678892790814e-06, "loss": 0.531, "mean_token_accuracy": 0.8336144149303436, "num_tokens": 73789111.0, "step": 61360 }, { "entropy": 1.8182827576994895, "epoch": 0.1902416304299811, "grad_norm": 7.453169822692871, "learning_rate": 5.80020626693612e-06, "loss": 0.4876, "mean_token_accuracy": 0.8479637071490288, "num_tokens": 73801615.0, "step": 61370 }, { "entropy": 1.791535222530365, "epoch": 0.1902726295550308, "grad_norm": 3.9200282096862793, "learning_rate": 5.799733756587978e-06, "loss": 0.4514, "mean_token_accuracy": 0.8588753417134285, "num_tokens": 73815428.0, "step": 61380 }, { "entropy": 1.8905212104320526, "epoch": 0.1903036286800805, "grad_norm": 6.597655773162842, "learning_rate": 5.799261361699344e-06, "loss": 0.4899, "mean_token_accuracy": 0.8383428543806076, "num_tokens": 73826951.0, "step": 61390 }, { "entropy": 1.8670665681362153, "epoch": 0.1903346278051302, "grad_norm": 8.293522834777832, "learning_rate": 5.798789082223209e-06, "loss": 0.4997, "mean_token_accuracy": 0.8391018345952034, "num_tokens": 73839103.0, "step": 61400 }, { "entropy": 1.7990123346447944, "epoch": 0.1903656269301799, "grad_norm": 2.1400680541992188, "learning_rate": 5.798316918112582e-06, "loss": 0.4596, "mean_token_accuracy": 0.8424833819270134, "num_tokens": 73852403.0, "step": 61410 }, { "entropy": 1.777857731282711, "epoch": 0.19039662605522958, "grad_norm": 4.169025897979736, "learning_rate": 5.797844869320504e-06, "loss": 0.4243, "mean_token_accuracy": 0.8537265419960022, "num_tokens": 73866168.0, "step": 61420 }, { "entropy": 1.9535469591617585, "epoch": 0.19042762518027928, "grad_norm": 9.52925968170166, "learning_rate": 5.797372935800044e-06, "loss": 0.5682, "mean_token_accuracy": 0.8247589409351349, "num_tokens": 73876934.0, "step": 61430 }, { "entropy": 1.8025505855679511, "epoch": 0.19045862430532898, "grad_norm": 3.67460036277771, "learning_rate": 5.796901117504291e-06, "loss": 0.4403, "mean_token_accuracy": 0.8523954421281814, "num_tokens": 73889905.0, "step": 61440 }, { "entropy": 1.879899947345257, "epoch": 0.19048962343037867, "grad_norm": 8.983809471130371, "learning_rate": 5.796429414386368e-06, "loss": 0.5415, "mean_token_accuracy": 0.823103578388691, "num_tokens": 73901347.0, "step": 61450 }, { "entropy": 1.9289403408765793, "epoch": 0.19052062255542837, "grad_norm": 8.888519287109375, "learning_rate": 5.7959578263994186e-06, "loss": 0.5555, "mean_token_accuracy": 0.8268095463514328, "num_tokens": 73912721.0, "step": 61460 }, { "entropy": 1.7729464322328568, "epoch": 0.19055162168047807, "grad_norm": 9.862566947937012, "learning_rate": 5.79548635349662e-06, "loss": 0.4513, "mean_token_accuracy": 0.8498085349798202, "num_tokens": 73926277.0, "step": 61470 }, { "entropy": 1.8908159032464027, "epoch": 0.19058262080552776, "grad_norm": 9.228188514709473, "learning_rate": 5.795014995631168e-06, "loss": 0.5077, "mean_token_accuracy": 0.8412437319755555, "num_tokens": 73937675.0, "step": 61480 }, { "entropy": 1.8851223319768906, "epoch": 0.19061361993057746, "grad_norm": 8.568156242370605, "learning_rate": 5.794543752756292e-06, "loss": 0.5504, "mean_token_accuracy": 0.82836285084486, "num_tokens": 73949212.0, "step": 61490 }, { "entropy": 1.9255726218223572, "epoch": 0.19064461905562716, "grad_norm": 7.397071838378906, "learning_rate": 5.794072624825245e-06, "loss": 0.5881, "mean_token_accuracy": 0.8346537619829177, "num_tokens": 73960504.0, "step": 61500 }, { "entropy": 1.9002389639616013, "epoch": 0.19067561818067685, "grad_norm": 9.786722183227539, "learning_rate": 5.793601611791305e-06, "loss": 0.4892, "mean_token_accuracy": 0.8421397104859352, "num_tokens": 73972536.0, "step": 61510 }, { "entropy": 1.7949311509728432, "epoch": 0.19070661730572655, "grad_norm": 9.29088306427002, "learning_rate": 5.7931307136077804e-06, "loss": 0.438, "mean_token_accuracy": 0.8478746846318245, "num_tokens": 73985554.0, "step": 61520 }, { "entropy": 1.8513918846845627, "epoch": 0.19073761643077625, "grad_norm": 7.442810535430908, "learning_rate": 5.792659930228004e-06, "loss": 0.5175, "mean_token_accuracy": 0.829763513803482, "num_tokens": 73997021.0, "step": 61530 }, { "entropy": 1.8692805111408233, "epoch": 0.19076861555582594, "grad_norm": 6.96608304977417, "learning_rate": 5.792189261605333e-06, "loss": 0.4901, "mean_token_accuracy": 0.8437046900391578, "num_tokens": 74009028.0, "step": 61540 }, { "entropy": 1.9384054720401764, "epoch": 0.19079961468087564, "grad_norm": 8.266419410705566, "learning_rate": 5.791718707693156e-06, "loss": 0.5769, "mean_token_accuracy": 0.830648484826088, "num_tokens": 74020561.0, "step": 61550 }, { "entropy": 1.869347333908081, "epoch": 0.19083061380592534, "grad_norm": 4.296153545379639, "learning_rate": 5.7912482684448845e-06, "loss": 0.5328, "mean_token_accuracy": 0.8308837920427322, "num_tokens": 74032486.0, "step": 61560 }, { "entropy": 1.8627609625458716, "epoch": 0.19086161293097503, "grad_norm": 8.509102821350098, "learning_rate": 5.790777943813958e-06, "loss": 0.5362, "mean_token_accuracy": 0.8389568880200386, "num_tokens": 74044672.0, "step": 61570 }, { "entropy": 1.8437671825289725, "epoch": 0.19089261205602473, "grad_norm": 8.853963851928711, "learning_rate": 5.79030773375384e-06, "loss": 0.5127, "mean_token_accuracy": 0.8353106841444969, "num_tokens": 74057007.0, "step": 61580 }, { "entropy": 1.9143078982830048, "epoch": 0.19092361118107443, "grad_norm": 7.705349445343018, "learning_rate": 5.789837638218024e-06, "loss": 0.52, "mean_token_accuracy": 0.8365962967276573, "num_tokens": 74068293.0, "step": 61590 }, { "entropy": 1.8211023643612863, "epoch": 0.19095461030612412, "grad_norm": 8.679984092712402, "learning_rate": 5.78936765716003e-06, "loss": 0.4909, "mean_token_accuracy": 0.8368862703442573, "num_tokens": 74080738.0, "step": 61600 }, { "entropy": 1.9360493808984756, "epoch": 0.1909856094311738, "grad_norm": 8.391457557678223, "learning_rate": 5.788897790533401e-06, "loss": 0.583, "mean_token_accuracy": 0.8265061900019646, "num_tokens": 74091611.0, "step": 61610 }, { "entropy": 1.8157276138663292, "epoch": 0.1910166085562235, "grad_norm": 8.302699089050293, "learning_rate": 5.788428038291707e-06, "loss": 0.4698, "mean_token_accuracy": 0.8514835327863693, "num_tokens": 74104091.0, "step": 61620 }, { "entropy": 1.9097129538655282, "epoch": 0.19104760768127318, "grad_norm": 10.248120307922363, "learning_rate": 5.787958400388546e-06, "loss": 0.5394, "mean_token_accuracy": 0.8318827226758003, "num_tokens": 74115663.0, "step": 61630 }, { "entropy": 1.805907167494297, "epoch": 0.19107860680632288, "grad_norm": 9.584233283996582, "learning_rate": 5.787488876777544e-06, "loss": 0.4833, "mean_token_accuracy": 0.8393665820360183, "num_tokens": 74127987.0, "step": 61640 }, { "entropy": 1.772979559749365, "epoch": 0.19110960593137258, "grad_norm": 7.392131805419922, "learning_rate": 5.78701946741235e-06, "loss": 0.4289, "mean_token_accuracy": 0.8417062953114509, "num_tokens": 74141723.0, "step": 61650 }, { "entropy": 1.9116898834705354, "epoch": 0.19114060505642227, "grad_norm": 9.686699867248535, "learning_rate": 5.786550172246639e-06, "loss": 0.5491, "mean_token_accuracy": 0.8334579989314079, "num_tokens": 74152615.0, "step": 61660 }, { "entropy": 1.8346798852086068, "epoch": 0.19117160418147197, "grad_norm": 7.035947799682617, "learning_rate": 5.786080991234115e-06, "loss": 0.49, "mean_token_accuracy": 0.8480042293667793, "num_tokens": 74164838.0, "step": 61670 }, { "entropy": 1.822737891972065, "epoch": 0.19120260330652167, "grad_norm": 9.475711822509766, "learning_rate": 5.785611924328507e-06, "loss": 0.4779, "mean_token_accuracy": 0.8468611136078834, "num_tokens": 74177191.0, "step": 61680 }, { "entropy": 1.8375527679920196, "epoch": 0.19123360243157136, "grad_norm": 6.906650543212891, "learning_rate": 5.785142971483572e-06, "loss": 0.4749, "mean_token_accuracy": 0.8478382468223572, "num_tokens": 74189816.0, "step": 61690 }, { "entropy": 1.7865667924284936, "epoch": 0.19126460155662106, "grad_norm": 4.482153415679932, "learning_rate": 5.784674132653088e-06, "loss": 0.4917, "mean_token_accuracy": 0.8439031273126603, "num_tokens": 74202820.0, "step": 61700 }, { "entropy": 1.8899090319871903, "epoch": 0.19129560068167076, "grad_norm": 8.343757629394531, "learning_rate": 5.784205407790866e-06, "loss": 0.481, "mean_token_accuracy": 0.8452587991952896, "num_tokens": 74214375.0, "step": 61710 }, { "entropy": 1.83342125415802, "epoch": 0.19132659980672045, "grad_norm": 8.29771614074707, "learning_rate": 5.783736796850737e-06, "loss": 0.5425, "mean_token_accuracy": 0.8335444629192352, "num_tokens": 74227751.0, "step": 61720 }, { "entropy": 1.961284738779068, "epoch": 0.19135759893177015, "grad_norm": 9.149755477905273, "learning_rate": 5.783268299786564e-06, "loss": 0.544, "mean_token_accuracy": 0.833545659482479, "num_tokens": 74238590.0, "step": 61730 }, { "entropy": 1.8501825496554374, "epoch": 0.19138859805681985, "grad_norm": 10.697173118591309, "learning_rate": 5.782799916552232e-06, "loss": 0.5199, "mean_token_accuracy": 0.8350521355867386, "num_tokens": 74250667.0, "step": 61740 }, { "entropy": 1.9134044259786607, "epoch": 0.19141959718186954, "grad_norm": 6.616395950317383, "learning_rate": 5.782331647101653e-06, "loss": 0.5406, "mean_token_accuracy": 0.8395699933171272, "num_tokens": 74261958.0, "step": 61750 }, { "entropy": 1.9204273253679276, "epoch": 0.19145059630691924, "grad_norm": 10.518773078918457, "learning_rate": 5.781863491388767e-06, "loss": 0.5411, "mean_token_accuracy": 0.8439541757106781, "num_tokens": 74272746.0, "step": 61760 }, { "entropy": 1.8314485549926758, "epoch": 0.19148159543196894, "grad_norm": 8.12490463256836, "learning_rate": 5.781395449367536e-06, "loss": 0.4703, "mean_token_accuracy": 0.8424828752875329, "num_tokens": 74285787.0, "step": 61770 }, { "entropy": 1.894325715303421, "epoch": 0.19151259455701863, "grad_norm": 10.066198348999023, "learning_rate": 5.780927520991953e-06, "loss": 0.5017, "mean_token_accuracy": 0.8336468815803528, "num_tokens": 74298311.0, "step": 61780 }, { "entropy": 1.9529366761445999, "epoch": 0.19154359368206833, "grad_norm": 9.384862899780273, "learning_rate": 5.780459706216036e-06, "loss": 0.5579, "mean_token_accuracy": 0.8310287207365036, "num_tokens": 74309228.0, "step": 61790 }, { "entropy": 1.962406338751316, "epoch": 0.19157459280711803, "grad_norm": 7.662502288818359, "learning_rate": 5.779992004993824e-06, "loss": 0.5665, "mean_token_accuracy": 0.8225008681416511, "num_tokens": 74320341.0, "step": 61800 }, { "entropy": 1.9238436996936799, "epoch": 0.19160559193216772, "grad_norm": 9.47153091430664, "learning_rate": 5.779524417279388e-06, "loss": 0.526, "mean_token_accuracy": 0.836972689628601, "num_tokens": 74332180.0, "step": 61810 }, { "entropy": 1.933986322581768, "epoch": 0.19163659105721742, "grad_norm": 7.185315132141113, "learning_rate": 5.779056943026824e-06, "loss": 0.4926, "mean_token_accuracy": 0.8494284927845002, "num_tokens": 74343712.0, "step": 61820 }, { "entropy": 1.920379176735878, "epoch": 0.19166759018226712, "grad_norm": 4.159258842468262, "learning_rate": 5.778589582190252e-06, "loss": 0.489, "mean_token_accuracy": 0.836681205034256, "num_tokens": 74356004.0, "step": 61830 }, { "entropy": 1.8799566522240638, "epoch": 0.1916985893073168, "grad_norm": 8.231870651245117, "learning_rate": 5.778122334723817e-06, "loss": 0.5039, "mean_token_accuracy": 0.8473984330892563, "num_tokens": 74368305.0, "step": 61840 }, { "entropy": 1.893815641105175, "epoch": 0.1917295884323665, "grad_norm": 4.571141719818115, "learning_rate": 5.777655200581693e-06, "loss": 0.4869, "mean_token_accuracy": 0.8461870953440667, "num_tokens": 74380533.0, "step": 61850 }, { "entropy": 1.8599497005343437, "epoch": 0.19176058755741618, "grad_norm": 4.433168411254883, "learning_rate": 5.77718817971808e-06, "loss": 0.4654, "mean_token_accuracy": 0.8428743481636047, "num_tokens": 74393564.0, "step": 61860 }, { "entropy": 1.9357849955558777, "epoch": 0.19179158668246588, "grad_norm": 8.343416213989258, "learning_rate": 5.776721272087201e-06, "loss": 0.5484, "mean_token_accuracy": 0.8426366299390793, "num_tokens": 74405577.0, "step": 61870 }, { "entropy": 2.0002988666296004, "epoch": 0.19182258580751557, "grad_norm": 8.28388500213623, "learning_rate": 5.776254477643307e-06, "loss": 0.5856, "mean_token_accuracy": 0.8197537645697593, "num_tokens": 74416662.0, "step": 61880 }, { "entropy": 1.8667000889778138, "epoch": 0.19185358493256527, "grad_norm": 10.070082664489746, "learning_rate": 5.775787796340675e-06, "loss": 0.5541, "mean_token_accuracy": 0.8392659932374954, "num_tokens": 74428842.0, "step": 61890 }, { "entropy": 1.8394625827670097, "epoch": 0.19188458405761497, "grad_norm": 2.4154021739959717, "learning_rate": 5.775321228133606e-06, "loss": 0.4126, "mean_token_accuracy": 0.8550085946917534, "num_tokens": 74441442.0, "step": 61900 }, { "entropy": 1.8814760237932204, "epoch": 0.19191558318266466, "grad_norm": 8.660712242126465, "learning_rate": 5.77485477297643e-06, "loss": 0.486, "mean_token_accuracy": 0.8466302067041397, "num_tokens": 74453829.0, "step": 61910 }, { "entropy": 1.8994119971990586, "epoch": 0.19194658230771436, "grad_norm": 8.826970100402832, "learning_rate": 5.774388430823499e-06, "loss": 0.5172, "mean_token_accuracy": 0.8399457424879074, "num_tokens": 74464456.0, "step": 61920 }, { "entropy": 1.9710954904556275, "epoch": 0.19197758143276406, "grad_norm": 7.8489155769348145, "learning_rate": 5.773922201629193e-06, "loss": 0.5579, "mean_token_accuracy": 0.8248024433851242, "num_tokens": 74475138.0, "step": 61930 }, { "entropy": 1.8401651561260224, "epoch": 0.19200858055781375, "grad_norm": 9.654853820800781, "learning_rate": 5.7734560853479185e-06, "loss": 0.4738, "mean_token_accuracy": 0.8436680495738983, "num_tokens": 74488049.0, "step": 61940 }, { "entropy": 1.839670716226101, "epoch": 0.19203957968286345, "grad_norm": 8.641029357910156, "learning_rate": 5.772990081934104e-06, "loss": 0.4826, "mean_token_accuracy": 0.8443516567349434, "num_tokens": 74500733.0, "step": 61950 }, { "entropy": 1.8809733077883721, "epoch": 0.19207057880791314, "grad_norm": 7.769062519073486, "learning_rate": 5.772524191342211e-06, "loss": 0.4726, "mean_token_accuracy": 0.8486372217535972, "num_tokens": 74512926.0, "step": 61960 }, { "entropy": 1.8726215928792953, "epoch": 0.19210157793296284, "grad_norm": 9.620684623718262, "learning_rate": 5.77205841352672e-06, "loss": 0.5019, "mean_token_accuracy": 0.8346716165542603, "num_tokens": 74524851.0, "step": 61970 }, { "entropy": 1.8348112776875496, "epoch": 0.19213257705801254, "grad_norm": 9.443613052368164, "learning_rate": 5.771592748442137e-06, "loss": 0.4938, "mean_token_accuracy": 0.8425130292773246, "num_tokens": 74537835.0, "step": 61980 }, { "entropy": 1.871398164331913, "epoch": 0.19216357618306223, "grad_norm": 4.7039875984191895, "learning_rate": 5.771127196042999e-06, "loss": 0.4768, "mean_token_accuracy": 0.8458441883325577, "num_tokens": 74549780.0, "step": 61990 }, { "entropy": 1.8827469125390053, "epoch": 0.19219457530811193, "grad_norm": 10.308496475219727, "learning_rate": 5.770661756283866e-06, "loss": 0.5176, "mean_token_accuracy": 0.8351684272289276, "num_tokens": 74561738.0, "step": 62000 }, { "entropy": 1.9043960571289062, "epoch": 0.19222557443316163, "grad_norm": 7.712822914123535, "learning_rate": 5.7701964291193214e-06, "loss": 0.5082, "mean_token_accuracy": 0.8395509481430053, "num_tokens": 74573112.0, "step": 62010 }, { "entropy": 1.9056378319859504, "epoch": 0.19225657355821132, "grad_norm": 9.439181327819824, "learning_rate": 5.769731214503978e-06, "loss": 0.5181, "mean_token_accuracy": 0.8385317623615265, "num_tokens": 74583987.0, "step": 62020 }, { "entropy": 1.8788242667913437, "epoch": 0.19228757268326102, "grad_norm": 3.4620392322540283, "learning_rate": 5.76926611239247e-06, "loss": 0.4964, "mean_token_accuracy": 0.8428028956055641, "num_tokens": 74595794.0, "step": 62030 }, { "entropy": 1.8820303320884704, "epoch": 0.19231857180831072, "grad_norm": 10.090309143066406, "learning_rate": 5.7688011227394625e-06, "loss": 0.4822, "mean_token_accuracy": 0.8451215922832489, "num_tokens": 74607298.0, "step": 62040 }, { "entropy": 1.886990125477314, "epoch": 0.19234957093336041, "grad_norm": 8.511155128479004, "learning_rate": 5.768336245499641e-06, "loss": 0.5045, "mean_token_accuracy": 0.8422086775302887, "num_tokens": 74619848.0, "step": 62050 }, { "entropy": 1.9199146822094917, "epoch": 0.1923805700584101, "grad_norm": 8.11715316772461, "learning_rate": 5.767871480627723e-06, "loss": 0.5289, "mean_token_accuracy": 0.8385195031762123, "num_tokens": 74631619.0, "step": 62060 }, { "entropy": 1.8858480796217918, "epoch": 0.1924115691834598, "grad_norm": 7.366301536560059, "learning_rate": 5.767406828078441e-06, "loss": 0.489, "mean_token_accuracy": 0.8323479920625687, "num_tokens": 74643673.0, "step": 62070 }, { "entropy": 1.8879368424415588, "epoch": 0.1924425683085095, "grad_norm": 5.003079414367676, "learning_rate": 5.766942287806564e-06, "loss": 0.5219, "mean_token_accuracy": 0.8332347095012664, "num_tokens": 74656285.0, "step": 62080 }, { "entropy": 1.8710455060005189, "epoch": 0.1924735674335592, "grad_norm": 3.315375804901123, "learning_rate": 5.766477859766882e-06, "loss": 0.5307, "mean_token_accuracy": 0.829591977596283, "num_tokens": 74668414.0, "step": 62090 }, { "entropy": 1.8381486520171166, "epoch": 0.19250456655860887, "grad_norm": 7.80744743347168, "learning_rate": 5.766013543914207e-06, "loss": 0.4593, "mean_token_accuracy": 0.8402635097503662, "num_tokens": 74680230.0, "step": 62100 }, { "entropy": 1.9462435692548752, "epoch": 0.19253556568365857, "grad_norm": 8.435437202453613, "learning_rate": 5.7655493402033836e-06, "loss": 0.5916, "mean_token_accuracy": 0.8307736337184906, "num_tokens": 74692003.0, "step": 62110 }, { "entropy": 1.903138768672943, "epoch": 0.19256656480870826, "grad_norm": 8.948243141174316, "learning_rate": 5.7650852485892765e-06, "loss": 0.5137, "mean_token_accuracy": 0.8443427816033363, "num_tokens": 74703411.0, "step": 62120 }, { "entropy": 1.8868961855769157, "epoch": 0.19259756393375796, "grad_norm": 11.374164581298828, "learning_rate": 5.764621269026775e-06, "loss": 0.4984, "mean_token_accuracy": 0.8403722077608109, "num_tokens": 74715521.0, "step": 62130 }, { "entropy": 1.892562797665596, "epoch": 0.19262856305880766, "grad_norm": 7.841557025909424, "learning_rate": 5.764157401470803e-06, "loss": 0.5442, "mean_token_accuracy": 0.8378325685858726, "num_tokens": 74727068.0, "step": 62140 }, { "entropy": 1.7926482424139976, "epoch": 0.19265956218385735, "grad_norm": 3.997764825820923, "learning_rate": 5.763693645876296e-06, "loss": 0.4083, "mean_token_accuracy": 0.8492718085646629, "num_tokens": 74740269.0, "step": 62150 }, { "entropy": 1.866290318965912, "epoch": 0.19269056130890705, "grad_norm": 4.6976494789123535, "learning_rate": 5.763230002198225e-06, "loss": 0.4993, "mean_token_accuracy": 0.8362017720937729, "num_tokens": 74751597.0, "step": 62160 }, { "entropy": 1.8112548857927322, "epoch": 0.19272156043395675, "grad_norm": 8.123526573181152, "learning_rate": 5.762766470391583e-06, "loss": 0.4779, "mean_token_accuracy": 0.8409424394369125, "num_tokens": 74764989.0, "step": 62170 }, { "entropy": 1.753453615307808, "epoch": 0.19275255955900644, "grad_norm": 4.4287848472595215, "learning_rate": 5.762303050411388e-06, "loss": 0.397, "mean_token_accuracy": 0.8451619282364845, "num_tokens": 74778762.0, "step": 62180 }, { "entropy": 1.929175502061844, "epoch": 0.19278355868405614, "grad_norm": 9.11049747467041, "learning_rate": 5.761839742212686e-06, "loss": 0.5584, "mean_token_accuracy": 0.841756422817707, "num_tokens": 74789840.0, "step": 62190 }, { "entropy": 1.9000461488962173, "epoch": 0.19281455780910584, "grad_norm": 4.639410972595215, "learning_rate": 5.761376545750544e-06, "loss": 0.493, "mean_token_accuracy": 0.8386220842599869, "num_tokens": 74801445.0, "step": 62200 }, { "entropy": 1.8612179577350616, "epoch": 0.19284555693415553, "grad_norm": 9.239517211914062, "learning_rate": 5.760913460980057e-06, "loss": 0.4894, "mean_token_accuracy": 0.8468897223472596, "num_tokens": 74813988.0, "step": 62210 }, { "entropy": 1.7820834949612618, "epoch": 0.19287655605920523, "grad_norm": 3.959775447845459, "learning_rate": 5.760450487856346e-06, "loss": 0.4573, "mean_token_accuracy": 0.8533436298370362, "num_tokens": 74827651.0, "step": 62220 }, { "entropy": 1.8692509412765503, "epoch": 0.19290755518425493, "grad_norm": 9.524608612060547, "learning_rate": 5.759987626334555e-06, "loss": 0.498, "mean_token_accuracy": 0.8503947392106056, "num_tokens": 74840095.0, "step": 62230 }, { "entropy": 1.9380140736699105, "epoch": 0.19293855430930462, "grad_norm": 8.502176284790039, "learning_rate": 5.759524876369853e-06, "loss": 0.5736, "mean_token_accuracy": 0.8298733696341515, "num_tokens": 74851878.0, "step": 62240 }, { "entropy": 1.928398758172989, "epoch": 0.19296955343435432, "grad_norm": 9.923539161682129, "learning_rate": 5.759062237917436e-06, "loss": 0.523, "mean_token_accuracy": 0.8408984065055847, "num_tokens": 74862822.0, "step": 62250 }, { "entropy": 1.8122706308960914, "epoch": 0.19300055255940401, "grad_norm": 8.388425827026367, "learning_rate": 5.758599710932528e-06, "loss": 0.4573, "mean_token_accuracy": 0.8447504699230194, "num_tokens": 74875649.0, "step": 62260 }, { "entropy": 1.8911856144666672, "epoch": 0.1930315516844537, "grad_norm": 8.741740226745605, "learning_rate": 5.75813729537037e-06, "loss": 0.5357, "mean_token_accuracy": 0.8428622677922248, "num_tokens": 74887627.0, "step": 62270 }, { "entropy": 1.86591185182333, "epoch": 0.1930625508095034, "grad_norm": 8.541808128356934, "learning_rate": 5.757674991186235e-06, "loss": 0.4936, "mean_token_accuracy": 0.8314554423093796, "num_tokens": 74900605.0, "step": 62280 }, { "entropy": 1.926130199432373, "epoch": 0.1930935499345531, "grad_norm": 7.839807510375977, "learning_rate": 5.75721279833542e-06, "loss": 0.5659, "mean_token_accuracy": 0.8310964539647102, "num_tokens": 74912462.0, "step": 62290 }, { "entropy": 1.8626934379339217, "epoch": 0.1931245490596028, "grad_norm": 11.213129997253418, "learning_rate": 5.756750716773244e-06, "loss": 0.4776, "mean_token_accuracy": 0.8312896370887757, "num_tokens": 74924678.0, "step": 62300 }, { "entropy": 1.9561289519071579, "epoch": 0.1931555481846525, "grad_norm": 9.022578239440918, "learning_rate": 5.7562887464550565e-06, "loss": 0.5684, "mean_token_accuracy": 0.83260398209095, "num_tokens": 74935522.0, "step": 62310 }, { "entropy": 1.935487399995327, "epoch": 0.1931865473097022, "grad_norm": 8.986858367919922, "learning_rate": 5.755826887336227e-06, "loss": 0.5735, "mean_token_accuracy": 0.8362299337983131, "num_tokens": 74947411.0, "step": 62320 }, { "entropy": 1.8337993949651719, "epoch": 0.1932175464347519, "grad_norm": 8.892833709716797, "learning_rate": 5.75536513937215e-06, "loss": 0.5033, "mean_token_accuracy": 0.840664692223072, "num_tokens": 74961058.0, "step": 62330 }, { "entropy": 1.9785827055573464, "epoch": 0.1932485455598016, "grad_norm": 8.412293434143066, "learning_rate": 5.7549035025182494e-06, "loss": 0.5438, "mean_token_accuracy": 0.8272167652845382, "num_tokens": 74973029.0, "step": 62340 }, { "entropy": 1.8726650208234787, "epoch": 0.19327954468485126, "grad_norm": 3.809394598007202, "learning_rate": 5.754441976729972e-06, "loss": 0.4827, "mean_token_accuracy": 0.846262401342392, "num_tokens": 74985904.0, "step": 62350 }, { "entropy": 1.8960716605186463, "epoch": 0.19331054380990095, "grad_norm": 4.894566535949707, "learning_rate": 5.753980561962787e-06, "loss": 0.5005, "mean_token_accuracy": 0.846526712179184, "num_tokens": 74997658.0, "step": 62360 }, { "entropy": 1.9618462428450585, "epoch": 0.19334154293495065, "grad_norm": 7.828151226043701, "learning_rate": 5.753519258172194e-06, "loss": 0.5315, "mean_token_accuracy": 0.8398016721010209, "num_tokens": 75009078.0, "step": 62370 }, { "entropy": 1.8819516450166702, "epoch": 0.19337254206000035, "grad_norm": 8.260143280029297, "learning_rate": 5.753058065313714e-06, "loss": 0.5095, "mean_token_accuracy": 0.8390106439590455, "num_tokens": 75021094.0, "step": 62380 }, { "entropy": 1.8420429840683936, "epoch": 0.19340354118505004, "grad_norm": 7.1242475509643555, "learning_rate": 5.7525969833428895e-06, "loss": 0.5266, "mean_token_accuracy": 0.834058178961277, "num_tokens": 75033633.0, "step": 62390 }, { "entropy": 1.9346123069524765, "epoch": 0.19343454031009974, "grad_norm": 7.526602268218994, "learning_rate": 5.752136012215297e-06, "loss": 0.5595, "mean_token_accuracy": 0.8317095413804054, "num_tokens": 75045262.0, "step": 62400 }, { "entropy": 1.9385161980986596, "epoch": 0.19346553943514944, "grad_norm": 7.26128625869751, "learning_rate": 5.751675151886529e-06, "loss": 0.492, "mean_token_accuracy": 0.8422947779297829, "num_tokens": 75057520.0, "step": 62410 }, { "entropy": 1.8083595961332322, "epoch": 0.19349653856019913, "grad_norm": 8.095551490783691, "learning_rate": 5.751214402312208e-06, "loss": 0.4523, "mean_token_accuracy": 0.8498411178588867, "num_tokens": 75070750.0, "step": 62420 }, { "entropy": 1.8618351459503173, "epoch": 0.19352753768524883, "grad_norm": 9.302799224853516, "learning_rate": 5.750753763447981e-06, "loss": 0.4894, "mean_token_accuracy": 0.8428505852818489, "num_tokens": 75083142.0, "step": 62430 }, { "entropy": 1.874634511768818, "epoch": 0.19355853681029853, "grad_norm": 8.7201566696167, "learning_rate": 5.750293235249518e-06, "loss": 0.4875, "mean_token_accuracy": 0.8446508884429932, "num_tokens": 75094939.0, "step": 62440 }, { "entropy": 1.9367821574211121, "epoch": 0.19358953593534822, "grad_norm": 8.614350318908691, "learning_rate": 5.749832817672515e-06, "loss": 0.5362, "mean_token_accuracy": 0.8392156153917313, "num_tokens": 75106355.0, "step": 62450 }, { "entropy": 1.873591238260269, "epoch": 0.19362053506039792, "grad_norm": 9.169925689697266, "learning_rate": 5.749372510672692e-06, "loss": 0.4925, "mean_token_accuracy": 0.8400036081671715, "num_tokens": 75117711.0, "step": 62460 }, { "entropy": 1.8877021759748458, "epoch": 0.19365153418544762, "grad_norm": 4.023455619812012, "learning_rate": 5.748912314205795e-06, "loss": 0.4643, "mean_token_accuracy": 0.8442784741520881, "num_tokens": 75130177.0, "step": 62470 }, { "entropy": 1.9278222426772118, "epoch": 0.1936825333104973, "grad_norm": 8.48756217956543, "learning_rate": 5.748452228227594e-06, "loss": 0.5062, "mean_token_accuracy": 0.8467386439442635, "num_tokens": 75141376.0, "step": 62480 }, { "entropy": 1.8919538147747517, "epoch": 0.193713532435547, "grad_norm": 8.26032829284668, "learning_rate": 5.7479922526938844e-06, "loss": 0.4982, "mean_token_accuracy": 0.8378899380564689, "num_tokens": 75153872.0, "step": 62490 }, { "entropy": 1.975018060207367, "epoch": 0.1937445315605967, "grad_norm": 12.015787124633789, "learning_rate": 5.747532387560486e-06, "loss": 0.5763, "mean_token_accuracy": 0.8265995383262634, "num_tokens": 75164449.0, "step": 62500 }, { "entropy": 1.9173302993178367, "epoch": 0.1937755306856464, "grad_norm": 9.031746864318848, "learning_rate": 5.747072632783242e-06, "loss": 0.499, "mean_token_accuracy": 0.8385538429021835, "num_tokens": 75176299.0, "step": 62510 }, { "entropy": 1.9593890875577926, "epoch": 0.1938065298106961, "grad_norm": 9.437821388244629, "learning_rate": 5.746612988318023e-06, "loss": 0.5156, "mean_token_accuracy": 0.8430341646075249, "num_tokens": 75187861.0, "step": 62520 }, { "entropy": 1.8832410350441933, "epoch": 0.1938375289357458, "grad_norm": 13.652206420898438, "learning_rate": 5.7461534541207234e-06, "loss": 0.4873, "mean_token_accuracy": 0.8412627220153809, "num_tokens": 75200163.0, "step": 62530 }, { "entropy": 1.9353638172149659, "epoch": 0.1938685280607955, "grad_norm": 7.707444667816162, "learning_rate": 5.745694030147259e-06, "loss": 0.5079, "mean_token_accuracy": 0.839554651081562, "num_tokens": 75211123.0, "step": 62540 }, { "entropy": 1.8284053675830365, "epoch": 0.1938995271858452, "grad_norm": 8.445575714111328, "learning_rate": 5.745234716353575e-06, "loss": 0.4667, "mean_token_accuracy": 0.84172955006361, "num_tokens": 75223852.0, "step": 62550 }, { "entropy": 1.9116384595632554, "epoch": 0.19393052631089489, "grad_norm": 8.664936065673828, "learning_rate": 5.744775512695639e-06, "loss": 0.5437, "mean_token_accuracy": 0.8343887582421303, "num_tokens": 75235614.0, "step": 62560 }, { "entropy": 1.8100930273532867, "epoch": 0.19396152543594458, "grad_norm": 7.181854724884033, "learning_rate": 5.744316419129445e-06, "loss": 0.4493, "mean_token_accuracy": 0.8528478279709816, "num_tokens": 75248586.0, "step": 62570 }, { "entropy": 1.9202060773968697, "epoch": 0.19399252456099428, "grad_norm": 3.0128285884857178, "learning_rate": 5.743857435611008e-06, "loss": 0.5123, "mean_token_accuracy": 0.8501577556133271, "num_tokens": 75259638.0, "step": 62580 }, { "entropy": 1.782825130224228, "epoch": 0.19402352368604397, "grad_norm": 9.068933486938477, "learning_rate": 5.743398562096369e-06, "loss": 0.4205, "mean_token_accuracy": 0.8459088578820229, "num_tokens": 75273019.0, "step": 62590 }, { "entropy": 1.912159901857376, "epoch": 0.19405452281109364, "grad_norm": 8.978429794311523, "learning_rate": 5.742939798541598e-06, "loss": 0.5199, "mean_token_accuracy": 0.8394264042377472, "num_tokens": 75284329.0, "step": 62600 }, { "entropy": 1.9709818661212921, "epoch": 0.19408552193614334, "grad_norm": 8.043622970581055, "learning_rate": 5.742481144902782e-06, "loss": 0.5889, "mean_token_accuracy": 0.8323390766978264, "num_tokens": 75295155.0, "step": 62610 }, { "entropy": 1.8821393936872481, "epoch": 0.19411652106119304, "grad_norm": 7.908102989196777, "learning_rate": 5.742022601136038e-06, "loss": 0.4848, "mean_token_accuracy": 0.8384183317422866, "num_tokens": 75306535.0, "step": 62620 }, { "entropy": 1.8917057454586028, "epoch": 0.19414752018624273, "grad_norm": 4.215620994567871, "learning_rate": 5.741564167197507e-06, "loss": 0.514, "mean_token_accuracy": 0.8402154177427292, "num_tokens": 75318253.0, "step": 62630 }, { "entropy": 1.9241933241486548, "epoch": 0.19417851931129243, "grad_norm": 9.891353607177734, "learning_rate": 5.741105843043353e-06, "loss": 0.5965, "mean_token_accuracy": 0.818701134622097, "num_tokens": 75329353.0, "step": 62640 }, { "entropy": 1.9373375624418259, "epoch": 0.19420951843634213, "grad_norm": 9.279706954956055, "learning_rate": 5.740647628629763e-06, "loss": 0.525, "mean_token_accuracy": 0.8327772691845894, "num_tokens": 75340932.0, "step": 62650 }, { "entropy": 1.8756248265504838, "epoch": 0.19424051756139182, "grad_norm": 5.068475723266602, "learning_rate": 5.740189523912952e-06, "loss": 0.5173, "mean_token_accuracy": 0.8361865937709808, "num_tokens": 75353705.0, "step": 62660 }, { "entropy": 1.9200247406959534, "epoch": 0.19427151668644152, "grad_norm": 8.971196174621582, "learning_rate": 5.7397315288491585e-06, "loss": 0.553, "mean_token_accuracy": 0.8186924889683723, "num_tokens": 75365245.0, "step": 62670 }, { "entropy": 1.87558753490448, "epoch": 0.19430251581149122, "grad_norm": 8.281537055969238, "learning_rate": 5.7392736433946424e-06, "loss": 0.4695, "mean_token_accuracy": 0.8481769561767578, "num_tokens": 75377140.0, "step": 62680 }, { "entropy": 1.904216541349888, "epoch": 0.1943335149365409, "grad_norm": 8.079618453979492, "learning_rate": 5.738815867505695e-06, "loss": 0.5325, "mean_token_accuracy": 0.8357766851782799, "num_tokens": 75389067.0, "step": 62690 }, { "entropy": 1.8466911643743515, "epoch": 0.1943645140615906, "grad_norm": 8.569628715515137, "learning_rate": 5.7383582011386215e-06, "loss": 0.4744, "mean_token_accuracy": 0.8424016252160073, "num_tokens": 75401290.0, "step": 62700 }, { "entropy": 1.8429038256406785, "epoch": 0.1943955131866403, "grad_norm": 4.195886135101318, "learning_rate": 5.737900644249762e-06, "loss": 0.4951, "mean_token_accuracy": 0.8370422974228859, "num_tokens": 75414571.0, "step": 62710 }, { "entropy": 1.870844414830208, "epoch": 0.19442651231169, "grad_norm": 10.276758193969727, "learning_rate": 5.737443196795474e-06, "loss": 0.5647, "mean_token_accuracy": 0.836325615644455, "num_tokens": 75426778.0, "step": 62720 }, { "entropy": 1.8725099056959151, "epoch": 0.1944575114367397, "grad_norm": 7.546666622161865, "learning_rate": 5.736985858732143e-06, "loss": 0.4965, "mean_token_accuracy": 0.8451737076044082, "num_tokens": 75438333.0, "step": 62730 }, { "entropy": 1.9273310750722885, "epoch": 0.1944885105617894, "grad_norm": 9.644882202148438, "learning_rate": 5.736528630016177e-06, "loss": 0.5543, "mean_token_accuracy": 0.8291450396180153, "num_tokens": 75450773.0, "step": 62740 }, { "entropy": 1.9267635852098466, "epoch": 0.1945195096868391, "grad_norm": 3.730067729949951, "learning_rate": 5.7360715106040076e-06, "loss": 0.5217, "mean_token_accuracy": 0.8472494944930077, "num_tokens": 75462100.0, "step": 62750 }, { "entropy": 1.8266283124685287, "epoch": 0.1945505088118888, "grad_norm": 7.616328239440918, "learning_rate": 5.735614500452095e-06, "loss": 0.437, "mean_token_accuracy": 0.8459511280059815, "num_tokens": 75475416.0, "step": 62760 }, { "entropy": 1.9178658738732337, "epoch": 0.19458150793693849, "grad_norm": 9.340888977050781, "learning_rate": 5.7351575995169186e-06, "loss": 0.4936, "mean_token_accuracy": 0.835166348516941, "num_tokens": 75487885.0, "step": 62770 }, { "entropy": 1.8748321622610091, "epoch": 0.19461250706198818, "grad_norm": 3.991840124130249, "learning_rate": 5.734700807754984e-06, "loss": 0.478, "mean_token_accuracy": 0.8418037429451942, "num_tokens": 75499947.0, "step": 62780 }, { "entropy": 1.8018044173717498, "epoch": 0.19464350618703788, "grad_norm": 8.51059341430664, "learning_rate": 5.734244125122822e-06, "loss": 0.4169, "mean_token_accuracy": 0.8524055883288384, "num_tokens": 75513475.0, "step": 62790 }, { "entropy": 1.9655559062957764, "epoch": 0.19467450531208758, "grad_norm": 10.379049301147461, "learning_rate": 5.733787551576987e-06, "loss": 0.5602, "mean_token_accuracy": 0.8376842170953751, "num_tokens": 75524934.0, "step": 62800 }, { "entropy": 1.880786569416523, "epoch": 0.19470550443713727, "grad_norm": 8.091976165771484, "learning_rate": 5.7333310870740565e-06, "loss": 0.4844, "mean_token_accuracy": 0.8456220507621766, "num_tokens": 75537225.0, "step": 62810 }, { "entropy": 1.9510624960064888, "epoch": 0.19473650356218697, "grad_norm": 9.744572639465332, "learning_rate": 5.732874731570633e-06, "loss": 0.4754, "mean_token_accuracy": 0.8450943067669868, "num_tokens": 75548663.0, "step": 62820 }, { "entropy": 1.9875933229923248, "epoch": 0.19476750268723667, "grad_norm": 8.468138694763184, "learning_rate": 5.732418485023345e-06, "loss": 0.5636, "mean_token_accuracy": 0.8282896682620049, "num_tokens": 75559697.0, "step": 62830 }, { "entropy": 1.8670821204781531, "epoch": 0.19479850181228636, "grad_norm": 9.108405113220215, "learning_rate": 5.731962347388841e-06, "loss": 0.5391, "mean_token_accuracy": 0.8396436840295791, "num_tokens": 75571652.0, "step": 62840 }, { "entropy": 1.8874829977750778, "epoch": 0.19482950093733603, "grad_norm": 7.285294055938721, "learning_rate": 5.731506318623798e-06, "loss": 0.5263, "mean_token_accuracy": 0.836583924293518, "num_tokens": 75583725.0, "step": 62850 }, { "entropy": 1.8510995037853717, "epoch": 0.19486050006238573, "grad_norm": 8.491140365600586, "learning_rate": 5.731050398684913e-06, "loss": 0.4372, "mean_token_accuracy": 0.8445952758193016, "num_tokens": 75596812.0, "step": 62860 }, { "entropy": 1.9246593803167342, "epoch": 0.19489149918743542, "grad_norm": 8.031885147094727, "learning_rate": 5.7305945875289126e-06, "loss": 0.5581, "mean_token_accuracy": 0.8263408780097962, "num_tokens": 75609185.0, "step": 62870 }, { "entropy": 1.9077463194727897, "epoch": 0.19492249831248512, "grad_norm": 8.196245193481445, "learning_rate": 5.730138885112542e-06, "loss": 0.4949, "mean_token_accuracy": 0.837520606815815, "num_tokens": 75621376.0, "step": 62880 }, { "entropy": 1.9066799104213714, "epoch": 0.19495349743753482, "grad_norm": 8.017034530639648, "learning_rate": 5.729683291392573e-06, "loss": 0.522, "mean_token_accuracy": 0.8396532908082008, "num_tokens": 75634042.0, "step": 62890 }, { "entropy": 1.8118005082011224, "epoch": 0.19498449656258451, "grad_norm": 7.996087551116943, "learning_rate": 5.7292278063258015e-06, "loss": 0.4121, "mean_token_accuracy": 0.8566027864813804, "num_tokens": 75647509.0, "step": 62900 }, { "entropy": 1.9176707819104195, "epoch": 0.1950154956876342, "grad_norm": 3.844882011413574, "learning_rate": 5.728772429869045e-06, "loss": 0.5002, "mean_token_accuracy": 0.842312179505825, "num_tokens": 75659954.0, "step": 62910 }, { "entropy": 1.8584643453359604, "epoch": 0.1950464948126839, "grad_norm": 4.170616149902344, "learning_rate": 5.72831716197915e-06, "loss": 0.422, "mean_token_accuracy": 0.8405888646841049, "num_tokens": 75672753.0, "step": 62920 }, { "entropy": 1.8920593187212944, "epoch": 0.1950774939377336, "grad_norm": 8.466096878051758, "learning_rate": 5.727862002612982e-06, "loss": 0.4855, "mean_token_accuracy": 0.8370663553476334, "num_tokens": 75685124.0, "step": 62930 }, { "entropy": 1.9039631336927414, "epoch": 0.1951084930627833, "grad_norm": 10.970901489257812, "learning_rate": 5.727406951727435e-06, "loss": 0.5118, "mean_token_accuracy": 0.837016536295414, "num_tokens": 75696871.0, "step": 62940 }, { "entropy": 1.8986446857452393, "epoch": 0.195139492187833, "grad_norm": 8.603690147399902, "learning_rate": 5.726952009279424e-06, "loss": 0.4586, "mean_token_accuracy": 0.845548364520073, "num_tokens": 75708850.0, "step": 62950 }, { "entropy": 1.879063467681408, "epoch": 0.1951704913128827, "grad_norm": 8.833934783935547, "learning_rate": 5.726497175225886e-06, "loss": 0.5206, "mean_token_accuracy": 0.8337208881974221, "num_tokens": 75721064.0, "step": 62960 }, { "entropy": 1.9283981889486312, "epoch": 0.1952014904379324, "grad_norm": 4.0578155517578125, "learning_rate": 5.726042449523786e-06, "loss": 0.5236, "mean_token_accuracy": 0.8367056354880333, "num_tokens": 75732835.0, "step": 62970 }, { "entropy": 1.9276798009872436, "epoch": 0.1952324895629821, "grad_norm": 7.493443489074707, "learning_rate": 5.725587832130112e-06, "loss": 0.515, "mean_token_accuracy": 0.841886368393898, "num_tokens": 75743432.0, "step": 62980 }, { "entropy": 1.7954487293958663, "epoch": 0.19526348868803178, "grad_norm": 8.081300735473633, "learning_rate": 5.725133323001873e-06, "loss": 0.4176, "mean_token_accuracy": 0.8556428179144859, "num_tokens": 75757107.0, "step": 62990 }, { "entropy": 1.8096677586436272, "epoch": 0.19529448781308148, "grad_norm": 9.376117706298828, "learning_rate": 5.724678922096108e-06, "loss": 0.4369, "mean_token_accuracy": 0.8452500954270363, "num_tokens": 75769351.0, "step": 63000 }, { "entropy": 1.8731877133250237, "epoch": 0.19532548693813118, "grad_norm": 9.466216087341309, "learning_rate": 5.724224629369872e-06, "loss": 0.4606, "mean_token_accuracy": 0.8412355482578278, "num_tokens": 75781771.0, "step": 63010 }, { "entropy": 1.8507128790020944, "epoch": 0.19535648606318087, "grad_norm": 10.853748321533203, "learning_rate": 5.72377044478025e-06, "loss": 0.498, "mean_token_accuracy": 0.8409447997808457, "num_tokens": 75794706.0, "step": 63020 }, { "entropy": 1.9449727565050126, "epoch": 0.19538748518823057, "grad_norm": 6.4505085945129395, "learning_rate": 5.723316368284348e-06, "loss": 0.473, "mean_token_accuracy": 0.8548792108893395, "num_tokens": 75805711.0, "step": 63030 }, { "entropy": 1.8775279074907303, "epoch": 0.19541848431328027, "grad_norm": 7.87474250793457, "learning_rate": 5.722862399839298e-06, "loss": 0.4712, "mean_token_accuracy": 0.8526403203606605, "num_tokens": 75817522.0, "step": 63040 }, { "entropy": 1.856291452050209, "epoch": 0.19544948343832996, "grad_norm": 4.191627025604248, "learning_rate": 5.722408539402254e-06, "loss": 0.4899, "mean_token_accuracy": 0.8563273102045059, "num_tokens": 75829497.0, "step": 63050 }, { "entropy": 1.9053906798362732, "epoch": 0.19548048256337966, "grad_norm": 8.759142875671387, "learning_rate": 5.72195478693039e-06, "loss": 0.5129, "mean_token_accuracy": 0.8432946696877479, "num_tokens": 75840759.0, "step": 63060 }, { "entropy": 1.907120332121849, "epoch": 0.19551148168842936, "grad_norm": 9.412018775939941, "learning_rate": 5.721501142380913e-06, "loss": 0.4955, "mean_token_accuracy": 0.8373003304004669, "num_tokens": 75852503.0, "step": 63070 }, { "entropy": 2.0056669265031815, "epoch": 0.19554248081347905, "grad_norm": 7.475878715515137, "learning_rate": 5.721047605711047e-06, "loss": 0.57, "mean_token_accuracy": 0.8297986865043641, "num_tokens": 75863521.0, "step": 63080 }, { "entropy": 1.922423042356968, "epoch": 0.19557347993852872, "grad_norm": 4.381282329559326, "learning_rate": 5.720594176878039e-06, "loss": 0.4947, "mean_token_accuracy": 0.8460247412323951, "num_tokens": 75874691.0, "step": 63090 }, { "entropy": 1.9274744465947151, "epoch": 0.19560447906357842, "grad_norm": 10.508957862854004, "learning_rate": 5.720140855839166e-06, "loss": 0.5335, "mean_token_accuracy": 0.8317728966474534, "num_tokens": 75887760.0, "step": 63100 }, { "entropy": 1.9512497708201408, "epoch": 0.19563547818862811, "grad_norm": 9.120226860046387, "learning_rate": 5.719687642551722e-06, "loss": 0.5086, "mean_token_accuracy": 0.8364842370152473, "num_tokens": 75899765.0, "step": 63110 }, { "entropy": 1.925400537252426, "epoch": 0.1956664773136778, "grad_norm": 5.570336818695068, "learning_rate": 5.719234536973028e-06, "loss": 0.5209, "mean_token_accuracy": 0.8405450001358986, "num_tokens": 75911921.0, "step": 63120 }, { "entropy": 1.836472137272358, "epoch": 0.1956974764387275, "grad_norm": 2.734543561935425, "learning_rate": 5.718781539060429e-06, "loss": 0.4246, "mean_token_accuracy": 0.851534178853035, "num_tokens": 75925915.0, "step": 63130 }, { "entropy": 1.8467722043395043, "epoch": 0.1957284755637772, "grad_norm": 8.128966331481934, "learning_rate": 5.718328648771291e-06, "loss": 0.4669, "mean_token_accuracy": 0.851094801723957, "num_tokens": 75938929.0, "step": 63140 }, { "entropy": 1.9085650324821473, "epoch": 0.1957594746888269, "grad_norm": 8.325636863708496, "learning_rate": 5.717875866063008e-06, "loss": 0.4853, "mean_token_accuracy": 0.8383592694997788, "num_tokens": 75949645.0, "step": 63150 }, { "entropy": 1.8661453172564506, "epoch": 0.1957904738138766, "grad_norm": 4.130495548248291, "learning_rate": 5.717423190892991e-06, "loss": 0.4474, "mean_token_accuracy": 0.8449689701199532, "num_tokens": 75962507.0, "step": 63160 }, { "entropy": 1.9401205703616142, "epoch": 0.1958214729389263, "grad_norm": 10.611771583557129, "learning_rate": 5.716970623218681e-06, "loss": 0.5661, "mean_token_accuracy": 0.8341074407100677, "num_tokens": 75974430.0, "step": 63170 }, { "entropy": 1.834306775033474, "epoch": 0.195852472063976, "grad_norm": 8.28214168548584, "learning_rate": 5.716518162997542e-06, "loss": 0.4521, "mean_token_accuracy": 0.8515095382928848, "num_tokens": 75987170.0, "step": 63180 }, { "entropy": 1.8667117178440094, "epoch": 0.1958834711890257, "grad_norm": 4.622128963470459, "learning_rate": 5.716065810187056e-06, "loss": 0.4767, "mean_token_accuracy": 0.8383040621876716, "num_tokens": 75999364.0, "step": 63190 }, { "entropy": 1.9117874279618263, "epoch": 0.19591447031407538, "grad_norm": 7.746886730194092, "learning_rate": 5.715613564744734e-06, "loss": 0.5304, "mean_token_accuracy": 0.8401778340339661, "num_tokens": 76010533.0, "step": 63200 }, { "entropy": 1.860894750058651, "epoch": 0.19594546943912508, "grad_norm": 3.8454928398132324, "learning_rate": 5.715161426628111e-06, "loss": 0.4673, "mean_token_accuracy": 0.847791762650013, "num_tokens": 76023243.0, "step": 63210 }, { "entropy": 1.8731217697262763, "epoch": 0.19597646856417478, "grad_norm": 4.6532745361328125, "learning_rate": 5.71470939579474e-06, "loss": 0.4952, "mean_token_accuracy": 0.8273180708289146, "num_tokens": 76035128.0, "step": 63220 }, { "entropy": 1.892588709294796, "epoch": 0.19600746768922447, "grad_norm": 10.033417701721191, "learning_rate": 5.714257472202201e-06, "loss": 0.4887, "mean_token_accuracy": 0.8375523954629898, "num_tokens": 76047088.0, "step": 63230 }, { "entropy": 1.8054717764258386, "epoch": 0.19603846681427417, "grad_norm": 4.947404384613037, "learning_rate": 5.7138056558080975e-06, "loss": 0.4124, "mean_token_accuracy": 0.8533313110470772, "num_tokens": 76060229.0, "step": 63240 }, { "entropy": 1.9267044529318809, "epoch": 0.19606946593932387, "grad_norm": 8.534299850463867, "learning_rate": 5.713353946570057e-06, "loss": 0.5239, "mean_token_accuracy": 0.8383896484971046, "num_tokens": 76072124.0, "step": 63250 }, { "entropy": 1.9226173907518387, "epoch": 0.19610046506437356, "grad_norm": 8.385056495666504, "learning_rate": 5.712902344445731e-06, "loss": 0.5193, "mean_token_accuracy": 0.8362686946988106, "num_tokens": 76083932.0, "step": 63260 }, { "entropy": 1.8576195508241653, "epoch": 0.19613146418942326, "grad_norm": 9.054718017578125, "learning_rate": 5.712450849392791e-06, "loss": 0.4886, "mean_token_accuracy": 0.8375715285539627, "num_tokens": 76096111.0, "step": 63270 }, { "entropy": 1.9397896856069565, "epoch": 0.19616246331447296, "grad_norm": 10.121234893798828, "learning_rate": 5.711999461368935e-06, "loss": 0.5309, "mean_token_accuracy": 0.834457078576088, "num_tokens": 76107744.0, "step": 63280 }, { "entropy": 1.8755070850253106, "epoch": 0.19619346243952265, "grad_norm": 9.817523956298828, "learning_rate": 5.711548180331882e-06, "loss": 0.4992, "mean_token_accuracy": 0.8419964432716369, "num_tokens": 76119347.0, "step": 63290 }, { "entropy": 1.915390780568123, "epoch": 0.19622446156457235, "grad_norm": 7.848161697387695, "learning_rate": 5.711097006239378e-06, "loss": 0.56, "mean_token_accuracy": 0.8367161169648171, "num_tokens": 76130270.0, "step": 63300 }, { "entropy": 1.9267538145184517, "epoch": 0.19625546068962205, "grad_norm": 8.433647155761719, "learning_rate": 5.710645939049189e-06, "loss": 0.5233, "mean_token_accuracy": 0.8365942299365997, "num_tokens": 76141625.0, "step": 63310 }, { "entropy": 1.9020789548754693, "epoch": 0.19628645981467174, "grad_norm": 7.910606861114502, "learning_rate": 5.710194978719105e-06, "loss": 0.499, "mean_token_accuracy": 0.838070061802864, "num_tokens": 76153615.0, "step": 63320 }, { "entropy": 1.8399833947420121, "epoch": 0.19631745893972144, "grad_norm": 9.023702621459961, "learning_rate": 5.70974412520694e-06, "loss": 0.4621, "mean_token_accuracy": 0.8374461650848388, "num_tokens": 76166808.0, "step": 63330 }, { "entropy": 1.8543090134859086, "epoch": 0.1963484580647711, "grad_norm": 4.479185581207275, "learning_rate": 5.709293378470532e-06, "loss": 0.5471, "mean_token_accuracy": 0.8332849636673927, "num_tokens": 76180101.0, "step": 63340 }, { "entropy": 1.9013725534081458, "epoch": 0.1963794571898208, "grad_norm": 9.714118003845215, "learning_rate": 5.70884273846774e-06, "loss": 0.5029, "mean_token_accuracy": 0.8363057851791382, "num_tokens": 76191945.0, "step": 63350 }, { "entropy": 1.764454497396946, "epoch": 0.1964104563148705, "grad_norm": 9.242158889770508, "learning_rate": 5.7083922051564485e-06, "loss": 0.4131, "mean_token_accuracy": 0.8551331639289856, "num_tokens": 76205433.0, "step": 63360 }, { "entropy": 1.8029524132609367, "epoch": 0.1964414554399202, "grad_norm": 9.247615814208984, "learning_rate": 5.707941778494567e-06, "loss": 0.4469, "mean_token_accuracy": 0.8530771881341934, "num_tokens": 76218558.0, "step": 63370 }, { "entropy": 1.8929936558008194, "epoch": 0.1964724545649699, "grad_norm": 8.096500396728516, "learning_rate": 5.707491458440021e-06, "loss": 0.538, "mean_token_accuracy": 0.8339035794138908, "num_tokens": 76229934.0, "step": 63380 }, { "entropy": 1.92166768014431, "epoch": 0.1965034536900196, "grad_norm": 5.686055660247803, "learning_rate": 5.707041244950767e-06, "loss": 0.5108, "mean_token_accuracy": 0.8357293531298637, "num_tokens": 76241582.0, "step": 63390 }, { "entropy": 1.9319213822484016, "epoch": 0.1965344528150693, "grad_norm": 9.794820785522461, "learning_rate": 5.706591137984782e-06, "loss": 0.5742, "mean_token_accuracy": 0.8362261816859246, "num_tokens": 76253353.0, "step": 63400 }, { "entropy": 1.8578762322664262, "epoch": 0.19656545194011898, "grad_norm": 7.473123550415039, "learning_rate": 5.706141137500063e-06, "loss": 0.4557, "mean_token_accuracy": 0.8495231464505195, "num_tokens": 76266223.0, "step": 63410 }, { "entropy": 1.897558230161667, "epoch": 0.19659645106516868, "grad_norm": 8.56474494934082, "learning_rate": 5.705691243454636e-06, "loss": 0.4978, "mean_token_accuracy": 0.8442750841379165, "num_tokens": 76278256.0, "step": 63420 }, { "entropy": 1.8700624421238898, "epoch": 0.19662745019021838, "grad_norm": 4.685993194580078, "learning_rate": 5.705241455806546e-06, "loss": 0.5335, "mean_token_accuracy": 0.8440624356269837, "num_tokens": 76290947.0, "step": 63430 }, { "entropy": 1.9605732291936875, "epoch": 0.19665844931526807, "grad_norm": 9.56114387512207, "learning_rate": 5.704791774513863e-06, "loss": 0.4958, "mean_token_accuracy": 0.8460546612739563, "num_tokens": 76302224.0, "step": 63440 }, { "entropy": 1.9243500038981438, "epoch": 0.19668944844031777, "grad_norm": 4.1643195152282715, "learning_rate": 5.704342199534677e-06, "loss": 0.5014, "mean_token_accuracy": 0.8379947647452355, "num_tokens": 76314456.0, "step": 63450 }, { "entropy": 1.9549641892313958, "epoch": 0.19672044756536747, "grad_norm": 10.69619083404541, "learning_rate": 5.703892730827107e-06, "loss": 0.5515, "mean_token_accuracy": 0.8345342621207237, "num_tokens": 76325957.0, "step": 63460 }, { "entropy": 1.9176968559622765, "epoch": 0.19675144669041716, "grad_norm": 12.198366165161133, "learning_rate": 5.703443368349289e-06, "loss": 0.5492, "mean_token_accuracy": 0.8376664772629738, "num_tokens": 76337257.0, "step": 63470 }, { "entropy": 1.8076652333140373, "epoch": 0.19678244581546686, "grad_norm": 9.941756248474121, "learning_rate": 5.7029941120593864e-06, "loss": 0.4809, "mean_token_accuracy": 0.8417710468173027, "num_tokens": 76350461.0, "step": 63480 }, { "entropy": 1.8378579676151277, "epoch": 0.19681344494051656, "grad_norm": 8.623409271240234, "learning_rate": 5.702544961915585e-06, "loss": 0.464, "mean_token_accuracy": 0.8429977357387543, "num_tokens": 76363284.0, "step": 63490 }, { "entropy": 1.9412892490625382, "epoch": 0.19684444406556625, "grad_norm": 10.340338706970215, "learning_rate": 5.702095917876089e-06, "loss": 0.5472, "mean_token_accuracy": 0.8276393041014671, "num_tokens": 76374739.0, "step": 63500 }, { "entropy": 1.9820778489112854, "epoch": 0.19687544319061595, "grad_norm": 9.503996849060059, "learning_rate": 5.701646979899134e-06, "loss": 0.5663, "mean_token_accuracy": 0.8296515002846718, "num_tokens": 76386166.0, "step": 63510 }, { "entropy": 1.8864339843392373, "epoch": 0.19690644231566565, "grad_norm": 8.490914344787598, "learning_rate": 5.70119814794297e-06, "loss": 0.4971, "mean_token_accuracy": 0.8448032855987548, "num_tokens": 76399048.0, "step": 63520 }, { "entropy": 1.9335085853934288, "epoch": 0.19693744144071534, "grad_norm": 11.991941452026367, "learning_rate": 5.700749421965877e-06, "loss": 0.5494, "mean_token_accuracy": 0.8254455342888832, "num_tokens": 76410513.0, "step": 63530 }, { "entropy": 1.9954243630170823, "epoch": 0.19696844056576504, "grad_norm": 9.641355514526367, "learning_rate": 5.700300801926151e-06, "loss": 0.5682, "mean_token_accuracy": 0.820628535747528, "num_tokens": 76422104.0, "step": 63540 }, { "entropy": 1.879958561062813, "epoch": 0.19699943969081474, "grad_norm": 4.384090423583984, "learning_rate": 5.6998522877821185e-06, "loss": 0.5171, "mean_token_accuracy": 0.8393704131245613, "num_tokens": 76434124.0, "step": 63550 }, { "entropy": 1.8802961066365242, "epoch": 0.19703043881586443, "grad_norm": 6.248841762542725, "learning_rate": 5.699403879492125e-06, "loss": 0.5114, "mean_token_accuracy": 0.8394171461462975, "num_tokens": 76446867.0, "step": 63560 }, { "entropy": 1.9431464537978171, "epoch": 0.19706143794091413, "grad_norm": 9.1453857421875, "learning_rate": 5.6989555770145386e-06, "loss": 0.5201, "mean_token_accuracy": 0.8342863008379936, "num_tokens": 76458179.0, "step": 63570 }, { "entropy": 1.8758263066411018, "epoch": 0.19709243706596383, "grad_norm": 8.520499229431152, "learning_rate": 5.69850738030775e-06, "loss": 0.4718, "mean_token_accuracy": 0.8425302848219871, "num_tokens": 76469611.0, "step": 63580 }, { "entropy": 1.9540351748466491, "epoch": 0.1971234361910135, "grad_norm": 8.32021713256836, "learning_rate": 5.698059289330175e-06, "loss": 0.5643, "mean_token_accuracy": 0.8359291762113571, "num_tokens": 76481274.0, "step": 63590 }, { "entropy": 1.8006914272904395, "epoch": 0.1971544353160632, "grad_norm": 4.276872634887695, "learning_rate": 5.69761130404025e-06, "loss": 0.4121, "mean_token_accuracy": 0.8568015277385712, "num_tokens": 76494249.0, "step": 63600 }, { "entropy": 1.9028367415070533, "epoch": 0.1971854344411129, "grad_norm": 9.324864387512207, "learning_rate": 5.6971634243964366e-06, "loss": 0.5104, "mean_token_accuracy": 0.8430081695318222, "num_tokens": 76506213.0, "step": 63610 }, { "entropy": 1.8858824104070664, "epoch": 0.19721643356616259, "grad_norm": 10.004111289978027, "learning_rate": 5.696715650357217e-06, "loss": 0.5222, "mean_token_accuracy": 0.8416607335209847, "num_tokens": 76519093.0, "step": 63620 }, { "entropy": 1.8651671513915062, "epoch": 0.19724743269121228, "grad_norm": 8.351085662841797, "learning_rate": 5.696267981881099e-06, "loss": 0.4968, "mean_token_accuracy": 0.8429739147424697, "num_tokens": 76531349.0, "step": 63630 }, { "entropy": 1.9040121346712113, "epoch": 0.19727843181626198, "grad_norm": 8.7483549118042, "learning_rate": 5.69582041892661e-06, "loss": 0.4998, "mean_token_accuracy": 0.8458465442061425, "num_tokens": 76542921.0, "step": 63640 }, { "entropy": 1.8771297112107277, "epoch": 0.19730943094131168, "grad_norm": 7.3089141845703125, "learning_rate": 5.695372961452301e-06, "loss": 0.5093, "mean_token_accuracy": 0.8363812834024429, "num_tokens": 76555655.0, "step": 63650 }, { "entropy": 1.8038627937436105, "epoch": 0.19734043006636137, "grad_norm": 7.573980808258057, "learning_rate": 5.694925609416748e-06, "loss": 0.4854, "mean_token_accuracy": 0.8495232254266739, "num_tokens": 76569165.0, "step": 63660 }, { "entropy": 1.8558023154735566, "epoch": 0.19737142919141107, "grad_norm": 8.455429077148438, "learning_rate": 5.694478362778547e-06, "loss": 0.4853, "mean_token_accuracy": 0.8412219509482384, "num_tokens": 76581380.0, "step": 63670 }, { "entropy": 1.8561557665467263, "epoch": 0.19740242831646077, "grad_norm": 3.842336416244507, "learning_rate": 5.69403122149632e-06, "loss": 0.4631, "mean_token_accuracy": 0.8478426292538643, "num_tokens": 76594529.0, "step": 63680 }, { "entropy": 1.85400443226099, "epoch": 0.19743342744151046, "grad_norm": 4.739361763000488, "learning_rate": 5.693584185528707e-06, "loss": 0.5293, "mean_token_accuracy": 0.8424937948584557, "num_tokens": 76607350.0, "step": 63690 }, { "entropy": 1.8113030225038529, "epoch": 0.19746442656656016, "grad_norm": 8.30618953704834, "learning_rate": 5.693137254834375e-06, "loss": 0.398, "mean_token_accuracy": 0.8603976845741272, "num_tokens": 76620524.0, "step": 63700 }, { "entropy": 1.853132924437523, "epoch": 0.19749542569160985, "grad_norm": 9.655352592468262, "learning_rate": 5.692690429372012e-06, "loss": 0.5033, "mean_token_accuracy": 0.8338514223694802, "num_tokens": 76633444.0, "step": 63710 }, { "entropy": 1.898238630592823, "epoch": 0.19752642481665955, "grad_norm": 4.058250904083252, "learning_rate": 5.692243709100329e-06, "loss": 0.5007, "mean_token_accuracy": 0.8403716444969177, "num_tokens": 76645664.0, "step": 63720 }, { "entropy": 1.882179079949856, "epoch": 0.19755742394170925, "grad_norm": 5.784068584442139, "learning_rate": 5.691797093978061e-06, "loss": 0.4826, "mean_token_accuracy": 0.8447945788502693, "num_tokens": 76657600.0, "step": 63730 }, { "entropy": 1.8364885538816451, "epoch": 0.19758842306675894, "grad_norm": 5.507530689239502, "learning_rate": 5.69135058396396e-06, "loss": 0.4522, "mean_token_accuracy": 0.8419953018426896, "num_tokens": 76671201.0, "step": 63740 }, { "entropy": 1.8077232390642166, "epoch": 0.19761942219180864, "grad_norm": 8.423868179321289, "learning_rate": 5.690904179016809e-06, "loss": 0.4529, "mean_token_accuracy": 0.8521018475294113, "num_tokens": 76684611.0, "step": 63750 }, { "entropy": 1.8499022141098975, "epoch": 0.19765042131685834, "grad_norm": 5.460327625274658, "learning_rate": 5.690457879095407e-06, "loss": 0.5245, "mean_token_accuracy": 0.8433385342359543, "num_tokens": 76697424.0, "step": 63760 }, { "entropy": 1.8565186977386474, "epoch": 0.19768142044190803, "grad_norm": 10.535595893859863, "learning_rate": 5.690011684158579e-06, "loss": 0.4767, "mean_token_accuracy": 0.8477386757731438, "num_tokens": 76710657.0, "step": 63770 }, { "entropy": 1.9264386996626854, "epoch": 0.19771241956695773, "grad_norm": 8.825230598449707, "learning_rate": 5.689565594165174e-06, "loss": 0.5299, "mean_token_accuracy": 0.8357585594058037, "num_tokens": 76721904.0, "step": 63780 }, { "entropy": 1.9114050999283791, "epoch": 0.19774341869200743, "grad_norm": 10.269311904907227, "learning_rate": 5.689119609074057e-06, "loss": 0.5089, "mean_token_accuracy": 0.8319689840078354, "num_tokens": 76734403.0, "step": 63790 }, { "entropy": 1.8821352809667586, "epoch": 0.19777441781705712, "grad_norm": 8.149957656860352, "learning_rate": 5.688673728844121e-06, "loss": 0.483, "mean_token_accuracy": 0.8380433395504951, "num_tokens": 76745913.0, "step": 63800 }, { "entropy": 1.9270009264349937, "epoch": 0.19780541694210682, "grad_norm": 8.53276252746582, "learning_rate": 5.688227953434282e-06, "loss": 0.5314, "mean_token_accuracy": 0.8341132417321205, "num_tokens": 76758099.0, "step": 63810 }, { "entropy": 1.8181735575199127, "epoch": 0.19783641606715652, "grad_norm": 7.680285930633545, "learning_rate": 5.687782282803477e-06, "loss": 0.4461, "mean_token_accuracy": 0.8435733437538147, "num_tokens": 76771266.0, "step": 63820 }, { "entropy": 1.8427308067679404, "epoch": 0.1978674151922062, "grad_norm": 3.6180124282836914, "learning_rate": 5.687336716910663e-06, "loss": 0.4316, "mean_token_accuracy": 0.8437779262661934, "num_tokens": 76783158.0, "step": 63830 }, { "entropy": 1.9142847254872322, "epoch": 0.19789841431725588, "grad_norm": 4.747416019439697, "learning_rate": 5.6868912557148245e-06, "loss": 0.5068, "mean_token_accuracy": 0.8309885829687118, "num_tokens": 76795826.0, "step": 63840 }, { "entropy": 1.8509532332420349, "epoch": 0.19792941344230558, "grad_norm": 8.225378036499023, "learning_rate": 5.686445899174965e-06, "loss": 0.5019, "mean_token_accuracy": 0.8449263677001, "num_tokens": 76808003.0, "step": 63850 }, { "entropy": 1.9634561657905578, "epoch": 0.19796041256735528, "grad_norm": 8.663080215454102, "learning_rate": 5.686000647250109e-06, "loss": 0.583, "mean_token_accuracy": 0.8314743667840958, "num_tokens": 76819491.0, "step": 63860 }, { "entropy": 1.9227616682648658, "epoch": 0.19799141169240497, "grad_norm": 7.894753932952881, "learning_rate": 5.685555499899311e-06, "loss": 0.5717, "mean_token_accuracy": 0.831498672068119, "num_tokens": 76831514.0, "step": 63870 }, { "entropy": 1.969376365840435, "epoch": 0.19802241081745467, "grad_norm": 9.801876068115234, "learning_rate": 5.685110457081639e-06, "loss": 0.5539, "mean_token_accuracy": 0.8390384584665298, "num_tokens": 76842781.0, "step": 63880 }, { "entropy": 1.9508319780230523, "epoch": 0.19805340994250437, "grad_norm": 11.13720989227295, "learning_rate": 5.6846655187561874e-06, "loss": 0.5257, "mean_token_accuracy": 0.835049818456173, "num_tokens": 76854513.0, "step": 63890 }, { "entropy": 1.9810258507728578, "epoch": 0.19808440906755406, "grad_norm": 9.226847648620605, "learning_rate": 5.684220684882074e-06, "loss": 0.5617, "mean_token_accuracy": 0.8322012394666671, "num_tokens": 76865518.0, "step": 63900 }, { "entropy": 1.9172653019428254, "epoch": 0.19811540819260376, "grad_norm": 4.333598613739014, "learning_rate": 5.683775955418437e-06, "loss": 0.5478, "mean_token_accuracy": 0.8240251198410988, "num_tokens": 76877792.0, "step": 63910 }, { "entropy": 1.843831568211317, "epoch": 0.19814640731765346, "grad_norm": 9.46835708618164, "learning_rate": 5.683331330324438e-06, "loss": 0.4919, "mean_token_accuracy": 0.8351884678006172, "num_tokens": 76890123.0, "step": 63920 }, { "entropy": 1.7964137971401215, "epoch": 0.19817740644270315, "grad_norm": 8.710184097290039, "learning_rate": 5.682886809559261e-06, "loss": 0.5308, "mean_token_accuracy": 0.8368506744503975, "num_tokens": 76902945.0, "step": 63930 }, { "entropy": 1.9574948653578759, "epoch": 0.19820840556775285, "grad_norm": 8.956100463867188, "learning_rate": 5.682442393082113e-06, "loss": 0.528, "mean_token_accuracy": 0.8361043930053711, "num_tokens": 76914195.0, "step": 63940 }, { "entropy": 1.8992867067456245, "epoch": 0.19823940469280255, "grad_norm": 5.019556045532227, "learning_rate": 5.681998080852219e-06, "loss": 0.5534, "mean_token_accuracy": 0.8368826553225517, "num_tokens": 76925863.0, "step": 63950 }, { "entropy": 1.9133247062563896, "epoch": 0.19827040381785224, "grad_norm": 4.297521591186523, "learning_rate": 5.681553872828835e-06, "loss": 0.5043, "mean_token_accuracy": 0.8392513215541839, "num_tokens": 76937635.0, "step": 63960 }, { "entropy": 1.9356270998716354, "epoch": 0.19830140294290194, "grad_norm": 7.358262062072754, "learning_rate": 5.681109768971228e-06, "loss": 0.5431, "mean_token_accuracy": 0.8359444841742516, "num_tokens": 76948918.0, "step": 63970 }, { "entropy": 1.807618674635887, "epoch": 0.19833240206795164, "grad_norm": 8.462199211120605, "learning_rate": 5.680665769238698e-06, "loss": 0.4808, "mean_token_accuracy": 0.844212980568409, "num_tokens": 76962029.0, "step": 63980 }, { "entropy": 1.8870945870876312, "epoch": 0.19836340119300133, "grad_norm": 9.050925254821777, "learning_rate": 5.68022187359056e-06, "loss": 0.4935, "mean_token_accuracy": 0.836103081703186, "num_tokens": 76973718.0, "step": 63990 }, { "entropy": 1.8385724887251853, "epoch": 0.19839440031805103, "grad_norm": 9.02990436553955, "learning_rate": 5.6797780819861545e-06, "loss": 0.4713, "mean_token_accuracy": 0.8344948351383209, "num_tokens": 76986094.0, "step": 64000 }, { "entropy": 1.8950312197208405, "epoch": 0.19842539944310073, "grad_norm": 17.118181228637695, "learning_rate": 5.679334394384845e-06, "loss": 0.4611, "mean_token_accuracy": 0.846867561340332, "num_tokens": 76998477.0, "step": 64010 }, { "entropy": 1.9193229526281357, "epoch": 0.19845639856815042, "grad_norm": 10.047476768493652, "learning_rate": 5.6788908107460135e-06, "loss": 0.5418, "mean_token_accuracy": 0.8332524642348289, "num_tokens": 77009850.0, "step": 64020 }, { "entropy": 1.8773248583078384, "epoch": 0.19848739769320012, "grad_norm": 8.258344650268555, "learning_rate": 5.678447331029068e-06, "loss": 0.4643, "mean_token_accuracy": 0.8483355283737183, "num_tokens": 77022417.0, "step": 64030 }, { "entropy": 1.9686390221118928, "epoch": 0.19851839681824981, "grad_norm": 9.272161483764648, "learning_rate": 5.678003955193437e-06, "loss": 0.5287, "mean_token_accuracy": 0.8388251766562462, "num_tokens": 77033370.0, "step": 64040 }, { "entropy": 1.8505666583776474, "epoch": 0.1985493959432995, "grad_norm": 8.902565956115723, "learning_rate": 5.677560683198569e-06, "loss": 0.5142, "mean_token_accuracy": 0.840177808701992, "num_tokens": 77046098.0, "step": 64050 }, { "entropy": 1.9229148238897324, "epoch": 0.1985803950683492, "grad_norm": 7.94978141784668, "learning_rate": 5.677117515003942e-06, "loss": 0.5188, "mean_token_accuracy": 0.8431189730763435, "num_tokens": 77058145.0, "step": 64060 }, { "entropy": 1.8256406679749488, "epoch": 0.1986113941933989, "grad_norm": 5.319147109985352, "learning_rate": 5.676674450569045e-06, "loss": 0.4836, "mean_token_accuracy": 0.8440526649355888, "num_tokens": 77071645.0, "step": 64070 }, { "entropy": 1.8413712576031684, "epoch": 0.19864239331844857, "grad_norm": 7.502344131469727, "learning_rate": 5.6762314898534e-06, "loss": 0.4358, "mean_token_accuracy": 0.8615258172154426, "num_tokens": 77083570.0, "step": 64080 }, { "entropy": 1.8211113825440406, "epoch": 0.19867339244349827, "grad_norm": 4.350229740142822, "learning_rate": 5.675788632816544e-06, "loss": 0.4523, "mean_token_accuracy": 0.8473315939307213, "num_tokens": 77096128.0, "step": 64090 }, { "entropy": 1.7832973524928093, "epoch": 0.19870439156854797, "grad_norm": 3.9118053913116455, "learning_rate": 5.67534587941804e-06, "loss": 0.4505, "mean_token_accuracy": 0.8490145295858383, "num_tokens": 77109619.0, "step": 64100 }, { "entropy": 1.8864678606390952, "epoch": 0.19873539069359766, "grad_norm": 4.381364345550537, "learning_rate": 5.674903229617469e-06, "loss": 0.5189, "mean_token_accuracy": 0.8414922654628754, "num_tokens": 77121495.0, "step": 64110 }, { "entropy": 1.8282597333192825, "epoch": 0.19876638981864736, "grad_norm": 8.134101867675781, "learning_rate": 5.67446068337444e-06, "loss": 0.4934, "mean_token_accuracy": 0.8438005477190018, "num_tokens": 77133762.0, "step": 64120 }, { "entropy": 1.891626113653183, "epoch": 0.19879738894369706, "grad_norm": 8.915387153625488, "learning_rate": 5.674018240648578e-06, "loss": 0.4952, "mean_token_accuracy": 0.8532876297831535, "num_tokens": 77145322.0, "step": 64130 }, { "entropy": 1.8996186777949333, "epoch": 0.19882838806874675, "grad_norm": 9.293252944946289, "learning_rate": 5.673575901399533e-06, "loss": 0.5231, "mean_token_accuracy": 0.8366784289479255, "num_tokens": 77157517.0, "step": 64140 }, { "entropy": 1.8965847790241241, "epoch": 0.19885938719379645, "grad_norm": 4.745813846588135, "learning_rate": 5.673133665586977e-06, "loss": 0.52, "mean_token_accuracy": 0.8319694399833679, "num_tokens": 77169583.0, "step": 64150 }, { "entropy": 1.8250271826982498, "epoch": 0.19889038631884615, "grad_norm": 9.300322532653809, "learning_rate": 5.672691533170605e-06, "loss": 0.4958, "mean_token_accuracy": 0.8459231272339821, "num_tokens": 77182323.0, "step": 64160 }, { "entropy": 1.7872568264603614, "epoch": 0.19892138544389584, "grad_norm": 9.027938842773438, "learning_rate": 5.672249504110131e-06, "loss": 0.4101, "mean_token_accuracy": 0.8547685459256172, "num_tokens": 77196100.0, "step": 64170 }, { "entropy": 1.814260269701481, "epoch": 0.19895238456894554, "grad_norm": 8.810227394104004, "learning_rate": 5.671807578365294e-06, "loss": 0.4367, "mean_token_accuracy": 0.849295774102211, "num_tokens": 77208500.0, "step": 64180 }, { "entropy": 1.8780076310038567, "epoch": 0.19898338369399524, "grad_norm": 9.624731063842773, "learning_rate": 5.671365755895851e-06, "loss": 0.4757, "mean_token_accuracy": 0.8416895166039466, "num_tokens": 77220028.0, "step": 64190 }, { "entropy": 1.9330232828855514, "epoch": 0.19901438281904493, "grad_norm": 9.217514038085938, "learning_rate": 5.670924036661586e-06, "loss": 0.5643, "mean_token_accuracy": 0.8271647378802299, "num_tokens": 77231617.0, "step": 64200 }, { "entropy": 1.9576894700527192, "epoch": 0.19904538194409463, "grad_norm": 7.6577534675598145, "learning_rate": 5.670482420622302e-06, "loss": 0.5551, "mean_token_accuracy": 0.8377937138080597, "num_tokens": 77243109.0, "step": 64210 }, { "entropy": 1.9337700963020326, "epoch": 0.19907638106914433, "grad_norm": 9.766473770141602, "learning_rate": 5.6700409077378235e-06, "loss": 0.5428, "mean_token_accuracy": 0.8324208498001099, "num_tokens": 77253457.0, "step": 64220 }, { "entropy": 1.888157394528389, "epoch": 0.19910738019419402, "grad_norm": 8.229586601257324, "learning_rate": 5.6695994979679995e-06, "loss": 0.4829, "mean_token_accuracy": 0.8483126178383827, "num_tokens": 77265767.0, "step": 64230 }, { "entropy": 1.795005053281784, "epoch": 0.19913837931924372, "grad_norm": 8.953361511230469, "learning_rate": 5.669158191272697e-06, "loss": 0.4682, "mean_token_accuracy": 0.8460436001420021, "num_tokens": 77278828.0, "step": 64240 }, { "entropy": 1.8841128557920457, "epoch": 0.19916937844429342, "grad_norm": 9.218796730041504, "learning_rate": 5.668716987611807e-06, "loss": 0.504, "mean_token_accuracy": 0.8368963778018952, "num_tokens": 77291261.0, "step": 64250 }, { "entropy": 1.816078770160675, "epoch": 0.1992003775693431, "grad_norm": 7.860946178436279, "learning_rate": 5.668275886945246e-06, "loss": 0.4316, "mean_token_accuracy": 0.8497811511158944, "num_tokens": 77303890.0, "step": 64260 }, { "entropy": 1.9705949038267137, "epoch": 0.1992313766943928, "grad_norm": 7.762573719024658, "learning_rate": 5.667834889232945e-06, "loss": 0.5301, "mean_token_accuracy": 0.8459170028567314, "num_tokens": 77315352.0, "step": 64270 }, { "entropy": 1.8340299040079118, "epoch": 0.1992623758194425, "grad_norm": 5.21130895614624, "learning_rate": 5.667393994434861e-06, "loss": 0.49, "mean_token_accuracy": 0.8440465152263641, "num_tokens": 77328106.0, "step": 64280 }, { "entropy": 1.872594805061817, "epoch": 0.1992933749444922, "grad_norm": 7.447837829589844, "learning_rate": 5.666953202510973e-06, "loss": 0.4397, "mean_token_accuracy": 0.8528081148862838, "num_tokens": 77340307.0, "step": 64290 }, { "entropy": 1.9547280743718147, "epoch": 0.1993243740695419, "grad_norm": 8.335238456726074, "learning_rate": 5.666512513421281e-06, "loss": 0.5324, "mean_token_accuracy": 0.8396565094590187, "num_tokens": 77351792.0, "step": 64300 }, { "entropy": 1.8686283484101296, "epoch": 0.1993553731945916, "grad_norm": 10.21964168548584, "learning_rate": 5.666071927125806e-06, "loss": 0.4666, "mean_token_accuracy": 0.8395275220274925, "num_tokens": 77364455.0, "step": 64310 }, { "entropy": 1.9559142783284187, "epoch": 0.1993863723196413, "grad_norm": 6.589775085449219, "learning_rate": 5.665631443584593e-06, "loss": 0.5251, "mean_token_accuracy": 0.8342868536710739, "num_tokens": 77376312.0, "step": 64320 }, { "entropy": 1.8677933678030967, "epoch": 0.19941737144469096, "grad_norm": 4.30719518661499, "learning_rate": 5.665191062757705e-06, "loss": 0.4805, "mean_token_accuracy": 0.8398402318358421, "num_tokens": 77388805.0, "step": 64330 }, { "entropy": 2.0073965549468995, "epoch": 0.19944837056974066, "grad_norm": 8.787602424621582, "learning_rate": 5.664750784605233e-06, "loss": 0.5647, "mean_token_accuracy": 0.8316746145486832, "num_tokens": 77399105.0, "step": 64340 }, { "entropy": 1.9256497889757156, "epoch": 0.19947936969479035, "grad_norm": 4.221436023712158, "learning_rate": 5.664310609087283e-06, "loss": 0.5362, "mean_token_accuracy": 0.8293071076273918, "num_tokens": 77411168.0, "step": 64350 }, { "entropy": 1.8341520741581916, "epoch": 0.19951036881984005, "grad_norm": 7.341567516326904, "learning_rate": 5.663870536163986e-06, "loss": 0.5056, "mean_token_accuracy": 0.835312868654728, "num_tokens": 77424558.0, "step": 64360 }, { "entropy": 1.9391889229416848, "epoch": 0.19954136794488975, "grad_norm": 10.012337684631348, "learning_rate": 5.663430565795495e-06, "loss": 0.5365, "mean_token_accuracy": 0.8299921050667762, "num_tokens": 77436576.0, "step": 64370 }, { "entropy": 1.9387200742959976, "epoch": 0.19957236706993944, "grad_norm": 8.041949272155762, "learning_rate": 5.6629906979419805e-06, "loss": 0.5124, "mean_token_accuracy": 0.8453411236405373, "num_tokens": 77448293.0, "step": 64380 }, { "entropy": 1.845164766907692, "epoch": 0.19960336619498914, "grad_norm": 9.091232299804688, "learning_rate": 5.662550932563643e-06, "loss": 0.4484, "mean_token_accuracy": 0.8558371141552925, "num_tokens": 77461006.0, "step": 64390 }, { "entropy": 1.9543328523635863, "epoch": 0.19963436532003884, "grad_norm": 8.760300636291504, "learning_rate": 5.662111269620696e-06, "loss": 0.546, "mean_token_accuracy": 0.8322904482483864, "num_tokens": 77472575.0, "step": 64400 }, { "entropy": 1.8849376797676087, "epoch": 0.19966536444508853, "grad_norm": 8.26484489440918, "learning_rate": 5.661671709073379e-06, "loss": 0.4755, "mean_token_accuracy": 0.8436549201607704, "num_tokens": 77484546.0, "step": 64410 }, { "entropy": 1.9597482338547707, "epoch": 0.19969636357013823, "grad_norm": 7.358891487121582, "learning_rate": 5.661232250881952e-06, "loss": 0.5204, "mean_token_accuracy": 0.8410298019647598, "num_tokens": 77496212.0, "step": 64420 }, { "entropy": 1.9519696533679962, "epoch": 0.19972736269518793, "grad_norm": 8.28138542175293, "learning_rate": 5.660792895006698e-06, "loss": 0.527, "mean_token_accuracy": 0.8325450897216797, "num_tokens": 77507649.0, "step": 64430 }, { "entropy": 1.9554157480597496, "epoch": 0.19975836182023762, "grad_norm": 8.739425659179688, "learning_rate": 5.660353641407921e-06, "loss": 0.5152, "mean_token_accuracy": 0.8404718577861786, "num_tokens": 77518910.0, "step": 64440 }, { "entropy": 1.9898914635181426, "epoch": 0.19978936094528732, "grad_norm": 11.325966835021973, "learning_rate": 5.659914490045944e-06, "loss": 0.5837, "mean_token_accuracy": 0.8286274582147598, "num_tokens": 77529561.0, "step": 64450 }, { "entropy": 1.8334274768829346, "epoch": 0.19982036007033702, "grad_norm": 7.993432998657227, "learning_rate": 5.659475440881115e-06, "loss": 0.4159, "mean_token_accuracy": 0.8571745663881302, "num_tokens": 77542142.0, "step": 64460 }, { "entropy": 1.864787982404232, "epoch": 0.1998513591953867, "grad_norm": 8.196157455444336, "learning_rate": 5.659036493873802e-06, "loss": 0.472, "mean_token_accuracy": 0.8399261072278023, "num_tokens": 77554999.0, "step": 64470 }, { "entropy": 1.9031756028532982, "epoch": 0.1998823583204364, "grad_norm": 7.0196404457092285, "learning_rate": 5.658597648984394e-06, "loss": 0.5216, "mean_token_accuracy": 0.8386317491531372, "num_tokens": 77567455.0, "step": 64480 }, { "entropy": 1.8681576654314995, "epoch": 0.1999133574454861, "grad_norm": 8.093351364135742, "learning_rate": 5.658158906173302e-06, "loss": 0.406, "mean_token_accuracy": 0.8515056669712067, "num_tokens": 77580142.0, "step": 64490 }, { "entropy": 1.9180188179016113, "epoch": 0.1999443565705358, "grad_norm": 9.29157543182373, "learning_rate": 5.657720265400961e-06, "loss": 0.5155, "mean_token_accuracy": 0.8398342624306678, "num_tokens": 77591543.0, "step": 64500 }, { "entropy": 1.7902460798621178, "epoch": 0.1999753556955855, "grad_norm": 3.0435149669647217, "learning_rate": 5.657281726627822e-06, "loss": 0.4415, "mean_token_accuracy": 0.8577824845910073, "num_tokens": 77604945.0, "step": 64510 }, { "entropy": 1.9697556897997857, "epoch": 0.2000063548206352, "grad_norm": 8.26711654663086, "learning_rate": 5.656843289814361e-06, "loss": 0.6193, "mean_token_accuracy": 0.8271805793046951, "num_tokens": 77616932.0, "step": 64520 }, { "entropy": 1.90864490121603, "epoch": 0.2000373539456849, "grad_norm": 9.074986457824707, "learning_rate": 5.656404954921076e-06, "loss": 0.4704, "mean_token_accuracy": 0.8418331027030945, "num_tokens": 77629419.0, "step": 64530 }, { "entropy": 1.9975577175617218, "epoch": 0.2000683530707346, "grad_norm": 9.11250114440918, "learning_rate": 5.655966721908486e-06, "loss": 0.5851, "mean_token_accuracy": 0.8237589776515961, "num_tokens": 77640305.0, "step": 64540 }, { "entropy": 1.9538366571068764, "epoch": 0.20009935219578429, "grad_norm": 8.259376525878906, "learning_rate": 5.65552859073713e-06, "loss": 0.5225, "mean_token_accuracy": 0.8349790692329406, "num_tokens": 77651850.0, "step": 64550 }, { "entropy": 1.8756514206528663, "epoch": 0.20013035132083398, "grad_norm": 9.364764213562012, "learning_rate": 5.655090561367568e-06, "loss": 0.4591, "mean_token_accuracy": 0.8493126779794693, "num_tokens": 77664419.0, "step": 64560 }, { "entropy": 1.908227115869522, "epoch": 0.20016135044588365, "grad_norm": 8.66217041015625, "learning_rate": 5.654652633760384e-06, "loss": 0.5649, "mean_token_accuracy": 0.8257618814706802, "num_tokens": 77676594.0, "step": 64570 }, { "entropy": 1.9556605026125908, "epoch": 0.20019234957093335, "grad_norm": 3.5400590896606445, "learning_rate": 5.654214807876182e-06, "loss": 0.504, "mean_token_accuracy": 0.8475314304232597, "num_tokens": 77687943.0, "step": 64580 }, { "entropy": 1.883988819271326, "epoch": 0.20022334869598304, "grad_norm": 9.24106216430664, "learning_rate": 5.653777083675587e-06, "loss": 0.4301, "mean_token_accuracy": 0.8436937823891639, "num_tokens": 77700896.0, "step": 64590 }, { "entropy": 1.857344676554203, "epoch": 0.20025434782103274, "grad_norm": 9.264991760253906, "learning_rate": 5.653339461119245e-06, "loss": 0.4813, "mean_token_accuracy": 0.8411024749279022, "num_tokens": 77713541.0, "step": 64600 }, { "entropy": 1.9974335208535194, "epoch": 0.20028534694608244, "grad_norm": 9.765063285827637, "learning_rate": 5.6529019401678256e-06, "loss": 0.5565, "mean_token_accuracy": 0.8282542005181313, "num_tokens": 77724948.0, "step": 64610 }, { "entropy": 1.9897297486662864, "epoch": 0.20031634607113213, "grad_norm": 10.479876518249512, "learning_rate": 5.652464520782016e-06, "loss": 0.5124, "mean_token_accuracy": 0.8397626534104348, "num_tokens": 77736336.0, "step": 64620 }, { "entropy": 1.8929444566369056, "epoch": 0.20034734519618183, "grad_norm": 9.795084953308105, "learning_rate": 5.652027202922528e-06, "loss": 0.4723, "mean_token_accuracy": 0.8592626452445984, "num_tokens": 77748048.0, "step": 64630 }, { "entropy": 1.814023308455944, "epoch": 0.20037834432123153, "grad_norm": 7.460301399230957, "learning_rate": 5.651589986550092e-06, "loss": 0.4049, "mean_token_accuracy": 0.8527435094118119, "num_tokens": 77761289.0, "step": 64640 }, { "entropy": 1.8586131647229194, "epoch": 0.20040934344628122, "grad_norm": 8.590618133544922, "learning_rate": 5.6511528716254636e-06, "loss": 0.4138, "mean_token_accuracy": 0.8501928374171257, "num_tokens": 77773602.0, "step": 64650 }, { "entropy": 1.8472480833530427, "epoch": 0.20044034257133092, "grad_norm": 3.6471304893493652, "learning_rate": 5.650715858109416e-06, "loss": 0.4407, "mean_token_accuracy": 0.8560655087232589, "num_tokens": 77785559.0, "step": 64660 }, { "entropy": 2.008794406056404, "epoch": 0.20047134169638062, "grad_norm": 10.51722240447998, "learning_rate": 5.650278945962744e-06, "loss": 0.5934, "mean_token_accuracy": 0.8334228664636611, "num_tokens": 77796525.0, "step": 64670 }, { "entropy": 1.9870349198579789, "epoch": 0.2005023408214303, "grad_norm": 8.583792686462402, "learning_rate": 5.649842135146264e-06, "loss": 0.6073, "mean_token_accuracy": 0.8145452126860618, "num_tokens": 77807477.0, "step": 64680 }, { "entropy": 1.8656486958265304, "epoch": 0.20053333994648, "grad_norm": 8.072990417480469, "learning_rate": 5.649405425620815e-06, "loss": 0.4531, "mean_token_accuracy": 0.8582053184509277, "num_tokens": 77819825.0, "step": 64690 }, { "entropy": 1.8668094590306281, "epoch": 0.2005643390715297, "grad_norm": 9.32240104675293, "learning_rate": 5.648968817347257e-06, "loss": 0.4919, "mean_token_accuracy": 0.8417691692709923, "num_tokens": 77832238.0, "step": 64700 }, { "entropy": 1.9135265216231345, "epoch": 0.2005953381965794, "grad_norm": 8.36419677734375, "learning_rate": 5.648532310286469e-06, "loss": 0.507, "mean_token_accuracy": 0.8405087172985077, "num_tokens": 77844237.0, "step": 64710 }, { "entropy": 2.0349139839410784, "epoch": 0.2006263373216291, "grad_norm": 8.855172157287598, "learning_rate": 5.648095904399352e-06, "loss": 0.6111, "mean_token_accuracy": 0.8124338254332543, "num_tokens": 77854626.0, "step": 64720 }, { "entropy": 1.838395781815052, "epoch": 0.2006573364466788, "grad_norm": 5.492077350616455, "learning_rate": 5.64765959964683e-06, "loss": 0.5007, "mean_token_accuracy": 0.8443232774734497, "num_tokens": 77867844.0, "step": 64730 }, { "entropy": 1.9272115871310234, "epoch": 0.2006883355717285, "grad_norm": 8.975436210632324, "learning_rate": 5.647223395989846e-06, "loss": 0.5197, "mean_token_accuracy": 0.8435587778687477, "num_tokens": 77880414.0, "step": 64740 }, { "entropy": 1.8668215185403825, "epoch": 0.2007193346967782, "grad_norm": 8.991979598999023, "learning_rate": 5.646787293389365e-06, "loss": 0.5237, "mean_token_accuracy": 0.8380271717905998, "num_tokens": 77893230.0, "step": 64750 }, { "entropy": 1.9775980859994888, "epoch": 0.2007503338218279, "grad_norm": 9.368738174438477, "learning_rate": 5.646351291806372e-06, "loss": 0.5447, "mean_token_accuracy": 0.8423718631267547, "num_tokens": 77904698.0, "step": 64760 }, { "entropy": 1.880776160955429, "epoch": 0.20078133294687758, "grad_norm": 4.232335090637207, "learning_rate": 5.645915391201876e-06, "loss": 0.4911, "mean_token_accuracy": 0.8433832973241806, "num_tokens": 77918354.0, "step": 64770 }, { "entropy": 1.9753024756908417, "epoch": 0.20081233207192728, "grad_norm": 9.014158248901367, "learning_rate": 5.645479591536904e-06, "loss": 0.5052, "mean_token_accuracy": 0.8433095306158066, "num_tokens": 77929796.0, "step": 64780 }, { "entropy": 1.9146323308348656, "epoch": 0.20084333119697698, "grad_norm": 8.354053497314453, "learning_rate": 5.645043892772506e-06, "loss": 0.4854, "mean_token_accuracy": 0.8488031610846519, "num_tokens": 77941791.0, "step": 64790 }, { "entropy": 1.9045788452029229, "epoch": 0.20087433032202667, "grad_norm": 9.207497596740723, "learning_rate": 5.64460829486975e-06, "loss": 0.4815, "mean_token_accuracy": 0.8390837252140045, "num_tokens": 77953907.0, "step": 64800 }, { "entropy": 1.9096369460225104, "epoch": 0.20090532944707637, "grad_norm": 3.9759466648101807, "learning_rate": 5.64417279778973e-06, "loss": 0.5009, "mean_token_accuracy": 0.834852209687233, "num_tokens": 77966476.0, "step": 64810 }, { "entropy": 1.971180261671543, "epoch": 0.20093632857212604, "grad_norm": 7.077384948730469, "learning_rate": 5.643737401493556e-06, "loss": 0.5186, "mean_token_accuracy": 0.8305558815598488, "num_tokens": 77978231.0, "step": 64820 }, { "entropy": 1.910263580083847, "epoch": 0.20096732769717573, "grad_norm": 5.62261438369751, "learning_rate": 5.643302105942363e-06, "loss": 0.4388, "mean_token_accuracy": 0.8370845153927803, "num_tokens": 77991195.0, "step": 64830 }, { "entropy": 1.8971102446317674, "epoch": 0.20099832682222543, "grad_norm": 14.074230194091797, "learning_rate": 5.6428669110973035e-06, "loss": 0.4748, "mean_token_accuracy": 0.8393953680992127, "num_tokens": 78003545.0, "step": 64840 }, { "entropy": 1.8818523928523063, "epoch": 0.20102932594727513, "grad_norm": 8.744156837463379, "learning_rate": 5.642431816919555e-06, "loss": 0.4876, "mean_token_accuracy": 0.8389109507203102, "num_tokens": 78015484.0, "step": 64850 }, { "entropy": 1.8921169385313987, "epoch": 0.20106032507232482, "grad_norm": 6.810762882232666, "learning_rate": 5.6419968233703105e-06, "loss": 0.4518, "mean_token_accuracy": 0.8551272869110107, "num_tokens": 78027522.0, "step": 64860 }, { "entropy": 2.0100679606199265, "epoch": 0.20109132419737452, "grad_norm": 7.5746378898620605, "learning_rate": 5.641561930410791e-06, "loss": 0.5471, "mean_token_accuracy": 0.8342107847332955, "num_tokens": 78038669.0, "step": 64870 }, { "entropy": 1.886759166419506, "epoch": 0.20112232332242422, "grad_norm": 8.69468879699707, "learning_rate": 5.64112713800223e-06, "loss": 0.4933, "mean_token_accuracy": 0.8427166402339935, "num_tokens": 78051321.0, "step": 64880 }, { "entropy": 1.980288290977478, "epoch": 0.20115332244747391, "grad_norm": 9.738602638244629, "learning_rate": 5.64069244610589e-06, "loss": 0.5282, "mean_token_accuracy": 0.8373025566339493, "num_tokens": 78062300.0, "step": 64890 }, { "entropy": 1.9037230789661408, "epoch": 0.2011843215725236, "grad_norm": 7.894903659820557, "learning_rate": 5.640257854683049e-06, "loss": 0.4886, "mean_token_accuracy": 0.8482183083891869, "num_tokens": 78074143.0, "step": 64900 }, { "entropy": 1.965438848733902, "epoch": 0.2012153206975733, "grad_norm": 10.353565216064453, "learning_rate": 5.639823363695008e-06, "loss": 0.5162, "mean_token_accuracy": 0.8452044859528541, "num_tokens": 78084809.0, "step": 64910 }, { "entropy": 1.8660222113132476, "epoch": 0.201246319822623, "grad_norm": 8.646041870117188, "learning_rate": 5.639388973103089e-06, "loss": 0.4265, "mean_token_accuracy": 0.8471137225627899, "num_tokens": 78097733.0, "step": 64920 }, { "entropy": 1.92643845975399, "epoch": 0.2012773189476727, "grad_norm": 8.62920093536377, "learning_rate": 5.638954682868635e-06, "loss": 0.5631, "mean_token_accuracy": 0.8208620086312294, "num_tokens": 78110514.0, "step": 64930 }, { "entropy": 1.8348462983965874, "epoch": 0.2013083180727224, "grad_norm": 7.881258964538574, "learning_rate": 5.638520492953008e-06, "loss": 0.4422, "mean_token_accuracy": 0.8508792489767074, "num_tokens": 78123634.0, "step": 64940 }, { "entropy": 1.9531269282102586, "epoch": 0.2013393171977721, "grad_norm": 3.806800603866577, "learning_rate": 5.638086403317592e-06, "loss": 0.5033, "mean_token_accuracy": 0.8420859977602959, "num_tokens": 78134835.0, "step": 64950 }, { "entropy": 1.9173472926020623, "epoch": 0.2013703163228218, "grad_norm": 9.862812995910645, "learning_rate": 5.637652413923792e-06, "loss": 0.5778, "mean_token_accuracy": 0.817606084048748, "num_tokens": 78146512.0, "step": 64960 }, { "entropy": 1.9328098177909852, "epoch": 0.2014013154478715, "grad_norm": 8.525472640991211, "learning_rate": 5.637218524733035e-06, "loss": 0.5115, "mean_token_accuracy": 0.8353380098938942, "num_tokens": 78158404.0, "step": 64970 }, { "entropy": 1.9535952508449554, "epoch": 0.20143231457292118, "grad_norm": 7.10647439956665, "learning_rate": 5.636784735706765e-06, "loss": 0.5739, "mean_token_accuracy": 0.8305754721164703, "num_tokens": 78169749.0, "step": 64980 }, { "entropy": 1.946779978275299, "epoch": 0.20146331369797088, "grad_norm": 8.665600776672363, "learning_rate": 5.636351046806451e-06, "loss": 0.5525, "mean_token_accuracy": 0.835592320561409, "num_tokens": 78180620.0, "step": 64990 }, { "entropy": 1.8874027088284493, "epoch": 0.20149431282302058, "grad_norm": 10.456498146057129, "learning_rate": 5.6359174579935805e-06, "loss": 0.5393, "mean_token_accuracy": 0.8345045700669289, "num_tokens": 78192494.0, "step": 65000 }, { "entropy": 1.9049117237329483, "epoch": 0.20152531194807027, "grad_norm": 8.67405891418457, "learning_rate": 5.635483969229662e-06, "loss": 0.5042, "mean_token_accuracy": 0.8426391571760178, "num_tokens": 78204503.0, "step": 65010 }, { "entropy": 1.937725681066513, "epoch": 0.20155631107311997, "grad_norm": 8.417729377746582, "learning_rate": 5.635050580476227e-06, "loss": 0.5326, "mean_token_accuracy": 0.8371980518102646, "num_tokens": 78215731.0, "step": 65020 }, { "entropy": 1.881471572816372, "epoch": 0.20158731019816967, "grad_norm": 8.803342819213867, "learning_rate": 5.6346172916948215e-06, "loss": 0.4486, "mean_token_accuracy": 0.8541508734226226, "num_tokens": 78228595.0, "step": 65030 }, { "entropy": 1.997860112786293, "epoch": 0.20161830932321936, "grad_norm": 8.877419471740723, "learning_rate": 5.634184102847018e-06, "loss": 0.5516, "mean_token_accuracy": 0.8311259895563126, "num_tokens": 78239815.0, "step": 65040 }, { "entropy": 1.9192116037011147, "epoch": 0.20164930844826906, "grad_norm": 8.195626258850098, "learning_rate": 5.6337510138944094e-06, "loss": 0.5028, "mean_token_accuracy": 0.8414540275931358, "num_tokens": 78250962.0, "step": 65050 }, { "entropy": 1.9783461928367614, "epoch": 0.20168030757331876, "grad_norm": 9.960369110107422, "learning_rate": 5.633318024798608e-06, "loss": 0.5766, "mean_token_accuracy": 0.8294192060828209, "num_tokens": 78261583.0, "step": 65060 }, { "entropy": 1.999306383728981, "epoch": 0.20171130669836843, "grad_norm": 8.405972480773926, "learning_rate": 5.6328851355212445e-06, "loss": 0.5814, "mean_token_accuracy": 0.8255703672766685, "num_tokens": 78272701.0, "step": 65070 }, { "entropy": 1.9220364853739738, "epoch": 0.20174230582341812, "grad_norm": 8.912109375, "learning_rate": 5.632452346023972e-06, "loss": 0.5051, "mean_token_accuracy": 0.8397694423794746, "num_tokens": 78284100.0, "step": 65080 }, { "entropy": 1.8529915317893029, "epoch": 0.20177330494846782, "grad_norm": 4.4439697265625, "learning_rate": 5.6320196562684685e-06, "loss": 0.4849, "mean_token_accuracy": 0.8452705577015877, "num_tokens": 78296972.0, "step": 65090 }, { "entropy": 1.8957914143800736, "epoch": 0.20180430407351752, "grad_norm": 21.00503921508789, "learning_rate": 5.6315870662164244e-06, "loss": 0.4532, "mean_token_accuracy": 0.8400771632790566, "num_tokens": 78309482.0, "step": 65100 }, { "entropy": 1.9614157870411872, "epoch": 0.2018353031985672, "grad_norm": 9.07087230682373, "learning_rate": 5.631154575829556e-06, "loss": 0.4778, "mean_token_accuracy": 0.8453190490603447, "num_tokens": 78321178.0, "step": 65110 }, { "entropy": 1.8932098597288132, "epoch": 0.2018663023236169, "grad_norm": 2.6463136672973633, "learning_rate": 5.6307221850696e-06, "loss": 0.5016, "mean_token_accuracy": 0.8434045687317848, "num_tokens": 78333684.0, "step": 65120 }, { "entropy": 1.8674306973814965, "epoch": 0.2018973014486666, "grad_norm": 8.476088523864746, "learning_rate": 5.630289893898313e-06, "loss": 0.4328, "mean_token_accuracy": 0.8461329773068428, "num_tokens": 78346646.0, "step": 65130 }, { "entropy": 1.9643313199281693, "epoch": 0.2019283005737163, "grad_norm": 9.779008865356445, "learning_rate": 5.629857702277471e-06, "loss": 0.5428, "mean_token_accuracy": 0.8256931126117706, "num_tokens": 78358044.0, "step": 65140 }, { "entropy": 1.9168717786669731, "epoch": 0.201959299698766, "grad_norm": 8.545971870422363, "learning_rate": 5.629425610168872e-06, "loss": 0.5267, "mean_token_accuracy": 0.8337204769253731, "num_tokens": 78370065.0, "step": 65150 }, { "entropy": 1.8033166334033013, "epoch": 0.2019902988238157, "grad_norm": 4.0642991065979, "learning_rate": 5.628993617534335e-06, "loss": 0.3731, "mean_token_accuracy": 0.8547035023570061, "num_tokens": 78384325.0, "step": 65160 }, { "entropy": 1.8826155215501785, "epoch": 0.2020212979488654, "grad_norm": 4.962791442871094, "learning_rate": 5.628561724335695e-06, "loss": 0.4747, "mean_token_accuracy": 0.8410342499613762, "num_tokens": 78396987.0, "step": 65170 }, { "entropy": 1.8694611176848412, "epoch": 0.2020522970739151, "grad_norm": 3.838369131088257, "learning_rate": 5.628129930534814e-06, "loss": 0.4678, "mean_token_accuracy": 0.846546882390976, "num_tokens": 78410583.0, "step": 65180 }, { "entropy": 1.8311977073550225, "epoch": 0.20208329619896478, "grad_norm": 3.510143518447876, "learning_rate": 5.627698236093573e-06, "loss": 0.4623, "mean_token_accuracy": 0.8387721896171569, "num_tokens": 78423729.0, "step": 65190 }, { "entropy": 1.9377717927098275, "epoch": 0.20211429532401448, "grad_norm": 8.662220001220703, "learning_rate": 5.627266640973867e-06, "loss": 0.5164, "mean_token_accuracy": 0.8386742398142815, "num_tokens": 78435466.0, "step": 65200 }, { "entropy": 1.9007938578724861, "epoch": 0.20214529444906418, "grad_norm": 4.089953899383545, "learning_rate": 5.626835145137622e-06, "loss": 0.4383, "mean_token_accuracy": 0.8495527386665345, "num_tokens": 78447613.0, "step": 65210 }, { "entropy": 1.9075167581439019, "epoch": 0.20217629357411387, "grad_norm": 8.904956817626953, "learning_rate": 5.626403748546773e-06, "loss": 0.5229, "mean_token_accuracy": 0.8407324820756912, "num_tokens": 78459460.0, "step": 65220 }, { "entropy": 1.829827456176281, "epoch": 0.20220729269916357, "grad_norm": 8.076254844665527, "learning_rate": 5.625972451163285e-06, "loss": 0.4076, "mean_token_accuracy": 0.8538295239210129, "num_tokens": 78472309.0, "step": 65230 }, { "entropy": 1.964961776137352, "epoch": 0.20223829182421327, "grad_norm": 4.608421325683594, "learning_rate": 5.625541252949139e-06, "loss": 0.5394, "mean_token_accuracy": 0.8294176504015922, "num_tokens": 78483546.0, "step": 65240 }, { "entropy": 1.9537744015455245, "epoch": 0.20226929094926296, "grad_norm": 8.989550590515137, "learning_rate": 5.6251101538663364e-06, "loss": 0.5179, "mean_token_accuracy": 0.8399158343672752, "num_tokens": 78493937.0, "step": 65250 }, { "entropy": 1.8486858293414117, "epoch": 0.20230029007431266, "grad_norm": 8.618539810180664, "learning_rate": 5.624679153876901e-06, "loss": 0.5205, "mean_token_accuracy": 0.8347037017345429, "num_tokens": 78506495.0, "step": 65260 }, { "entropy": 1.8448414601385594, "epoch": 0.20233128919936236, "grad_norm": 1.566992998123169, "learning_rate": 5.624248252942874e-06, "loss": 0.5063, "mean_token_accuracy": 0.8431580245494843, "num_tokens": 78519640.0, "step": 65270 }, { "entropy": 1.9085783809423447, "epoch": 0.20236228832441205, "grad_norm": 8.527741432189941, "learning_rate": 5.623817451026321e-06, "loss": 0.5242, "mean_token_accuracy": 0.8311206966638565, "num_tokens": 78530969.0, "step": 65280 }, { "entropy": 1.9010276600718499, "epoch": 0.20239328744946175, "grad_norm": 8.006790161132812, "learning_rate": 5.623386748089322e-06, "loss": 0.4927, "mean_token_accuracy": 0.8460407719016075, "num_tokens": 78542383.0, "step": 65290 }, { "entropy": 1.9630857422947883, "epoch": 0.20242428657451145, "grad_norm": 10.483144760131836, "learning_rate": 5.622956144093983e-06, "loss": 0.5581, "mean_token_accuracy": 0.8222698003053666, "num_tokens": 78553514.0, "step": 65300 }, { "entropy": 1.9169240906834601, "epoch": 0.20245528569956112, "grad_norm": 7.206988334655762, "learning_rate": 5.622525639002427e-06, "loss": 0.5551, "mean_token_accuracy": 0.8317204505205155, "num_tokens": 78565113.0, "step": 65310 }, { "entropy": 2.0341182202100754, "epoch": 0.2024862848246108, "grad_norm": 9.495631217956543, "learning_rate": 5.6220952327768e-06, "loss": 0.5879, "mean_token_accuracy": 0.8261968463659286, "num_tokens": 78576309.0, "step": 65320 }, { "entropy": 1.8233429193496704, "epoch": 0.2025172839496605, "grad_norm": 7.392541885375977, "learning_rate": 5.6216649253792645e-06, "loss": 0.485, "mean_token_accuracy": 0.8485372483730316, "num_tokens": 78589699.0, "step": 65330 }, { "entropy": 1.9578388512134552, "epoch": 0.2025482830747102, "grad_norm": 9.04162883758545, "learning_rate": 5.6212347167720085e-06, "loss": 0.575, "mean_token_accuracy": 0.8360649108886719, "num_tokens": 78601015.0, "step": 65340 }, { "entropy": 1.9982189297676087, "epoch": 0.2025792821997599, "grad_norm": 8.363797187805176, "learning_rate": 5.620804606917233e-06, "loss": 0.5966, "mean_token_accuracy": 0.8217172041535378, "num_tokens": 78611845.0, "step": 65350 }, { "entropy": 1.954882425069809, "epoch": 0.2026102813248096, "grad_norm": 3.7723734378814697, "learning_rate": 5.620374595777169e-06, "loss": 0.5392, "mean_token_accuracy": 0.8342722833156586, "num_tokens": 78622857.0, "step": 65360 }, { "entropy": 1.9132612109184266, "epoch": 0.2026412804498593, "grad_norm": 8.916566848754883, "learning_rate": 5.619944683314056e-06, "loss": 0.545, "mean_token_accuracy": 0.8364124834537506, "num_tokens": 78635064.0, "step": 65370 }, { "entropy": 1.9422496438026429, "epoch": 0.202672279574909, "grad_norm": 9.716279983520508, "learning_rate": 5.619514869490165e-06, "loss": 0.5646, "mean_token_accuracy": 0.8320507362484932, "num_tokens": 78646048.0, "step": 65380 }, { "entropy": 1.8317863315343856, "epoch": 0.2027032786999587, "grad_norm": 9.566564559936523, "learning_rate": 5.619085154267778e-06, "loss": 0.4479, "mean_token_accuracy": 0.8540365427732468, "num_tokens": 78658841.0, "step": 65390 }, { "entropy": 1.8700613364577294, "epoch": 0.20273427782500839, "grad_norm": 8.530824661254883, "learning_rate": 5.618655537609205e-06, "loss": 0.4934, "mean_token_accuracy": 0.8470278188586235, "num_tokens": 78671464.0, "step": 65400 }, { "entropy": 1.9383513778448105, "epoch": 0.20276527695005808, "grad_norm": 11.235833168029785, "learning_rate": 5.61822601947677e-06, "loss": 0.5219, "mean_token_accuracy": 0.8409254685044288, "num_tokens": 78683244.0, "step": 65410 }, { "entropy": 1.8878853350877762, "epoch": 0.20279627607510778, "grad_norm": 4.765620231628418, "learning_rate": 5.617796599832821e-06, "loss": 0.4732, "mean_token_accuracy": 0.8444775015115737, "num_tokens": 78695252.0, "step": 65420 }, { "entropy": 1.9394584164023398, "epoch": 0.20282727520015748, "grad_norm": 9.20919132232666, "learning_rate": 5.617367278639724e-06, "loss": 0.5591, "mean_token_accuracy": 0.8318523272871972, "num_tokens": 78707014.0, "step": 65430 }, { "entropy": 1.856240051984787, "epoch": 0.20285827432520717, "grad_norm": 8.130507469177246, "learning_rate": 5.6169380558598655e-06, "loss": 0.4699, "mean_token_accuracy": 0.8484870880842209, "num_tokens": 78719124.0, "step": 65440 }, { "entropy": 1.9641065716743469, "epoch": 0.20288927345025687, "grad_norm": 11.00446891784668, "learning_rate": 5.616508931455653e-06, "loss": 0.6064, "mean_token_accuracy": 0.8264747887849808, "num_tokens": 78730740.0, "step": 65450 }, { "entropy": 1.906115210056305, "epoch": 0.20292027257530657, "grad_norm": 9.167335510253906, "learning_rate": 5.616079905389513e-06, "loss": 0.5045, "mean_token_accuracy": 0.8373008102178574, "num_tokens": 78742187.0, "step": 65460 }, { "entropy": 1.8616785183548927, "epoch": 0.20295127170035626, "grad_norm": 4.638375282287598, "learning_rate": 5.6156509776238955e-06, "loss": 0.4359, "mean_token_accuracy": 0.851144377887249, "num_tokens": 78754915.0, "step": 65470 }, { "entropy": 1.9143847838044166, "epoch": 0.20298227082540596, "grad_norm": 9.230887413024902, "learning_rate": 5.615222148121263e-06, "loss": 0.5505, "mean_token_accuracy": 0.8377783179283143, "num_tokens": 78767603.0, "step": 65480 }, { "entropy": 1.9339503601193428, "epoch": 0.20301326995045565, "grad_norm": 7.70396614074707, "learning_rate": 5.614793416844106e-06, "loss": 0.5429, "mean_token_accuracy": 0.8370099574327469, "num_tokens": 78779568.0, "step": 65490 }, { "entropy": 1.9379954680800437, "epoch": 0.20304426907550535, "grad_norm": 9.033950805664062, "learning_rate": 5.614364783754932e-06, "loss": 0.5227, "mean_token_accuracy": 0.8270322412252427, "num_tokens": 78791706.0, "step": 65500 }, { "entropy": 1.8664806455373764, "epoch": 0.20307526820055505, "grad_norm": 2.266970634460449, "learning_rate": 5.613936248816266e-06, "loss": 0.5006, "mean_token_accuracy": 0.8406695753335953, "num_tokens": 78804244.0, "step": 65510 }, { "entropy": 1.8240263767540454, "epoch": 0.20310626732560474, "grad_norm": 9.05511474609375, "learning_rate": 5.613507811990659e-06, "loss": 0.4567, "mean_token_accuracy": 0.8478545337915421, "num_tokens": 78817721.0, "step": 65520 }, { "entropy": 1.8720962792634963, "epoch": 0.20313726645065444, "grad_norm": 4.106721878051758, "learning_rate": 5.613079473240674e-06, "loss": 0.4869, "mean_token_accuracy": 0.8406256809830666, "num_tokens": 78829445.0, "step": 65530 }, { "entropy": 1.9249654412269592, "epoch": 0.20316826557570414, "grad_norm": 7.825181007385254, "learning_rate": 5.612651232528903e-06, "loss": 0.5033, "mean_token_accuracy": 0.8406661361455917, "num_tokens": 78841287.0, "step": 65540 }, { "entropy": 1.9155500084161758, "epoch": 0.20319926470075383, "grad_norm": 8.039549827575684, "learning_rate": 5.61222308981795e-06, "loss": 0.5308, "mean_token_accuracy": 0.8323755100369453, "num_tokens": 78853504.0, "step": 65550 }, { "entropy": 1.9643807247281075, "epoch": 0.2032302638258035, "grad_norm": 7.621034622192383, "learning_rate": 5.611795045070444e-06, "loss": 0.5137, "mean_token_accuracy": 0.8504145249724389, "num_tokens": 78864902.0, "step": 65560 }, { "entropy": 1.8879443630576134, "epoch": 0.2032612629508532, "grad_norm": 10.218156814575195, "learning_rate": 5.611367098249031e-06, "loss": 0.5091, "mean_token_accuracy": 0.8361433282494545, "num_tokens": 78877179.0, "step": 65570 }, { "entropy": 1.863333511352539, "epoch": 0.2032922620759029, "grad_norm": 8.165308952331543, "learning_rate": 5.61093924931638e-06, "loss": 0.4681, "mean_token_accuracy": 0.836507086455822, "num_tokens": 78889655.0, "step": 65580 }, { "entropy": 1.8848236933350564, "epoch": 0.2033232612009526, "grad_norm": 4.619635581970215, "learning_rate": 5.610511498235176e-06, "loss": 0.4952, "mean_token_accuracy": 0.8372624054551124, "num_tokens": 78901982.0, "step": 65590 }, { "entropy": 1.9587459236383438, "epoch": 0.2033542603260023, "grad_norm": 9.958028793334961, "learning_rate": 5.610083844968128e-06, "loss": 0.5397, "mean_token_accuracy": 0.8413488537073135, "num_tokens": 78912795.0, "step": 65600 }, { "entropy": 1.973938637971878, "epoch": 0.203385259451052, "grad_norm": 8.87924861907959, "learning_rate": 5.609656289477961e-06, "loss": 0.5612, "mean_token_accuracy": 0.828423522412777, "num_tokens": 78923643.0, "step": 65610 }, { "entropy": 1.9414395466446877, "epoch": 0.20341625857610168, "grad_norm": 8.88390064239502, "learning_rate": 5.609228831727426e-06, "loss": 0.5457, "mean_token_accuracy": 0.8232395872473717, "num_tokens": 78934550.0, "step": 65620 }, { "entropy": 1.9254730343818665, "epoch": 0.20344725770115138, "grad_norm": 9.035659790039062, "learning_rate": 5.6088014716792835e-06, "loss": 0.5016, "mean_token_accuracy": 0.8413706630468368, "num_tokens": 78946328.0, "step": 65630 }, { "entropy": 1.8993248373270035, "epoch": 0.20347825682620108, "grad_norm": 8.599936485290527, "learning_rate": 5.6083742092963255e-06, "loss": 0.4786, "mean_token_accuracy": 0.8389776006340981, "num_tokens": 78958262.0, "step": 65640 }, { "entropy": 1.8646668374538422, "epoch": 0.20350925595125077, "grad_norm": 9.56519603729248, "learning_rate": 5.607947044541355e-06, "loss": 0.4687, "mean_token_accuracy": 0.8369742676615715, "num_tokens": 78971613.0, "step": 65650 }, { "entropy": 1.9807878568768502, "epoch": 0.20354025507630047, "grad_norm": 7.997987747192383, "learning_rate": 5.607519977377199e-06, "loss": 0.535, "mean_token_accuracy": 0.8315387591719627, "num_tokens": 78983244.0, "step": 65660 }, { "entropy": 1.8589694291353225, "epoch": 0.20357125420135017, "grad_norm": 8.80921459197998, "learning_rate": 5.607093007766705e-06, "loss": 0.4687, "mean_token_accuracy": 0.8467966377735138, "num_tokens": 78997102.0, "step": 65670 }, { "entropy": 1.8324778914451598, "epoch": 0.20360225332639986, "grad_norm": 4.456134796142578, "learning_rate": 5.606666135672738e-06, "loss": 0.3992, "mean_token_accuracy": 0.8528119862079621, "num_tokens": 79011102.0, "step": 65680 }, { "entropy": 1.94354690015316, "epoch": 0.20363325245144956, "grad_norm": 8.482820510864258, "learning_rate": 5.6062393610581824e-06, "loss": 0.4913, "mean_token_accuracy": 0.8464238449931145, "num_tokens": 79022653.0, "step": 65690 }, { "entropy": 1.8365743920207023, "epoch": 0.20366425157649926, "grad_norm": 3.94012188911438, "learning_rate": 5.605812683885945e-06, "loss": 0.4387, "mean_token_accuracy": 0.8460432440042496, "num_tokens": 79035953.0, "step": 65700 }, { "entropy": 1.9237422615289688, "epoch": 0.20369525070154895, "grad_norm": 8.296760559082031, "learning_rate": 5.6053861041189515e-06, "loss": 0.5001, "mean_token_accuracy": 0.8385673075914383, "num_tokens": 79049233.0, "step": 65710 }, { "entropy": 1.9227632522583007, "epoch": 0.20372624982659865, "grad_norm": 4.054368019104004, "learning_rate": 5.604959621720145e-06, "loss": 0.5091, "mean_token_accuracy": 0.8334377571940422, "num_tokens": 79061362.0, "step": 65720 }, { "entropy": 1.9160942152142524, "epoch": 0.20375724895164835, "grad_norm": 3.290419340133667, "learning_rate": 5.604533236652492e-06, "loss": 0.4808, "mean_token_accuracy": 0.840754111111164, "num_tokens": 79073746.0, "step": 65730 }, { "entropy": 1.9141247898340226, "epoch": 0.20378824807669804, "grad_norm": 8.813379287719727, "learning_rate": 5.604106948878974e-06, "loss": 0.4842, "mean_token_accuracy": 0.846279302239418, "num_tokens": 79086069.0, "step": 65740 }, { "entropy": 1.8690850347280503, "epoch": 0.20381924720174774, "grad_norm": 3.4831085205078125, "learning_rate": 5.603680758362599e-06, "loss": 0.4491, "mean_token_accuracy": 0.844321160018444, "num_tokens": 79098143.0, "step": 65750 }, { "entropy": 1.9780056357383728, "epoch": 0.20385024632679744, "grad_norm": 8.82219409942627, "learning_rate": 5.603254665066387e-06, "loss": 0.5811, "mean_token_accuracy": 0.8181054502725601, "num_tokens": 79109063.0, "step": 65760 }, { "entropy": 1.9268645226955414, "epoch": 0.20388124545184713, "grad_norm": 7.107331275939941, "learning_rate": 5.602828668953384e-06, "loss": 0.4698, "mean_token_accuracy": 0.8484486505389214, "num_tokens": 79120352.0, "step": 65770 }, { "entropy": 1.9007486268877982, "epoch": 0.20391224457689683, "grad_norm": 8.633251190185547, "learning_rate": 5.602402769986652e-06, "loss": 0.4971, "mean_token_accuracy": 0.8409877792000771, "num_tokens": 79131530.0, "step": 65780 }, { "entropy": 1.873155763745308, "epoch": 0.20394324370194652, "grad_norm": 4.75627326965332, "learning_rate": 5.601976968129274e-06, "loss": 0.4344, "mean_token_accuracy": 0.8587671235203743, "num_tokens": 79143461.0, "step": 65790 }, { "entropy": 1.895801869034767, "epoch": 0.20397424282699622, "grad_norm": 6.479430675506592, "learning_rate": 5.6015512633443526e-06, "loss": 0.5122, "mean_token_accuracy": 0.8410785019397735, "num_tokens": 79155115.0, "step": 65800 }, { "entropy": 1.8817868903279305, "epoch": 0.2040052419520459, "grad_norm": 7.321788787841797, "learning_rate": 5.60112565559501e-06, "loss": 0.48, "mean_token_accuracy": 0.8473842695355416, "num_tokens": 79168062.0, "step": 65810 }, { "entropy": 1.9196514919400216, "epoch": 0.2040362410770956, "grad_norm": 4.99709415435791, "learning_rate": 5.600700144844387e-06, "loss": 0.5157, "mean_token_accuracy": 0.8285318657755851, "num_tokens": 79180238.0, "step": 65820 }, { "entropy": 1.880447769165039, "epoch": 0.20406724020214528, "grad_norm": 10.029952049255371, "learning_rate": 5.600274731055645e-06, "loss": 0.5018, "mean_token_accuracy": 0.8361893489956855, "num_tokens": 79193071.0, "step": 65830 }, { "entropy": 1.9641305297613143, "epoch": 0.20409823932719498, "grad_norm": 8.848322868347168, "learning_rate": 5.599849414191965e-06, "loss": 0.5383, "mean_token_accuracy": 0.8405619546771049, "num_tokens": 79204235.0, "step": 65840 }, { "entropy": 1.896867436170578, "epoch": 0.20412923845224468, "grad_norm": 9.602326393127441, "learning_rate": 5.599424194216547e-06, "loss": 0.492, "mean_token_accuracy": 0.8406928956508637, "num_tokens": 79216424.0, "step": 65850 }, { "entropy": 1.9198857069015502, "epoch": 0.20416023757729437, "grad_norm": 10.032066345214844, "learning_rate": 5.598999071092613e-06, "loss": 0.4794, "mean_token_accuracy": 0.8465869203209877, "num_tokens": 79227957.0, "step": 65860 }, { "entropy": 1.8796373263001442, "epoch": 0.20419123670234407, "grad_norm": 4.095681190490723, "learning_rate": 5.598574044783399e-06, "loss": 0.4646, "mean_token_accuracy": 0.8457603216171264, "num_tokens": 79241020.0, "step": 65870 }, { "entropy": 1.890330995619297, "epoch": 0.20422223582739377, "grad_norm": 8.238532066345215, "learning_rate": 5.598149115252166e-06, "loss": 0.4564, "mean_token_accuracy": 0.8491918399930001, "num_tokens": 79254022.0, "step": 65880 }, { "entropy": 1.960501140356064, "epoch": 0.20425323495244346, "grad_norm": 9.54404354095459, "learning_rate": 5.597724282462193e-06, "loss": 0.5882, "mean_token_accuracy": 0.8301891297101974, "num_tokens": 79265375.0, "step": 65890 }, { "entropy": 1.8832382291555405, "epoch": 0.20428423407749316, "grad_norm": 10.813145637512207, "learning_rate": 5.597299546376778e-06, "loss": 0.5184, "mean_token_accuracy": 0.8315230071544647, "num_tokens": 79277983.0, "step": 65900 }, { "entropy": 1.9647821724414825, "epoch": 0.20431523320254286, "grad_norm": 8.538866996765137, "learning_rate": 5.596874906959238e-06, "loss": 0.5566, "mean_token_accuracy": 0.8316359147429466, "num_tokens": 79289116.0, "step": 65910 }, { "entropy": 1.8251458272337913, "epoch": 0.20434623232759255, "grad_norm": 8.377923965454102, "learning_rate": 5.596450364172909e-06, "loss": 0.3959, "mean_token_accuracy": 0.86036896109581, "num_tokens": 79301925.0, "step": 65920 }, { "entropy": 1.9117643371224404, "epoch": 0.20437723145264225, "grad_norm": 8.313860893249512, "learning_rate": 5.596025917981147e-06, "loss": 0.5422, "mean_token_accuracy": 0.8350561752915382, "num_tokens": 79313308.0, "step": 65930 }, { "entropy": 1.9248536437749864, "epoch": 0.20440823057769195, "grad_norm": 7.247808933258057, "learning_rate": 5.595601568347332e-06, "loss": 0.5714, "mean_token_accuracy": 0.8272919088602066, "num_tokens": 79325093.0, "step": 65940 }, { "entropy": 1.9249488562345505, "epoch": 0.20443922970274164, "grad_norm": 4.61583137512207, "learning_rate": 5.5951773152348545e-06, "loss": 0.4612, "mean_token_accuracy": 0.8503524556756019, "num_tokens": 79336001.0, "step": 65950 }, { "entropy": 1.8732093065977096, "epoch": 0.20447022882779134, "grad_norm": 8.46263313293457, "learning_rate": 5.594753158607133e-06, "loss": 0.4757, "mean_token_accuracy": 0.8438627734780312, "num_tokens": 79347710.0, "step": 65960 }, { "entropy": 1.8675259336829186, "epoch": 0.20450122795284104, "grad_norm": 9.65857219696045, "learning_rate": 5.594329098427599e-06, "loss": 0.4691, "mean_token_accuracy": 0.8433469668030739, "num_tokens": 79359872.0, "step": 65970 }, { "entropy": 1.8642318069934845, "epoch": 0.20453222707789073, "grad_norm": 3.6823036670684814, "learning_rate": 5.5939051346597075e-06, "loss": 0.4386, "mean_token_accuracy": 0.853775417804718, "num_tokens": 79372088.0, "step": 65980 }, { "entropy": 1.7361238494515419, "epoch": 0.20456322620294043, "grad_norm": 3.253519296646118, "learning_rate": 5.59348126726693e-06, "loss": 0.4458, "mean_token_accuracy": 0.8488474145531655, "num_tokens": 79386133.0, "step": 65990 }, { "entropy": 1.9137817591428756, "epoch": 0.20459422532799013, "grad_norm": 9.194711685180664, "learning_rate": 5.593057496212762e-06, "loss": 0.5272, "mean_token_accuracy": 0.8341950923204422, "num_tokens": 79397436.0, "step": 66000 }, { "entropy": 1.8905989840626716, "epoch": 0.20462522445303982, "grad_norm": 9.221059799194336, "learning_rate": 5.592633821460712e-06, "loss": 0.5225, "mean_token_accuracy": 0.8302080318331718, "num_tokens": 79409957.0, "step": 66010 }, { "entropy": 1.9771162524819375, "epoch": 0.20465622357808952, "grad_norm": 8.793451309204102, "learning_rate": 5.592210242974312e-06, "loss": 0.5649, "mean_token_accuracy": 0.8244690522551537, "num_tokens": 79421218.0, "step": 66020 }, { "entropy": 1.9063599698245526, "epoch": 0.20468722270313922, "grad_norm": 8.209443092346191, "learning_rate": 5.5917867607171115e-06, "loss": 0.5213, "mean_token_accuracy": 0.8356138646602631, "num_tokens": 79432683.0, "step": 66030 }, { "entropy": 1.9731371477246284, "epoch": 0.2047182218281889, "grad_norm": 8.800919532775879, "learning_rate": 5.5913633746526845e-06, "loss": 0.5424, "mean_token_accuracy": 0.8384016111493111, "num_tokens": 79443817.0, "step": 66040 }, { "entropy": 1.9695898175239563, "epoch": 0.2047492209532386, "grad_norm": 5.338846206665039, "learning_rate": 5.590940084744614e-06, "loss": 0.556, "mean_token_accuracy": 0.8361503854393959, "num_tokens": 79455128.0, "step": 66050 }, { "entropy": 1.9504301056265831, "epoch": 0.20478022007828828, "grad_norm": 6.454843997955322, "learning_rate": 5.590516890956512e-06, "loss": 0.4941, "mean_token_accuracy": 0.8300734832882881, "num_tokens": 79466545.0, "step": 66060 }, { "entropy": 1.916282233595848, "epoch": 0.20481121920333797, "grad_norm": 10.762775421142578, "learning_rate": 5.590093793252005e-06, "loss": 0.5018, "mean_token_accuracy": 0.850112085044384, "num_tokens": 79477559.0, "step": 66070 }, { "entropy": 1.932596181333065, "epoch": 0.20484221832838767, "grad_norm": 8.139491081237793, "learning_rate": 5.5896707915947404e-06, "loss": 0.5233, "mean_token_accuracy": 0.8297857508063317, "num_tokens": 79489215.0, "step": 66080 }, { "entropy": 1.7636524528264999, "epoch": 0.20487321745343737, "grad_norm": 3.8502655029296875, "learning_rate": 5.5892478859483836e-06, "loss": 0.4123, "mean_token_accuracy": 0.8482681661844254, "num_tokens": 79503087.0, "step": 66090 }, { "entropy": 1.9150663495063782, "epoch": 0.20490421657848706, "grad_norm": 8.005017280578613, "learning_rate": 5.588825076276619e-06, "loss": 0.5327, "mean_token_accuracy": 0.8414066791534424, "num_tokens": 79514977.0, "step": 66100 }, { "entropy": 1.884339025616646, "epoch": 0.20493521570353676, "grad_norm": 12.069209098815918, "learning_rate": 5.5884023625431536e-06, "loss": 0.4926, "mean_token_accuracy": 0.8346866846084595, "num_tokens": 79527912.0, "step": 66110 }, { "entropy": 1.9233988001942635, "epoch": 0.20496621482858646, "grad_norm": 7.263315200805664, "learning_rate": 5.58797974471171e-06, "loss": 0.5187, "mean_token_accuracy": 0.8440830901265144, "num_tokens": 79539486.0, "step": 66120 }, { "entropy": 1.9001603171229362, "epoch": 0.20499721395363615, "grad_norm": 8.054461479187012, "learning_rate": 5.587557222746031e-06, "loss": 0.4695, "mean_token_accuracy": 0.8448784217238426, "num_tokens": 79551307.0, "step": 66130 }, { "entropy": 1.966906487941742, "epoch": 0.20502821307868585, "grad_norm": 7.843715190887451, "learning_rate": 5.587134796609878e-06, "loss": 0.534, "mean_token_accuracy": 0.8330809384584427, "num_tokens": 79563106.0, "step": 66140 }, { "entropy": 1.9580473832786083, "epoch": 0.20505921220373555, "grad_norm": 8.565571784973145, "learning_rate": 5.586712466267033e-06, "loss": 0.4751, "mean_token_accuracy": 0.8391212552785874, "num_tokens": 79574679.0, "step": 66150 }, { "entropy": 1.9535856321454048, "epoch": 0.20509021132878524, "grad_norm": 7.142053604125977, "learning_rate": 5.586290231681297e-06, "loss": 0.5079, "mean_token_accuracy": 0.8499238818883896, "num_tokens": 79586243.0, "step": 66160 }, { "entropy": 1.832335925102234, "epoch": 0.20512121045383494, "grad_norm": 4.120788097381592, "learning_rate": 5.58586809281649e-06, "loss": 0.4294, "mean_token_accuracy": 0.8358307898044586, "num_tokens": 79599760.0, "step": 66170 }, { "entropy": 2.0018374592065813, "epoch": 0.20515220957888464, "grad_norm": 9.204635620117188, "learning_rate": 5.585446049636449e-06, "loss": 0.5362, "mean_token_accuracy": 0.8410599693655968, "num_tokens": 79610302.0, "step": 66180 }, { "entropy": 1.914768175780773, "epoch": 0.20518320870393433, "grad_norm": 9.314388275146484, "learning_rate": 5.585024102105034e-06, "loss": 0.5387, "mean_token_accuracy": 0.8398109778761864, "num_tokens": 79621486.0, "step": 66190 }, { "entropy": 1.8635256588459015, "epoch": 0.20521420782898403, "grad_norm": 8.0902738571167, "learning_rate": 5.5846022501861204e-06, "loss": 0.461, "mean_token_accuracy": 0.8411575838923454, "num_tokens": 79634109.0, "step": 66200 }, { "entropy": 1.9430198609828948, "epoch": 0.20524520695403373, "grad_norm": 9.754541397094727, "learning_rate": 5.584180493843605e-06, "loss": 0.5164, "mean_token_accuracy": 0.8356720983982087, "num_tokens": 79645040.0, "step": 66210 }, { "entropy": 1.9141881585121154, "epoch": 0.20527620607908342, "grad_norm": 4.26265287399292, "learning_rate": 5.583758833041404e-06, "loss": 0.5076, "mean_token_accuracy": 0.8484968930482865, "num_tokens": 79656732.0, "step": 66220 }, { "entropy": 1.9716264665126801, "epoch": 0.20530720520413312, "grad_norm": 8.999820709228516, "learning_rate": 5.583337267743449e-06, "loss": 0.5258, "mean_token_accuracy": 0.8347686797380447, "num_tokens": 79667904.0, "step": 66230 }, { "entropy": 1.9396614864468575, "epoch": 0.20533820432918282, "grad_norm": 8.992365837097168, "learning_rate": 5.582915797913695e-06, "loss": 0.5204, "mean_token_accuracy": 0.8403194323182106, "num_tokens": 79679499.0, "step": 66240 }, { "entropy": 1.9468914091587066, "epoch": 0.2053692034542325, "grad_norm": 3.764883279800415, "learning_rate": 5.582494423516115e-06, "loss": 0.5039, "mean_token_accuracy": 0.8384948015213013, "num_tokens": 79690852.0, "step": 66250 }, { "entropy": 1.9295331597328187, "epoch": 0.2054002025792822, "grad_norm": 7.999744415283203, "learning_rate": 5.582073144514698e-06, "loss": 0.5274, "mean_token_accuracy": 0.8378969594836235, "num_tokens": 79701855.0, "step": 66260 }, { "entropy": 1.9138659819960595, "epoch": 0.2054312017043319, "grad_norm": 7.380083084106445, "learning_rate": 5.5816519608734575e-06, "loss": 0.5216, "mean_token_accuracy": 0.8312686800956726, "num_tokens": 79713471.0, "step": 66270 }, { "entropy": 1.8375635713338851, "epoch": 0.2054622008293816, "grad_norm": 6.883208751678467, "learning_rate": 5.58123087255642e-06, "loss": 0.4478, "mean_token_accuracy": 0.8480865985155106, "num_tokens": 79726767.0, "step": 66280 }, { "entropy": 1.8265572801232337, "epoch": 0.2054931999544313, "grad_norm": 10.417244911193848, "learning_rate": 5.580809879527636e-06, "loss": 0.4507, "mean_token_accuracy": 0.8491293832659721, "num_tokens": 79739638.0, "step": 66290 }, { "entropy": 1.8894486725330353, "epoch": 0.20552419907948097, "grad_norm": 6.471380710601807, "learning_rate": 5.580388981751174e-06, "loss": 0.5406, "mean_token_accuracy": 0.8371762230992317, "num_tokens": 79752223.0, "step": 66300 }, { "entropy": 1.7594513550400734, "epoch": 0.20555519820453066, "grad_norm": 3.132899522781372, "learning_rate": 5.579968179191117e-06, "loss": 0.4459, "mean_token_accuracy": 0.8502689436078071, "num_tokens": 79766341.0, "step": 66310 }, { "entropy": 1.7812666043639183, "epoch": 0.20558619732958036, "grad_norm": 3.3629016876220703, "learning_rate": 5.579547471811571e-06, "loss": 0.4518, "mean_token_accuracy": 0.8472658976912498, "num_tokens": 79779942.0, "step": 66320 }, { "entropy": 1.908031238615513, "epoch": 0.20561719645463006, "grad_norm": 9.27261734008789, "learning_rate": 5.579126859576662e-06, "loss": 0.4839, "mean_token_accuracy": 0.8606662392616272, "num_tokens": 79791530.0, "step": 66330 }, { "entropy": 1.912331785261631, "epoch": 0.20564819557967975, "grad_norm": 6.716431140899658, "learning_rate": 5.578706342450532e-06, "loss": 0.5208, "mean_token_accuracy": 0.8352584257721901, "num_tokens": 79804036.0, "step": 66340 }, { "entropy": 1.886544433236122, "epoch": 0.20567919470472945, "grad_norm": 7.406538963317871, "learning_rate": 5.578285920397344e-06, "loss": 0.469, "mean_token_accuracy": 0.839650048315525, "num_tokens": 79816050.0, "step": 66350 }, { "entropy": 1.9246870696544647, "epoch": 0.20571019382977915, "grad_norm": 9.786649703979492, "learning_rate": 5.577865593381278e-06, "loss": 0.4991, "mean_token_accuracy": 0.8357265755534172, "num_tokens": 79827947.0, "step": 66360 }, { "entropy": 1.9001474544405936, "epoch": 0.20574119295482884, "grad_norm": 5.080446243286133, "learning_rate": 5.577445361366534e-06, "loss": 0.461, "mean_token_accuracy": 0.8487439841032028, "num_tokens": 79839935.0, "step": 66370 }, { "entropy": 1.9511905193328858, "epoch": 0.20577219207987854, "grad_norm": 9.604957580566406, "learning_rate": 5.577025224317332e-06, "loss": 0.536, "mean_token_accuracy": 0.8385162249207496, "num_tokens": 79851151.0, "step": 66380 }, { "entropy": 1.9839515179395675, "epoch": 0.20580319120492824, "grad_norm": 8.998725891113281, "learning_rate": 5.576605182197907e-06, "loss": 0.5523, "mean_token_accuracy": 0.8333553358912468, "num_tokens": 79862474.0, "step": 66390 }, { "entropy": 2.031079703569412, "epoch": 0.20583419032997793, "grad_norm": 9.277356147766113, "learning_rate": 5.57618523497252e-06, "loss": 0.5982, "mean_token_accuracy": 0.822320033609867, "num_tokens": 79873184.0, "step": 66400 }, { "entropy": 1.959704938530922, "epoch": 0.20586518945502763, "grad_norm": 9.140758514404297, "learning_rate": 5.575765382605441e-06, "loss": 0.5058, "mean_token_accuracy": 0.8415651440620422, "num_tokens": 79884713.0, "step": 66410 }, { "entropy": 1.9794253259897232, "epoch": 0.20589618858007733, "grad_norm": 10.414609909057617, "learning_rate": 5.575345625060967e-06, "loss": 0.5002, "mean_token_accuracy": 0.844949309527874, "num_tokens": 79895411.0, "step": 66420 }, { "entropy": 1.9424155369400977, "epoch": 0.20592718770512702, "grad_norm": 7.093960762023926, "learning_rate": 5.574925962303411e-06, "loss": 0.4719, "mean_token_accuracy": 0.8516281858086586, "num_tokens": 79906896.0, "step": 66430 }, { "entropy": 1.966086308658123, "epoch": 0.20595818683017672, "grad_norm": 9.383204460144043, "learning_rate": 5.574506394297104e-06, "loss": 0.5155, "mean_token_accuracy": 0.8342742830514908, "num_tokens": 79918533.0, "step": 66440 }, { "entropy": 1.9549981564283372, "epoch": 0.20598918595522642, "grad_norm": 7.823967456817627, "learning_rate": 5.574086921006398e-06, "loss": 0.5041, "mean_token_accuracy": 0.8452614173293114, "num_tokens": 79930153.0, "step": 66450 }, { "entropy": 1.9196519732475281, "epoch": 0.2060201850802761, "grad_norm": 10.632098197937012, "learning_rate": 5.57366754239566e-06, "loss": 0.4804, "mean_token_accuracy": 0.8417575985193253, "num_tokens": 79941908.0, "step": 66460 }, { "entropy": 1.9567938655614854, "epoch": 0.2060511842053258, "grad_norm": 7.4412150382995605, "learning_rate": 5.57324825842928e-06, "loss": 0.5404, "mean_token_accuracy": 0.8311595633625984, "num_tokens": 79953034.0, "step": 66470 }, { "entropy": 1.9112473011016846, "epoch": 0.2060821833303755, "grad_norm": 10.366971015930176, "learning_rate": 5.572829069071665e-06, "loss": 0.4727, "mean_token_accuracy": 0.8476077273488045, "num_tokens": 79964657.0, "step": 66480 }, { "entropy": 1.9210296481847764, "epoch": 0.2061131824554252, "grad_norm": 9.77088451385498, "learning_rate": 5.572409974287238e-06, "loss": 0.525, "mean_token_accuracy": 0.8393374785780907, "num_tokens": 79976836.0, "step": 66490 }, { "entropy": 1.9088515147566796, "epoch": 0.2061441815804749, "grad_norm": 8.623429298400879, "learning_rate": 5.5719909740404465e-06, "loss": 0.5084, "mean_token_accuracy": 0.8390779867768288, "num_tokens": 79988995.0, "step": 66500 }, { "entropy": 1.8673071622848512, "epoch": 0.2061751807055246, "grad_norm": 4.687022686004639, "learning_rate": 5.571572068295751e-06, "loss": 0.451, "mean_token_accuracy": 0.8348147094249725, "num_tokens": 80001572.0, "step": 66510 }, { "entropy": 1.9063550010323524, "epoch": 0.2062061798305743, "grad_norm": 7.440145015716553, "learning_rate": 5.571153257017634e-06, "loss": 0.5065, "mean_token_accuracy": 0.8437089160084724, "num_tokens": 80013203.0, "step": 66520 }, { "entropy": 1.8806561693549155, "epoch": 0.206237178955624, "grad_norm": 3.6028239727020264, "learning_rate": 5.570734540170597e-06, "loss": 0.4346, "mean_token_accuracy": 0.8492167726159096, "num_tokens": 80025389.0, "step": 66530 }, { "entropy": 2.0046221882104875, "epoch": 0.2062681780806737, "grad_norm": 10.231752395629883, "learning_rate": 5.570315917719158e-06, "loss": 0.5636, "mean_token_accuracy": 0.8313092350959778, "num_tokens": 80036574.0, "step": 66540 }, { "entropy": 1.9903879404067992, "epoch": 0.20629917720572336, "grad_norm": 9.108248710632324, "learning_rate": 5.569897389627855e-06, "loss": 0.6043, "mean_token_accuracy": 0.8205847591161728, "num_tokens": 80048154.0, "step": 66550 }, { "entropy": 1.9472715437412262, "epoch": 0.20633017633077305, "grad_norm": 8.83901596069336, "learning_rate": 5.569478955861244e-06, "loss": 0.5332, "mean_token_accuracy": 0.8277617216110229, "num_tokens": 80060061.0, "step": 66560 }, { "entropy": 1.9148873046040535, "epoch": 0.20636117545582275, "grad_norm": 8.391643524169922, "learning_rate": 5.5690606163839e-06, "loss": 0.5329, "mean_token_accuracy": 0.8314318493008613, "num_tokens": 80071978.0, "step": 66570 }, { "entropy": 1.9878364622592926, "epoch": 0.20639217458087245, "grad_norm": 9.095196723937988, "learning_rate": 5.568642371160418e-06, "loss": 0.5044, "mean_token_accuracy": 0.8388659507036209, "num_tokens": 80083500.0, "step": 66580 }, { "entropy": 1.941962979733944, "epoch": 0.20642317370592214, "grad_norm": 10.697854995727539, "learning_rate": 5.568224220155408e-06, "loss": 0.5167, "mean_token_accuracy": 0.833092825114727, "num_tokens": 80095609.0, "step": 66590 }, { "entropy": 1.8618491291999817, "epoch": 0.20645417283097184, "grad_norm": 3.9840543270111084, "learning_rate": 5.567806163333503e-06, "loss": 0.4091, "mean_token_accuracy": 0.8554950803518295, "num_tokens": 80108181.0, "step": 66600 }, { "entropy": 1.8243538379669189, "epoch": 0.20648517195602153, "grad_norm": 3.919654130935669, "learning_rate": 5.5673882006593514e-06, "loss": 0.4579, "mean_token_accuracy": 0.8437583655118942, "num_tokens": 80121901.0, "step": 66610 }, { "entropy": 1.950028759241104, "epoch": 0.20651617108107123, "grad_norm": 9.593600273132324, "learning_rate": 5.566970332097621e-06, "loss": 0.5336, "mean_token_accuracy": 0.8381760567426682, "num_tokens": 80132818.0, "step": 66620 }, { "entropy": 1.9005480587482453, "epoch": 0.20654717020612093, "grad_norm": 8.083364486694336, "learning_rate": 5.5665525576129985e-06, "loss": 0.5259, "mean_token_accuracy": 0.837388077378273, "num_tokens": 80145296.0, "step": 66630 }, { "entropy": 1.9510286539793014, "epoch": 0.20657816933117062, "grad_norm": 9.924178123474121, "learning_rate": 5.566134877170189e-06, "loss": 0.5598, "mean_token_accuracy": 0.8308894857764244, "num_tokens": 80156187.0, "step": 66640 }, { "entropy": 1.9527115762233733, "epoch": 0.20660916845622032, "grad_norm": 8.096648216247559, "learning_rate": 5.565717290733918e-06, "loss": 0.5243, "mean_token_accuracy": 0.8337409555912018, "num_tokens": 80167814.0, "step": 66650 }, { "entropy": 2.028332456946373, "epoch": 0.20664016758127002, "grad_norm": 8.106879234313965, "learning_rate": 5.565299798268925e-06, "loss": 0.5553, "mean_token_accuracy": 0.8302205935120582, "num_tokens": 80178500.0, "step": 66660 }, { "entropy": 2.0136495500802996, "epoch": 0.20667116670631971, "grad_norm": 10.762166023254395, "learning_rate": 5.5648823997399714e-06, "loss": 0.5889, "mean_token_accuracy": 0.8241096153855324, "num_tokens": 80189113.0, "step": 66670 }, { "entropy": 1.8538544356822968, "epoch": 0.2067021658313694, "grad_norm": 9.980530738830566, "learning_rate": 5.564465095111836e-06, "loss": 0.4405, "mean_token_accuracy": 0.854385307431221, "num_tokens": 80201832.0, "step": 66680 }, { "entropy": 1.9106570437550545, "epoch": 0.2067331649564191, "grad_norm": 8.356368064880371, "learning_rate": 5.564047884349318e-06, "loss": 0.4916, "mean_token_accuracy": 0.8357516199350357, "num_tokens": 80213809.0, "step": 66690 }, { "entropy": 1.9118531972169877, "epoch": 0.2067641640814688, "grad_norm": 7.672004699707031, "learning_rate": 5.563630767417233e-06, "loss": 0.4976, "mean_token_accuracy": 0.8388531014323235, "num_tokens": 80226123.0, "step": 66700 }, { "entropy": 1.8637908115983008, "epoch": 0.2067951632065185, "grad_norm": 4.457185745239258, "learning_rate": 5.563213744280416e-06, "loss": 0.4235, "mean_token_accuracy": 0.849481725692749, "num_tokens": 80238328.0, "step": 66710 }, { "entropy": 2.0010482162237166, "epoch": 0.2068261623315682, "grad_norm": 10.969298362731934, "learning_rate": 5.562796814903717e-06, "loss": 0.5952, "mean_token_accuracy": 0.8272617772221565, "num_tokens": 80249249.0, "step": 66720 }, { "entropy": 1.8944606065750123, "epoch": 0.2068571614566179, "grad_norm": 8.652742385864258, "learning_rate": 5.562379979252011e-06, "loss": 0.5298, "mean_token_accuracy": 0.8307804465293884, "num_tokens": 80262484.0, "step": 66730 }, { "entropy": 1.8551345482468604, "epoch": 0.2068881605816676, "grad_norm": 9.101856231689453, "learning_rate": 5.5619632372901865e-06, "loss": 0.4624, "mean_token_accuracy": 0.8466040581464768, "num_tokens": 80275457.0, "step": 66740 }, { "entropy": 1.8982827827334403, "epoch": 0.2069191597067173, "grad_norm": 7.065288543701172, "learning_rate": 5.561546588983153e-06, "loss": 0.4591, "mean_token_accuracy": 0.851127202808857, "num_tokens": 80287822.0, "step": 66750 }, { "entropy": 1.9204147845506667, "epoch": 0.20695015883176698, "grad_norm": 3.726187229156494, "learning_rate": 5.561130034295834e-06, "loss": 0.5046, "mean_token_accuracy": 0.8382663875818253, "num_tokens": 80299151.0, "step": 66760 }, { "entropy": 1.8413376584649086, "epoch": 0.20698115795681668, "grad_norm": 3.96382737159729, "learning_rate": 5.560713573193179e-06, "loss": 0.4529, "mean_token_accuracy": 0.8472777932882309, "num_tokens": 80311811.0, "step": 66770 }, { "entropy": 1.8229566425085069, "epoch": 0.20701215708186638, "grad_norm": 9.275156021118164, "learning_rate": 5.560297205640148e-06, "loss": 0.4624, "mean_token_accuracy": 0.8521854087710381, "num_tokens": 80324515.0, "step": 66780 }, { "entropy": 1.9010821655392647, "epoch": 0.20704315620691607, "grad_norm": 3.5451276302337646, "learning_rate": 5.559880931601726e-06, "loss": 0.4801, "mean_token_accuracy": 0.8351982876658439, "num_tokens": 80337231.0, "step": 66790 }, { "entropy": 1.9652031093835831, "epoch": 0.20707415533196574, "grad_norm": 9.789170265197754, "learning_rate": 5.559464751042909e-06, "loss": 0.5863, "mean_token_accuracy": 0.8287955492734909, "num_tokens": 80348415.0, "step": 66800 }, { "entropy": 1.9459122210741042, "epoch": 0.20710515445701544, "grad_norm": 8.680326461791992, "learning_rate": 5.559048663928719e-06, "loss": 0.5252, "mean_token_accuracy": 0.837482500076294, "num_tokens": 80360317.0, "step": 66810 }, { "entropy": 1.882257905602455, "epoch": 0.20713615358206514, "grad_norm": 9.22433090209961, "learning_rate": 5.558632670224192e-06, "loss": 0.4569, "mean_token_accuracy": 0.8451365128159523, "num_tokens": 80372302.0, "step": 66820 }, { "entropy": 1.9027293175458908, "epoch": 0.20716715270711483, "grad_norm": 10.963581085205078, "learning_rate": 5.558216769894383e-06, "loss": 0.5326, "mean_token_accuracy": 0.8382361069321632, "num_tokens": 80384407.0, "step": 66830 }, { "entropy": 1.9402805969119072, "epoch": 0.20719815183216453, "grad_norm": 10.329010009765625, "learning_rate": 5.557800962904364e-06, "loss": 0.5305, "mean_token_accuracy": 0.8304101303219795, "num_tokens": 80396463.0, "step": 66840 }, { "entropy": 1.9431497290730477, "epoch": 0.20722915095721423, "grad_norm": 6.116969108581543, "learning_rate": 5.557385249219228e-06, "loss": 0.5488, "mean_token_accuracy": 0.8325564071536065, "num_tokens": 80407622.0, "step": 66850 }, { "entropy": 1.9404701754450797, "epoch": 0.20726015008226392, "grad_norm": 8.709739685058594, "learning_rate": 5.5569696288040865e-06, "loss": 0.5052, "mean_token_accuracy": 0.8362139865756035, "num_tokens": 80418746.0, "step": 66860 }, { "entropy": 1.833868359029293, "epoch": 0.20729114920731362, "grad_norm": 3.70329213142395, "learning_rate": 5.5565541016240665e-06, "loss": 0.4345, "mean_token_accuracy": 0.8538075044751168, "num_tokens": 80431962.0, "step": 66870 }, { "entropy": 1.9629965022206306, "epoch": 0.20732214833236332, "grad_norm": 7.854671478271484, "learning_rate": 5.556138667644313e-06, "loss": 0.5174, "mean_token_accuracy": 0.8370448827743531, "num_tokens": 80443654.0, "step": 66880 }, { "entropy": 1.8818984359502793, "epoch": 0.207353147457413, "grad_norm": 9.3052396774292, "learning_rate": 5.5557233268299925e-06, "loss": 0.4549, "mean_token_accuracy": 0.8465585172176361, "num_tokens": 80456312.0, "step": 66890 }, { "entropy": 1.925614383816719, "epoch": 0.2073841465824627, "grad_norm": 8.081002235412598, "learning_rate": 5.555308079146288e-06, "loss": 0.5944, "mean_token_accuracy": 0.8237112745642662, "num_tokens": 80468324.0, "step": 66900 }, { "entropy": 1.9605086162686347, "epoch": 0.2074151457075124, "grad_norm": 9.929429054260254, "learning_rate": 5.554892924558401e-06, "loss": 0.5574, "mean_token_accuracy": 0.838320504128933, "num_tokens": 80480134.0, "step": 66910 }, { "entropy": 1.9550656154751778, "epoch": 0.2074461448325621, "grad_norm": 9.695679664611816, "learning_rate": 5.554477863031548e-06, "loss": 0.5097, "mean_token_accuracy": 0.8479965060949326, "num_tokens": 80491369.0, "step": 66920 }, { "entropy": 1.7863646060228349, "epoch": 0.2074771439576118, "grad_norm": 8.903475761413574, "learning_rate": 5.55406289453097e-06, "loss": 0.4342, "mean_token_accuracy": 0.8561421141028405, "num_tokens": 80504627.0, "step": 66930 }, { "entropy": 1.9312346950173378, "epoch": 0.2075081430826615, "grad_norm": 8.315851211547852, "learning_rate": 5.553648019021922e-06, "loss": 0.5116, "mean_token_accuracy": 0.8424394950270653, "num_tokens": 80515858.0, "step": 66940 }, { "entropy": 1.9585540309548377, "epoch": 0.2075391422077112, "grad_norm": 8.037333488464355, "learning_rate": 5.553233236469678e-06, "loss": 0.5528, "mean_token_accuracy": 0.8312124446034431, "num_tokens": 80527650.0, "step": 66950 }, { "entropy": 1.885070489346981, "epoch": 0.2075701413327609, "grad_norm": 3.955148458480835, "learning_rate": 5.552818546839529e-06, "loss": 0.4427, "mean_token_accuracy": 0.8447909742593765, "num_tokens": 80539892.0, "step": 66960 }, { "entropy": 1.9014841109514236, "epoch": 0.20760114045781058, "grad_norm": 8.854087829589844, "learning_rate": 5.552403950096787e-06, "loss": 0.4729, "mean_token_accuracy": 0.8488608747720718, "num_tokens": 80551351.0, "step": 66970 }, { "entropy": 1.9420479103922843, "epoch": 0.20763213958286028, "grad_norm": 8.27338981628418, "learning_rate": 5.551989446206778e-06, "loss": 0.5369, "mean_token_accuracy": 0.8347111612558364, "num_tokens": 80563303.0, "step": 66980 }, { "entropy": 1.830499567091465, "epoch": 0.20766313870790998, "grad_norm": 8.519128799438477, "learning_rate": 5.55157503513485e-06, "loss": 0.4133, "mean_token_accuracy": 0.8499986290931701, "num_tokens": 80575761.0, "step": 66990 }, { "entropy": 1.8168928176164627, "epoch": 0.20769413783295967, "grad_norm": 3.4133358001708984, "learning_rate": 5.551160716846368e-06, "loss": 0.4532, "mean_token_accuracy": 0.8549173071980476, "num_tokens": 80588988.0, "step": 67000 }, { "entropy": 1.7608814522624017, "epoch": 0.20772513695800937, "grad_norm": 9.151144981384277, "learning_rate": 5.550746491306713e-06, "loss": 0.4139, "mean_token_accuracy": 0.8523845434188843, "num_tokens": 80602468.0, "step": 67010 }, { "entropy": 1.8375635772943497, "epoch": 0.20775613608305907, "grad_norm": 9.777922630310059, "learning_rate": 5.5503323584812866e-06, "loss": 0.4555, "mean_token_accuracy": 0.8414626881480217, "num_tokens": 80615406.0, "step": 67020 }, { "entropy": 1.7990097239613534, "epoch": 0.20778713520810876, "grad_norm": 5.260679244995117, "learning_rate": 5.549918318335509e-06, "loss": 0.4096, "mean_token_accuracy": 0.8451185315847397, "num_tokens": 80628306.0, "step": 67030 }, { "entropy": 1.8393166303634643, "epoch": 0.20781813433315843, "grad_norm": 9.089051246643066, "learning_rate": 5.549504370834814e-06, "loss": 0.4342, "mean_token_accuracy": 0.8503473341464997, "num_tokens": 80640534.0, "step": 67040 }, { "entropy": 1.9743037328124047, "epoch": 0.20784913345820813, "grad_norm": 7.8871684074401855, "learning_rate": 5.54909051594466e-06, "loss": 0.5803, "mean_token_accuracy": 0.8254914477467536, "num_tokens": 80652287.0, "step": 67050 }, { "entropy": 1.9053780257701873, "epoch": 0.20788013258325783, "grad_norm": 9.635024070739746, "learning_rate": 5.5486767536305175e-06, "loss": 0.4633, "mean_token_accuracy": 0.8367680624127388, "num_tokens": 80664802.0, "step": 67060 }, { "entropy": 1.9582928448915482, "epoch": 0.20791113170830752, "grad_norm": 9.71713924407959, "learning_rate": 5.548263083857879e-06, "loss": 0.5104, "mean_token_accuracy": 0.8391944229602813, "num_tokens": 80676644.0, "step": 67070 }, { "entropy": 1.914687879383564, "epoch": 0.20794213083335722, "grad_norm": 9.875877380371094, "learning_rate": 5.547849506592251e-06, "loss": 0.4863, "mean_token_accuracy": 0.8417564824223518, "num_tokens": 80688640.0, "step": 67080 }, { "entropy": 1.883899413049221, "epoch": 0.20797312995840692, "grad_norm": 4.134374618530273, "learning_rate": 5.547436021799163e-06, "loss": 0.4928, "mean_token_accuracy": 0.8355391338467598, "num_tokens": 80701658.0, "step": 67090 }, { "entropy": 1.892640021443367, "epoch": 0.2080041290834566, "grad_norm": 7.394375801086426, "learning_rate": 5.54702262944416e-06, "loss": 0.5235, "mean_token_accuracy": 0.8335898667573929, "num_tokens": 80713327.0, "step": 67100 }, { "entropy": 1.8861460164189339, "epoch": 0.2080351282085063, "grad_norm": 7.844824314117432, "learning_rate": 5.546609329492804e-06, "loss": 0.4888, "mean_token_accuracy": 0.8451134487986565, "num_tokens": 80724788.0, "step": 67110 }, { "entropy": 1.9601202994585036, "epoch": 0.208066127333556, "grad_norm": 9.103307723999023, "learning_rate": 5.546196121910674e-06, "loss": 0.5232, "mean_token_accuracy": 0.8401788577437401, "num_tokens": 80735373.0, "step": 67120 }, { "entropy": 1.9305698484182359, "epoch": 0.2080971264586057, "grad_norm": 10.515917778015137, "learning_rate": 5.545783006663372e-06, "loss": 0.506, "mean_token_accuracy": 0.8503312930464745, "num_tokens": 80745599.0, "step": 67130 }, { "entropy": 1.8946173369884491, "epoch": 0.2081281255836554, "grad_norm": 8.005970001220703, "learning_rate": 5.545369983716514e-06, "loss": 0.5053, "mean_token_accuracy": 0.8377565070986748, "num_tokens": 80757413.0, "step": 67140 }, { "entropy": 1.9489420622587204, "epoch": 0.2081591247087051, "grad_norm": 9.35682487487793, "learning_rate": 5.544957053035733e-06, "loss": 0.5294, "mean_token_accuracy": 0.8386912405490875, "num_tokens": 80767952.0, "step": 67150 }, { "entropy": 1.8574773401021958, "epoch": 0.2081901238337548, "grad_norm": 4.570268154144287, "learning_rate": 5.5445442145866835e-06, "loss": 0.4491, "mean_token_accuracy": 0.8455320358276367, "num_tokens": 80780046.0, "step": 67160 }, { "entropy": 1.871639384329319, "epoch": 0.2082211229588045, "grad_norm": 7.216861724853516, "learning_rate": 5.544131468335036e-06, "loss": 0.453, "mean_token_accuracy": 0.846157830953598, "num_tokens": 80792621.0, "step": 67170 }, { "entropy": 1.84230377972126, "epoch": 0.20825212208385419, "grad_norm": 3.1845858097076416, "learning_rate": 5.543718814246477e-06, "loss": 0.4188, "mean_token_accuracy": 0.8477763459086418, "num_tokens": 80805344.0, "step": 67180 }, { "entropy": 1.8911578252911567, "epoch": 0.20828312120890388, "grad_norm": 10.065200805664062, "learning_rate": 5.543306252286714e-06, "loss": 0.5073, "mean_token_accuracy": 0.8389113351702691, "num_tokens": 80817758.0, "step": 67190 }, { "entropy": 1.9382609099149704, "epoch": 0.20831412033395358, "grad_norm": 9.62885570526123, "learning_rate": 5.542893782421471e-06, "loss": 0.5609, "mean_token_accuracy": 0.8365870863199234, "num_tokens": 80828548.0, "step": 67200 }, { "entropy": 1.9892377644777297, "epoch": 0.20834511945900328, "grad_norm": 7.691858768463135, "learning_rate": 5.54248140461649e-06, "loss": 0.5937, "mean_token_accuracy": 0.8110725492238998, "num_tokens": 80840477.0, "step": 67210 }, { "entropy": 1.9411058470606803, "epoch": 0.20837611858405297, "grad_norm": 12.743127822875977, "learning_rate": 5.54206911883753e-06, "loss": 0.5645, "mean_token_accuracy": 0.8270158648490906, "num_tokens": 80851801.0, "step": 67220 }, { "entropy": 1.9056400299072265, "epoch": 0.20840711770910267, "grad_norm": 8.924535751342773, "learning_rate": 5.541656925050371e-06, "loss": 0.4843, "mean_token_accuracy": 0.8432095557451248, "num_tokens": 80864251.0, "step": 67230 }, { "entropy": 1.8577035881578923, "epoch": 0.20843811683415236, "grad_norm": 6.489585876464844, "learning_rate": 5.541244823220805e-06, "loss": 0.4592, "mean_token_accuracy": 0.8534018874168396, "num_tokens": 80877496.0, "step": 67240 }, { "entropy": 1.7855018101632596, "epoch": 0.20846911595920206, "grad_norm": 2.9368503093719482, "learning_rate": 5.540832813314648e-06, "loss": 0.3625, "mean_token_accuracy": 0.8677582457661629, "num_tokens": 80891030.0, "step": 67250 }, { "entropy": 1.9163383916020393, "epoch": 0.20850011508425176, "grad_norm": 8.91242504119873, "learning_rate": 5.54042089529773e-06, "loss": 0.5228, "mean_token_accuracy": 0.8421859487891197, "num_tokens": 80902336.0, "step": 67260 }, { "entropy": 1.9107780367136002, "epoch": 0.20853111420930145, "grad_norm": 8.577934265136719, "learning_rate": 5.5400090691359e-06, "loss": 0.499, "mean_token_accuracy": 0.837343692779541, "num_tokens": 80912858.0, "step": 67270 }, { "entropy": 1.868037761747837, "epoch": 0.20856211333435115, "grad_norm": 9.165842056274414, "learning_rate": 5.539597334795024e-06, "loss": 0.4754, "mean_token_accuracy": 0.8567280262708664, "num_tokens": 80924871.0, "step": 67280 }, { "entropy": 1.897133542597294, "epoch": 0.20859311245940082, "grad_norm": 3.6761436462402344, "learning_rate": 5.539185692240987e-06, "loss": 0.4565, "mean_token_accuracy": 0.8450244545936585, "num_tokens": 80936908.0, "step": 67290 }, { "entropy": 1.852649959921837, "epoch": 0.20862411158445052, "grad_norm": 5.1975483894348145, "learning_rate": 5.538774141439691e-06, "loss": 0.4465, "mean_token_accuracy": 0.8447863146662712, "num_tokens": 80949637.0, "step": 67300 }, { "entropy": 1.9235988169908524, "epoch": 0.2086551107095002, "grad_norm": 9.723345756530762, "learning_rate": 5.538362682357055e-06, "loss": 0.5537, "mean_token_accuracy": 0.8258920639753342, "num_tokens": 80960775.0, "step": 67310 }, { "entropy": 1.9941146105527878, "epoch": 0.2086861098345499, "grad_norm": 7.907135963439941, "learning_rate": 5.537951314959018e-06, "loss": 0.5285, "mean_token_accuracy": 0.8398841202259064, "num_tokens": 80971994.0, "step": 67320 }, { "entropy": 1.8793514996767045, "epoch": 0.2087171089595996, "grad_norm": 8.207256317138672, "learning_rate": 5.537540039211534e-06, "loss": 0.4522, "mean_token_accuracy": 0.8463095262646675, "num_tokens": 80984309.0, "step": 67330 }, { "entropy": 1.798450830578804, "epoch": 0.2087481080846493, "grad_norm": 2.7568373680114746, "learning_rate": 5.537128855080577e-06, "loss": 0.4334, "mean_token_accuracy": 0.8558945968747139, "num_tokens": 80997276.0, "step": 67340 }, { "entropy": 1.8368099942803382, "epoch": 0.208779107209699, "grad_norm": 8.600449562072754, "learning_rate": 5.5367177625321355e-06, "loss": 0.4832, "mean_token_accuracy": 0.8459367111325264, "num_tokens": 81010351.0, "step": 67350 }, { "entropy": 1.7894717290997506, "epoch": 0.2088101063347487, "grad_norm": 2.465794086456299, "learning_rate": 5.5363067615322206e-06, "loss": 0.4354, "mean_token_accuracy": 0.8549363747239113, "num_tokens": 81024645.0, "step": 67360 }, { "entropy": 1.80710818618536, "epoch": 0.2088411054597984, "grad_norm": 5.430171012878418, "learning_rate": 5.535895852046857e-06, "loss": 0.4382, "mean_token_accuracy": 0.8437926679849624, "num_tokens": 81038048.0, "step": 67370 }, { "entropy": 1.9083049342036247, "epoch": 0.2088721045848481, "grad_norm": 9.347772598266602, "learning_rate": 5.535485034042086e-06, "loss": 0.5258, "mean_token_accuracy": 0.833581855893135, "num_tokens": 81050463.0, "step": 67380 }, { "entropy": 1.9086160019040108, "epoch": 0.20890310370989779, "grad_norm": 8.73170280456543, "learning_rate": 5.535074307483974e-06, "loss": 0.4727, "mean_token_accuracy": 0.8540397524833679, "num_tokens": 81062467.0, "step": 67390 }, { "entropy": 1.8362308278679849, "epoch": 0.20893410283494748, "grad_norm": 6.898866176605225, "learning_rate": 5.534663672338595e-06, "loss": 0.4177, "mean_token_accuracy": 0.8507218867540359, "num_tokens": 81075477.0, "step": 67400 }, { "entropy": 1.964953315258026, "epoch": 0.20896510195999718, "grad_norm": 10.953664779663086, "learning_rate": 5.534253128572048e-06, "loss": 0.515, "mean_token_accuracy": 0.845398873090744, "num_tokens": 81086245.0, "step": 67410 }, { "entropy": 1.8964574694633485, "epoch": 0.20899610108504688, "grad_norm": 10.055764198303223, "learning_rate": 5.533842676150446e-06, "loss": 0.473, "mean_token_accuracy": 0.8452073886990548, "num_tokens": 81098248.0, "step": 67420 }, { "entropy": 1.9074363961815834, "epoch": 0.20902710021009657, "grad_norm": 8.339896202087402, "learning_rate": 5.533432315039921e-06, "loss": 0.481, "mean_token_accuracy": 0.8502192839980125, "num_tokens": 81109121.0, "step": 67430 }, { "entropy": 1.9613432750105857, "epoch": 0.20905809933514627, "grad_norm": 10.081904411315918, "learning_rate": 5.533022045206623e-06, "loss": 0.4932, "mean_token_accuracy": 0.8488780468702316, "num_tokens": 81120269.0, "step": 67440 }, { "entropy": 1.951788181066513, "epoch": 0.20908909846019597, "grad_norm": 8.335789680480957, "learning_rate": 5.532611866616719e-06, "loss": 0.5574, "mean_token_accuracy": 0.8319823101162911, "num_tokens": 81131197.0, "step": 67450 }, { "entropy": 1.8895439878106117, "epoch": 0.20912009758524566, "grad_norm": 9.064699172973633, "learning_rate": 5.53220177923639e-06, "loss": 0.5051, "mean_token_accuracy": 0.8434992015361786, "num_tokens": 81142524.0, "step": 67460 }, { "entropy": 1.8257933855056763, "epoch": 0.20915109671029536, "grad_norm": 8.583352088928223, "learning_rate": 5.531791783031842e-06, "loss": 0.4277, "mean_token_accuracy": 0.850779265165329, "num_tokens": 81154674.0, "step": 67470 }, { "entropy": 1.7744839206337928, "epoch": 0.20918209583534506, "grad_norm": 7.863668918609619, "learning_rate": 5.5313818779692915e-06, "loss": 0.4427, "mean_token_accuracy": 0.8497899547219276, "num_tokens": 81168137.0, "step": 67480 }, { "entropy": 1.9520211279392243, "epoch": 0.20921309496039475, "grad_norm": 12.411447525024414, "learning_rate": 5.5309720640149785e-06, "loss": 0.5196, "mean_token_accuracy": 0.8483351454138756, "num_tokens": 81178936.0, "step": 67490 }, { "entropy": 1.8984515577554704, "epoch": 0.20924409408544445, "grad_norm": 7.144548416137695, "learning_rate": 5.530562341135155e-06, "loss": 0.5423, "mean_token_accuracy": 0.837235240638256, "num_tokens": 81191279.0, "step": 67500 }, { "entropy": 1.8845224693417548, "epoch": 0.20927509321049415, "grad_norm": 9.875595092773438, "learning_rate": 5.5301527092960925e-06, "loss": 0.4426, "mean_token_accuracy": 0.8476980268955231, "num_tokens": 81203433.0, "step": 67510 }, { "entropy": 1.9049016535282135, "epoch": 0.20930609233554384, "grad_norm": 7.915565490722656, "learning_rate": 5.529743168464083e-06, "loss": 0.4639, "mean_token_accuracy": 0.8503067553043365, "num_tokens": 81215534.0, "step": 67520 }, { "entropy": 1.8581149563193322, "epoch": 0.20933709146059354, "grad_norm": 3.4274792671203613, "learning_rate": 5.5293337186054315e-06, "loss": 0.4919, "mean_token_accuracy": 0.8372601106762886, "num_tokens": 81227450.0, "step": 67530 }, { "entropy": 1.8535752549767495, "epoch": 0.2093680905856432, "grad_norm": 10.018072128295898, "learning_rate": 5.528924359686464e-06, "loss": 0.4527, "mean_token_accuracy": 0.845684327185154, "num_tokens": 81239765.0, "step": 67540 }, { "entropy": 1.8562732204794883, "epoch": 0.2093990897106929, "grad_norm": 7.851308822631836, "learning_rate": 5.528515091673519e-06, "loss": 0.4674, "mean_token_accuracy": 0.8401390001177788, "num_tokens": 81252593.0, "step": 67550 }, { "entropy": 1.9241983875632287, "epoch": 0.2094300888357426, "grad_norm": 9.050371170043945, "learning_rate": 5.5281059145329605e-06, "loss": 0.4962, "mean_token_accuracy": 0.8380835622549057, "num_tokens": 81264640.0, "step": 67560 }, { "entropy": 1.9892011910676957, "epoch": 0.2094610879607923, "grad_norm": 7.706739902496338, "learning_rate": 5.527696828231161e-06, "loss": 0.5774, "mean_token_accuracy": 0.8230189695954323, "num_tokens": 81275373.0, "step": 67570 }, { "entropy": 1.9130545258522034, "epoch": 0.209492087085842, "grad_norm": 8.775412559509277, "learning_rate": 5.527287832734517e-06, "loss": 0.4685, "mean_token_accuracy": 0.8499484866857528, "num_tokens": 81287424.0, "step": 67580 }, { "entropy": 1.9063115805387496, "epoch": 0.2095230862108917, "grad_norm": 9.373629570007324, "learning_rate": 5.526878928009438e-06, "loss": 0.4993, "mean_token_accuracy": 0.8434836745262146, "num_tokens": 81300644.0, "step": 67590 }, { "entropy": 2.0209019780158997, "epoch": 0.2095540853359414, "grad_norm": 8.37852954864502, "learning_rate": 5.526470114022357e-06, "loss": 0.5427, "mean_token_accuracy": 0.830120287835598, "num_tokens": 81311779.0, "step": 67600 }, { "entropy": 1.944344201683998, "epoch": 0.20958508446099108, "grad_norm": 8.276811599731445, "learning_rate": 5.526061390739714e-06, "loss": 0.5441, "mean_token_accuracy": 0.8304149687290192, "num_tokens": 81323352.0, "step": 67610 }, { "entropy": 1.9719750136137009, "epoch": 0.20961608358604078, "grad_norm": 7.648212909698486, "learning_rate": 5.5256527581279785e-06, "loss": 0.5287, "mean_token_accuracy": 0.8441421076655388, "num_tokens": 81335024.0, "step": 67620 }, { "entropy": 1.9229772925376891, "epoch": 0.20964708271109048, "grad_norm": 4.550539970397949, "learning_rate": 5.5252442161536276e-06, "loss": 0.4864, "mean_token_accuracy": 0.8464221879839897, "num_tokens": 81346392.0, "step": 67630 }, { "entropy": 1.9195913657546044, "epoch": 0.20967808183614017, "grad_norm": 5.1315999031066895, "learning_rate": 5.524835764783162e-06, "loss": 0.5051, "mean_token_accuracy": 0.8396693095564842, "num_tokens": 81358537.0, "step": 67640 }, { "entropy": 1.7798305720090866, "epoch": 0.20970908096118987, "grad_norm": 8.729194641113281, "learning_rate": 5.524427403983096e-06, "loss": 0.4147, "mean_token_accuracy": 0.8598864421248436, "num_tokens": 81372176.0, "step": 67650 }, { "entropy": 1.8792505249381066, "epoch": 0.20974008008623957, "grad_norm": 2.838498592376709, "learning_rate": 5.524019133719963e-06, "loss": 0.4509, "mean_token_accuracy": 0.8509874641895294, "num_tokens": 81384748.0, "step": 67660 }, { "entropy": 1.9109538584947585, "epoch": 0.20977107921128926, "grad_norm": 9.723373413085938, "learning_rate": 5.5236109539603145e-06, "loss": 0.502, "mean_token_accuracy": 0.8392190888524056, "num_tokens": 81396571.0, "step": 67670 }, { "entropy": 1.9938452377915383, "epoch": 0.20980207833633896, "grad_norm": 9.84363079071045, "learning_rate": 5.523202864670717e-06, "loss": 0.5627, "mean_token_accuracy": 0.8240502581000329, "num_tokens": 81408124.0, "step": 67680 }, { "entropy": 1.944431021809578, "epoch": 0.20983307746138866, "grad_norm": 7.82313871383667, "learning_rate": 5.522794865817755e-06, "loss": 0.4808, "mean_token_accuracy": 0.8497413724660874, "num_tokens": 81418940.0, "step": 67690 }, { "entropy": 1.866161908209324, "epoch": 0.20986407658643835, "grad_norm": 4.241913318634033, "learning_rate": 5.52238695736803e-06, "loss": 0.5066, "mean_token_accuracy": 0.838020247220993, "num_tokens": 81431920.0, "step": 67700 }, { "entropy": 1.9558416873216629, "epoch": 0.20989507571148805, "grad_norm": 6.742011547088623, "learning_rate": 5.521979139288163e-06, "loss": 0.5383, "mean_token_accuracy": 0.8325184598565102, "num_tokens": 81443316.0, "step": 67710 }, { "entropy": 1.9829859271645547, "epoch": 0.20992607483653775, "grad_norm": 7.284268856048584, "learning_rate": 5.521571411544792e-06, "loss": 0.5344, "mean_token_accuracy": 0.8363759860396385, "num_tokens": 81454728.0, "step": 67720 }, { "entropy": 1.9214450731873511, "epoch": 0.20995707396158744, "grad_norm": 8.096468925476074, "learning_rate": 5.521163774104567e-06, "loss": 0.4913, "mean_token_accuracy": 0.83761525452137, "num_tokens": 81467284.0, "step": 67730 }, { "entropy": 1.8891347169876098, "epoch": 0.20998807308663714, "grad_norm": 8.036181449890137, "learning_rate": 5.520756226934162e-06, "loss": 0.4661, "mean_token_accuracy": 0.8447176307439804, "num_tokens": 81480629.0, "step": 67740 }, { "entropy": 1.8999253660440445, "epoch": 0.21001907221168684, "grad_norm": 10.522799491882324, "learning_rate": 5.520348770000264e-06, "loss": 0.5095, "mean_token_accuracy": 0.8358193069696427, "num_tokens": 81492287.0, "step": 67750 }, { "entropy": 1.8602621525526046, "epoch": 0.21005007133673653, "grad_norm": 3.4755728244781494, "learning_rate": 5.519941403269581e-06, "loss": 0.4179, "mean_token_accuracy": 0.8537157669663429, "num_tokens": 81505125.0, "step": 67760 }, { "entropy": 1.9202613174915313, "epoch": 0.21008107046178623, "grad_norm": 8.444540977478027, "learning_rate": 5.519534126708833e-06, "loss": 0.4978, "mean_token_accuracy": 0.842594002187252, "num_tokens": 81517173.0, "step": 67770 }, { "entropy": 1.8747940637171268, "epoch": 0.2101120695868359, "grad_norm": 2.7482457160949707, "learning_rate": 5.519126940284762e-06, "loss": 0.4996, "mean_token_accuracy": 0.847461374104023, "num_tokens": 81530328.0, "step": 67780 }, { "entropy": 1.904065564274788, "epoch": 0.2101430687118856, "grad_norm": 4.310403347015381, "learning_rate": 5.518719843964123e-06, "loss": 0.4791, "mean_token_accuracy": 0.8386267200112343, "num_tokens": 81542313.0, "step": 67790 }, { "entropy": 1.9362899020314217, "epoch": 0.2101740678369353, "grad_norm": 8.252490043640137, "learning_rate": 5.518312837713692e-06, "loss": 0.4957, "mean_token_accuracy": 0.8409799665212632, "num_tokens": 81554270.0, "step": 67800 }, { "entropy": 1.9585260882973672, "epoch": 0.210205066961985, "grad_norm": 9.089271545410156, "learning_rate": 5.517905921500259e-06, "loss": 0.4919, "mean_token_accuracy": 0.8473243802785874, "num_tokens": 81565971.0, "step": 67810 }, { "entropy": 1.9076432511210442, "epoch": 0.21023606608703468, "grad_norm": 8.114571571350098, "learning_rate": 5.517499095290636e-06, "loss": 0.4766, "mean_token_accuracy": 0.8429806470870972, "num_tokens": 81578195.0, "step": 67820 }, { "entropy": 1.876829606294632, "epoch": 0.21026706521208438, "grad_norm": 10.027071952819824, "learning_rate": 5.5170923590516444e-06, "loss": 0.5153, "mean_token_accuracy": 0.8376459792256356, "num_tokens": 81591671.0, "step": 67830 }, { "entropy": 1.9631253123283385, "epoch": 0.21029806433713408, "grad_norm": 6.608702659606934, "learning_rate": 5.51668571275013e-06, "loss": 0.5715, "mean_token_accuracy": 0.8352919310331345, "num_tokens": 81603027.0, "step": 67840 }, { "entropy": 1.9352620527148248, "epoch": 0.21032906346218377, "grad_norm": 3.764373302459717, "learning_rate": 5.51627915635295e-06, "loss": 0.4655, "mean_token_accuracy": 0.8549694180488586, "num_tokens": 81615161.0, "step": 67850 }, { "entropy": 1.9944947317242623, "epoch": 0.21036006258723347, "grad_norm": 9.673376083374023, "learning_rate": 5.515872689826984e-06, "loss": 0.5357, "mean_token_accuracy": 0.8339143067598342, "num_tokens": 81627143.0, "step": 67860 }, { "entropy": 2.035477635264397, "epoch": 0.21039106171228317, "grad_norm": 9.949082374572754, "learning_rate": 5.515466313139126e-06, "loss": 0.5937, "mean_token_accuracy": 0.8266064122319221, "num_tokens": 81637833.0, "step": 67870 }, { "entropy": 1.9482318013906479, "epoch": 0.21042206083733286, "grad_norm": 8.834207534790039, "learning_rate": 5.5150600262562855e-06, "loss": 0.4966, "mean_token_accuracy": 0.8446396380662918, "num_tokens": 81649773.0, "step": 67880 }, { "entropy": 2.023282551765442, "epoch": 0.21045305996238256, "grad_norm": 8.027750015258789, "learning_rate": 5.514653829145392e-06, "loss": 0.5464, "mean_token_accuracy": 0.8356869459152222, "num_tokens": 81660767.0, "step": 67890 }, { "entropy": 1.8931564077734948, "epoch": 0.21048405908743226, "grad_norm": 4.419806003570557, "learning_rate": 5.5142477217733905e-06, "loss": 0.4572, "mean_token_accuracy": 0.8490284129977226, "num_tokens": 81673309.0, "step": 67900 }, { "entropy": 1.9538974300026895, "epoch": 0.21051505821248195, "grad_norm": 9.901854515075684, "learning_rate": 5.513841704107242e-06, "loss": 0.4815, "mean_token_accuracy": 0.8569442689418793, "num_tokens": 81684556.0, "step": 67910 }, { "entropy": 1.9160713866353034, "epoch": 0.21054605733753165, "grad_norm": 7.857386112213135, "learning_rate": 5.513435776113929e-06, "loss": 0.4742, "mean_token_accuracy": 0.8500816449522972, "num_tokens": 81696905.0, "step": 67920 }, { "entropy": 1.864570914208889, "epoch": 0.21057705646258135, "grad_norm": 9.086345672607422, "learning_rate": 5.513029937760446e-06, "loss": 0.4621, "mean_token_accuracy": 0.8444380149245262, "num_tokens": 81709959.0, "step": 67930 }, { "entropy": 2.0148605525493624, "epoch": 0.21060805558763104, "grad_norm": 8.19638442993164, "learning_rate": 5.512624189013806e-06, "loss": 0.549, "mean_token_accuracy": 0.8306930497288704, "num_tokens": 81721588.0, "step": 67940 }, { "entropy": 1.9208003774285316, "epoch": 0.21063905471268074, "grad_norm": 8.312312126159668, "learning_rate": 5.512218529841038e-06, "loss": 0.4823, "mean_token_accuracy": 0.8352445542812348, "num_tokens": 81733933.0, "step": 67950 }, { "entropy": 1.8529336631298066, "epoch": 0.21067005383773044, "grad_norm": 3.6399710178375244, "learning_rate": 5.511812960209193e-06, "loss": 0.4649, "mean_token_accuracy": 0.8411158919334412, "num_tokens": 81746778.0, "step": 67960 }, { "entropy": 1.9720004379749299, "epoch": 0.21070105296278013, "grad_norm": 8.460240364074707, "learning_rate": 5.511407480085334e-06, "loss": 0.5148, "mean_token_accuracy": 0.8386137381196022, "num_tokens": 81758395.0, "step": 67970 }, { "entropy": 2.007098397612572, "epoch": 0.21073205208782983, "grad_norm": 9.23631763458252, "learning_rate": 5.51100208943654e-06, "loss": 0.5553, "mean_token_accuracy": 0.835183584690094, "num_tokens": 81769530.0, "step": 67980 }, { "entropy": 1.8748371377587318, "epoch": 0.21076305121287953, "grad_norm": 8.400230407714844, "learning_rate": 5.510596788229912e-06, "loss": 0.4947, "mean_token_accuracy": 0.8293806165456772, "num_tokens": 81782455.0, "step": 67990 }, { "entropy": 1.9910854011774064, "epoch": 0.21079405033792922, "grad_norm": 8.415918350219727, "learning_rate": 5.510191576432563e-06, "loss": 0.5531, "mean_token_accuracy": 0.8300189360976219, "num_tokens": 81793212.0, "step": 68000 }, { "entropy": 1.8886552080512047, "epoch": 0.21082504946297892, "grad_norm": 7.983949661254883, "learning_rate": 5.509786454011627e-06, "loss": 0.4475, "mean_token_accuracy": 0.8474817335605621, "num_tokens": 81804680.0, "step": 68010 }, { "entropy": 1.929812017083168, "epoch": 0.21085604858802862, "grad_norm": 7.43207311630249, "learning_rate": 5.509381420934252e-06, "loss": 0.5576, "mean_token_accuracy": 0.8385087087750435, "num_tokens": 81816705.0, "step": 68020 }, { "entropy": 1.9719702288508416, "epoch": 0.21088704771307829, "grad_norm": 7.93623685836792, "learning_rate": 5.5089764771676035e-06, "loss": 0.4997, "mean_token_accuracy": 0.8436658188700676, "num_tokens": 81828323.0, "step": 68030 }, { "entropy": 1.9078006014227866, "epoch": 0.21091804683812798, "grad_norm": 10.205698013305664, "learning_rate": 5.508571622678865e-06, "loss": 0.4597, "mean_token_accuracy": 0.8465578466653824, "num_tokens": 81840696.0, "step": 68040 }, { "entropy": 1.9035088881850242, "epoch": 0.21094904596317768, "grad_norm": 8.092962265014648, "learning_rate": 5.5081668574352364e-06, "loss": 0.4854, "mean_token_accuracy": 0.8459865510463714, "num_tokens": 81853045.0, "step": 68050 }, { "entropy": 2.0051578521728515, "epoch": 0.21098004508822737, "grad_norm": 7.893548488616943, "learning_rate": 5.507762181403934e-06, "loss": 0.5796, "mean_token_accuracy": 0.8307501718401908, "num_tokens": 81863610.0, "step": 68060 }, { "entropy": 1.862001748383045, "epoch": 0.21101104421327707, "grad_norm": 9.222723960876465, "learning_rate": 5.507357594552191e-06, "loss": 0.4486, "mean_token_accuracy": 0.8515830934047699, "num_tokens": 81876722.0, "step": 68070 }, { "entropy": 1.9945030003786086, "epoch": 0.21104204333832677, "grad_norm": 9.973723411560059, "learning_rate": 5.5069530968472575e-06, "loss": 0.5548, "mean_token_accuracy": 0.830809174478054, "num_tokens": 81887391.0, "step": 68080 }, { "entropy": 1.9193598270416259, "epoch": 0.21107304246337646, "grad_norm": 8.859430313110352, "learning_rate": 5.506548688256401e-06, "loss": 0.4772, "mean_token_accuracy": 0.851340101659298, "num_tokens": 81899366.0, "step": 68090 }, { "entropy": 1.9339387387037277, "epoch": 0.21110404158842616, "grad_norm": 4.928318500518799, "learning_rate": 5.506144368746905e-06, "loss": 0.5192, "mean_token_accuracy": 0.8325812682509423, "num_tokens": 81911145.0, "step": 68100 }, { "entropy": 1.9344223082065581, "epoch": 0.21113504071347586, "grad_norm": 2.5606257915496826, "learning_rate": 5.505740138286071e-06, "loss": 0.5272, "mean_token_accuracy": 0.8293151870369911, "num_tokens": 81923550.0, "step": 68110 }, { "entropy": 1.9258287683129311, "epoch": 0.21116603983852555, "grad_norm": 7.782974720001221, "learning_rate": 5.505335996841215e-06, "loss": 0.489, "mean_token_accuracy": 0.8488619342446327, "num_tokens": 81935832.0, "step": 68120 }, { "entropy": 1.8819763243198395, "epoch": 0.21119703896357525, "grad_norm": 2.672853708267212, "learning_rate": 5.504931944379673e-06, "loss": 0.4988, "mean_token_accuracy": 0.8511589944362641, "num_tokens": 81949301.0, "step": 68130 }, { "entropy": 2.001712107658386, "epoch": 0.21122803808862495, "grad_norm": 9.238136291503906, "learning_rate": 5.504527980868795e-06, "loss": 0.5271, "mean_token_accuracy": 0.8439046397805214, "num_tokens": 81960491.0, "step": 68140 }, { "entropy": 1.9117625072598456, "epoch": 0.21125903721367464, "grad_norm": 8.495841026306152, "learning_rate": 5.504124106275948e-06, "loss": 0.5269, "mean_token_accuracy": 0.8380966350436211, "num_tokens": 81972833.0, "step": 68150 }, { "entropy": 1.9697169050574304, "epoch": 0.21129003633872434, "grad_norm": 9.759078025817871, "learning_rate": 5.5037203205685196e-06, "loss": 0.5163, "mean_token_accuracy": 0.8402854591608048, "num_tokens": 81984232.0, "step": 68160 }, { "entropy": 1.9297288402915, "epoch": 0.21132103546377404, "grad_norm": 4.200081825256348, "learning_rate": 5.503316623713908e-06, "loss": 0.5086, "mean_token_accuracy": 0.8306440845131874, "num_tokens": 81995830.0, "step": 68170 }, { "entropy": 1.8650834277272224, "epoch": 0.21135203458882373, "grad_norm": 7.745209693908691, "learning_rate": 5.502913015679533e-06, "loss": 0.4646, "mean_token_accuracy": 0.8491313457489014, "num_tokens": 82008180.0, "step": 68180 }, { "entropy": 1.9364105448126794, "epoch": 0.21138303371387343, "grad_norm": 9.66666030883789, "learning_rate": 5.502509496432829e-06, "loss": 0.5228, "mean_token_accuracy": 0.8386658191680908, "num_tokens": 82019778.0, "step": 68190 }, { "entropy": 1.9314407289028168, "epoch": 0.21141403283892313, "grad_norm": 9.512344360351562, "learning_rate": 5.502106065941247e-06, "loss": 0.5106, "mean_token_accuracy": 0.8394078284502029, "num_tokens": 82031972.0, "step": 68200 }, { "entropy": 1.9816076889634133, "epoch": 0.21144503196397282, "grad_norm": 7.756664276123047, "learning_rate": 5.501702724172256e-06, "loss": 0.5672, "mean_token_accuracy": 0.8252545759081841, "num_tokens": 82043777.0, "step": 68210 }, { "entropy": 1.9324266403913497, "epoch": 0.21147603108902252, "grad_norm": 4.227046966552734, "learning_rate": 5.501299471093341e-06, "loss": 0.4712, "mean_token_accuracy": 0.8418770223855973, "num_tokens": 82056125.0, "step": 68220 }, { "entropy": 1.8676659151911736, "epoch": 0.21150703021407222, "grad_norm": 9.944682121276855, "learning_rate": 5.500896306672003e-06, "loss": 0.448, "mean_token_accuracy": 0.8422658532857895, "num_tokens": 82068630.0, "step": 68230 }, { "entropy": 1.9347829192876815, "epoch": 0.2115380293391219, "grad_norm": 8.809101104736328, "learning_rate": 5.500493230875762e-06, "loss": 0.4802, "mean_token_accuracy": 0.8409867346286773, "num_tokens": 82080910.0, "step": 68240 }, { "entropy": 1.9162681803107262, "epoch": 0.2115690284641716, "grad_norm": 8.236538887023926, "learning_rate": 5.500090243672151e-06, "loss": 0.4667, "mean_token_accuracy": 0.8516815707087517, "num_tokens": 82092398.0, "step": 68250 }, { "entropy": 1.9335265710949898, "epoch": 0.2116000275892213, "grad_norm": 8.672457695007324, "learning_rate": 5.4996873450287205e-06, "loss": 0.5003, "mean_token_accuracy": 0.8431479528546333, "num_tokens": 82104602.0, "step": 68260 }, { "entropy": 1.8349942237138748, "epoch": 0.211631026714271, "grad_norm": 7.4427924156188965, "learning_rate": 5.4992845349130406e-06, "loss": 0.4437, "mean_token_accuracy": 0.843446071445942, "num_tokens": 82117625.0, "step": 68270 }, { "entropy": 1.9474259793758393, "epoch": 0.21166202583932067, "grad_norm": 4.666007041931152, "learning_rate": 5.498881813292697e-06, "loss": 0.4792, "mean_token_accuracy": 0.8499509572982789, "num_tokens": 82129574.0, "step": 68280 }, { "entropy": 1.8501393646001816, "epoch": 0.21169302496437037, "grad_norm": 8.344110488891602, "learning_rate": 5.498479180135289e-06, "loss": 0.4608, "mean_token_accuracy": 0.8498427078127861, "num_tokens": 82143041.0, "step": 68290 }, { "entropy": 1.8913217276334762, "epoch": 0.21172402408942007, "grad_norm": 11.081100463867188, "learning_rate": 5.498076635408436e-06, "loss": 0.48, "mean_token_accuracy": 0.8393001139163971, "num_tokens": 82155572.0, "step": 68300 }, { "entropy": 1.8870702683925629, "epoch": 0.21175502321446976, "grad_norm": 3.9212942123413086, "learning_rate": 5.497674179079771e-06, "loss": 0.5092, "mean_token_accuracy": 0.8416898652911187, "num_tokens": 82167720.0, "step": 68310 }, { "entropy": 1.8655384734272957, "epoch": 0.21178602233951946, "grad_norm": 9.342215538024902, "learning_rate": 5.497271811116948e-06, "loss": 0.4638, "mean_token_accuracy": 0.8492528066039086, "num_tokens": 82180629.0, "step": 68320 }, { "entropy": 1.8181511342525483, "epoch": 0.21181702146456916, "grad_norm": 7.617997169494629, "learning_rate": 5.496869531487634e-06, "loss": 0.426, "mean_token_accuracy": 0.8530010193586349, "num_tokens": 82194518.0, "step": 68330 }, { "entropy": 1.939797979593277, "epoch": 0.21184802058961885, "grad_norm": 6.760346412658691, "learning_rate": 5.496467340159511e-06, "loss": 0.5003, "mean_token_accuracy": 0.8504227936267853, "num_tokens": 82205732.0, "step": 68340 }, { "entropy": 1.9736777380108834, "epoch": 0.21187901971466855, "grad_norm": 10.192756652832031, "learning_rate": 5.496065237100283e-06, "loss": 0.5461, "mean_token_accuracy": 0.8265336826443672, "num_tokens": 82217338.0, "step": 68350 }, { "entropy": 1.9774869233369827, "epoch": 0.21191001883971824, "grad_norm": 8.361988067626953, "learning_rate": 5.495663222277665e-06, "loss": 0.5018, "mean_token_accuracy": 0.8427802279591561, "num_tokens": 82228528.0, "step": 68360 }, { "entropy": 1.941089576482773, "epoch": 0.21194101796476794, "grad_norm": 6.703628063201904, "learning_rate": 5.495261295659393e-06, "loss": 0.4872, "mean_token_accuracy": 0.8476432636380196, "num_tokens": 82239902.0, "step": 68370 }, { "entropy": 1.9076963618397713, "epoch": 0.21197201708981764, "grad_norm": 7.75342321395874, "learning_rate": 5.494859457213216e-06, "loss": 0.5217, "mean_token_accuracy": 0.8372241660952568, "num_tokens": 82251693.0, "step": 68380 }, { "entropy": 1.877877102792263, "epoch": 0.21200301621486733, "grad_norm": 4.743284225463867, "learning_rate": 5.494457706906901e-06, "loss": 0.4329, "mean_token_accuracy": 0.854452121257782, "num_tokens": 82264357.0, "step": 68390 }, { "entropy": 1.927871122956276, "epoch": 0.21203401533991703, "grad_norm": 8.312472343444824, "learning_rate": 5.494056044708233e-06, "loss": 0.5026, "mean_token_accuracy": 0.8458049356937408, "num_tokens": 82275361.0, "step": 68400 }, { "entropy": 1.8819461211562156, "epoch": 0.21206501446496673, "grad_norm": 8.821391105651855, "learning_rate": 5.493654470585011e-06, "loss": 0.4855, "mean_token_accuracy": 0.8411281630396843, "num_tokens": 82287079.0, "step": 68410 }, { "entropy": 1.9309892505407333, "epoch": 0.21209601359001642, "grad_norm": 7.936349391937256, "learning_rate": 5.4932529845050494e-06, "loss": 0.5459, "mean_token_accuracy": 0.833111310005188, "num_tokens": 82298592.0, "step": 68420 }, { "entropy": 1.9115046963095665, "epoch": 0.21212701271506612, "grad_norm": 7.75383996963501, "learning_rate": 5.492851586436185e-06, "loss": 0.4562, "mean_token_accuracy": 0.8484810963273048, "num_tokens": 82310414.0, "step": 68430 }, { "entropy": 1.8966820895671845, "epoch": 0.21215801184011582, "grad_norm": 9.623336791992188, "learning_rate": 5.492450276346264e-06, "loss": 0.5298, "mean_token_accuracy": 0.824579867720604, "num_tokens": 82323527.0, "step": 68440 }, { "entropy": 1.929227152466774, "epoch": 0.21218901096516551, "grad_norm": 8.437525749206543, "learning_rate": 5.4920490542031545e-06, "loss": 0.4853, "mean_token_accuracy": 0.8451452344655991, "num_tokens": 82335454.0, "step": 68450 }, { "entropy": 1.8906142652034759, "epoch": 0.2122200100902152, "grad_norm": 8.768442153930664, "learning_rate": 5.4916479199747375e-06, "loss": 0.5266, "mean_token_accuracy": 0.8401165828108788, "num_tokens": 82347531.0, "step": 68460 }, { "entropy": 1.9358887538313865, "epoch": 0.2122510092152649, "grad_norm": 8.354040145874023, "learning_rate": 5.491246873628911e-06, "loss": 0.5167, "mean_token_accuracy": 0.8335384339094162, "num_tokens": 82358918.0, "step": 68470 }, { "entropy": 1.722612802684307, "epoch": 0.2122820083403146, "grad_norm": 4.17601203918457, "learning_rate": 5.490845915133592e-06, "loss": 0.418, "mean_token_accuracy": 0.852487288415432, "num_tokens": 82373282.0, "step": 68480 }, { "entropy": 1.9600424468517303, "epoch": 0.2123130074653643, "grad_norm": 8.103271484375, "learning_rate": 5.49044504445671e-06, "loss": 0.5425, "mean_token_accuracy": 0.8350730538368225, "num_tokens": 82384254.0, "step": 68490 }, { "entropy": 1.9285910725593567, "epoch": 0.212344006590414, "grad_norm": 7.515426158905029, "learning_rate": 5.490044261566214e-06, "loss": 0.5133, "mean_token_accuracy": 0.8370998069643975, "num_tokens": 82395748.0, "step": 68500 }, { "entropy": 1.7850094467401505, "epoch": 0.2123750057154637, "grad_norm": 2.562181234359741, "learning_rate": 5.489643566430068e-06, "loss": 0.3974, "mean_token_accuracy": 0.8605840176343917, "num_tokens": 82408632.0, "step": 68510 }, { "entropy": 1.8428252264857292, "epoch": 0.21240600484051336, "grad_norm": 10.207568168640137, "learning_rate": 5.489242959016253e-06, "loss": 0.4632, "mean_token_accuracy": 0.8491317898035049, "num_tokens": 82421317.0, "step": 68520 }, { "entropy": 1.9188319817185402, "epoch": 0.21243700396556306, "grad_norm": 8.114725112915039, "learning_rate": 5.488842439292764e-06, "loss": 0.4659, "mean_token_accuracy": 0.8487970292568207, "num_tokens": 82432817.0, "step": 68530 }, { "entropy": 1.8276264518499374, "epoch": 0.21246800309061276, "grad_norm": 3.9016194343566895, "learning_rate": 5.4884420072276175e-06, "loss": 0.4068, "mean_token_accuracy": 0.8536558151245117, "num_tokens": 82445275.0, "step": 68540 }, { "entropy": 1.8365425869822503, "epoch": 0.21249900221566245, "grad_norm": 10.570717811584473, "learning_rate": 5.48804166278884e-06, "loss": 0.4222, "mean_token_accuracy": 0.8519185066223145, "num_tokens": 82457785.0, "step": 68550 }, { "entropy": 1.9167245119810103, "epoch": 0.21253000134071215, "grad_norm": 6.744617938995361, "learning_rate": 5.487641405944478e-06, "loss": 0.5499, "mean_token_accuracy": 0.8354932352900505, "num_tokens": 82469125.0, "step": 68560 }, { "entropy": 1.908539716899395, "epoch": 0.21256100046576185, "grad_norm": 8.550407409667969, "learning_rate": 5.487241236662596e-06, "loss": 0.456, "mean_token_accuracy": 0.8480324164032936, "num_tokens": 82481644.0, "step": 68570 }, { "entropy": 1.8927012383937836, "epoch": 0.21259199959081154, "grad_norm": 8.421663284301758, "learning_rate": 5.486841154911271e-06, "loss": 0.4723, "mean_token_accuracy": 0.8398829996585846, "num_tokens": 82493695.0, "step": 68580 }, { "entropy": 1.9059516102075578, "epoch": 0.21262299871586124, "grad_norm": 4.052833080291748, "learning_rate": 5.486441160658598e-06, "loss": 0.4829, "mean_token_accuracy": 0.8456622689962388, "num_tokens": 82505268.0, "step": 68590 }, { "entropy": 1.9685225948691367, "epoch": 0.21265399784091094, "grad_norm": 12.194074630737305, "learning_rate": 5.486041253872687e-06, "loss": 0.5623, "mean_token_accuracy": 0.8278161734342575, "num_tokens": 82516596.0, "step": 68600 }, { "entropy": 1.84447330981493, "epoch": 0.21268499696596063, "grad_norm": 8.463014602661133, "learning_rate": 5.485641434521665e-06, "loss": 0.4572, "mean_token_accuracy": 0.8529325991868972, "num_tokens": 82529400.0, "step": 68610 }, { "entropy": 1.8710281908512116, "epoch": 0.21271599609101033, "grad_norm": 7.990627765655518, "learning_rate": 5.48524170257368e-06, "loss": 0.4716, "mean_token_accuracy": 0.8378486022353172, "num_tokens": 82541818.0, "step": 68620 }, { "entropy": 1.8808648347854615, "epoch": 0.21274699521606003, "grad_norm": 4.200010776519775, "learning_rate": 5.484842057996887e-06, "loss": 0.4435, "mean_token_accuracy": 0.8450862690806389, "num_tokens": 82554516.0, "step": 68630 }, { "entropy": 1.8946150675415994, "epoch": 0.21277799434110972, "grad_norm": 7.040407657623291, "learning_rate": 5.484442500759464e-06, "loss": 0.4846, "mean_token_accuracy": 0.8425398871302605, "num_tokens": 82566421.0, "step": 68640 }, { "entropy": 1.9813665717840194, "epoch": 0.21280899346615942, "grad_norm": 7.405858516693115, "learning_rate": 5.4840430308296035e-06, "loss": 0.5383, "mean_token_accuracy": 0.8323499038815498, "num_tokens": 82578195.0, "step": 68650 }, { "entropy": 1.9478058412671089, "epoch": 0.21283999259120912, "grad_norm": 4.5492329597473145, "learning_rate": 5.483643648175514e-06, "loss": 0.5035, "mean_token_accuracy": 0.8383038908243179, "num_tokens": 82590022.0, "step": 68660 }, { "entropy": 1.838891714811325, "epoch": 0.2128709917162588, "grad_norm": 7.715846061706543, "learning_rate": 5.48324435276542e-06, "loss": 0.4777, "mean_token_accuracy": 0.8427498295903206, "num_tokens": 82603321.0, "step": 68670 }, { "entropy": 1.8843644648790359, "epoch": 0.2129019908413085, "grad_norm": 9.188892364501953, "learning_rate": 5.482845144567561e-06, "loss": 0.4404, "mean_token_accuracy": 0.8428171291947365, "num_tokens": 82615976.0, "step": 68680 }, { "entropy": 1.9740795180201531, "epoch": 0.2129329899663582, "grad_norm": 4.1340765953063965, "learning_rate": 5.482446023550199e-06, "loss": 0.5321, "mean_token_accuracy": 0.8349632441997528, "num_tokens": 82627312.0, "step": 68690 }, { "entropy": 1.9550517603754998, "epoch": 0.2129639890914079, "grad_norm": 8.036059379577637, "learning_rate": 5.482046989681602e-06, "loss": 0.5156, "mean_token_accuracy": 0.8462382450699806, "num_tokens": 82638817.0, "step": 68700 }, { "entropy": 1.8374310091137886, "epoch": 0.2129949882164576, "grad_norm": 9.189522743225098, "learning_rate": 5.481648042930061e-06, "loss": 0.4203, "mean_token_accuracy": 0.849276214838028, "num_tokens": 82651779.0, "step": 68710 }, { "entropy": 1.9555922120809555, "epoch": 0.2130259873415073, "grad_norm": 8.850881576538086, "learning_rate": 5.481249183263883e-06, "loss": 0.5377, "mean_token_accuracy": 0.8374005898833274, "num_tokens": 82663394.0, "step": 68720 }, { "entropy": 1.9426356315612794, "epoch": 0.213056986466557, "grad_norm": 9.274955749511719, "learning_rate": 5.480850410651389e-06, "loss": 0.5114, "mean_token_accuracy": 0.8367856085300446, "num_tokens": 82674791.0, "step": 68730 }, { "entropy": 1.815965899825096, "epoch": 0.2130879855916067, "grad_norm": 8.32228946685791, "learning_rate": 5.4804517250609165e-06, "loss": 0.4675, "mean_token_accuracy": 0.8415889486670494, "num_tokens": 82688146.0, "step": 68740 }, { "entropy": 1.8485229358077049, "epoch": 0.21311898471665638, "grad_norm": 4.761613845825195, "learning_rate": 5.4800531264608205e-06, "loss": 0.4694, "mean_token_accuracy": 0.8439338058233261, "num_tokens": 82701247.0, "step": 68750 }, { "entropy": 1.9277915149927138, "epoch": 0.21314998384170608, "grad_norm": 9.579967498779297, "learning_rate": 5.47965461481947e-06, "loss": 0.5294, "mean_token_accuracy": 0.8259765520691872, "num_tokens": 82713022.0, "step": 68760 }, { "entropy": 1.950098218023777, "epoch": 0.21318098296675575, "grad_norm": 10.044625282287598, "learning_rate": 5.47925619010525e-06, "loss": 0.5154, "mean_token_accuracy": 0.8383822947740555, "num_tokens": 82723810.0, "step": 68770 }, { "entropy": 1.9772588819265366, "epoch": 0.21321198209180545, "grad_norm": 3.088735342025757, "learning_rate": 5.478857852286567e-06, "loss": 0.5824, "mean_token_accuracy": 0.8151338204741478, "num_tokens": 82735793.0, "step": 68780 }, { "entropy": 1.8972803056240082, "epoch": 0.21324298121685514, "grad_norm": 7.994344711303711, "learning_rate": 5.478459601331835e-06, "loss": 0.4829, "mean_token_accuracy": 0.8478777214884758, "num_tokens": 82747857.0, "step": 68790 }, { "entropy": 1.9425535961985587, "epoch": 0.21327398034190484, "grad_norm": 9.188566207885742, "learning_rate": 5.478061437209491e-06, "loss": 0.5046, "mean_token_accuracy": 0.8370546743273735, "num_tokens": 82759441.0, "step": 68800 }, { "entropy": 1.9374151572585105, "epoch": 0.21330497946695454, "grad_norm": 9.73183822631836, "learning_rate": 5.477663359887986e-06, "loss": 0.5835, "mean_token_accuracy": 0.8232653871178627, "num_tokens": 82771750.0, "step": 68810 }, { "entropy": 1.8426042452454567, "epoch": 0.21333597859200423, "grad_norm": 8.94770622253418, "learning_rate": 5.4772653693357835e-06, "loss": 0.4571, "mean_token_accuracy": 0.8506119534373283, "num_tokens": 82785172.0, "step": 68820 }, { "entropy": 1.9215306863188744, "epoch": 0.21336697771705393, "grad_norm": 8.768072128295898, "learning_rate": 5.476867465521369e-06, "loss": 0.485, "mean_token_accuracy": 0.8429464474320412, "num_tokens": 82796191.0, "step": 68830 }, { "entropy": 1.864244209229946, "epoch": 0.21339797684210363, "grad_norm": 8.412409782409668, "learning_rate": 5.4764696484132394e-06, "loss": 0.4564, "mean_token_accuracy": 0.8489756211638451, "num_tokens": 82808552.0, "step": 68840 }, { "entropy": 1.942490178346634, "epoch": 0.21342897596715332, "grad_norm": 7.980142116546631, "learning_rate": 5.47607191797991e-06, "loss": 0.5294, "mean_token_accuracy": 0.8287267431616783, "num_tokens": 82820097.0, "step": 68850 }, { "entropy": 1.9282173097133637, "epoch": 0.21345997509220302, "grad_norm": 8.005396842956543, "learning_rate": 5.475674274189913e-06, "loss": 0.5122, "mean_token_accuracy": 0.8375453725457191, "num_tokens": 82831508.0, "step": 68860 }, { "entropy": 1.9350904941558837, "epoch": 0.21349097421725272, "grad_norm": 9.574058532714844, "learning_rate": 5.4752767170117924e-06, "loss": 0.5194, "mean_token_accuracy": 0.8478696927428245, "num_tokens": 82842649.0, "step": 68870 }, { "entropy": 1.9676428467035294, "epoch": 0.2135219733423024, "grad_norm": 8.351263046264648, "learning_rate": 5.474879246414112e-06, "loss": 0.526, "mean_token_accuracy": 0.8345917791128159, "num_tokens": 82854006.0, "step": 68880 }, { "entropy": 1.8507450267672538, "epoch": 0.2135529724673521, "grad_norm": 9.128543853759766, "learning_rate": 5.47448186236545e-06, "loss": 0.4949, "mean_token_accuracy": 0.843813742697239, "num_tokens": 82866178.0, "step": 68890 }, { "entropy": 1.9041711300611497, "epoch": 0.2135839715924018, "grad_norm": 8.370091438293457, "learning_rate": 5.474084564834402e-06, "loss": 0.4606, "mean_token_accuracy": 0.8432056456804276, "num_tokens": 82878638.0, "step": 68900 }, { "entropy": 1.7404463455080985, "epoch": 0.2136149707174515, "grad_norm": 9.250161170959473, "learning_rate": 5.473687353789579e-06, "loss": 0.3617, "mean_token_accuracy": 0.8636480793356895, "num_tokens": 82892733.0, "step": 68910 }, { "entropy": 1.8433457240462303, "epoch": 0.2136459698425012, "grad_norm": 8.090189933776855, "learning_rate": 5.473290229199604e-06, "loss": 0.4374, "mean_token_accuracy": 0.8539926618337631, "num_tokens": 82905596.0, "step": 68920 }, { "entropy": 1.9230886220932006, "epoch": 0.2136769689675509, "grad_norm": 9.134618759155273, "learning_rate": 5.472893191033122e-06, "loss": 0.4933, "mean_token_accuracy": 0.8437715753912925, "num_tokens": 82917687.0, "step": 68930 }, { "entropy": 1.9287979647517204, "epoch": 0.2137079680926006, "grad_norm": 9.821601867675781, "learning_rate": 5.472496239258791e-06, "loss": 0.5274, "mean_token_accuracy": 0.839262530207634, "num_tokens": 82928862.0, "step": 68940 }, { "entropy": 1.8872993856668472, "epoch": 0.2137389672176503, "grad_norm": 8.534150123596191, "learning_rate": 5.472099373845285e-06, "loss": 0.4787, "mean_token_accuracy": 0.8436684221029281, "num_tokens": 82940484.0, "step": 68950 }, { "entropy": 1.8906041860580445, "epoch": 0.21376996634269999, "grad_norm": 10.668354988098145, "learning_rate": 5.471702594761294e-06, "loss": 0.4922, "mean_token_accuracy": 0.8390226155519486, "num_tokens": 82952870.0, "step": 68960 }, { "entropy": 1.808730572462082, "epoch": 0.21380096546774968, "grad_norm": 4.311106204986572, "learning_rate": 5.471305901975526e-06, "loss": 0.4422, "mean_token_accuracy": 0.8523236215114594, "num_tokens": 82965433.0, "step": 68970 }, { "entropy": 1.9497309267520904, "epoch": 0.21383196459279938, "grad_norm": 7.436934471130371, "learning_rate": 5.470909295456699e-06, "loss": 0.515, "mean_token_accuracy": 0.843020536005497, "num_tokens": 82976459.0, "step": 68980 }, { "entropy": 1.9538449883460998, "epoch": 0.21386296371784907, "grad_norm": 7.01023530960083, "learning_rate": 5.470512775173554e-06, "loss": 0.5377, "mean_token_accuracy": 0.8403704330325127, "num_tokens": 82987717.0, "step": 68990 }, { "entropy": 1.879340572655201, "epoch": 0.21389396284289877, "grad_norm": 7.733237266540527, "learning_rate": 5.470116341094843e-06, "loss": 0.4839, "mean_token_accuracy": 0.8418584942817688, "num_tokens": 83000323.0, "step": 69000 }, { "entropy": 1.9619764864444733, "epoch": 0.21392496196794847, "grad_norm": 9.5713472366333, "learning_rate": 5.469719993189336e-06, "loss": 0.5458, "mean_token_accuracy": 0.8402963668107987, "num_tokens": 83011596.0, "step": 69010 }, { "entropy": 1.9356929183006286, "epoch": 0.21395596109299814, "grad_norm": 7.573556900024414, "learning_rate": 5.469323731425817e-06, "loss": 0.499, "mean_token_accuracy": 0.8439344793558121, "num_tokens": 83022881.0, "step": 69020 }, { "entropy": 1.8428806871175767, "epoch": 0.21398696021804783, "grad_norm": 4.41666841506958, "learning_rate": 5.46892755577309e-06, "loss": 0.4711, "mean_token_accuracy": 0.845615790784359, "num_tokens": 83035756.0, "step": 69030 }, { "entropy": 1.8916326105594634, "epoch": 0.21401795934309753, "grad_norm": 8.813542366027832, "learning_rate": 5.46853146619997e-06, "loss": 0.485, "mean_token_accuracy": 0.8480862900614738, "num_tokens": 83046777.0, "step": 69040 }, { "entropy": 1.9366579607129097, "epoch": 0.21404895846814723, "grad_norm": 10.812383651733398, "learning_rate": 5.468135462675289e-06, "loss": 0.5598, "mean_token_accuracy": 0.8317944586277009, "num_tokens": 83058798.0, "step": 69050 }, { "entropy": 1.875098566710949, "epoch": 0.21407995759319692, "grad_norm": 7.257170677185059, "learning_rate": 5.467739545167898e-06, "loss": 0.4672, "mean_token_accuracy": 0.8402672663331032, "num_tokens": 83071399.0, "step": 69060 }, { "entropy": 1.9152754202485085, "epoch": 0.21411095671824662, "grad_norm": 8.708121299743652, "learning_rate": 5.467343713646658e-06, "loss": 0.4949, "mean_token_accuracy": 0.8351624384522438, "num_tokens": 83083904.0, "step": 69070 }, { "entropy": 1.938556207716465, "epoch": 0.21414195584329632, "grad_norm": 9.21479606628418, "learning_rate": 5.466947968080452e-06, "loss": 0.493, "mean_token_accuracy": 0.8361433446407318, "num_tokens": 83095844.0, "step": 69080 }, { "entropy": 1.9366083085536956, "epoch": 0.214172954968346, "grad_norm": 8.12380313873291, "learning_rate": 5.466552308438176e-06, "loss": 0.5251, "mean_token_accuracy": 0.8430330127477645, "num_tokens": 83107044.0, "step": 69090 }, { "entropy": 1.9676083683967591, "epoch": 0.2142039540933957, "grad_norm": 8.887046813964844, "learning_rate": 5.466156734688738e-06, "loss": 0.588, "mean_token_accuracy": 0.8293441072106361, "num_tokens": 83118055.0, "step": 69100 }, { "entropy": 1.8269154354929924, "epoch": 0.2142349532184454, "grad_norm": 4.919051647186279, "learning_rate": 5.4657612468010675e-06, "loss": 0.4613, "mean_token_accuracy": 0.8531086966395378, "num_tokens": 83130970.0, "step": 69110 }, { "entropy": 2.0073640048503876, "epoch": 0.2142659523434951, "grad_norm": 9.67685317993164, "learning_rate": 5.46536584474411e-06, "loss": 0.5912, "mean_token_accuracy": 0.8317721515893937, "num_tokens": 83142153.0, "step": 69120 }, { "entropy": 1.9201458156108857, "epoch": 0.2142969514685448, "grad_norm": 11.463045120239258, "learning_rate": 5.464970528486821e-06, "loss": 0.5906, "mean_token_accuracy": 0.8264217585325241, "num_tokens": 83153550.0, "step": 69130 }, { "entropy": 2.0055788159370422, "epoch": 0.2143279505935945, "grad_norm": 9.573265075683594, "learning_rate": 5.464575297998177e-06, "loss": 0.5833, "mean_token_accuracy": 0.8264138385653496, "num_tokens": 83164548.0, "step": 69140 }, { "entropy": 1.8320914067327976, "epoch": 0.2143589497186442, "grad_norm": 7.352808952331543, "learning_rate": 5.464180153247167e-06, "loss": 0.432, "mean_token_accuracy": 0.8488354295492172, "num_tokens": 83177984.0, "step": 69150 }, { "entropy": 1.8535144045948981, "epoch": 0.2143899488436939, "grad_norm": 8.093338012695312, "learning_rate": 5.463785094202798e-06, "loss": 0.5093, "mean_token_accuracy": 0.8474163025617599, "num_tokens": 83190325.0, "step": 69160 }, { "entropy": 1.9603209257125855, "epoch": 0.21442094796874359, "grad_norm": 7.552434921264648, "learning_rate": 5.46339012083409e-06, "loss": 0.549, "mean_token_accuracy": 0.8280288457870484, "num_tokens": 83201141.0, "step": 69170 }, { "entropy": 1.9304186806082726, "epoch": 0.21445194709379328, "grad_norm": 4.334521770477295, "learning_rate": 5.462995233110082e-06, "loss": 0.5573, "mean_token_accuracy": 0.8300809517502785, "num_tokens": 83212796.0, "step": 69180 }, { "entropy": 1.9205600872635842, "epoch": 0.21448294621884298, "grad_norm": 7.847184658050537, "learning_rate": 5.462600430999826e-06, "loss": 0.5061, "mean_token_accuracy": 0.8355577915906907, "num_tokens": 83224491.0, "step": 69190 }, { "entropy": 1.9560988396406174, "epoch": 0.21451394534389268, "grad_norm": 8.206576347351074, "learning_rate": 5.462205714472391e-06, "loss": 0.5586, "mean_token_accuracy": 0.8278788521885871, "num_tokens": 83235792.0, "step": 69200 }, { "entropy": 1.8794623762369156, "epoch": 0.21454494446894237, "grad_norm": 8.750676155090332, "learning_rate": 5.4618110834968615e-06, "loss": 0.4661, "mean_token_accuracy": 0.8406390935182572, "num_tokens": 83247555.0, "step": 69210 }, { "entropy": 1.8585446387529374, "epoch": 0.21457594359399207, "grad_norm": 3.854595422744751, "learning_rate": 5.461416538042337e-06, "loss": 0.4323, "mean_token_accuracy": 0.8608892947435379, "num_tokens": 83259237.0, "step": 69220 }, { "entropy": 1.7895995572209358, "epoch": 0.21460694271904177, "grad_norm": 4.206018447875977, "learning_rate": 5.4610220780779335e-06, "loss": 0.451, "mean_token_accuracy": 0.8465156257152557, "num_tokens": 83272545.0, "step": 69230 }, { "entropy": 1.7522881373763084, "epoch": 0.21463794184409146, "grad_norm": 5.796809673309326, "learning_rate": 5.46062770357278e-06, "loss": 0.4137, "mean_token_accuracy": 0.8636647120118142, "num_tokens": 83285897.0, "step": 69240 }, { "entropy": 1.8799547150731086, "epoch": 0.21466894096914116, "grad_norm": 4.128879070281982, "learning_rate": 5.460233414496026e-06, "loss": 0.4874, "mean_token_accuracy": 0.8423994764685631, "num_tokens": 83297294.0, "step": 69250 }, { "entropy": 1.9117163017392158, "epoch": 0.21469994009419083, "grad_norm": 8.076613426208496, "learning_rate": 5.459839210816833e-06, "loss": 0.5012, "mean_token_accuracy": 0.8484107598662376, "num_tokens": 83308283.0, "step": 69260 }, { "entropy": 1.8744409173727035, "epoch": 0.21473093921924052, "grad_norm": 8.670746803283691, "learning_rate": 5.459445092504379e-06, "loss": 0.4878, "mean_token_accuracy": 0.840450718998909, "num_tokens": 83319959.0, "step": 69270 }, { "entropy": 1.83160520195961, "epoch": 0.21476193834429022, "grad_norm": 9.35455322265625, "learning_rate": 5.459051059527855e-06, "loss": 0.4709, "mean_token_accuracy": 0.8386999174952507, "num_tokens": 83332935.0, "step": 69280 }, { "entropy": 1.9142531588673593, "epoch": 0.21479293746933992, "grad_norm": 9.220640182495117, "learning_rate": 5.458657111856474e-06, "loss": 0.4902, "mean_token_accuracy": 0.8410219177603722, "num_tokens": 83343839.0, "step": 69290 }, { "entropy": 1.859879168868065, "epoch": 0.21482393659438961, "grad_norm": 8.967870712280273, "learning_rate": 5.458263249459458e-06, "loss": 0.4535, "mean_token_accuracy": 0.8425408810377121, "num_tokens": 83356149.0, "step": 69300 }, { "entropy": 1.9017312452197075, "epoch": 0.2148549357194393, "grad_norm": 8.242757797241211, "learning_rate": 5.4578694723060475e-06, "loss": 0.4979, "mean_token_accuracy": 0.8415887534618378, "num_tokens": 83367770.0, "step": 69310 }, { "entropy": 1.8691633999347688, "epoch": 0.214885934844489, "grad_norm": 8.582977294921875, "learning_rate": 5.4574757803654994e-06, "loss": 0.4712, "mean_token_accuracy": 0.850647197663784, "num_tokens": 83378887.0, "step": 69320 }, { "entropy": 1.908175478875637, "epoch": 0.2149169339695387, "grad_norm": 7.830326080322266, "learning_rate": 5.457082173607083e-06, "loss": 0.5134, "mean_token_accuracy": 0.8378239721059799, "num_tokens": 83390350.0, "step": 69330 }, { "entropy": 1.9030898958444595, "epoch": 0.2149479330945884, "grad_norm": 6.693634510040283, "learning_rate": 5.456688652000087e-06, "loss": 0.5441, "mean_token_accuracy": 0.8313490644097328, "num_tokens": 83402282.0, "step": 69340 }, { "entropy": 1.9227802157402039, "epoch": 0.2149789322196381, "grad_norm": 13.45190143585205, "learning_rate": 5.456295215513813e-06, "loss": 0.514, "mean_token_accuracy": 0.8335293993353844, "num_tokens": 83413989.0, "step": 69350 }, { "entropy": 1.957323005795479, "epoch": 0.2150099313446878, "grad_norm": 12.361275672912598, "learning_rate": 5.455901864117576e-06, "loss": 0.5202, "mean_token_accuracy": 0.837789298593998, "num_tokens": 83424597.0, "step": 69360 }, { "entropy": 1.9297591596841812, "epoch": 0.2150409304697375, "grad_norm": 12.451163291931152, "learning_rate": 5.455508597780713e-06, "loss": 0.5219, "mean_token_accuracy": 0.831664165854454, "num_tokens": 83436096.0, "step": 69370 }, { "entropy": 1.9417940333485604, "epoch": 0.2150719295947872, "grad_norm": 8.612032890319824, "learning_rate": 5.455115416472572e-06, "loss": 0.5074, "mean_token_accuracy": 0.8476990014314651, "num_tokens": 83447255.0, "step": 69380 }, { "entropy": 1.9677758157253264, "epoch": 0.21510292871983688, "grad_norm": 7.140786170959473, "learning_rate": 5.454722320162514e-06, "loss": 0.5591, "mean_token_accuracy": 0.8349123015999794, "num_tokens": 83458140.0, "step": 69390 }, { "entropy": 1.8533436045050622, "epoch": 0.21513392784488658, "grad_norm": 8.395837783813477, "learning_rate": 5.454329308819922e-06, "loss": 0.4613, "mean_token_accuracy": 0.835472346842289, "num_tokens": 83470694.0, "step": 69400 }, { "entropy": 1.9218518555164337, "epoch": 0.21516492696993628, "grad_norm": Infinity, "learning_rate": 5.45393638241419e-06, "loss": 0.518, "mean_token_accuracy": 0.8393091827630996, "num_tokens": 83482363.0, "step": 69410 }, { "entropy": 1.9040866911411285, "epoch": 0.21519592609498597, "grad_norm": 7.9769673347473145, "learning_rate": 5.4535435409147265e-06, "loss": 0.4728, "mean_token_accuracy": 0.842051412165165, "num_tokens": 83494631.0, "step": 69420 }, { "entropy": 1.9231672808527946, "epoch": 0.21522692522003567, "grad_norm": 9.245956420898438, "learning_rate": 5.45315078429096e-06, "loss": 0.4454, "mean_token_accuracy": 0.847654415667057, "num_tokens": 83506941.0, "step": 69430 }, { "entropy": 1.9047624886035919, "epoch": 0.21525792434508537, "grad_norm": 9.397164344787598, "learning_rate": 5.452758112512331e-06, "loss": 0.5046, "mean_token_accuracy": 0.8385005071759224, "num_tokens": 83518932.0, "step": 69440 }, { "entropy": 1.8809319645166398, "epoch": 0.21528892347013506, "grad_norm": 10.352240562438965, "learning_rate": 5.452365525548295e-06, "loss": 0.5005, "mean_token_accuracy": 0.8362722516059875, "num_tokens": 83531350.0, "step": 69450 }, { "entropy": 1.8688334867358207, "epoch": 0.21531992259518476, "grad_norm": 4.476759433746338, "learning_rate": 5.451973023368325e-06, "loss": 0.4834, "mean_token_accuracy": 0.8403964400291443, "num_tokens": 83543584.0, "step": 69460 }, { "entropy": 1.9141950756311417, "epoch": 0.21535092172023446, "grad_norm": 8.979772567749023, "learning_rate": 5.451580605941909e-06, "loss": 0.5066, "mean_token_accuracy": 0.8465558111667633, "num_tokens": 83555457.0, "step": 69470 }, { "entropy": 1.9037456959486008, "epoch": 0.21538192084528415, "grad_norm": 12.75918197631836, "learning_rate": 5.451188273238549e-06, "loss": 0.4657, "mean_token_accuracy": 0.8453539073467254, "num_tokens": 83567575.0, "step": 69480 }, { "entropy": 1.886037102341652, "epoch": 0.21541291997033385, "grad_norm": 2.314100503921509, "learning_rate": 5.450796025227764e-06, "loss": 0.463, "mean_token_accuracy": 0.8452310040593147, "num_tokens": 83580210.0, "step": 69490 }, { "entropy": 1.9223955109715463, "epoch": 0.21544391909538355, "grad_norm": 7.768220901489258, "learning_rate": 5.450403861879085e-06, "loss": 0.436, "mean_token_accuracy": 0.8553515776991845, "num_tokens": 83592037.0, "step": 69500 }, { "entropy": 1.8389130905270576, "epoch": 0.21547491822043321, "grad_norm": 3.468709945678711, "learning_rate": 5.450011783162063e-06, "loss": 0.4376, "mean_token_accuracy": 0.8565122127532959, "num_tokens": 83604963.0, "step": 69510 }, { "entropy": 1.9391876861453057, "epoch": 0.2155059173454829, "grad_norm": 7.677327632904053, "learning_rate": 5.449619789046263e-06, "loss": 0.5053, "mean_token_accuracy": 0.8431411743164062, "num_tokens": 83616136.0, "step": 69520 }, { "entropy": 1.923940536379814, "epoch": 0.2155369164705326, "grad_norm": 9.04365348815918, "learning_rate": 5.449227879501263e-06, "loss": 0.4923, "mean_token_accuracy": 0.8554372638463974, "num_tokens": 83627773.0, "step": 69530 }, { "entropy": 1.8013005346059798, "epoch": 0.2155679155955823, "grad_norm": 7.584324359893799, "learning_rate": 5.448836054496658e-06, "loss": 0.4881, "mean_token_accuracy": 0.8366676792502403, "num_tokens": 83642089.0, "step": 69540 }, { "entropy": 1.9511119529604912, "epoch": 0.215598914720632, "grad_norm": 8.205597877502441, "learning_rate": 5.448444314002058e-06, "loss": 0.561, "mean_token_accuracy": 0.8263240188360215, "num_tokens": 83653309.0, "step": 69550 }, { "entropy": 1.8489666640758515, "epoch": 0.2156299138456817, "grad_norm": 9.638209342956543, "learning_rate": 5.448052657987088e-06, "loss": 0.4825, "mean_token_accuracy": 0.8453381776809692, "num_tokens": 83666503.0, "step": 69560 }, { "entropy": 1.9822421237826346, "epoch": 0.2156609129707314, "grad_norm": 7.235997676849365, "learning_rate": 5.4476610864213905e-06, "loss": 0.5225, "mean_token_accuracy": 0.8368374884128571, "num_tokens": 83677549.0, "step": 69570 }, { "entropy": 1.9172227129340171, "epoch": 0.2156919120957811, "grad_norm": 9.961241722106934, "learning_rate": 5.447269599274621e-06, "loss": 0.5732, "mean_token_accuracy": 0.8320469737052918, "num_tokens": 83689740.0, "step": 69580 }, { "entropy": 1.8688659638166427, "epoch": 0.2157229112208308, "grad_norm": 4.677713394165039, "learning_rate": 5.446878196516448e-06, "loss": 0.4483, "mean_token_accuracy": 0.8490154132246971, "num_tokens": 83701923.0, "step": 69590 }, { "entropy": 1.9590770199894905, "epoch": 0.21575391034588048, "grad_norm": 4.404328346252441, "learning_rate": 5.446486878116561e-06, "loss": 0.4984, "mean_token_accuracy": 0.8431831479072571, "num_tokens": 83713981.0, "step": 69600 }, { "entropy": 1.9376450017094613, "epoch": 0.21578490947093018, "grad_norm": 9.207627296447754, "learning_rate": 5.446095644044661e-06, "loss": 0.5232, "mean_token_accuracy": 0.8345242127776146, "num_tokens": 83725681.0, "step": 69610 }, { "entropy": 1.9425865828990936, "epoch": 0.21581590859597988, "grad_norm": 9.27120304107666, "learning_rate": 5.445704494270465e-06, "loss": 0.4888, "mean_token_accuracy": 0.8417545929551125, "num_tokens": 83737057.0, "step": 69620 }, { "entropy": 1.794182512164116, "epoch": 0.21584690772102957, "grad_norm": 9.344676971435547, "learning_rate": 5.445313428763705e-06, "loss": 0.4512, "mean_token_accuracy": 0.8520261868834496, "num_tokens": 83750518.0, "step": 69630 }, { "entropy": 1.8970528110861777, "epoch": 0.21587790684607927, "grad_norm": 10.588910102844238, "learning_rate": 5.444922447494128e-06, "loss": 0.4665, "mean_token_accuracy": 0.8415069609880448, "num_tokens": 83763123.0, "step": 69640 }, { "entropy": 1.9143267586827277, "epoch": 0.21590890597112897, "grad_norm": 8.116543769836426, "learning_rate": 5.444531550431497e-06, "loss": 0.487, "mean_token_accuracy": 0.8432750076055526, "num_tokens": 83774896.0, "step": 69650 }, { "entropy": 1.8820920512080193, "epoch": 0.21593990509617866, "grad_norm": 9.194504737854004, "learning_rate": 5.444140737545589e-06, "loss": 0.4566, "mean_token_accuracy": 0.8378720536828042, "num_tokens": 83786830.0, "step": 69660 }, { "entropy": 1.8950974389910697, "epoch": 0.21597090422122836, "grad_norm": 9.50033187866211, "learning_rate": 5.443750008806198e-06, "loss": 0.5127, "mean_token_accuracy": 0.840063288807869, "num_tokens": 83798450.0, "step": 69670 }, { "entropy": 1.9445108637213706, "epoch": 0.21600190334627806, "grad_norm": 7.2820892333984375, "learning_rate": 5.443359364183132e-06, "loss": 0.5052, "mean_token_accuracy": 0.8457172334194183, "num_tokens": 83809795.0, "step": 69680 }, { "entropy": 1.8929253578186036, "epoch": 0.21603290247132775, "grad_norm": 8.967961311340332, "learning_rate": 5.442968803646214e-06, "loss": 0.4744, "mean_token_accuracy": 0.8517738983035088, "num_tokens": 83821954.0, "step": 69690 }, { "entropy": 1.8933438271284104, "epoch": 0.21606390159637745, "grad_norm": 8.333928108215332, "learning_rate": 5.4425783271652824e-06, "loss": 0.4771, "mean_token_accuracy": 0.8390064716339112, "num_tokens": 83834653.0, "step": 69700 }, { "entropy": 1.8290221512317657, "epoch": 0.21609490072142715, "grad_norm": 4.229883670806885, "learning_rate": 5.44218793471019e-06, "loss": 0.4057, "mean_token_accuracy": 0.8560205176472664, "num_tokens": 83847132.0, "step": 69710 }, { "entropy": 1.9552450522780418, "epoch": 0.21612589984647684, "grad_norm": 8.421517372131348, "learning_rate": 5.441797626250808e-06, "loss": 0.5362, "mean_token_accuracy": 0.8295035645365715, "num_tokens": 83858171.0, "step": 69720 }, { "entropy": 1.912724708020687, "epoch": 0.21615689897152654, "grad_norm": 7.742305278778076, "learning_rate": 5.441407401757017e-06, "loss": 0.5841, "mean_token_accuracy": 0.8269262716174126, "num_tokens": 83870397.0, "step": 69730 }, { "entropy": 1.9444334492087365, "epoch": 0.21618789809657624, "grad_norm": 4.496399402618408, "learning_rate": 5.44101726119872e-06, "loss": 0.4989, "mean_token_accuracy": 0.8388503283262253, "num_tokens": 83882469.0, "step": 69740 }, { "entropy": 1.8968336150050162, "epoch": 0.21621889722162593, "grad_norm": 7.71511697769165, "learning_rate": 5.440627204545827e-06, "loss": 0.4742, "mean_token_accuracy": 0.8507803946733474, "num_tokens": 83894493.0, "step": 69750 }, { "entropy": 1.9784729063510895, "epoch": 0.2162498963466756, "grad_norm": 10.393377304077148, "learning_rate": 5.440237231768271e-06, "loss": 0.5628, "mean_token_accuracy": 0.8280197530984879, "num_tokens": 83905959.0, "step": 69760 }, { "entropy": 1.9202745750546455, "epoch": 0.2162808954717253, "grad_norm": 7.838612079620361, "learning_rate": 5.439847342835992e-06, "loss": 0.517, "mean_token_accuracy": 0.8408389419317246, "num_tokens": 83917828.0, "step": 69770 }, { "entropy": 1.9422619119286537, "epoch": 0.216311894596775, "grad_norm": 3.9136011600494385, "learning_rate": 5.439457537718953e-06, "loss": 0.474, "mean_token_accuracy": 0.8431269928812981, "num_tokens": 83929517.0, "step": 69780 }, { "entropy": 2.040539500117302, "epoch": 0.2163428937218247, "grad_norm": 10.292856216430664, "learning_rate": 5.4390678163871265e-06, "loss": 0.585, "mean_token_accuracy": 0.8160558506846428, "num_tokens": 83940294.0, "step": 69790 }, { "entropy": 2.000251492857933, "epoch": 0.2163738928468744, "grad_norm": 7.672717571258545, "learning_rate": 5.438678178810503e-06, "loss": 0.5194, "mean_token_accuracy": 0.8404903680086135, "num_tokens": 83951400.0, "step": 69800 }, { "entropy": 1.9795056134462357, "epoch": 0.21640489197192408, "grad_norm": 9.820237159729004, "learning_rate": 5.438288624959086e-06, "loss": 0.5402, "mean_token_accuracy": 0.8368617355823517, "num_tokens": 83963042.0, "step": 69810 }, { "entropy": 1.9222991615533829, "epoch": 0.21643589109697378, "grad_norm": 9.5084810256958, "learning_rate": 5.437899154802895e-06, "loss": 0.5009, "mean_token_accuracy": 0.8351835623383522, "num_tokens": 83974409.0, "step": 69820 }, { "entropy": 1.945703822374344, "epoch": 0.21646689022202348, "grad_norm": 11.023735046386719, "learning_rate": 5.437509768311967e-06, "loss": 0.5189, "mean_token_accuracy": 0.8369563966989517, "num_tokens": 83985791.0, "step": 69830 }, { "entropy": 1.8587691098451615, "epoch": 0.21649788934707317, "grad_norm": 3.4747183322906494, "learning_rate": 5.437120465456348e-06, "loss": 0.4441, "mean_token_accuracy": 0.8473239108920098, "num_tokens": 83998427.0, "step": 69840 }, { "entropy": 1.891299197077751, "epoch": 0.21652888847212287, "grad_norm": 7.577733993530273, "learning_rate": 5.436731246206105e-06, "loss": 0.4721, "mean_token_accuracy": 0.846482339501381, "num_tokens": 84010579.0, "step": 69850 }, { "entropy": 1.9225863426923753, "epoch": 0.21655988759717257, "grad_norm": 6.7725324630737305, "learning_rate": 5.436342110531316e-06, "loss": 0.5036, "mean_token_accuracy": 0.8383674845099449, "num_tokens": 84022493.0, "step": 69860 }, { "entropy": 1.955282860994339, "epoch": 0.21659088672222226, "grad_norm": 9.114567756652832, "learning_rate": 5.435953058402078e-06, "loss": 0.5296, "mean_token_accuracy": 0.8373220950365067, "num_tokens": 84033749.0, "step": 69870 }, { "entropy": 2.0027921319007875, "epoch": 0.21662188584727196, "grad_norm": 8.859573364257812, "learning_rate": 5.435564089788498e-06, "loss": 0.5998, "mean_token_accuracy": 0.8251512601971627, "num_tokens": 84044757.0, "step": 69880 }, { "entropy": 1.8767832040786743, "epoch": 0.21665288497232166, "grad_norm": 8.931615829467773, "learning_rate": 5.435175204660702e-06, "loss": 0.516, "mean_token_accuracy": 0.8378856316208839, "num_tokens": 84056798.0, "step": 69890 }, { "entropy": 1.8918719656765461, "epoch": 0.21668388409737135, "grad_norm": 9.630752563476562, "learning_rate": 5.4347864029888284e-06, "loss": 0.4562, "mean_token_accuracy": 0.8510532274842262, "num_tokens": 84068455.0, "step": 69900 }, { "entropy": 1.9158081158995628, "epoch": 0.21671488322242105, "grad_norm": 4.970503807067871, "learning_rate": 5.434397684743034e-06, "loss": 0.4817, "mean_token_accuracy": 0.8565353244543076, "num_tokens": 84080433.0, "step": 69910 }, { "entropy": 1.8488872632384301, "epoch": 0.21674588234747075, "grad_norm": 8.853670120239258, "learning_rate": 5.434009049893485e-06, "loss": 0.4252, "mean_token_accuracy": 0.8482698664069176, "num_tokens": 84093473.0, "step": 69920 }, { "entropy": 1.941627450287342, "epoch": 0.21677688147252044, "grad_norm": 8.593602180480957, "learning_rate": 5.433620498410368e-06, "loss": 0.5158, "mean_token_accuracy": 0.8356386333703995, "num_tokens": 84105334.0, "step": 69930 }, { "entropy": 1.9388030782341956, "epoch": 0.21680788059757014, "grad_norm": 8.22298526763916, "learning_rate": 5.433232030263881e-06, "loss": 0.5375, "mean_token_accuracy": 0.8294572129845619, "num_tokens": 84117279.0, "step": 69940 }, { "entropy": 1.844594794511795, "epoch": 0.21683887972261984, "grad_norm": 9.116217613220215, "learning_rate": 5.432843645424239e-06, "loss": 0.4281, "mean_token_accuracy": 0.8563014373183251, "num_tokens": 84129513.0, "step": 69950 }, { "entropy": 1.860558421909809, "epoch": 0.21686987884766953, "grad_norm": 8.21207332611084, "learning_rate": 5.43245534386167e-06, "loss": 0.449, "mean_token_accuracy": 0.8442764699459075, "num_tokens": 84142071.0, "step": 69960 }, { "entropy": 1.9477815553545952, "epoch": 0.21690087797271923, "grad_norm": 9.044229507446289, "learning_rate": 5.432067125546419e-06, "loss": 0.507, "mean_token_accuracy": 0.8379213094711304, "num_tokens": 84153180.0, "step": 69970 }, { "entropy": 1.9056160926818848, "epoch": 0.21693187709776893, "grad_norm": 8.381840705871582, "learning_rate": 5.431678990448746e-06, "loss": 0.5013, "mean_token_accuracy": 0.8412555903196335, "num_tokens": 84165277.0, "step": 69980 }, { "entropy": 1.9368507653474807, "epoch": 0.21696287622281862, "grad_norm": 9.226876258850098, "learning_rate": 5.431290938538921e-06, "loss": 0.5589, "mean_token_accuracy": 0.8354489773511886, "num_tokens": 84176763.0, "step": 69990 }, { "entropy": 1.9172518044710158, "epoch": 0.21699387534786832, "grad_norm": 4.2976603507995605, "learning_rate": 5.430902969787236e-06, "loss": 0.5022, "mean_token_accuracy": 0.8291812673211097, "num_tokens": 84189241.0, "step": 70000 }, { "entropy": 1.8809144780039788, "epoch": 0.217024874472918, "grad_norm": 7.371922016143799, "learning_rate": 5.430515084163993e-06, "loss": 0.4669, "mean_token_accuracy": 0.8422490224242211, "num_tokens": 84201347.0, "step": 70010 }, { "entropy": 1.951662066578865, "epoch": 0.21705587359796769, "grad_norm": 10.154802322387695, "learning_rate": 5.43012728163951e-06, "loss": 0.5137, "mean_token_accuracy": 0.8398729935288429, "num_tokens": 84213095.0, "step": 70020 }, { "entropy": 1.8219065442681313, "epoch": 0.21708687272301738, "grad_norm": 8.22426700592041, "learning_rate": 5.429739562184121e-06, "loss": 0.4161, "mean_token_accuracy": 0.8590575277805328, "num_tokens": 84225978.0, "step": 70030 }, { "entropy": 1.8591913282871246, "epoch": 0.21711787184806708, "grad_norm": 9.920334815979004, "learning_rate": 5.429351925768173e-06, "loss": 0.4268, "mean_token_accuracy": 0.8464103743433953, "num_tokens": 84239200.0, "step": 70040 }, { "entropy": 1.88853762447834, "epoch": 0.21714887097311678, "grad_norm": 4.088785171508789, "learning_rate": 5.428964372362031e-06, "loss": 0.4223, "mean_token_accuracy": 0.8548205360770226, "num_tokens": 84252272.0, "step": 70050 }, { "entropy": 1.8458364441990853, "epoch": 0.21717987009816647, "grad_norm": 9.360186576843262, "learning_rate": 5.428576901936069e-06, "loss": 0.4356, "mean_token_accuracy": 0.852430385351181, "num_tokens": 84265230.0, "step": 70060 }, { "entropy": 1.9471904829144477, "epoch": 0.21721086922321617, "grad_norm": 3.6564393043518066, "learning_rate": 5.428189514460681e-06, "loss": 0.484, "mean_token_accuracy": 0.8501205831766129, "num_tokens": 84276446.0, "step": 70070 }, { "entropy": 1.8922279462218285, "epoch": 0.21724186834826587, "grad_norm": 3.798719644546509, "learning_rate": 5.427802209906274e-06, "loss": 0.5129, "mean_token_accuracy": 0.838223971426487, "num_tokens": 84288449.0, "step": 70080 }, { "entropy": 1.9325299307703971, "epoch": 0.21727286747331556, "grad_norm": 9.839792251586914, "learning_rate": 5.427414988243273e-06, "loss": 0.5621, "mean_token_accuracy": 0.83007542937994, "num_tokens": 84299853.0, "step": 70090 }, { "entropy": 1.8988226309418679, "epoch": 0.21730386659836526, "grad_norm": 7.719738960266113, "learning_rate": 5.427027849442109e-06, "loss": 0.4806, "mean_token_accuracy": 0.8383999258279801, "num_tokens": 84312190.0, "step": 70100 }, { "entropy": 1.8617843106389045, "epoch": 0.21733486572341496, "grad_norm": 8.315159797668457, "learning_rate": 5.426640793473237e-06, "loss": 0.4371, "mean_token_accuracy": 0.8574797883629799, "num_tokens": 84324677.0, "step": 70110 }, { "entropy": 1.9673965632915498, "epoch": 0.21736586484846465, "grad_norm": 8.235828399658203, "learning_rate": 5.426253820307122e-06, "loss": 0.4979, "mean_token_accuracy": 0.8495687767863274, "num_tokens": 84335718.0, "step": 70120 }, { "entropy": 1.9207556203007699, "epoch": 0.21739686397351435, "grad_norm": 8.374682426452637, "learning_rate": 5.4258669299142465e-06, "loss": 0.4762, "mean_token_accuracy": 0.8351808488368988, "num_tokens": 84347566.0, "step": 70130 }, { "entropy": 1.9010922938585282, "epoch": 0.21742786309856404, "grad_norm": 9.063957214355469, "learning_rate": 5.425480122265106e-06, "loss": 0.501, "mean_token_accuracy": 0.8409019201993942, "num_tokens": 84360542.0, "step": 70140 }, { "entropy": 1.9091226264834404, "epoch": 0.21745886222361374, "grad_norm": 9.379876136779785, "learning_rate": 5.425093397330208e-06, "loss": 0.4669, "mean_token_accuracy": 0.8458316907286644, "num_tokens": 84372835.0, "step": 70150 }, { "entropy": 1.7553482845425605, "epoch": 0.21748986134866344, "grad_norm": 3.750493288040161, "learning_rate": 5.424706755080079e-06, "loss": 0.4027, "mean_token_accuracy": 0.8478049889206887, "num_tokens": 84387102.0, "step": 70160 }, { "entropy": 1.9269262328743935, "epoch": 0.21752086047371313, "grad_norm": 11.567874908447266, "learning_rate": 5.4243201954852605e-06, "loss": 0.5347, "mean_token_accuracy": 0.837294514477253, "num_tokens": 84398445.0, "step": 70170 }, { "entropy": 1.757249329984188, "epoch": 0.21755185959876283, "grad_norm": 2.541597604751587, "learning_rate": 5.423933718516307e-06, "loss": 0.4143, "mean_token_accuracy": 0.8519966840744019, "num_tokens": 84412057.0, "step": 70180 }, { "entropy": 1.9713240012526512, "epoch": 0.21758285872381253, "grad_norm": 9.577204704284668, "learning_rate": 5.423547324143784e-06, "loss": 0.4958, "mean_token_accuracy": 0.8448547378182412, "num_tokens": 84423064.0, "step": 70190 }, { "entropy": 1.9091683775186539, "epoch": 0.21761385784886222, "grad_norm": 7.392279624938965, "learning_rate": 5.423161012338279e-06, "loss": 0.494, "mean_token_accuracy": 0.8476917743682861, "num_tokens": 84435063.0, "step": 70200 }, { "entropy": 1.926077064871788, "epoch": 0.21764485697391192, "grad_norm": 8.991741180419922, "learning_rate": 5.422774783070391e-06, "loss": 0.542, "mean_token_accuracy": 0.8332406044006347, "num_tokens": 84446653.0, "step": 70210 }, { "entropy": 1.9285721063613892, "epoch": 0.21767585609896162, "grad_norm": 4.427052974700928, "learning_rate": 5.42238863631073e-06, "loss": 0.5299, "mean_token_accuracy": 0.8307848066091538, "num_tokens": 84458387.0, "step": 70220 }, { "entropy": 1.8635422542691231, "epoch": 0.21770685522401131, "grad_norm": 8.436079025268555, "learning_rate": 5.422002572029925e-06, "loss": 0.4617, "mean_token_accuracy": 0.8484039083123207, "num_tokens": 84470516.0, "step": 70230 }, { "entropy": 1.9386792957782746, "epoch": 0.217737854349061, "grad_norm": 4.04444694519043, "learning_rate": 5.421616590198619e-06, "loss": 0.479, "mean_token_accuracy": 0.8558227822184563, "num_tokens": 84481818.0, "step": 70240 }, { "entropy": 1.9428984984755515, "epoch": 0.21776885347411068, "grad_norm": 7.593319892883301, "learning_rate": 5.4212306907874705e-06, "loss": 0.4931, "mean_token_accuracy": 0.8404227659106255, "num_tokens": 84493867.0, "step": 70250 }, { "entropy": 1.854594275355339, "epoch": 0.21779985259916038, "grad_norm": 10.006421089172363, "learning_rate": 5.420844873767147e-06, "loss": 0.4851, "mean_token_accuracy": 0.8399219319224358, "num_tokens": 84506467.0, "step": 70260 }, { "entropy": 1.9580897375941277, "epoch": 0.21783085172421007, "grad_norm": 9.24285888671875, "learning_rate": 5.420459139108339e-06, "loss": 0.5512, "mean_token_accuracy": 0.8321612671017646, "num_tokens": 84517655.0, "step": 70270 }, { "entropy": 1.9946820080280303, "epoch": 0.21786185084925977, "grad_norm": 9.095592498779297, "learning_rate": 5.420073486781746e-06, "loss": 0.5713, "mean_token_accuracy": 0.8361142173409462, "num_tokens": 84527947.0, "step": 70280 }, { "entropy": 1.819739530980587, "epoch": 0.21789284997430947, "grad_norm": 9.337332725524902, "learning_rate": 5.419687916758083e-06, "loss": 0.414, "mean_token_accuracy": 0.857285912334919, "num_tokens": 84540966.0, "step": 70290 }, { "entropy": 1.959464368224144, "epoch": 0.21792384909935916, "grad_norm": 9.26564884185791, "learning_rate": 5.419302429008081e-06, "loss": 0.5639, "mean_token_accuracy": 0.8280451089143753, "num_tokens": 84552392.0, "step": 70300 }, { "entropy": 1.824575850367546, "epoch": 0.21795484822440886, "grad_norm": 8.046067237854004, "learning_rate": 5.418917023502482e-06, "loss": 0.496, "mean_token_accuracy": 0.8442411884665489, "num_tokens": 84565104.0, "step": 70310 }, { "entropy": 1.9438457012176513, "epoch": 0.21798584734945856, "grad_norm": 8.829744338989258, "learning_rate": 5.41853170021205e-06, "loss": 0.5719, "mean_token_accuracy": 0.8261488035321236, "num_tokens": 84576914.0, "step": 70320 }, { "entropy": 1.9732747316360473, "epoch": 0.21801684647450825, "grad_norm": 8.278334617614746, "learning_rate": 5.418146459107554e-06, "loss": 0.5788, "mean_token_accuracy": 0.8282134965062141, "num_tokens": 84588102.0, "step": 70330 }, { "entropy": 1.9039635524153709, "epoch": 0.21804784559955795, "grad_norm": 7.718066692352295, "learning_rate": 5.417761300159784e-06, "loss": 0.4893, "mean_token_accuracy": 0.8389883413910866, "num_tokens": 84599955.0, "step": 70340 }, { "entropy": 1.910678581893444, "epoch": 0.21807884472460765, "grad_norm": 3.6842432022094727, "learning_rate": 5.417376223339544e-06, "loss": 0.5259, "mean_token_accuracy": 0.8448718905448913, "num_tokens": 84611103.0, "step": 70350 }, { "entropy": 1.8811346575617791, "epoch": 0.21810984384965734, "grad_norm": 10.019588470458984, "learning_rate": 5.41699122861765e-06, "loss": 0.5111, "mean_token_accuracy": 0.8373315662145615, "num_tokens": 84622901.0, "step": 70360 }, { "entropy": 1.751522246003151, "epoch": 0.21814084297470704, "grad_norm": 9.472254753112793, "learning_rate": 5.416606315964937e-06, "loss": 0.4021, "mean_token_accuracy": 0.8536264047026634, "num_tokens": 84636784.0, "step": 70370 }, { "entropy": 1.9128979295492172, "epoch": 0.21817184209975674, "grad_norm": 7.596695899963379, "learning_rate": 5.416221485352247e-06, "loss": 0.5573, "mean_token_accuracy": 0.8282625824213028, "num_tokens": 84649019.0, "step": 70380 }, { "entropy": 1.90406776368618, "epoch": 0.21820284122480643, "grad_norm": 9.435015678405762, "learning_rate": 5.415836736750441e-06, "loss": 0.4832, "mean_token_accuracy": 0.8421972319483757, "num_tokens": 84660281.0, "step": 70390 }, { "entropy": 1.77169189453125, "epoch": 0.21823384034985613, "grad_norm": 3.901792287826538, "learning_rate": 5.415452070130397e-06, "loss": 0.3818, "mean_token_accuracy": 0.8625829800963402, "num_tokens": 84673251.0, "step": 70400 }, { "entropy": 1.9100979268550873, "epoch": 0.21826483947490583, "grad_norm": 8.25220012664795, "learning_rate": 5.415067485463005e-06, "loss": 0.4971, "mean_token_accuracy": 0.8386782616376877, "num_tokens": 84685476.0, "step": 70410 }, { "entropy": 1.8133963495492935, "epoch": 0.21829583859995552, "grad_norm": 3.5599453449249268, "learning_rate": 5.414682982719167e-06, "loss": 0.4691, "mean_token_accuracy": 0.8472143262624741, "num_tokens": 84698468.0, "step": 70420 }, { "entropy": 1.9142759516835213, "epoch": 0.21832683772500522, "grad_norm": 7.450267314910889, "learning_rate": 5.414298561869803e-06, "loss": 0.4914, "mean_token_accuracy": 0.8527796879410744, "num_tokens": 84709995.0, "step": 70430 }, { "entropy": 1.9114891082048415, "epoch": 0.21835783685005491, "grad_norm": 8.194131851196289, "learning_rate": 5.413914222885847e-06, "loss": 0.5451, "mean_token_accuracy": 0.838103885948658, "num_tokens": 84720745.0, "step": 70440 }, { "entropy": 1.8565336361527442, "epoch": 0.2183888359751046, "grad_norm": 7.725749969482422, "learning_rate": 5.413529965738245e-06, "loss": 0.4834, "mean_token_accuracy": 0.8396166414022446, "num_tokens": 84732619.0, "step": 70450 }, { "entropy": 1.933957888185978, "epoch": 0.2184198351001543, "grad_norm": 9.328766822814941, "learning_rate": 5.41314579039796e-06, "loss": 0.5431, "mean_token_accuracy": 0.8291135087609292, "num_tokens": 84744044.0, "step": 70460 }, { "entropy": 1.881012487411499, "epoch": 0.218450834225204, "grad_norm": 7.425388813018799, "learning_rate": 5.412761696835969e-06, "loss": 0.5131, "mean_token_accuracy": 0.8454090863466263, "num_tokens": 84756091.0, "step": 70470 }, { "entropy": 1.8883946731686592, "epoch": 0.2184818333502537, "grad_norm": 7.838623523712158, "learning_rate": 5.412377685023262e-06, "loss": 0.4852, "mean_token_accuracy": 0.8344195529818534, "num_tokens": 84768181.0, "step": 70480 }, { "entropy": 1.8858201622962951, "epoch": 0.2185128324753034, "grad_norm": 8.257635116577148, "learning_rate": 5.411993754930844e-06, "loss": 0.4949, "mean_token_accuracy": 0.8470271736383438, "num_tokens": 84779613.0, "step": 70490 }, { "entropy": 1.87155821621418, "epoch": 0.21854383160035307, "grad_norm": 11.120357513427734, "learning_rate": 5.411609906529737e-06, "loss": 0.5173, "mean_token_accuracy": 0.8390228837728501, "num_tokens": 84792236.0, "step": 70500 }, { "entropy": 1.836224192380905, "epoch": 0.21857483072540276, "grad_norm": 9.826521873474121, "learning_rate": 5.411226139790973e-06, "loss": 0.4767, "mean_token_accuracy": 0.8449719101190567, "num_tokens": 84805022.0, "step": 70510 }, { "entropy": 1.9778549373149872, "epoch": 0.21860582985045246, "grad_norm": 9.500571250915527, "learning_rate": 5.410842454685601e-06, "loss": 0.6753, "mean_token_accuracy": 0.8305225998163224, "num_tokens": 84816577.0, "step": 70520 }, { "entropy": 1.8446165218949317, "epoch": 0.21863682897550216, "grad_norm": 9.461227416992188, "learning_rate": 5.4104588511846846e-06, "loss": 0.4405, "mean_token_accuracy": 0.8477078288793564, "num_tokens": 84829132.0, "step": 70530 }, { "entropy": 1.859433715045452, "epoch": 0.21866782810055185, "grad_norm": 7.800492286682129, "learning_rate": 5.410075329259299e-06, "loss": 0.4696, "mean_token_accuracy": 0.8472880437970162, "num_tokens": 84841353.0, "step": 70540 }, { "entropy": 1.9165452167391777, "epoch": 0.21869882722560155, "grad_norm": 8.318398475646973, "learning_rate": 5.4096918888805385e-06, "loss": 0.5168, "mean_token_accuracy": 0.8420958235859871, "num_tokens": 84852397.0, "step": 70550 }, { "entropy": 1.8914957597851754, "epoch": 0.21872982635065125, "grad_norm": 9.693846702575684, "learning_rate": 5.409308530019507e-06, "loss": 0.5111, "mean_token_accuracy": 0.8405283436179161, "num_tokens": 84863491.0, "step": 70560 }, { "entropy": 1.945120519399643, "epoch": 0.21876082547570094, "grad_norm": 9.057845115661621, "learning_rate": 5.408925252647326e-06, "loss": 0.4925, "mean_token_accuracy": 0.8449465945363045, "num_tokens": 84874898.0, "step": 70570 }, { "entropy": 1.9313878312706947, "epoch": 0.21879182460075064, "grad_norm": 8.234183311462402, "learning_rate": 5.408542056735129e-06, "loss": 0.5658, "mean_token_accuracy": 0.8252523899078369, "num_tokens": 84886394.0, "step": 70580 }, { "entropy": 1.9243649192154408, "epoch": 0.21882282372580034, "grad_norm": 8.189180374145508, "learning_rate": 5.408158942254065e-06, "loss": 0.5435, "mean_token_accuracy": 0.8330753937363624, "num_tokens": 84898510.0, "step": 70590 }, { "entropy": 1.8462709829211235, "epoch": 0.21885382285085003, "grad_norm": 7.798243522644043, "learning_rate": 5.407775909175298e-06, "loss": 0.4693, "mean_token_accuracy": 0.8495460674166679, "num_tokens": 84910723.0, "step": 70600 }, { "entropy": 1.8084265619516373, "epoch": 0.21888482197589973, "grad_norm": 10.60255241394043, "learning_rate": 5.407392957470005e-06, "loss": 0.418, "mean_token_accuracy": 0.8456479325890541, "num_tokens": 84923960.0, "step": 70610 }, { "entropy": 1.880794422328472, "epoch": 0.21891582110094943, "grad_norm": 4.523313045501709, "learning_rate": 5.4070100871093764e-06, "loss": 0.4671, "mean_token_accuracy": 0.8479678109288216, "num_tokens": 84935570.0, "step": 70620 }, { "entropy": 1.8770657986402512, "epoch": 0.21894682022599912, "grad_norm": 2.5397913455963135, "learning_rate": 5.406627298064622e-06, "loss": 0.5017, "mean_token_accuracy": 0.8388467326760292, "num_tokens": 84948444.0, "step": 70630 }, { "entropy": 1.946268130838871, "epoch": 0.21897781935104882, "grad_norm": 9.20574951171875, "learning_rate": 5.406244590306958e-06, "loss": 0.5068, "mean_token_accuracy": 0.8426952421665191, "num_tokens": 84959437.0, "step": 70640 }, { "entropy": 1.9465517818927764, "epoch": 0.21900881847609852, "grad_norm": 9.563549995422363, "learning_rate": 5.405861963807622e-06, "loss": 0.5409, "mean_token_accuracy": 0.834696726500988, "num_tokens": 84970287.0, "step": 70650 }, { "entropy": 1.9270946726202964, "epoch": 0.2190398176011482, "grad_norm": 7.783606052398682, "learning_rate": 5.4054794185378615e-06, "loss": 0.5061, "mean_token_accuracy": 0.8351917177438736, "num_tokens": 84982065.0, "step": 70660 }, { "entropy": 1.8967857383191586, "epoch": 0.2190708167261979, "grad_norm": 8.703535079956055, "learning_rate": 5.405096954468938e-06, "loss": 0.4889, "mean_token_accuracy": 0.8460932150483131, "num_tokens": 84993902.0, "step": 70670 }, { "entropy": 1.9372851148247718, "epoch": 0.2191018158512476, "grad_norm": 4.806000232696533, "learning_rate": 5.4047145715721315e-06, "loss": 0.566, "mean_token_accuracy": 0.8275101691484451, "num_tokens": 85005220.0, "step": 70680 }, { "entropy": 1.88345315605402, "epoch": 0.2191328149762973, "grad_norm": 7.77454948425293, "learning_rate": 5.404332269818732e-06, "loss": 0.4613, "mean_token_accuracy": 0.8458261653780937, "num_tokens": 85017018.0, "step": 70690 }, { "entropy": 1.7957565858960152, "epoch": 0.219163814101347, "grad_norm": 8.677156448364258, "learning_rate": 5.403950049180046e-06, "loss": 0.4538, "mean_token_accuracy": 0.8442620366811753, "num_tokens": 85030620.0, "step": 70700 }, { "entropy": 1.8943561017513275, "epoch": 0.2191948132263967, "grad_norm": 7.602232933044434, "learning_rate": 5.403567909627393e-06, "loss": 0.5309, "mean_token_accuracy": 0.8428284287452698, "num_tokens": 85041835.0, "step": 70710 }, { "entropy": 1.9183341562747955, "epoch": 0.2192258123514464, "grad_norm": 8.18143367767334, "learning_rate": 5.4031858511321065e-06, "loss": 0.5451, "mean_token_accuracy": 0.8351936027407646, "num_tokens": 85053607.0, "step": 70720 }, { "entropy": 1.8503445282578468, "epoch": 0.2192568114764961, "grad_norm": 8.790541648864746, "learning_rate": 5.402803873665535e-06, "loss": 0.414, "mean_token_accuracy": 0.8500261098146439, "num_tokens": 85065898.0, "step": 70730 }, { "entropy": 1.869480137526989, "epoch": 0.21928781060154579, "grad_norm": 10.168290138244629, "learning_rate": 5.402421977199042e-06, "loss": 0.4884, "mean_token_accuracy": 0.838524155318737, "num_tokens": 85078033.0, "step": 70740 }, { "entropy": 1.9085437297821044, "epoch": 0.21931880972659545, "grad_norm": 8.224616050720215, "learning_rate": 5.402040161704004e-06, "loss": 0.4843, "mean_token_accuracy": 0.8398436903953552, "num_tokens": 85090009.0, "step": 70750 }, { "entropy": 1.8690147161483766, "epoch": 0.21934980885164515, "grad_norm": 7.7431416511535645, "learning_rate": 5.4016584271518116e-06, "loss": 0.4382, "mean_token_accuracy": 0.8496010720729827, "num_tokens": 85102267.0, "step": 70760 }, { "entropy": 1.908894456923008, "epoch": 0.21938080797669485, "grad_norm": 9.895110130310059, "learning_rate": 5.401276773513869e-06, "loss": 0.5064, "mean_token_accuracy": 0.839865879714489, "num_tokens": 85113947.0, "step": 70770 }, { "entropy": 1.8951399207115174, "epoch": 0.21941180710174454, "grad_norm": 7.593353271484375, "learning_rate": 5.400895200761596e-06, "loss": 0.5009, "mean_token_accuracy": 0.8396402359008789, "num_tokens": 85125637.0, "step": 70780 }, { "entropy": 1.8372249327600003, "epoch": 0.21944280622679424, "grad_norm": 10.441277503967285, "learning_rate": 5.400513708866425e-06, "loss": 0.4661, "mean_token_accuracy": 0.8503782153129578, "num_tokens": 85138593.0, "step": 70790 }, { "entropy": 1.9151126846671105, "epoch": 0.21947380535184394, "grad_norm": 11.181161880493164, "learning_rate": 5.400132297799804e-06, "loss": 0.4699, "mean_token_accuracy": 0.8408087193965912, "num_tokens": 85150130.0, "step": 70800 }, { "entropy": 1.9188938543200493, "epoch": 0.21950480447689363, "grad_norm": 7.681055545806885, "learning_rate": 5.399750967533195e-06, "loss": 0.5204, "mean_token_accuracy": 0.8343793347477912, "num_tokens": 85161618.0, "step": 70810 }, { "entropy": 1.8500541925430298, "epoch": 0.21953580360194333, "grad_norm": 9.042118072509766, "learning_rate": 5.399369718038073e-06, "loss": 0.5029, "mean_token_accuracy": 0.8330987498164177, "num_tokens": 85174263.0, "step": 70820 }, { "entropy": 1.8322535023093223, "epoch": 0.21956680272699303, "grad_norm": 4.178494930267334, "learning_rate": 5.398988549285927e-06, "loss": 0.4795, "mean_token_accuracy": 0.8345492288470269, "num_tokens": 85187028.0, "step": 70830 }, { "entropy": 1.8351117119193077, "epoch": 0.21959780185204272, "grad_norm": 4.005007743835449, "learning_rate": 5.398607461248263e-06, "loss": 0.4246, "mean_token_accuracy": 0.8548778548836709, "num_tokens": 85199485.0, "step": 70840 }, { "entropy": 1.8513669028878212, "epoch": 0.21962880097709242, "grad_norm": 8.433499336242676, "learning_rate": 5.398226453896596e-06, "loss": 0.5017, "mean_token_accuracy": 0.8487704336643219, "num_tokens": 85211879.0, "step": 70850 }, { "entropy": 1.8096089884638786, "epoch": 0.21965980010214212, "grad_norm": 10.63721752166748, "learning_rate": 5.39784552720246e-06, "loss": 0.4147, "mean_token_accuracy": 0.8548229023814201, "num_tokens": 85224815.0, "step": 70860 }, { "entropy": 1.8579236298799515, "epoch": 0.2196907992271918, "grad_norm": 8.522771835327148, "learning_rate": 5.3974646811373986e-06, "loss": 0.4972, "mean_token_accuracy": 0.8453032687306404, "num_tokens": 85236804.0, "step": 70870 }, { "entropy": 1.8571562334895133, "epoch": 0.2197217983522415, "grad_norm": 8.076016426086426, "learning_rate": 5.397083915672975e-06, "loss": 0.4687, "mean_token_accuracy": 0.8438248604536056, "num_tokens": 85248947.0, "step": 70880 }, { "entropy": 1.851157009601593, "epoch": 0.2197527974772912, "grad_norm": 9.514842987060547, "learning_rate": 5.396703230780761e-06, "loss": 0.4632, "mean_token_accuracy": 0.8403349593281746, "num_tokens": 85261568.0, "step": 70890 }, { "entropy": 1.8390909820795058, "epoch": 0.2197837966023409, "grad_norm": 9.045916557312012, "learning_rate": 5.396322626432345e-06, "loss": 0.4379, "mean_token_accuracy": 0.8455731064081192, "num_tokens": 85274322.0, "step": 70900 }, { "entropy": 1.9245983332395553, "epoch": 0.2198147957273906, "grad_norm": 7.920559406280518, "learning_rate": 5.39594210259933e-06, "loss": 0.5211, "mean_token_accuracy": 0.8428880482912063, "num_tokens": 85285617.0, "step": 70910 }, { "entropy": 1.8781443014740944, "epoch": 0.2198457948524403, "grad_norm": 4.677145481109619, "learning_rate": 5.395561659253331e-06, "loss": 0.4892, "mean_token_accuracy": 0.8449257969856262, "num_tokens": 85297208.0, "step": 70920 }, { "entropy": 1.9284519299864769, "epoch": 0.21987679397749, "grad_norm": 3.986211061477661, "learning_rate": 5.395181296365979e-06, "loss": 0.5201, "mean_token_accuracy": 0.8414773017168045, "num_tokens": 85308876.0, "step": 70930 }, { "entropy": 1.7243415489792824, "epoch": 0.2199077931025397, "grad_norm": 4.131589412689209, "learning_rate": 5.394801013908917e-06, "loss": 0.3716, "mean_token_accuracy": 0.865143957734108, "num_tokens": 85323468.0, "step": 70940 }, { "entropy": 1.9777496784925461, "epoch": 0.21993879222758939, "grad_norm": 7.485686779022217, "learning_rate": 5.394420811853805e-06, "loss": 0.5543, "mean_token_accuracy": 0.8345079109072685, "num_tokens": 85334331.0, "step": 70950 }, { "entropy": 1.9227191910147667, "epoch": 0.21996979135263908, "grad_norm": 4.045227527618408, "learning_rate": 5.394040690172313e-06, "loss": 0.5687, "mean_token_accuracy": 0.8265855267643929, "num_tokens": 85346105.0, "step": 70960 }, { "entropy": 1.9031540900468826, "epoch": 0.22000079047768878, "grad_norm": 4.864028453826904, "learning_rate": 5.393660648836128e-06, "loss": 0.4885, "mean_token_accuracy": 0.8400901973247528, "num_tokens": 85358132.0, "step": 70970 }, { "entropy": 1.98341506421566, "epoch": 0.22003178960273848, "grad_norm": 8.267914772033691, "learning_rate": 5.393280687816951e-06, "loss": 0.5651, "mean_token_accuracy": 0.8373934581875802, "num_tokens": 85369306.0, "step": 70980 }, { "entropy": 1.9509702712297439, "epoch": 0.22006278872778814, "grad_norm": 8.570839881896973, "learning_rate": 5.392900807086495e-06, "loss": 0.5279, "mean_token_accuracy": 0.8445903062820435, "num_tokens": 85380557.0, "step": 70990 }, { "entropy": 1.9639266401529312, "epoch": 0.22009378785283784, "grad_norm": 9.221185684204102, "learning_rate": 5.392521006616488e-06, "loss": 0.4951, "mean_token_accuracy": 0.8395860701799392, "num_tokens": 85392420.0, "step": 71000 }, { "entropy": 1.9102492406964302, "epoch": 0.22012478697788754, "grad_norm": 3.7444820404052734, "learning_rate": 5.392141286378672e-06, "loss": 0.497, "mean_token_accuracy": 0.8374612465500831, "num_tokens": 85404213.0, "step": 71010 }, { "entropy": 1.8868770197033882, "epoch": 0.22015578610293723, "grad_norm": 2.842402219772339, "learning_rate": 5.391761646344802e-06, "loss": 0.4818, "mean_token_accuracy": 0.8464625418186188, "num_tokens": 85416634.0, "step": 71020 }, { "entropy": 1.8559535294771194, "epoch": 0.22018678522798693, "grad_norm": 8.821431159973145, "learning_rate": 5.391382086486649e-06, "loss": 0.5071, "mean_token_accuracy": 0.8445154055953026, "num_tokens": 85430022.0, "step": 71030 }, { "entropy": 1.9842773735523225, "epoch": 0.22021778435303663, "grad_norm": 8.328567504882812, "learning_rate": 5.391002606775996e-06, "loss": 0.5205, "mean_token_accuracy": 0.8406598642468452, "num_tokens": 85441410.0, "step": 71040 }, { "entropy": 1.9358683452010155, "epoch": 0.22024878347808632, "grad_norm": 9.156634330749512, "learning_rate": 5.3906232071846385e-06, "loss": 0.4785, "mean_token_accuracy": 0.8384227842092514, "num_tokens": 85453655.0, "step": 71050 }, { "entropy": 1.8562579780817032, "epoch": 0.22027978260313602, "grad_norm": 7.700805187225342, "learning_rate": 5.390243887684392e-06, "loss": 0.4197, "mean_token_accuracy": 0.854043036699295, "num_tokens": 85466539.0, "step": 71060 }, { "entropy": 1.9701458364725113, "epoch": 0.22031078172818572, "grad_norm": 7.824542045593262, "learning_rate": 5.38986464824708e-06, "loss": 0.5203, "mean_token_accuracy": 0.8268503859639168, "num_tokens": 85478191.0, "step": 71070 }, { "entropy": 1.9239497467875482, "epoch": 0.2203417808532354, "grad_norm": 10.349506378173828, "learning_rate": 5.3894854888445415e-06, "loss": 0.5199, "mean_token_accuracy": 0.8427152633666992, "num_tokens": 85490546.0, "step": 71080 }, { "entropy": 1.9393352545797824, "epoch": 0.2203727799782851, "grad_norm": 8.5408935546875, "learning_rate": 5.389106409448628e-06, "loss": 0.4947, "mean_token_accuracy": 0.8323530539870262, "num_tokens": 85502425.0, "step": 71090 }, { "entropy": 1.997844734787941, "epoch": 0.2204037791033348, "grad_norm": 7.891308307647705, "learning_rate": 5.38872741003121e-06, "loss": 0.5502, "mean_token_accuracy": 0.8281294673681259, "num_tokens": 85513168.0, "step": 71100 }, { "entropy": 1.9478082045912744, "epoch": 0.2204347782283845, "grad_norm": 8.2448091506958, "learning_rate": 5.388348490564164e-06, "loss": 0.5257, "mean_token_accuracy": 0.8402855768799782, "num_tokens": 85524556.0, "step": 71110 }, { "entropy": 1.931041233241558, "epoch": 0.2204657773534342, "grad_norm": 9.231786727905273, "learning_rate": 5.387969651019387e-06, "loss": 0.4919, "mean_token_accuracy": 0.8433810248970985, "num_tokens": 85536049.0, "step": 71120 }, { "entropy": 1.9640550106763839, "epoch": 0.2204967764784839, "grad_norm": 8.980378150939941, "learning_rate": 5.387590891368787e-06, "loss": 0.5885, "mean_token_accuracy": 0.8172830164432525, "num_tokens": 85547228.0, "step": 71130 }, { "entropy": 1.8748082131147386, "epoch": 0.2205277756035336, "grad_norm": 8.130640983581543, "learning_rate": 5.387212211584286e-06, "loss": 0.4647, "mean_token_accuracy": 0.838733246922493, "num_tokens": 85559551.0, "step": 71140 }, { "entropy": 1.8640730693936347, "epoch": 0.2205587747285833, "grad_norm": 7.803213596343994, "learning_rate": 5.386833611637822e-06, "loss": 0.5066, "mean_token_accuracy": 0.844725139439106, "num_tokens": 85571361.0, "step": 71150 }, { "entropy": 1.8213941514492036, "epoch": 0.220589773853633, "grad_norm": 9.419188499450684, "learning_rate": 5.386455091501342e-06, "loss": 0.447, "mean_token_accuracy": 0.8451396659016609, "num_tokens": 85584550.0, "step": 71160 }, { "entropy": 1.8446722000837326, "epoch": 0.22062077297868268, "grad_norm": 9.770685195922852, "learning_rate": 5.3860766511468095e-06, "loss": 0.4354, "mean_token_accuracy": 0.849824532866478, "num_tokens": 85597173.0, "step": 71170 }, { "entropy": 1.8369273975491525, "epoch": 0.22065177210373238, "grad_norm": 7.802650451660156, "learning_rate": 5.385698290546205e-06, "loss": 0.4305, "mean_token_accuracy": 0.8467299669981003, "num_tokens": 85611218.0, "step": 71180 }, { "entropy": 1.9682567581534385, "epoch": 0.22068277122878208, "grad_norm": 8.1614408493042, "learning_rate": 5.3853200096715175e-06, "loss": 0.5049, "mean_token_accuracy": 0.8387658104300499, "num_tokens": 85622538.0, "step": 71190 }, { "entropy": 1.8726427100598813, "epoch": 0.22071377035383177, "grad_norm": 7.950981616973877, "learning_rate": 5.384941808494753e-06, "loss": 0.4462, "mean_token_accuracy": 0.8444047793745995, "num_tokens": 85635421.0, "step": 71200 }, { "entropy": 1.893759909272194, "epoch": 0.22074476947888147, "grad_norm": 8.878076553344727, "learning_rate": 5.384563686987928e-06, "loss": 0.4765, "mean_token_accuracy": 0.8406562879681587, "num_tokens": 85648124.0, "step": 71210 }, { "entropy": 1.9666777163743974, "epoch": 0.22077576860393117, "grad_norm": 7.653510093688965, "learning_rate": 5.384185645123078e-06, "loss": 0.5379, "mean_token_accuracy": 0.840550334751606, "num_tokens": 85659636.0, "step": 71220 }, { "entropy": 1.898690427839756, "epoch": 0.22080676772898086, "grad_norm": 4.228546619415283, "learning_rate": 5.383807682872247e-06, "loss": 0.4725, "mean_token_accuracy": 0.8438500598073005, "num_tokens": 85671836.0, "step": 71230 }, { "entropy": 1.9357111945748329, "epoch": 0.22083776685403053, "grad_norm": 5.342586517333984, "learning_rate": 5.383429800207497e-06, "loss": 0.5501, "mean_token_accuracy": 0.8345894768834115, "num_tokens": 85684340.0, "step": 71240 }, { "entropy": 1.9341301143169403, "epoch": 0.22086876597908023, "grad_norm": 8.754364013671875, "learning_rate": 5.3830519971009e-06, "loss": 0.5377, "mean_token_accuracy": 0.8313232839107514, "num_tokens": 85695420.0, "step": 71250 }, { "entropy": 1.9368458151817323, "epoch": 0.22089976510412992, "grad_norm": 8.349538803100586, "learning_rate": 5.382674273524544e-06, "loss": 0.5105, "mean_token_accuracy": 0.8416445463895798, "num_tokens": 85707345.0, "step": 71260 }, { "entropy": 1.7997360065579415, "epoch": 0.22093076422917962, "grad_norm": 8.042038917541504, "learning_rate": 5.382296629450529e-06, "loss": 0.4267, "mean_token_accuracy": 0.845326641201973, "num_tokens": 85720742.0, "step": 71270 }, { "entropy": 1.958139282464981, "epoch": 0.22096176335422932, "grad_norm": 10.129528045654297, "learning_rate": 5.3819190648509714e-06, "loss": 0.555, "mean_token_accuracy": 0.8313161879777908, "num_tokens": 85730990.0, "step": 71280 }, { "entropy": 1.959393960237503, "epoch": 0.22099276247927901, "grad_norm": 8.495170593261719, "learning_rate": 5.381541579697999e-06, "loss": 0.5183, "mean_token_accuracy": 0.8422238364815712, "num_tokens": 85742428.0, "step": 71290 }, { "entropy": 1.851896020770073, "epoch": 0.2210237616043287, "grad_norm": 8.348380088806152, "learning_rate": 5.381164173963755e-06, "loss": 0.4896, "mean_token_accuracy": 0.8479127526283264, "num_tokens": 85755347.0, "step": 71300 }, { "entropy": 1.875650581717491, "epoch": 0.2210547607293784, "grad_norm": 3.1359446048736572, "learning_rate": 5.380786847620394e-06, "loss": 0.5207, "mean_token_accuracy": 0.8393297314643859, "num_tokens": 85767657.0, "step": 71310 }, { "entropy": 1.8281921789050102, "epoch": 0.2210857598544281, "grad_norm": 9.828446388244629, "learning_rate": 5.3804096006400844e-06, "loss": 0.4102, "mean_token_accuracy": 0.8533445596694946, "num_tokens": 85780812.0, "step": 71320 }, { "entropy": 1.8348768278956413, "epoch": 0.2211167589794778, "grad_norm": 8.210378646850586, "learning_rate": 5.380032432995013e-06, "loss": 0.4545, "mean_token_accuracy": 0.8492519795894623, "num_tokens": 85794004.0, "step": 71330 }, { "entropy": 1.8405587255954743, "epoch": 0.2211477581045275, "grad_norm": 8.21066951751709, "learning_rate": 5.379655344657373e-06, "loss": 0.427, "mean_token_accuracy": 0.8470315858721733, "num_tokens": 85807268.0, "step": 71340 }, { "entropy": 1.908400295674801, "epoch": 0.2211787572295772, "grad_norm": 8.391548156738281, "learning_rate": 5.379278335599377e-06, "loss": 0.5263, "mean_token_accuracy": 0.8420346319675446, "num_tokens": 85819177.0, "step": 71350 }, { "entropy": 1.929537844657898, "epoch": 0.2212097563546269, "grad_norm": 9.014862060546875, "learning_rate": 5.378901405793249e-06, "loss": 0.5488, "mean_token_accuracy": 0.8366276562213898, "num_tokens": 85830729.0, "step": 71360 }, { "entropy": 1.9423951536417008, "epoch": 0.2212407554796766, "grad_norm": 7.291965484619141, "learning_rate": 5.378524555211225e-06, "loss": 0.5274, "mean_token_accuracy": 0.8441259831190109, "num_tokens": 85841037.0, "step": 71370 }, { "entropy": 1.88891644179821, "epoch": 0.22127175460472628, "grad_norm": 8.193086624145508, "learning_rate": 5.378147783825558e-06, "loss": 0.4727, "mean_token_accuracy": 0.8404372721910477, "num_tokens": 85852678.0, "step": 71380 }, { "entropy": 1.911217801272869, "epoch": 0.22130275372977598, "grad_norm": 6.810231685638428, "learning_rate": 5.3777710916085125e-06, "loss": 0.4821, "mean_token_accuracy": 0.8454770565032959, "num_tokens": 85864412.0, "step": 71390 }, { "entropy": 1.9492921188473702, "epoch": 0.22133375285482568, "grad_norm": 9.480756759643555, "learning_rate": 5.377394478532367e-06, "loss": 0.5085, "mean_token_accuracy": 0.843833090364933, "num_tokens": 85875825.0, "step": 71400 }, { "entropy": 1.8765543788671493, "epoch": 0.22136475197987537, "grad_norm": 10.186013221740723, "learning_rate": 5.377017944569414e-06, "loss": 0.4708, "mean_token_accuracy": 0.8491380795836448, "num_tokens": 85887745.0, "step": 71410 }, { "entropy": 1.9640293627977372, "epoch": 0.22139575110492507, "grad_norm": 4.53644323348999, "learning_rate": 5.376641489691959e-06, "loss": 0.5024, "mean_token_accuracy": 0.8418907791376113, "num_tokens": 85898854.0, "step": 71420 }, { "entropy": 1.9712566941976548, "epoch": 0.22142675022997477, "grad_norm": 8.63244915008545, "learning_rate": 5.37626511387232e-06, "loss": 0.5354, "mean_token_accuracy": 0.8289510354399681, "num_tokens": 85909616.0, "step": 71430 }, { "entropy": 1.9296884000301362, "epoch": 0.22145774935502446, "grad_norm": 8.477029800415039, "learning_rate": 5.375888817082833e-06, "loss": 0.5219, "mean_token_accuracy": 0.8430458202958107, "num_tokens": 85920633.0, "step": 71440 }, { "entropy": 1.8276248887181281, "epoch": 0.22148874848007416, "grad_norm": 4.144367694854736, "learning_rate": 5.37551259929584e-06, "loss": 0.4666, "mean_token_accuracy": 0.8453167825937271, "num_tokens": 85933483.0, "step": 71450 }, { "entropy": 1.9024877399206161, "epoch": 0.22151974760512386, "grad_norm": 7.674788475036621, "learning_rate": 5.375136460483704e-06, "loss": 0.476, "mean_token_accuracy": 0.8434097394347191, "num_tokens": 85945186.0, "step": 71460 }, { "entropy": 1.8075610235333444, "epoch": 0.22155074673017355, "grad_norm": 8.375659942626953, "learning_rate": 5.374760400618798e-06, "loss": 0.4304, "mean_token_accuracy": 0.8462614819407464, "num_tokens": 85958094.0, "step": 71470 }, { "entropy": 1.8221330136060714, "epoch": 0.22158174585522325, "grad_norm": 7.434113025665283, "learning_rate": 5.37438441967351e-06, "loss": 0.4377, "mean_token_accuracy": 0.8467920750379563, "num_tokens": 85971956.0, "step": 71480 }, { "entropy": 1.9221985384821891, "epoch": 0.22161274498027292, "grad_norm": 8.287166595458984, "learning_rate": 5.374008517620237e-06, "loss": 0.5122, "mean_token_accuracy": 0.8432189226150513, "num_tokens": 85982856.0, "step": 71490 }, { "entropy": 1.9569891929626464, "epoch": 0.22164374410532262, "grad_norm": 9.78246021270752, "learning_rate": 5.373632694431396e-06, "loss": 0.563, "mean_token_accuracy": 0.826215885579586, "num_tokens": 85994575.0, "step": 71500 }, { "entropy": 1.9260801374912262, "epoch": 0.2216747432303723, "grad_norm": 7.269822120666504, "learning_rate": 5.373256950079414e-06, "loss": 0.5272, "mean_token_accuracy": 0.8399269118905067, "num_tokens": 86006104.0, "step": 71510 }, { "entropy": 1.8692869395017624, "epoch": 0.221705742355422, "grad_norm": 8.818297386169434, "learning_rate": 5.372881284536732e-06, "loss": 0.5117, "mean_token_accuracy": 0.8455665573477745, "num_tokens": 86019217.0, "step": 71520 }, { "entropy": 1.897246205806732, "epoch": 0.2217367414804717, "grad_norm": 7.545614242553711, "learning_rate": 5.372505697775805e-06, "loss": 0.4867, "mean_token_accuracy": 0.8487001821398735, "num_tokens": 86030738.0, "step": 71530 }, { "entropy": 1.922917690873146, "epoch": 0.2217677406055214, "grad_norm": 9.24394416809082, "learning_rate": 5.372130189769099e-06, "loss": 0.5345, "mean_token_accuracy": 0.8366933539509773, "num_tokens": 86042441.0, "step": 71540 }, { "entropy": 1.8682702884078026, "epoch": 0.2217987397305711, "grad_norm": 8.580945014953613, "learning_rate": 5.371754760489097e-06, "loss": 0.5302, "mean_token_accuracy": 0.8417121097445488, "num_tokens": 86054427.0, "step": 71550 }, { "entropy": 1.8668757036328316, "epoch": 0.2218297388556208, "grad_norm": 10.871688842773438, "learning_rate": 5.371379409908294e-06, "loss": 0.4733, "mean_token_accuracy": 0.8479284450411797, "num_tokens": 86066725.0, "step": 71560 }, { "entropy": 1.915563191473484, "epoch": 0.2218607379806705, "grad_norm": 10.612263679504395, "learning_rate": 5.371004137999198e-06, "loss": 0.5192, "mean_token_accuracy": 0.8447510316967964, "num_tokens": 86078349.0, "step": 71570 }, { "entropy": 1.897247090935707, "epoch": 0.2218917371057202, "grad_norm": 8.9105863571167, "learning_rate": 5.370628944734331e-06, "loss": 0.5511, "mean_token_accuracy": 0.8368135377764702, "num_tokens": 86091101.0, "step": 71580 }, { "entropy": 1.6814722761511802, "epoch": 0.22192273623076988, "grad_norm": 3.993140697479248, "learning_rate": 5.370253830086228e-06, "loss": 0.33, "mean_token_accuracy": 0.8649098068475723, "num_tokens": 86105722.0, "step": 71590 }, { "entropy": 1.9403679892420769, "epoch": 0.22195373535581958, "grad_norm": 8.66745662689209, "learning_rate": 5.369878794027438e-06, "loss": 0.504, "mean_token_accuracy": 0.8391989499330521, "num_tokens": 86116963.0, "step": 71600 }, { "entropy": 1.8736673273146152, "epoch": 0.22198473448086928, "grad_norm": 9.14969539642334, "learning_rate": 5.369503836530523e-06, "loss": 0.4614, "mean_token_accuracy": 0.8440220966935158, "num_tokens": 86129346.0, "step": 71610 }, { "entropy": 1.901363869011402, "epoch": 0.22201573360591897, "grad_norm": 8.975959777832031, "learning_rate": 5.369128957568058e-06, "loss": 0.487, "mean_token_accuracy": 0.843775762617588, "num_tokens": 86141322.0, "step": 71620 }, { "entropy": 1.836611707508564, "epoch": 0.22204673273096867, "grad_norm": 3.2881882190704346, "learning_rate": 5.368754157112632e-06, "loss": 0.4551, "mean_token_accuracy": 0.8536363363265991, "num_tokens": 86154410.0, "step": 71630 }, { "entropy": 1.817706936597824, "epoch": 0.22207773185601837, "grad_norm": 7.758713722229004, "learning_rate": 5.368379435136848e-06, "loss": 0.4165, "mean_token_accuracy": 0.8535854294896126, "num_tokens": 86167987.0, "step": 71640 }, { "entropy": 1.8554053410887719, "epoch": 0.22210873098106806, "grad_norm": 3.6705055236816406, "learning_rate": 5.368004791613321e-06, "loss": 0.4786, "mean_token_accuracy": 0.8438238322734832, "num_tokens": 86180395.0, "step": 71650 }, { "entropy": 1.8957013592123986, "epoch": 0.22213973010611776, "grad_norm": 4.396717548370361, "learning_rate": 5.36763022651468e-06, "loss": 0.4914, "mean_token_accuracy": 0.8409201994538307, "num_tokens": 86192690.0, "step": 71660 }, { "entropy": 1.8508854925632476, "epoch": 0.22217072923116746, "grad_norm": 4.068726539611816, "learning_rate": 5.367255739813568e-06, "loss": 0.4696, "mean_token_accuracy": 0.8503674864768982, "num_tokens": 86205641.0, "step": 71670 }, { "entropy": 1.8594465300440788, "epoch": 0.22220172835621715, "grad_norm": 8.009425163269043, "learning_rate": 5.3668813314826414e-06, "loss": 0.4362, "mean_token_accuracy": 0.8513711959123611, "num_tokens": 86218005.0, "step": 71680 }, { "entropy": 1.9636985063552856, "epoch": 0.22223272748126685, "grad_norm": 9.654029846191406, "learning_rate": 5.366507001494568e-06, "loss": 0.5595, "mean_token_accuracy": 0.8238719955086709, "num_tokens": 86229342.0, "step": 71690 }, { "entropy": 1.8041048809885978, "epoch": 0.22226372660631655, "grad_norm": 8.672904014587402, "learning_rate": 5.3661327498220305e-06, "loss": 0.4395, "mean_token_accuracy": 0.8550098910927773, "num_tokens": 86241797.0, "step": 71700 }, { "entropy": 1.78623516112566, "epoch": 0.22229472573136624, "grad_norm": 8.161778450012207, "learning_rate": 5.365758576437724e-06, "loss": 0.413, "mean_token_accuracy": 0.8487135574221611, "num_tokens": 86255083.0, "step": 71710 }, { "entropy": 1.904472067952156, "epoch": 0.22232572485641594, "grad_norm": 9.36921501159668, "learning_rate": 5.365384481314359e-06, "loss": 0.5776, "mean_token_accuracy": 0.8299209102988243, "num_tokens": 86266459.0, "step": 71720 }, { "entropy": 1.8936071269214154, "epoch": 0.2223567239814656, "grad_norm": 8.524149894714355, "learning_rate": 5.365010464424658e-06, "loss": 0.4546, "mean_token_accuracy": 0.8460559591650962, "num_tokens": 86278539.0, "step": 71730 }, { "entropy": 1.8948934614658355, "epoch": 0.2223877231065153, "grad_norm": 8.934950828552246, "learning_rate": 5.364636525741356e-06, "loss": 0.5051, "mean_token_accuracy": 0.838769344985485, "num_tokens": 86290914.0, "step": 71740 }, { "entropy": 1.913631896674633, "epoch": 0.222418722231565, "grad_norm": 3.6874070167541504, "learning_rate": 5.364262665237202e-06, "loss": 0.4744, "mean_token_accuracy": 0.8442276403307915, "num_tokens": 86302745.0, "step": 71750 }, { "entropy": 1.92730543166399, "epoch": 0.2224497213566147, "grad_norm": 8.071089744567871, "learning_rate": 5.363888882884958e-06, "loss": 0.4861, "mean_token_accuracy": 0.8468718230724335, "num_tokens": 86314083.0, "step": 71760 }, { "entropy": 1.9443558216094972, "epoch": 0.2224807204816644, "grad_norm": 8.098677635192871, "learning_rate": 5.363515178657401e-06, "loss": 0.5392, "mean_token_accuracy": 0.8254296198487282, "num_tokens": 86325749.0, "step": 71770 }, { "entropy": 1.8496044874191284, "epoch": 0.2225117196067141, "grad_norm": 9.303723335266113, "learning_rate": 5.36314155252732e-06, "loss": 0.4445, "mean_token_accuracy": 0.8531778752803802, "num_tokens": 86339269.0, "step": 71780 }, { "entropy": 1.8947973191738128, "epoch": 0.2225427187317638, "grad_norm": 3.389211654663086, "learning_rate": 5.362768004467516e-06, "loss": 0.4637, "mean_token_accuracy": 0.8556076481938362, "num_tokens": 86351044.0, "step": 71790 }, { "entropy": 1.9089409783482552, "epoch": 0.22257371785681349, "grad_norm": 9.990097999572754, "learning_rate": 5.362394534450803e-06, "loss": 0.5464, "mean_token_accuracy": 0.8289933249354362, "num_tokens": 86362914.0, "step": 71800 }, { "entropy": 1.8263745561242104, "epoch": 0.22260471698186318, "grad_norm": 8.100030899047852, "learning_rate": 5.362021142450014e-06, "loss": 0.4337, "mean_token_accuracy": 0.8659976094961166, "num_tokens": 86375568.0, "step": 71810 }, { "entropy": 1.9488488882780075, "epoch": 0.22263571610691288, "grad_norm": 8.042920112609863, "learning_rate": 5.361647828437985e-06, "loss": 0.5051, "mean_token_accuracy": 0.8422571077942849, "num_tokens": 86386717.0, "step": 71820 }, { "entropy": 1.9189837276935577, "epoch": 0.22266671523196258, "grad_norm": 7.569156646728516, "learning_rate": 5.361274592387578e-06, "loss": 0.533, "mean_token_accuracy": 0.8239206343889236, "num_tokens": 86399499.0, "step": 71830 }, { "entropy": 1.8419446378946305, "epoch": 0.22269771435701227, "grad_norm": 3.669304609298706, "learning_rate": 5.360901434271656e-06, "loss": 0.4513, "mean_token_accuracy": 0.8487843006849289, "num_tokens": 86412252.0, "step": 71840 }, { "entropy": 1.8220521062612534, "epoch": 0.22272871348206197, "grad_norm": 5.168817043304443, "learning_rate": 5.360528354063102e-06, "loss": 0.4524, "mean_token_accuracy": 0.8524244442582131, "num_tokens": 86424935.0, "step": 71850 }, { "entropy": 1.836111642420292, "epoch": 0.22275971260711167, "grad_norm": 4.996338367462158, "learning_rate": 5.360155351734812e-06, "loss": 0.464, "mean_token_accuracy": 0.8411974281072616, "num_tokens": 86438125.0, "step": 71860 }, { "entropy": 1.985448981821537, "epoch": 0.22279071173216136, "grad_norm": 8.615861892700195, "learning_rate": 5.359782427259694e-06, "loss": 0.5502, "mean_token_accuracy": 0.8280460953712463, "num_tokens": 86448970.0, "step": 71870 }, { "entropy": 1.8285388305783272, "epoch": 0.22282171085721106, "grad_norm": 4.234899520874023, "learning_rate": 5.359409580610668e-06, "loss": 0.4333, "mean_token_accuracy": 0.8518346786499024, "num_tokens": 86461661.0, "step": 71880 }, { "entropy": 2.012462750822306, "epoch": 0.22285270998226075, "grad_norm": 3.0699379444122314, "learning_rate": 5.359036811760669e-06, "loss": 0.5702, "mean_token_accuracy": 0.8206193700432778, "num_tokens": 86473970.0, "step": 71890 }, { "entropy": 1.941874098777771, "epoch": 0.22288370910731045, "grad_norm": 6.572078227996826, "learning_rate": 5.358664120682644e-06, "loss": 0.5122, "mean_token_accuracy": 0.8426774546504021, "num_tokens": 86485521.0, "step": 71900 }, { "entropy": 1.9939334601163865, "epoch": 0.22291470823236015, "grad_norm": 9.066808700561523, "learning_rate": 5.358291507349554e-06, "loss": 0.5545, "mean_token_accuracy": 0.8358783975243569, "num_tokens": 86496329.0, "step": 71910 }, { "entropy": 1.8996268406510353, "epoch": 0.22294570735740984, "grad_norm": 7.218948841094971, "learning_rate": 5.357918971734374e-06, "loss": 0.4499, "mean_token_accuracy": 0.8539462700486183, "num_tokens": 86508655.0, "step": 71920 }, { "entropy": 1.8882664322853089, "epoch": 0.22297670648245954, "grad_norm": 7.739889621734619, "learning_rate": 5.35754651381009e-06, "loss": 0.495, "mean_token_accuracy": 0.8399605572223663, "num_tokens": 86521278.0, "step": 71930 }, { "entropy": 1.89051483720541, "epoch": 0.22300770560750924, "grad_norm": 3.720012903213501, "learning_rate": 5.357174133549702e-06, "loss": 0.4798, "mean_token_accuracy": 0.8391882091760635, "num_tokens": 86533583.0, "step": 71940 }, { "entropy": 1.8325127944350244, "epoch": 0.22303870473255893, "grad_norm": 4.477136611938477, "learning_rate": 5.356801830926224e-06, "loss": 0.4477, "mean_token_accuracy": 0.837264607846737, "num_tokens": 86546642.0, "step": 71950 }, { "entropy": 1.916852205991745, "epoch": 0.22306970385760863, "grad_norm": 8.114299774169922, "learning_rate": 5.356429605912681e-06, "loss": 0.5005, "mean_token_accuracy": 0.8481098964810372, "num_tokens": 86558470.0, "step": 71960 }, { "entropy": 1.869056871533394, "epoch": 0.22310070298265833, "grad_norm": 4.9179182052612305, "learning_rate": 5.356057458482115e-06, "loss": 0.4322, "mean_token_accuracy": 0.8442103147506714, "num_tokens": 86571713.0, "step": 71970 }, { "entropy": 1.9630917876958847, "epoch": 0.223131702107708, "grad_norm": 9.305983543395996, "learning_rate": 5.355685388607575e-06, "loss": 0.4825, "mean_token_accuracy": 0.8500207409262657, "num_tokens": 86582632.0, "step": 71980 }, { "entropy": 1.9708934336900712, "epoch": 0.2231627012327577, "grad_norm": 9.161256790161133, "learning_rate": 5.3553133962621305e-06, "loss": 0.5558, "mean_token_accuracy": 0.8375864192843437, "num_tokens": 86593114.0, "step": 71990 }, { "entropy": 1.892447827756405, "epoch": 0.2231937003578074, "grad_norm": 8.874557495117188, "learning_rate": 5.35494148141886e-06, "loss": 0.505, "mean_token_accuracy": 0.8418182253837585, "num_tokens": 86605405.0, "step": 72000 }, { "entropy": 1.9689023926854134, "epoch": 0.2232246994828571, "grad_norm": 8.011334419250488, "learning_rate": 5.354569644050853e-06, "loss": 0.5485, "mean_token_accuracy": 0.8215542823076248, "num_tokens": 86616609.0, "step": 72010 }, { "entropy": 1.7857394725084306, "epoch": 0.22325569860790678, "grad_norm": 8.13255786895752, "learning_rate": 5.354197884131216e-06, "loss": 0.3809, "mean_token_accuracy": 0.8540096297860146, "num_tokens": 86630878.0, "step": 72020 }, { "entropy": 1.8428432375192643, "epoch": 0.22328669773295648, "grad_norm": 10.879768371582031, "learning_rate": 5.353826201633068e-06, "loss": 0.4629, "mean_token_accuracy": 0.8473896354436874, "num_tokens": 86643315.0, "step": 72030 }, { "entropy": 1.9004595071077346, "epoch": 0.22331769685800618, "grad_norm": 10.056571960449219, "learning_rate": 5.3534545965295384e-06, "loss": 0.5032, "mean_token_accuracy": 0.8414341598749161, "num_tokens": 86654883.0, "step": 72040 }, { "entropy": 1.8974163249135017, "epoch": 0.22334869598305587, "grad_norm": 4.734619140625, "learning_rate": 5.353083068793772e-06, "loss": 0.5136, "mean_token_accuracy": 0.8393331229686737, "num_tokens": 86666404.0, "step": 72050 }, { "entropy": 1.8827458634972571, "epoch": 0.22337969510810557, "grad_norm": 7.965644359588623, "learning_rate": 5.352711618398927e-06, "loss": 0.4825, "mean_token_accuracy": 0.8474840223789215, "num_tokens": 86678813.0, "step": 72060 }, { "entropy": 1.9129985481500626, "epoch": 0.22341069423315527, "grad_norm": 3.682452917098999, "learning_rate": 5.352340245318172e-06, "loss": 0.4979, "mean_token_accuracy": 0.8426467835903168, "num_tokens": 86691188.0, "step": 72070 }, { "entropy": 1.8033430457115174, "epoch": 0.22344169335820496, "grad_norm": 8.872450828552246, "learning_rate": 5.351968949524691e-06, "loss": 0.3794, "mean_token_accuracy": 0.8609629422426224, "num_tokens": 86704614.0, "step": 72080 }, { "entropy": 1.8621386557817459, "epoch": 0.22347269248325466, "grad_norm": 3.9304046630859375, "learning_rate": 5.351597730991682e-06, "loss": 0.4775, "mean_token_accuracy": 0.8484826713800431, "num_tokens": 86717142.0, "step": 72090 }, { "entropy": 1.978152585029602, "epoch": 0.22350369160830436, "grad_norm": 8.142024040222168, "learning_rate": 5.351226589692352e-06, "loss": 0.5474, "mean_token_accuracy": 0.8380045786499977, "num_tokens": 86727790.0, "step": 72100 }, { "entropy": 1.861580203473568, "epoch": 0.22353469073335405, "grad_norm": 9.076419830322266, "learning_rate": 5.350855525599924e-06, "loss": 0.4839, "mean_token_accuracy": 0.8400816440582275, "num_tokens": 86739827.0, "step": 72110 }, { "entropy": 1.813391050696373, "epoch": 0.22356568985840375, "grad_norm": 8.701443672180176, "learning_rate": 5.350484538687634e-06, "loss": 0.4251, "mean_token_accuracy": 0.8458169072866439, "num_tokens": 86752977.0, "step": 72120 }, { "entropy": 1.9347064226865769, "epoch": 0.22359668898345345, "grad_norm": 8.15721607208252, "learning_rate": 5.350113628928731e-06, "loss": 0.499, "mean_token_accuracy": 0.8386592581868172, "num_tokens": 86764531.0, "step": 72130 }, { "entropy": 1.82811646014452, "epoch": 0.22362768810850314, "grad_norm": 3.8640589714050293, "learning_rate": 5.349742796296475e-06, "loss": 0.4314, "mean_token_accuracy": 0.8612099349498749, "num_tokens": 86777148.0, "step": 72140 }, { "entropy": 1.9188909232616425, "epoch": 0.22365868723355284, "grad_norm": 8.598167419433594, "learning_rate": 5.349372040764139e-06, "loss": 0.5201, "mean_token_accuracy": 0.8361388146877289, "num_tokens": 86788330.0, "step": 72150 }, { "entropy": 1.8747940585017204, "epoch": 0.22368968635860254, "grad_norm": 4.909088134765625, "learning_rate": 5.349001362305013e-06, "loss": 0.4986, "mean_token_accuracy": 0.836989839375019, "num_tokens": 86800353.0, "step": 72160 }, { "entropy": 1.9335097655653954, "epoch": 0.22372068548365223, "grad_norm": 8.049686431884766, "learning_rate": 5.348630760892396e-06, "loss": 0.5019, "mean_token_accuracy": 0.8366736158728599, "num_tokens": 86811632.0, "step": 72170 }, { "entropy": 1.8739148452877998, "epoch": 0.22375168460870193, "grad_norm": 8.556473731994629, "learning_rate": 5.3482602364996015e-06, "loss": 0.5167, "mean_token_accuracy": 0.8425387993454934, "num_tokens": 86824293.0, "step": 72180 }, { "entropy": 1.9024907439947127, "epoch": 0.22378268373375163, "grad_norm": 8.516105651855469, "learning_rate": 5.347889789099956e-06, "loss": 0.5412, "mean_token_accuracy": 0.825544822216034, "num_tokens": 86836349.0, "step": 72190 }, { "entropy": 1.897923794388771, "epoch": 0.22381368285880132, "grad_norm": 7.900509357452393, "learning_rate": 5.347519418666795e-06, "loss": 0.4713, "mean_token_accuracy": 0.8481002271175384, "num_tokens": 86847118.0, "step": 72200 }, { "entropy": 1.8177719444036484, "epoch": 0.22384468198385102, "grad_norm": 7.703773498535156, "learning_rate": 5.347149125173477e-06, "loss": 0.4398, "mean_token_accuracy": 0.8518023356795311, "num_tokens": 86860044.0, "step": 72210 }, { "entropy": 1.8685875788331032, "epoch": 0.22387568110890071, "grad_norm": 6.817202091217041, "learning_rate": 5.3467789085933605e-06, "loss": 0.4636, "mean_token_accuracy": 0.8450110226869583, "num_tokens": 86872544.0, "step": 72220 }, { "entropy": 1.9101796388626098, "epoch": 0.22390668023395038, "grad_norm": 8.793787002563477, "learning_rate": 5.346408768899827e-06, "loss": 0.538, "mean_token_accuracy": 0.8359600931406022, "num_tokens": 86884268.0, "step": 72230 }, { "entropy": 1.8767978221178054, "epoch": 0.22393767935900008, "grad_norm": 8.353222846984863, "learning_rate": 5.3460387060662665e-06, "loss": 0.4822, "mean_token_accuracy": 0.8356214210391044, "num_tokens": 86895860.0, "step": 72240 }, { "entropy": 1.8598055988550186, "epoch": 0.22396867848404978, "grad_norm": 8.898566246032715, "learning_rate": 5.345668720066082e-06, "loss": 0.5261, "mean_token_accuracy": 0.8349154770374299, "num_tokens": 86907668.0, "step": 72250 }, { "entropy": 1.9466781616210938, "epoch": 0.22399967760909947, "grad_norm": 8.537947654724121, "learning_rate": 5.34529881087269e-06, "loss": 0.5206, "mean_token_accuracy": 0.8424281865358353, "num_tokens": 86919261.0, "step": 72260 }, { "entropy": 1.89094198346138, "epoch": 0.22403067673414917, "grad_norm": 8.919629096984863, "learning_rate": 5.344928978459521e-06, "loss": 0.469, "mean_token_accuracy": 0.8445121660828591, "num_tokens": 86930676.0, "step": 72270 }, { "entropy": 1.896059663593769, "epoch": 0.22406167585919887, "grad_norm": 8.955682754516602, "learning_rate": 5.344559222800014e-06, "loss": 0.5143, "mean_token_accuracy": 0.8374183923006058, "num_tokens": 86941738.0, "step": 72280 }, { "entropy": 1.883140115439892, "epoch": 0.22409267498424856, "grad_norm": 4.690348148345947, "learning_rate": 5.344189543867627e-06, "loss": 0.4934, "mean_token_accuracy": 0.844383516907692, "num_tokens": 86953896.0, "step": 72290 }, { "entropy": 1.95089001506567, "epoch": 0.22412367410929826, "grad_norm": 10.894330024719238, "learning_rate": 5.3438199416358285e-06, "loss": 0.5338, "mean_token_accuracy": 0.8346414759755134, "num_tokens": 86965358.0, "step": 72300 }, { "entropy": 1.991261574625969, "epoch": 0.22415467323434796, "grad_norm": 8.711053848266602, "learning_rate": 5.343450416078097e-06, "loss": 0.5846, "mean_token_accuracy": 0.8269345715641976, "num_tokens": 86976161.0, "step": 72310 }, { "entropy": 1.9148213535547256, "epoch": 0.22418567235939765, "grad_norm": 8.75413703918457, "learning_rate": 5.343080967167927e-06, "loss": 0.5037, "mean_token_accuracy": 0.8315785259008408, "num_tokens": 86988247.0, "step": 72320 }, { "entropy": 1.967397329211235, "epoch": 0.22421667148444735, "grad_norm": 8.436080932617188, "learning_rate": 5.342711594878823e-06, "loss": 0.5688, "mean_token_accuracy": 0.8347294554114342, "num_tokens": 86999086.0, "step": 72330 }, { "entropy": 2.000737062096596, "epoch": 0.22424767060949705, "grad_norm": 9.230005264282227, "learning_rate": 5.342342299184309e-06, "loss": 0.6598, "mean_token_accuracy": 0.8300457715988159, "num_tokens": 87010382.0, "step": 72340 }, { "entropy": 1.9067621529102325, "epoch": 0.22427866973454674, "grad_norm": 8.503578186035156, "learning_rate": 5.341973080057913e-06, "loss": 0.5206, "mean_token_accuracy": 0.831950668990612, "num_tokens": 87022828.0, "step": 72350 }, { "entropy": 1.9217410042881966, "epoch": 0.22430966885959644, "grad_norm": 9.517120361328125, "learning_rate": 5.34160393747318e-06, "loss": 0.4946, "mean_token_accuracy": 0.8437418282032013, "num_tokens": 87034285.0, "step": 72360 }, { "entropy": 1.8522989198565483, "epoch": 0.22434066798464614, "grad_norm": 9.239526748657227, "learning_rate": 5.34123487140367e-06, "loss": 0.4158, "mean_token_accuracy": 0.8460891872644425, "num_tokens": 87046754.0, "step": 72370 }, { "entropy": 1.9420243352651596, "epoch": 0.22437166710969583, "grad_norm": 8.482245445251465, "learning_rate": 5.340865881822951e-06, "loss": 0.5542, "mean_token_accuracy": 0.8409739390015603, "num_tokens": 87058047.0, "step": 72380 }, { "entropy": 1.9989713937044145, "epoch": 0.22440266623474553, "grad_norm": 10.160581588745117, "learning_rate": 5.340496968704607e-06, "loss": 0.6208, "mean_token_accuracy": 0.82189432233572, "num_tokens": 87068796.0, "step": 72390 }, { "entropy": 1.8937209725379944, "epoch": 0.22443366535979523, "grad_norm": 8.602519035339355, "learning_rate": 5.340128132022235e-06, "loss": 0.4805, "mean_token_accuracy": 0.8464359551668167, "num_tokens": 87081062.0, "step": 72400 }, { "entropy": 1.923159296810627, "epoch": 0.22446466448484492, "grad_norm": 3.398913860321045, "learning_rate": 5.339759371749443e-06, "loss": 0.501, "mean_token_accuracy": 0.8402744174003601, "num_tokens": 87093053.0, "step": 72410 }, { "entropy": 1.8120801776647568, "epoch": 0.22449566360989462, "grad_norm": 4.736013412475586, "learning_rate": 5.339390687859851e-06, "loss": 0.4571, "mean_token_accuracy": 0.8469538927078247, "num_tokens": 87106652.0, "step": 72420 }, { "entropy": 1.976495975255966, "epoch": 0.22452666273494432, "grad_norm": 9.108383178710938, "learning_rate": 5.339022080327097e-06, "loss": 0.5906, "mean_token_accuracy": 0.8242412880063057, "num_tokens": 87117466.0, "step": 72430 }, { "entropy": 1.8691235825419426, "epoch": 0.224557661859994, "grad_norm": 3.6879398822784424, "learning_rate": 5.338653549124824e-06, "loss": 0.4588, "mean_token_accuracy": 0.8466471612453461, "num_tokens": 87130079.0, "step": 72440 }, { "entropy": 1.872739316523075, "epoch": 0.2245886609850437, "grad_norm": 8.234794616699219, "learning_rate": 5.338285094226693e-06, "loss": 0.5574, "mean_token_accuracy": 0.8330441653728485, "num_tokens": 87143430.0, "step": 72450 }, { "entropy": 1.911289119720459, "epoch": 0.2246196601100934, "grad_norm": 11.46174430847168, "learning_rate": 5.337916715606378e-06, "loss": 0.5162, "mean_token_accuracy": 0.8353003263473511, "num_tokens": 87155144.0, "step": 72460 }, { "entropy": 1.9616554155945778, "epoch": 0.22465065923514307, "grad_norm": 7.765041351318359, "learning_rate": 5.337548413237561e-06, "loss": 0.5378, "mean_token_accuracy": 0.8335460603237153, "num_tokens": 87166518.0, "step": 72470 }, { "entropy": 1.9011431649327277, "epoch": 0.22468165836019277, "grad_norm": 8.76104736328125, "learning_rate": 5.337180187093943e-06, "loss": 0.5007, "mean_token_accuracy": 0.8393405005335808, "num_tokens": 87178008.0, "step": 72480 }, { "entropy": 1.8503070279955864, "epoch": 0.22471265748524247, "grad_norm": 10.410445213317871, "learning_rate": 5.336812037149233e-06, "loss": 0.4587, "mean_token_accuracy": 0.8424379363656044, "num_tokens": 87191011.0, "step": 72490 }, { "entropy": 1.9341685771942139, "epoch": 0.22474365661029216, "grad_norm": 3.9696474075317383, "learning_rate": 5.336443963377155e-06, "loss": 0.4707, "mean_token_accuracy": 0.8476704210042953, "num_tokens": 87202897.0, "step": 72500 }, { "entropy": 1.9511468440294266, "epoch": 0.22477465573534186, "grad_norm": 9.15610122680664, "learning_rate": 5.336075965751444e-06, "loss": 0.5751, "mean_token_accuracy": 0.8327615946531296, "num_tokens": 87213746.0, "step": 72510 }, { "entropy": 1.9269428536295892, "epoch": 0.22480565486039156, "grad_norm": 9.162718772888184, "learning_rate": 5.335708044245848e-06, "loss": 0.5052, "mean_token_accuracy": 0.840380209684372, "num_tokens": 87225670.0, "step": 72520 }, { "entropy": 1.940378698706627, "epoch": 0.22483665398544125, "grad_norm": 6.751579284667969, "learning_rate": 5.335340198834132e-06, "loss": 0.532, "mean_token_accuracy": 0.8332238659262657, "num_tokens": 87236688.0, "step": 72530 }, { "entropy": 1.958859845995903, "epoch": 0.22486765311049095, "grad_norm": 10.634821891784668, "learning_rate": 5.334972429490065e-06, "loss": 0.5331, "mean_token_accuracy": 0.8343855604529381, "num_tokens": 87247914.0, "step": 72540 }, { "entropy": 1.9445181697607041, "epoch": 0.22489865223554065, "grad_norm": 9.388453483581543, "learning_rate": 5.334604736187437e-06, "loss": 0.4933, "mean_token_accuracy": 0.8429577887058258, "num_tokens": 87258915.0, "step": 72550 }, { "entropy": 1.920157741010189, "epoch": 0.22492965136059034, "grad_norm": 8.916130065917969, "learning_rate": 5.334237118900046e-06, "loss": 0.4968, "mean_token_accuracy": 0.8405413702130318, "num_tokens": 87271134.0, "step": 72560 }, { "entropy": 1.9094747498631477, "epoch": 0.22496065048564004, "grad_norm": 8.803078651428223, "learning_rate": 5.333869577601703e-06, "loss": 0.5004, "mean_token_accuracy": 0.8441318541765213, "num_tokens": 87282774.0, "step": 72570 }, { "entropy": 1.8855116859078407, "epoch": 0.22499164961068974, "grad_norm": 9.416301727294922, "learning_rate": 5.333502112266234e-06, "loss": 0.4659, "mean_token_accuracy": 0.8383870780467987, "num_tokens": 87295016.0, "step": 72580 }, { "entropy": 1.8912652000784873, "epoch": 0.22502264873573943, "grad_norm": 7.569722652435303, "learning_rate": 5.333134722867477e-06, "loss": 0.5354, "mean_token_accuracy": 0.8289127513766289, "num_tokens": 87307357.0, "step": 72590 }, { "entropy": 1.8563939124345779, "epoch": 0.22505364786078913, "grad_norm": 3.901181221008301, "learning_rate": 5.332767409379278e-06, "loss": 0.4824, "mean_token_accuracy": 0.8382441207766533, "num_tokens": 87321430.0, "step": 72600 }, { "entropy": 1.7983256116509438, "epoch": 0.22508464698583883, "grad_norm": 8.528167724609375, "learning_rate": 5.332400171775503e-06, "loss": 0.479, "mean_token_accuracy": 0.8504438042640686, "num_tokens": 87334567.0, "step": 72610 }, { "entropy": 1.8781714573502541, "epoch": 0.22511564611088852, "grad_norm": 8.67835521697998, "learning_rate": 5.332033010030026e-06, "loss": 0.4685, "mean_token_accuracy": 0.8410458341240883, "num_tokens": 87347091.0, "step": 72620 }, { "entropy": 1.869327275454998, "epoch": 0.22514664523593822, "grad_norm": 7.16377592086792, "learning_rate": 5.331665924116734e-06, "loss": 0.4843, "mean_token_accuracy": 0.8444514736533165, "num_tokens": 87359313.0, "step": 72630 }, { "entropy": 1.9245384186506271, "epoch": 0.22517764436098792, "grad_norm": 8.11259651184082, "learning_rate": 5.331298914009525e-06, "loss": 0.4793, "mean_token_accuracy": 0.8420319616794586, "num_tokens": 87370592.0, "step": 72640 }, { "entropy": 1.9539393037557602, "epoch": 0.2252086434860376, "grad_norm": 8.318745613098145, "learning_rate": 5.3309319796823165e-06, "loss": 0.5345, "mean_token_accuracy": 0.8356306239962578, "num_tokens": 87381867.0, "step": 72650 }, { "entropy": 1.860501480102539, "epoch": 0.2252396426110873, "grad_norm": 11.79776668548584, "learning_rate": 5.33056512110903e-06, "loss": 0.4856, "mean_token_accuracy": 0.8360062450170517, "num_tokens": 87394488.0, "step": 72660 }, { "entropy": 1.9126732975244523, "epoch": 0.225270641736137, "grad_norm": 7.536765098571777, "learning_rate": 5.330198338263605e-06, "loss": 0.5059, "mean_token_accuracy": 0.8378516137599945, "num_tokens": 87406482.0, "step": 72670 }, { "entropy": 1.950472255051136, "epoch": 0.2253016408611867, "grad_norm": 6.861098766326904, "learning_rate": 5.329831631119992e-06, "loss": 0.5095, "mean_token_accuracy": 0.8359086707234382, "num_tokens": 87417682.0, "step": 72680 }, { "entropy": 1.8644071131944657, "epoch": 0.2253326399862364, "grad_norm": 3.9185712337493896, "learning_rate": 5.329464999652151e-06, "loss": 0.4551, "mean_token_accuracy": 0.8424749970436096, "num_tokens": 87430239.0, "step": 72690 }, { "entropy": 1.8736916035413742, "epoch": 0.2253636391112861, "grad_norm": 8.611660957336426, "learning_rate": 5.329098443834062e-06, "loss": 0.4387, "mean_token_accuracy": 0.857531276345253, "num_tokens": 87442834.0, "step": 72700 }, { "entropy": 1.903275479376316, "epoch": 0.2253946382363358, "grad_norm": 2.624007225036621, "learning_rate": 5.328731963639709e-06, "loss": 0.492, "mean_token_accuracy": 0.8541991651058197, "num_tokens": 87453877.0, "step": 72710 }, { "entropy": 1.8286138609051705, "epoch": 0.22542563736138546, "grad_norm": 2.311256170272827, "learning_rate": 5.328365559043095e-06, "loss": 0.4711, "mean_token_accuracy": 0.8497246339917183, "num_tokens": 87466560.0, "step": 72720 }, { "entropy": 1.7663344264030456, "epoch": 0.22545663648643516, "grad_norm": 9.070472717285156, "learning_rate": 5.327999230018231e-06, "loss": 0.3978, "mean_token_accuracy": 0.8562262952327728, "num_tokens": 87480091.0, "step": 72730 }, { "entropy": 1.8812779873609542, "epoch": 0.22548763561148485, "grad_norm": 10.331746101379395, "learning_rate": 5.327632976539146e-06, "loss": 0.5269, "mean_token_accuracy": 0.8345619887113571, "num_tokens": 87492404.0, "step": 72740 }, { "entropy": 1.8396708235144614, "epoch": 0.22551863473653455, "grad_norm": 4.0618462562561035, "learning_rate": 5.327266798579874e-06, "loss": 0.414, "mean_token_accuracy": 0.8638743400573731, "num_tokens": 87505236.0, "step": 72750 }, { "entropy": 1.8074708625674247, "epoch": 0.22554963386158425, "grad_norm": 4.086684703826904, "learning_rate": 5.326900696114468e-06, "loss": 0.3903, "mean_token_accuracy": 0.8573868215084076, "num_tokens": 87518657.0, "step": 72760 }, { "entropy": 1.912179996073246, "epoch": 0.22558063298663394, "grad_norm": 6.899926662445068, "learning_rate": 5.326534669116988e-06, "loss": 0.4744, "mean_token_accuracy": 0.8395515128970146, "num_tokens": 87530348.0, "step": 72770 }, { "entropy": 1.95667557567358, "epoch": 0.22561163211168364, "grad_norm": 7.5052361488342285, "learning_rate": 5.326168717561514e-06, "loss": 0.5686, "mean_token_accuracy": 0.8295163378119469, "num_tokens": 87541743.0, "step": 72780 }, { "entropy": 1.7888675391674043, "epoch": 0.22564263123673334, "grad_norm": 6.81760311126709, "learning_rate": 5.325802841422131e-06, "loss": 0.3976, "mean_token_accuracy": 0.8517789766192436, "num_tokens": 87554783.0, "step": 72790 }, { "entropy": 1.9340469419956208, "epoch": 0.22567363036178303, "grad_norm": 8.59512996673584, "learning_rate": 5.325437040672939e-06, "loss": 0.4909, "mean_token_accuracy": 0.8468787103891373, "num_tokens": 87565222.0, "step": 72800 }, { "entropy": 1.8804991841316223, "epoch": 0.22570462948683273, "grad_norm": 4.417396545410156, "learning_rate": 5.3250713152880525e-06, "loss": 0.5092, "mean_token_accuracy": 0.8418024495244026, "num_tokens": 87577513.0, "step": 72810 }, { "entropy": 1.889168956875801, "epoch": 0.22573562861188243, "grad_norm": 6.085160732269287, "learning_rate": 5.3247056652415955e-06, "loss": 0.4342, "mean_token_accuracy": 0.8587741032242775, "num_tokens": 87589761.0, "step": 72820 }, { "entropy": 1.8290639758110045, "epoch": 0.22576662773693212, "grad_norm": 9.560019493103027, "learning_rate": 5.324340090507707e-06, "loss": 0.4573, "mean_token_accuracy": 0.8536442771553994, "num_tokens": 87602453.0, "step": 72830 }, { "entropy": 1.9466052517294883, "epoch": 0.22579762686198182, "grad_norm": 10.381073951721191, "learning_rate": 5.323974591060536e-06, "loss": 0.5507, "mean_token_accuracy": 0.8318102985620499, "num_tokens": 87614036.0, "step": 72840 }, { "entropy": 1.8601842939853668, "epoch": 0.22582862598703152, "grad_norm": 7.986728668212891, "learning_rate": 5.323609166874244e-06, "loss": 0.4184, "mean_token_accuracy": 0.8537519767880439, "num_tokens": 87625934.0, "step": 72850 }, { "entropy": 1.9298621758818626, "epoch": 0.2258596251120812, "grad_norm": 7.938553810119629, "learning_rate": 5.3232438179230086e-06, "loss": 0.5259, "mean_token_accuracy": 0.8388584434986115, "num_tokens": 87637522.0, "step": 72860 }, { "entropy": 1.8683239459991454, "epoch": 0.2258906242371309, "grad_norm": 7.229837894439697, "learning_rate": 5.322878544181015e-06, "loss": 0.4459, "mean_token_accuracy": 0.8578355640172959, "num_tokens": 87649965.0, "step": 72870 }, { "entropy": 1.9087668746709823, "epoch": 0.2259216233621806, "grad_norm": 8.52033519744873, "learning_rate": 5.322513345622464e-06, "loss": 0.4597, "mean_token_accuracy": 0.8454820603132248, "num_tokens": 87662212.0, "step": 72880 }, { "entropy": 1.8324894472956657, "epoch": 0.2259526224872303, "grad_norm": 9.297342300415039, "learning_rate": 5.322148222221568e-06, "loss": 0.4355, "mean_token_accuracy": 0.8492673173546791, "num_tokens": 87675147.0, "step": 72890 }, { "entropy": 1.8852784946560859, "epoch": 0.22598362161228, "grad_norm": 3.968653440475464, "learning_rate": 5.3217831739525515e-06, "loss": 0.53, "mean_token_accuracy": 0.8285475313663483, "num_tokens": 87686898.0, "step": 72900 }, { "entropy": 1.9112021766602993, "epoch": 0.2260146207373297, "grad_norm": 4.359899044036865, "learning_rate": 5.321418200789649e-06, "loss": 0.5096, "mean_token_accuracy": 0.8378346741199494, "num_tokens": 87699547.0, "step": 72910 }, { "entropy": 1.9921528965234756, "epoch": 0.2260456198623794, "grad_norm": 8.43260383605957, "learning_rate": 5.321053302707114e-06, "loss": 0.5883, "mean_token_accuracy": 0.8277798056602478, "num_tokens": 87710308.0, "step": 72920 }, { "entropy": 1.847052039206028, "epoch": 0.2260766189874291, "grad_norm": 8.61585521697998, "learning_rate": 5.320688479679204e-06, "loss": 0.4228, "mean_token_accuracy": 0.855183681845665, "num_tokens": 87723526.0, "step": 72930 }, { "entropy": 1.9104703694581986, "epoch": 0.2261076181124788, "grad_norm": 9.710681915283203, "learning_rate": 5.320323731680197e-06, "loss": 0.4608, "mean_token_accuracy": 0.8491173967719078, "num_tokens": 87735916.0, "step": 72940 }, { "entropy": 1.9582516461610795, "epoch": 0.22613861723752848, "grad_norm": 7.941116809844971, "learning_rate": 5.319959058684375e-06, "loss": 0.5148, "mean_token_accuracy": 0.8455424636602402, "num_tokens": 87747201.0, "step": 72950 }, { "entropy": 1.8938917353749276, "epoch": 0.22616961636257818, "grad_norm": 8.022253036499023, "learning_rate": 5.319594460666041e-06, "loss": 0.5049, "mean_token_accuracy": 0.8349814653396607, "num_tokens": 87758501.0, "step": 72960 }, { "entropy": 1.9582582622766496, "epoch": 0.22620061548762785, "grad_norm": 8.384016036987305, "learning_rate": 5.319229937599502e-06, "loss": 0.5318, "mean_token_accuracy": 0.8396169364452362, "num_tokens": 87769542.0, "step": 72970 }, { "entropy": 1.9805827289819717, "epoch": 0.22623161461267755, "grad_norm": 7.869809150695801, "learning_rate": 5.318865489459086e-06, "loss": 0.4925, "mean_token_accuracy": 0.8493814244866371, "num_tokens": 87779962.0, "step": 72980 }, { "entropy": 1.9055181667208672, "epoch": 0.22626261373772724, "grad_norm": 8.316291809082031, "learning_rate": 5.3185011162191226e-06, "loss": 0.4932, "mean_token_accuracy": 0.8454388037323952, "num_tokens": 87791102.0, "step": 72990 }, { "entropy": 1.9191015899181365, "epoch": 0.22629361286277694, "grad_norm": 7.545352458953857, "learning_rate": 5.318136817853964e-06, "loss": 0.5087, "mean_token_accuracy": 0.8402454987168312, "num_tokens": 87803046.0, "step": 73000 }, { "entropy": 1.9037675946950912, "epoch": 0.22632461198782663, "grad_norm": 9.174158096313477, "learning_rate": 5.317772594337969e-06, "loss": 0.5138, "mean_token_accuracy": 0.8370854437351227, "num_tokens": 87814781.0, "step": 73010 }, { "entropy": 1.941840186715126, "epoch": 0.22635561111287633, "grad_norm": 11.267866134643555, "learning_rate": 5.317408445645512e-06, "loss": 0.5302, "mean_token_accuracy": 0.834077525138855, "num_tokens": 87826000.0, "step": 73020 }, { "entropy": 1.9527758836746216, "epoch": 0.22638661023792603, "grad_norm": 7.3283514976501465, "learning_rate": 5.3170443717509745e-06, "loss": 0.5012, "mean_token_accuracy": 0.8412612110376358, "num_tokens": 87836886.0, "step": 73030 }, { "entropy": 1.8808117777109146, "epoch": 0.22641760936297572, "grad_norm": 10.775453567504883, "learning_rate": 5.316680372628757e-06, "loss": 0.4909, "mean_token_accuracy": 0.8417520016431809, "num_tokens": 87848489.0, "step": 73040 }, { "entropy": 1.8674452692270278, "epoch": 0.22644860848802542, "grad_norm": 4.4659223556518555, "learning_rate": 5.316316448253266e-06, "loss": 0.4594, "mean_token_accuracy": 0.8506308153271676, "num_tokens": 87860795.0, "step": 73050 }, { "entropy": 1.8244705572724342, "epoch": 0.22647960761307512, "grad_norm": 8.522405624389648, "learning_rate": 5.315952598598925e-06, "loss": 0.4971, "mean_token_accuracy": 0.8474853068590165, "num_tokens": 87873677.0, "step": 73060 }, { "entropy": 1.923226311802864, "epoch": 0.22651060673812481, "grad_norm": 7.727634906768799, "learning_rate": 5.315588823640166e-06, "loss": 0.5555, "mean_token_accuracy": 0.8321527540683746, "num_tokens": 87885403.0, "step": 73070 }, { "entropy": 1.8770409598946571, "epoch": 0.2265416058631745, "grad_norm": 7.156225681304932, "learning_rate": 5.315225123351437e-06, "loss": 0.4545, "mean_token_accuracy": 0.8440602973103524, "num_tokens": 87897387.0, "step": 73080 }, { "entropy": 1.8762115344405175, "epoch": 0.2265726049882242, "grad_norm": 7.807554721832275, "learning_rate": 5.3148614977071956e-06, "loss": 0.4957, "mean_token_accuracy": 0.8331007972359658, "num_tokens": 87910085.0, "step": 73090 }, { "entropy": 1.9102078214287759, "epoch": 0.2266036041132739, "grad_norm": 7.591177940368652, "learning_rate": 5.314497946681913e-06, "loss": 0.5056, "mean_token_accuracy": 0.8310081690549851, "num_tokens": 87922858.0, "step": 73100 }, { "entropy": 1.9448012560606003, "epoch": 0.2266346032383236, "grad_norm": 8.734160423278809, "learning_rate": 5.31413447025007e-06, "loss": 0.4943, "mean_token_accuracy": 0.8439796343445778, "num_tokens": 87933821.0, "step": 73110 }, { "entropy": 1.8482655853033065, "epoch": 0.2266656023633733, "grad_norm": 6.68172550201416, "learning_rate": 5.3137710683861635e-06, "loss": 0.4315, "mean_token_accuracy": 0.8570908337831498, "num_tokens": 87946480.0, "step": 73120 }, { "entropy": 1.9056684881448747, "epoch": 0.226696601488423, "grad_norm": 7.89918327331543, "learning_rate": 5.3134077410647004e-06, "loss": 0.4895, "mean_token_accuracy": 0.8478031545877457, "num_tokens": 87958529.0, "step": 73130 }, { "entropy": 1.8404195442795754, "epoch": 0.2267276006134727, "grad_norm": 8.014739990234375, "learning_rate": 5.3130444882601994e-06, "loss": 0.4243, "mean_token_accuracy": 0.8541824370622635, "num_tokens": 87971303.0, "step": 73140 }, { "entropy": 1.8912012234330178, "epoch": 0.2267585997385224, "grad_norm": 9.917512893676758, "learning_rate": 5.312681309947193e-06, "loss": 0.4525, "mean_token_accuracy": 0.8446354269981384, "num_tokens": 87983955.0, "step": 73150 }, { "entropy": 1.9031813889741898, "epoch": 0.22678959886357208, "grad_norm": 8.86811637878418, "learning_rate": 5.3123182061002245e-06, "loss": 0.4985, "mean_token_accuracy": 0.8356417074799538, "num_tokens": 87995437.0, "step": 73160 }, { "entropy": 1.9914841502904892, "epoch": 0.22682059798862178, "grad_norm": 8.847204208374023, "learning_rate": 5.311955176693849e-06, "loss": 0.5371, "mean_token_accuracy": 0.8395547032356262, "num_tokens": 88006625.0, "step": 73170 }, { "entropy": 1.9578437075018882, "epoch": 0.22685159711367148, "grad_norm": 8.657424926757812, "learning_rate": 5.311592221702637e-06, "loss": 0.5205, "mean_token_accuracy": 0.8370061025023461, "num_tokens": 88017862.0, "step": 73180 }, { "entropy": 1.843288530409336, "epoch": 0.22688259623872117, "grad_norm": 7.968525409698486, "learning_rate": 5.311229341101166e-06, "loss": 0.4257, "mean_token_accuracy": 0.8541227072477341, "num_tokens": 88030463.0, "step": 73190 }, { "entropy": 1.949458932876587, "epoch": 0.22691359536377087, "grad_norm": 8.945792198181152, "learning_rate": 5.3108665348640306e-06, "loss": 0.5382, "mean_token_accuracy": 0.8297375872731209, "num_tokens": 88041754.0, "step": 73200 }, { "entropy": 1.858209890127182, "epoch": 0.22694459448882057, "grad_norm": 4.584981441497803, "learning_rate": 5.3105038029658355e-06, "loss": 0.4564, "mean_token_accuracy": 0.849499624967575, "num_tokens": 88054627.0, "step": 73210 }, { "entropy": 1.9475027039647101, "epoch": 0.22697559361387024, "grad_norm": 3.7373814582824707, "learning_rate": 5.310141145381194e-06, "loss": 0.5527, "mean_token_accuracy": 0.8340747594833374, "num_tokens": 88066580.0, "step": 73220 }, { "entropy": 1.8866298824548722, "epoch": 0.22700659273891993, "grad_norm": 8.264451026916504, "learning_rate": 5.30977856208474e-06, "loss": 0.4415, "mean_token_accuracy": 0.8434574559330941, "num_tokens": 88078483.0, "step": 73230 }, { "entropy": 1.8932233542203902, "epoch": 0.22703759186396963, "grad_norm": 12.378700256347656, "learning_rate": 5.309416053051112e-06, "loss": 0.4815, "mean_token_accuracy": 0.8487199395895004, "num_tokens": 88090079.0, "step": 73240 }, { "entropy": 1.8719527080655098, "epoch": 0.22706859098901933, "grad_norm": 9.49503231048584, "learning_rate": 5.309053618254963e-06, "loss": 0.5081, "mean_token_accuracy": 0.8392317876219749, "num_tokens": 88103225.0, "step": 73250 }, { "entropy": 1.8768180578947067, "epoch": 0.22709959011406902, "grad_norm": 4.083436965942383, "learning_rate": 5.308691257670956e-06, "loss": 0.4518, "mean_token_accuracy": 0.8495214492082596, "num_tokens": 88115707.0, "step": 73260 }, { "entropy": 1.9521862715482712, "epoch": 0.22713058923911872, "grad_norm": 4.048558712005615, "learning_rate": 5.308328971273773e-06, "loss": 0.5133, "mean_token_accuracy": 0.8412155851721763, "num_tokens": 88126836.0, "step": 73270 }, { "entropy": 1.9520546540617942, "epoch": 0.22716158836416842, "grad_norm": 4.938279151916504, "learning_rate": 5.3079667590381004e-06, "loss": 0.5319, "mean_token_accuracy": 0.8300433591008186, "num_tokens": 88138872.0, "step": 73280 }, { "entropy": 1.8862208157777787, "epoch": 0.2271925874892181, "grad_norm": 7.787830829620361, "learning_rate": 5.3076046209386405e-06, "loss": 0.475, "mean_token_accuracy": 0.8360537528991699, "num_tokens": 88151734.0, "step": 73290 }, { "entropy": 1.9074003919959068, "epoch": 0.2272235866142678, "grad_norm": 7.785778045654297, "learning_rate": 5.307242556950106e-06, "loss": 0.4628, "mean_token_accuracy": 0.8495361775159835, "num_tokens": 88163810.0, "step": 73300 }, { "entropy": 1.9791472971439361, "epoch": 0.2272545857393175, "grad_norm": 8.446614265441895, "learning_rate": 5.306880567047223e-06, "loss": 0.6625, "mean_token_accuracy": 0.8312054976820946, "num_tokens": 88176079.0, "step": 73310 }, { "entropy": 1.8590899035334587, "epoch": 0.2272855848643672, "grad_norm": 4.120956897735596, "learning_rate": 5.306518651204732e-06, "loss": 0.4315, "mean_token_accuracy": 0.8539949178695678, "num_tokens": 88188688.0, "step": 73320 }, { "entropy": 1.92610152810812, "epoch": 0.2273165839894169, "grad_norm": 8.005793571472168, "learning_rate": 5.306156809397379e-06, "loss": 0.4976, "mean_token_accuracy": 0.8419401630759239, "num_tokens": 88200847.0, "step": 73330 }, { "entropy": 1.9598787605762482, "epoch": 0.2273475831144666, "grad_norm": 8.136838912963867, "learning_rate": 5.305795041599927e-06, "loss": 0.4955, "mean_token_accuracy": 0.8463526502251625, "num_tokens": 88212189.0, "step": 73340 }, { "entropy": 1.8861487179994583, "epoch": 0.2273785822395163, "grad_norm": 8.918937683105469, "learning_rate": 5.30543334778715e-06, "loss": 0.4946, "mean_token_accuracy": 0.8471100255846977, "num_tokens": 88223697.0, "step": 73350 }, { "entropy": 2.0112386524677275, "epoch": 0.227409581364566, "grad_norm": 8.22529411315918, "learning_rate": 5.305071727933835e-06, "loss": 0.5405, "mean_token_accuracy": 0.8350095435976982, "num_tokens": 88233981.0, "step": 73360 }, { "entropy": 1.9829117342829705, "epoch": 0.22744058048961568, "grad_norm": 8.34417724609375, "learning_rate": 5.304710182014778e-06, "loss": 0.524, "mean_token_accuracy": 0.8332961440086365, "num_tokens": 88245931.0, "step": 73370 }, { "entropy": 1.8644984647631646, "epoch": 0.22747157961466538, "grad_norm": 9.018733024597168, "learning_rate": 5.304348710004791e-06, "loss": 0.4726, "mean_token_accuracy": 0.8427291125059128, "num_tokens": 88258399.0, "step": 73380 }, { "entropy": 1.8523142382502555, "epoch": 0.22750257873971508, "grad_norm": 7.3071370124816895, "learning_rate": 5.303987311878693e-06, "loss": 0.4313, "mean_token_accuracy": 0.856279893219471, "num_tokens": 88270512.0, "step": 73390 }, { "entropy": 1.948920576274395, "epoch": 0.22753357786476477, "grad_norm": 8.815826416015625, "learning_rate": 5.303625987611321e-06, "loss": 0.5292, "mean_token_accuracy": 0.8310361623764038, "num_tokens": 88282293.0, "step": 73400 }, { "entropy": 1.8547866210341453, "epoch": 0.22756457698981447, "grad_norm": 4.00172233581543, "learning_rate": 5.30326473717752e-06, "loss": 0.4735, "mean_token_accuracy": 0.8407047167420387, "num_tokens": 88295869.0, "step": 73410 }, { "entropy": 1.9882050842046737, "epoch": 0.22759557611486417, "grad_norm": 4.15764045715332, "learning_rate": 5.3029035605521485e-06, "loss": 0.5662, "mean_token_accuracy": 0.824670821428299, "num_tokens": 88307629.0, "step": 73420 }, { "entropy": 1.896325621008873, "epoch": 0.22762657523991386, "grad_norm": 7.764687538146973, "learning_rate": 5.302542457710075e-06, "loss": 0.4483, "mean_token_accuracy": 0.8539949029684066, "num_tokens": 88319325.0, "step": 73430 }, { "entropy": 1.9740970045328141, "epoch": 0.22765757436496356, "grad_norm": 8.697678565979004, "learning_rate": 5.302181428626182e-06, "loss": 0.5632, "mean_token_accuracy": 0.8374375849962234, "num_tokens": 88330494.0, "step": 73440 }, { "entropy": 1.9539963483810425, "epoch": 0.22768857349001326, "grad_norm": 9.064693450927734, "learning_rate": 5.301820473275364e-06, "loss": 0.5405, "mean_token_accuracy": 0.8300345674157142, "num_tokens": 88341702.0, "step": 73450 }, { "entropy": 1.8306120559573174, "epoch": 0.22771957261506293, "grad_norm": 9.045393943786621, "learning_rate": 5.301459591632527e-06, "loss": 0.4311, "mean_token_accuracy": 0.847631786763668, "num_tokens": 88354694.0, "step": 73460 }, { "entropy": 1.9496394276618958, "epoch": 0.22775057174011262, "grad_norm": 9.405258178710938, "learning_rate": 5.301098783672588e-06, "loss": 0.5447, "mean_token_accuracy": 0.837113332748413, "num_tokens": 88365566.0, "step": 73470 }, { "entropy": 1.832155992090702, "epoch": 0.22778157086516232, "grad_norm": 8.784136772155762, "learning_rate": 5.300738049370477e-06, "loss": 0.4445, "mean_token_accuracy": 0.8398911356925964, "num_tokens": 88378055.0, "step": 73480 }, { "entropy": 1.8397590324282647, "epoch": 0.22781256999021202, "grad_norm": 10.566763877868652, "learning_rate": 5.3003773887011364e-06, "loss": 0.5284, "mean_token_accuracy": 0.8380313396453858, "num_tokens": 88391505.0, "step": 73490 }, { "entropy": 1.9307547613978386, "epoch": 0.2278435691152617, "grad_norm": 7.682570934295654, "learning_rate": 5.3000168016395195e-06, "loss": 0.529, "mean_token_accuracy": 0.8304173111915588, "num_tokens": 88402732.0, "step": 73500 }, { "entropy": 1.9600440084934234, "epoch": 0.2278745682403114, "grad_norm": 8.14580249786377, "learning_rate": 5.299656288160591e-06, "loss": 0.5391, "mean_token_accuracy": 0.8324763268232346, "num_tokens": 88414140.0, "step": 73510 }, { "entropy": 1.9035349115729332, "epoch": 0.2279055673653611, "grad_norm": 8.5955171585083, "learning_rate": 5.299295848239329e-06, "loss": 0.4725, "mean_token_accuracy": 0.8473742425441741, "num_tokens": 88425821.0, "step": 73520 }, { "entropy": 1.9537647753953933, "epoch": 0.2279365664904108, "grad_norm": 8.174680709838867, "learning_rate": 5.298935481850723e-06, "loss": 0.5177, "mean_token_accuracy": 0.8415812566876412, "num_tokens": 88436784.0, "step": 73530 }, { "entropy": 1.9054030612111093, "epoch": 0.2279675656154605, "grad_norm": 9.483674049377441, "learning_rate": 5.2985751889697746e-06, "loss": 0.4891, "mean_token_accuracy": 0.8276939764618874, "num_tokens": 88448671.0, "step": 73540 }, { "entropy": 1.7816397354006768, "epoch": 0.2279985647405102, "grad_norm": 9.179033279418945, "learning_rate": 5.2982149695714964e-06, "loss": 0.3919, "mean_token_accuracy": 0.8552224412560463, "num_tokens": 88463062.0, "step": 73550 }, { "entropy": 1.926460900902748, "epoch": 0.2280295638655599, "grad_norm": 8.515421867370605, "learning_rate": 5.297854823630913e-06, "loss": 0.503, "mean_token_accuracy": 0.8481031745672226, "num_tokens": 88474333.0, "step": 73560 }, { "entropy": 1.944996650516987, "epoch": 0.2280605629906096, "grad_norm": 7.795518398284912, "learning_rate": 5.2974947511230635e-06, "loss": 0.4867, "mean_token_accuracy": 0.847733362019062, "num_tokens": 88486183.0, "step": 73570 }, { "entropy": 1.924484086036682, "epoch": 0.22809156211565929, "grad_norm": 9.433268547058105, "learning_rate": 5.297134752022996e-06, "loss": 0.5195, "mean_token_accuracy": 0.83953125923872, "num_tokens": 88498763.0, "step": 73580 }, { "entropy": 1.8909065306186676, "epoch": 0.22812256124070898, "grad_norm": 4.520410537719727, "learning_rate": 5.2967748263057685e-06, "loss": 0.4292, "mean_token_accuracy": 0.8501924559473991, "num_tokens": 88511080.0, "step": 73590 }, { "entropy": 1.855413518846035, "epoch": 0.22815356036575868, "grad_norm": 7.426053047180176, "learning_rate": 5.296414973946457e-06, "loss": 0.4606, "mean_token_accuracy": 0.848104490339756, "num_tokens": 88523301.0, "step": 73600 }, { "entropy": 1.9100253120064736, "epoch": 0.22818455949080838, "grad_norm": 3.6768534183502197, "learning_rate": 5.2960551949201445e-06, "loss": 0.5232, "mean_token_accuracy": 0.831768749654293, "num_tokens": 88535393.0, "step": 73610 }, { "entropy": 1.9303764522075653, "epoch": 0.22821555861585807, "grad_norm": 9.903493881225586, "learning_rate": 5.295695489201927e-06, "loss": 0.522, "mean_token_accuracy": 0.8348423153162002, "num_tokens": 88547298.0, "step": 73620 }, { "entropy": 1.8810430273413659, "epoch": 0.22824655774090777, "grad_norm": 8.662088394165039, "learning_rate": 5.295335856766913e-06, "loss": 0.4731, "mean_token_accuracy": 0.836891371011734, "num_tokens": 88559818.0, "step": 73630 }, { "entropy": 1.9283401042222976, "epoch": 0.22827755686595746, "grad_norm": 8.903078079223633, "learning_rate": 5.294976297590223e-06, "loss": 0.5273, "mean_token_accuracy": 0.8383133113384247, "num_tokens": 88570970.0, "step": 73640 }, { "entropy": 1.9302942425012588, "epoch": 0.22830855599100716, "grad_norm": 8.803302764892578, "learning_rate": 5.294616811646988e-06, "loss": 0.5433, "mean_token_accuracy": 0.833921717107296, "num_tokens": 88582398.0, "step": 73650 }, { "entropy": 1.8926509320735931, "epoch": 0.22833955511605686, "grad_norm": 7.454058647155762, "learning_rate": 5.294257398912351e-06, "loss": 0.5225, "mean_token_accuracy": 0.8442865967750549, "num_tokens": 88594112.0, "step": 73660 }, { "entropy": 1.811450758576393, "epoch": 0.22837055424110655, "grad_norm": 3.648226499557495, "learning_rate": 5.29389805936147e-06, "loss": 0.3976, "mean_token_accuracy": 0.8563667595386505, "num_tokens": 88606926.0, "step": 73670 }, { "entropy": 1.927273753285408, "epoch": 0.22840155336615625, "grad_norm": 7.629867076873779, "learning_rate": 5.293538792969509e-06, "loss": 0.55, "mean_token_accuracy": 0.8345969125628472, "num_tokens": 88617995.0, "step": 73680 }, { "entropy": 1.866581754386425, "epoch": 0.22843255249120595, "grad_norm": 10.446778297424316, "learning_rate": 5.293179599711649e-06, "loss": 0.4725, "mean_token_accuracy": 0.8569563254714012, "num_tokens": 88630088.0, "step": 73690 }, { "entropy": 1.9038088709115981, "epoch": 0.22846355161625564, "grad_norm": 8.493614196777344, "learning_rate": 5.29282047956308e-06, "loss": 0.5245, "mean_token_accuracy": 0.8368158757686615, "num_tokens": 88642108.0, "step": 73700 }, { "entropy": 1.8956671848893165, "epoch": 0.2284945507413053, "grad_norm": 8.689042091369629, "learning_rate": 5.2924614324990045e-06, "loss": 0.4814, "mean_token_accuracy": 0.8425026118755341, "num_tokens": 88653698.0, "step": 73710 }, { "entropy": 1.8952711433172227, "epoch": 0.228525549866355, "grad_norm": 8.7789306640625, "learning_rate": 5.292102458494637e-06, "loss": 0.4779, "mean_token_accuracy": 0.8493198037147522, "num_tokens": 88665017.0, "step": 73720 }, { "entropy": 1.925877857208252, "epoch": 0.2285565489914047, "grad_norm": 9.960886001586914, "learning_rate": 5.2917435575252045e-06, "loss": 0.5461, "mean_token_accuracy": 0.833611187338829, "num_tokens": 88676741.0, "step": 73730 }, { "entropy": 1.9205459102988243, "epoch": 0.2285875481164544, "grad_norm": 4.133283615112305, "learning_rate": 5.291384729565944e-06, "loss": 0.525, "mean_token_accuracy": 0.8389394223690033, "num_tokens": 88688321.0, "step": 73740 }, { "entropy": 1.974983049929142, "epoch": 0.2286185472415041, "grad_norm": 8.154329299926758, "learning_rate": 5.291025974592104e-06, "loss": 0.582, "mean_token_accuracy": 0.8283642366528511, "num_tokens": 88700262.0, "step": 73750 }, { "entropy": 1.8185023814439774, "epoch": 0.2286495463665538, "grad_norm": 8.861298561096191, "learning_rate": 5.290667292578948e-06, "loss": 0.469, "mean_token_accuracy": 0.8509408429265022, "num_tokens": 88713127.0, "step": 73760 }, { "entropy": 1.8385506071150304, "epoch": 0.2286805454916035, "grad_norm": 3.4774937629699707, "learning_rate": 5.290308683501748e-06, "loss": 0.447, "mean_token_accuracy": 0.8416382968425751, "num_tokens": 88726534.0, "step": 73770 }, { "entropy": 2.0139817029237745, "epoch": 0.2287115446166532, "grad_norm": 9.335980415344238, "learning_rate": 5.289950147335788e-06, "loss": 0.5409, "mean_token_accuracy": 0.8362293437123298, "num_tokens": 88737523.0, "step": 73780 }, { "entropy": 1.7716790959239006, "epoch": 0.2287425437417029, "grad_norm": 2.6843504905700684, "learning_rate": 5.2895916840563675e-06, "loss": 0.3855, "mean_token_accuracy": 0.8626002490520477, "num_tokens": 88751293.0, "step": 73790 }, { "entropy": 1.9235130712389945, "epoch": 0.22877354286675258, "grad_norm": 9.159330368041992, "learning_rate": 5.289233293638791e-06, "loss": 0.5496, "mean_token_accuracy": 0.8392347306013107, "num_tokens": 88763247.0, "step": 73800 }, { "entropy": 1.8758223891258239, "epoch": 0.22880454199180228, "grad_norm": 7.979211330413818, "learning_rate": 5.288874976058381e-06, "loss": 0.4729, "mean_token_accuracy": 0.8446521013975143, "num_tokens": 88775511.0, "step": 73810 }, { "entropy": 1.9276006370782852, "epoch": 0.22883554111685198, "grad_norm": 7.672868251800537, "learning_rate": 5.288516731290468e-06, "loss": 0.514, "mean_token_accuracy": 0.8356775805354119, "num_tokens": 88786826.0, "step": 73820 }, { "entropy": 1.9099656268954277, "epoch": 0.22886654024190167, "grad_norm": 9.19827651977539, "learning_rate": 5.288158559310397e-06, "loss": 0.4861, "mean_token_accuracy": 0.8499478042125702, "num_tokens": 88799014.0, "step": 73830 }, { "entropy": 1.8794591814279555, "epoch": 0.22889753936695137, "grad_norm": 9.211488723754883, "learning_rate": 5.287800460093521e-06, "loss": 0.4953, "mean_token_accuracy": 0.8426800593733788, "num_tokens": 88811087.0, "step": 73840 }, { "entropy": 1.8956602096557618, "epoch": 0.22892853849200107, "grad_norm": 8.783346176147461, "learning_rate": 5.287442433615207e-06, "loss": 0.5048, "mean_token_accuracy": 0.8364124745130539, "num_tokens": 88823246.0, "step": 73850 }, { "entropy": 1.8410479247570037, "epoch": 0.22895953761705076, "grad_norm": 7.995346546173096, "learning_rate": 5.287084479850834e-06, "loss": 0.4777, "mean_token_accuracy": 0.8437046140432358, "num_tokens": 88836150.0, "step": 73860 }, { "entropy": 1.9268400833010673, "epoch": 0.22899053674210046, "grad_norm": 6.432771682739258, "learning_rate": 5.286726598775794e-06, "loss": 0.559, "mean_token_accuracy": 0.8304910391569138, "num_tokens": 88846935.0, "step": 73870 }, { "entropy": 1.967933678627014, "epoch": 0.22902153586715016, "grad_norm": 8.738357543945312, "learning_rate": 5.286368790365485e-06, "loss": 0.5395, "mean_token_accuracy": 0.8432664573192596, "num_tokens": 88857965.0, "step": 73880 }, { "entropy": 1.8915558218955995, "epoch": 0.22905253499219985, "grad_norm": 8.925724983215332, "learning_rate": 5.286011054595324e-06, "loss": 0.4876, "mean_token_accuracy": 0.8440207138657569, "num_tokens": 88869852.0, "step": 73890 }, { "entropy": 1.9460553407669068, "epoch": 0.22908353411724955, "grad_norm": 10.097593307495117, "learning_rate": 5.285653391440732e-06, "loss": 0.5563, "mean_token_accuracy": 0.8398781731724739, "num_tokens": 88880507.0, "step": 73900 }, { "entropy": 1.9188208684325219, "epoch": 0.22911453324229925, "grad_norm": 7.534275531768799, "learning_rate": 5.285295800877149e-06, "loss": 0.4985, "mean_token_accuracy": 0.845045380294323, "num_tokens": 88891773.0, "step": 73910 }, { "entropy": 1.9317274510860443, "epoch": 0.22914553236734894, "grad_norm": 7.515310764312744, "learning_rate": 5.284938282880022e-06, "loss": 0.5103, "mean_token_accuracy": 0.830031855404377, "num_tokens": 88903628.0, "step": 73920 }, { "entropy": 1.8418959081172943, "epoch": 0.22917653149239864, "grad_norm": 10.1459379196167, "learning_rate": 5.284580837424812e-06, "loss": 0.4781, "mean_token_accuracy": 0.8359077215194702, "num_tokens": 88915956.0, "step": 73930 }, { "entropy": 1.9436316937208176, "epoch": 0.22920753061744834, "grad_norm": 9.291312217712402, "learning_rate": 5.2842234644869895e-06, "loss": 0.56, "mean_token_accuracy": 0.8290729984641075, "num_tokens": 88926709.0, "step": 73940 }, { "entropy": 1.997511848807335, "epoch": 0.22923852974249803, "grad_norm": 8.855123519897461, "learning_rate": 5.283866164042037e-06, "loss": 0.5755, "mean_token_accuracy": 0.8371048003435135, "num_tokens": 88937889.0, "step": 73950 }, { "entropy": 1.9249091997742653, "epoch": 0.2292695288675477, "grad_norm": 10.297185897827148, "learning_rate": 5.283508936065452e-06, "loss": 0.5007, "mean_token_accuracy": 0.8376726984977723, "num_tokens": 88950020.0, "step": 73960 }, { "entropy": 1.974366408586502, "epoch": 0.2293005279925974, "grad_norm": 9.853002548217773, "learning_rate": 5.283151780532737e-06, "loss": 0.5265, "mean_token_accuracy": 0.8404949456453323, "num_tokens": 88960953.0, "step": 73970 }, { "entropy": 1.9446517005562782, "epoch": 0.2293315271176471, "grad_norm": 7.498370170593262, "learning_rate": 5.282794697419412e-06, "loss": 0.5435, "mean_token_accuracy": 0.8361097246408462, "num_tokens": 88972293.0, "step": 73980 }, { "entropy": 1.9057669252157212, "epoch": 0.2293625262426968, "grad_norm": 9.31442928314209, "learning_rate": 5.282437686701009e-06, "loss": 0.5119, "mean_token_accuracy": 0.8370606362819671, "num_tokens": 88983890.0, "step": 73990 }, { "entropy": 1.9226253479719162, "epoch": 0.2293935253677465, "grad_norm": 8.424675941467285, "learning_rate": 5.2820807483530635e-06, "loss": 0.5432, "mean_token_accuracy": 0.8275117412209511, "num_tokens": 88995595.0, "step": 74000 }, { "entropy": 1.9122430935502053, "epoch": 0.22942452449279618, "grad_norm": 7.728245258331299, "learning_rate": 5.281723882351132e-06, "loss": 0.4993, "mean_token_accuracy": 0.8327098101377487, "num_tokens": 89008305.0, "step": 74010 }, { "entropy": 1.8688533097505569, "epoch": 0.22945552361784588, "grad_norm": 4.136612415313721, "learning_rate": 5.281367088670779e-06, "loss": 0.4394, "mean_token_accuracy": 0.852810050547123, "num_tokens": 89020921.0, "step": 74020 }, { "entropy": 1.8244116827845573, "epoch": 0.22948652274289558, "grad_norm": 9.678080558776855, "learning_rate": 5.281010367287579e-06, "loss": 0.3859, "mean_token_accuracy": 0.8560571730136871, "num_tokens": 89034197.0, "step": 74030 }, { "entropy": 1.9837775856256485, "epoch": 0.22951752186794527, "grad_norm": 8.24919319152832, "learning_rate": 5.280653718177119e-06, "loss": 0.519, "mean_token_accuracy": 0.83905139118433, "num_tokens": 89044845.0, "step": 74040 }, { "entropy": 1.9067578569054604, "epoch": 0.22954852099299497, "grad_norm": 4.405426502227783, "learning_rate": 5.2802971413149995e-06, "loss": 0.4771, "mean_token_accuracy": 0.8524753168225289, "num_tokens": 89056288.0, "step": 74050 }, { "entropy": 1.8309132128953933, "epoch": 0.22957952011804467, "grad_norm": 7.820222854614258, "learning_rate": 5.279940636676828e-06, "loss": 0.4325, "mean_token_accuracy": 0.8572394847869873, "num_tokens": 89069166.0, "step": 74060 }, { "entropy": 1.9012271910905838, "epoch": 0.22961051924309436, "grad_norm": 9.459297180175781, "learning_rate": 5.27958420423823e-06, "loss": 0.4733, "mean_token_accuracy": 0.8430428206920624, "num_tokens": 89081030.0, "step": 74070 }, { "entropy": 1.937754437327385, "epoch": 0.22964151836814406, "grad_norm": 7.659812927246094, "learning_rate": 5.279227843974837e-06, "loss": 0.5038, "mean_token_accuracy": 0.8409286737442017, "num_tokens": 89093165.0, "step": 74080 }, { "entropy": 1.8637029841542243, "epoch": 0.22967251749319376, "grad_norm": 8.62972354888916, "learning_rate": 5.278871555862294e-06, "loss": 0.5112, "mean_token_accuracy": 0.8332642212510109, "num_tokens": 89105778.0, "step": 74090 }, { "entropy": 1.9526731699705124, "epoch": 0.22970351661824345, "grad_norm": 8.800896644592285, "learning_rate": 5.278515339876257e-06, "loss": 0.5603, "mean_token_accuracy": 0.8348133593797684, "num_tokens": 89116443.0, "step": 74100 }, { "entropy": 1.906109546124935, "epoch": 0.22973451574329315, "grad_norm": 8.518561363220215, "learning_rate": 5.278159195992395e-06, "loss": 0.5096, "mean_token_accuracy": 0.8357308685779572, "num_tokens": 89128471.0, "step": 74110 }, { "entropy": 1.9231334283947945, "epoch": 0.22976551486834285, "grad_norm": 5.9242401123046875, "learning_rate": 5.277803124186387e-06, "loss": 0.5068, "mean_token_accuracy": 0.8310579225420952, "num_tokens": 89140663.0, "step": 74120 }, { "entropy": 1.8632835254073143, "epoch": 0.22979651399339254, "grad_norm": 7.401041507720947, "learning_rate": 5.277447124433924e-06, "loss": 0.4631, "mean_token_accuracy": 0.8411552801728248, "num_tokens": 89153122.0, "step": 74130 }, { "entropy": 1.8588541388511657, "epoch": 0.22982751311844224, "grad_norm": 8.74480152130127, "learning_rate": 5.277091196710709e-06, "loss": 0.4855, "mean_token_accuracy": 0.8416764706373214, "num_tokens": 89166007.0, "step": 74140 }, { "entropy": 1.953472825884819, "epoch": 0.22985851224349194, "grad_norm": 10.276031494140625, "learning_rate": 5.276735340992454e-06, "loss": 0.5669, "mean_token_accuracy": 0.835921137034893, "num_tokens": 89177892.0, "step": 74150 }, { "entropy": 1.9886727631092072, "epoch": 0.22988951136854163, "grad_norm": 8.81214714050293, "learning_rate": 5.276379557254886e-06, "loss": 0.5502, "mean_token_accuracy": 0.8341899603605271, "num_tokens": 89188418.0, "step": 74160 }, { "entropy": 1.8839535772800446, "epoch": 0.22992051049359133, "grad_norm": 7.365606307983398, "learning_rate": 5.276023845473741e-06, "loss": 0.4427, "mean_token_accuracy": 0.8486846655607223, "num_tokens": 89201029.0, "step": 74170 }, { "entropy": 1.8606535702943803, "epoch": 0.22995150961864103, "grad_norm": 6.745984077453613, "learning_rate": 5.275668205624769e-06, "loss": 0.432, "mean_token_accuracy": 0.8506320536136627, "num_tokens": 89213185.0, "step": 74180 }, { "entropy": 1.9073518499732018, "epoch": 0.22998250874369072, "grad_norm": 8.412069320678711, "learning_rate": 5.275312637683727e-06, "loss": 0.5009, "mean_token_accuracy": 0.839865280687809, "num_tokens": 89225044.0, "step": 74190 }, { "entropy": 1.9035982072353363, "epoch": 0.2300135078687404, "grad_norm": 9.212803840637207, "learning_rate": 5.274957141626388e-06, "loss": 0.4738, "mean_token_accuracy": 0.8478123605251312, "num_tokens": 89236364.0, "step": 74200 }, { "entropy": 1.847705954313278, "epoch": 0.2300445069937901, "grad_norm": 8.281103134155273, "learning_rate": 5.274601717428534e-06, "loss": 0.4457, "mean_token_accuracy": 0.8482792422175407, "num_tokens": 89249228.0, "step": 74210 }, { "entropy": 1.8957882165908813, "epoch": 0.23007550611883978, "grad_norm": 9.440603256225586, "learning_rate": 5.274246365065958e-06, "loss": 0.5042, "mean_token_accuracy": 0.8471298664808273, "num_tokens": 89260514.0, "step": 74220 }, { "entropy": 1.8138180442154408, "epoch": 0.23010650524388948, "grad_norm": 3.8155839443206787, "learning_rate": 5.273891084514467e-06, "loss": 0.3897, "mean_token_accuracy": 0.8571711733937264, "num_tokens": 89274014.0, "step": 74230 }, { "entropy": 1.913990643620491, "epoch": 0.23013750436893918, "grad_norm": 9.489317893981934, "learning_rate": 5.273535875749878e-06, "loss": 0.5619, "mean_token_accuracy": 0.832446351647377, "num_tokens": 89286179.0, "step": 74240 }, { "entropy": 1.921476237475872, "epoch": 0.23016850349398887, "grad_norm": 4.419705867767334, "learning_rate": 5.273180738748017e-06, "loss": 0.458, "mean_token_accuracy": 0.8520811811089516, "num_tokens": 89298896.0, "step": 74250 }, { "entropy": 1.9407318532466888, "epoch": 0.23019950261903857, "grad_norm": 9.360230445861816, "learning_rate": 5.2728256734847265e-06, "loss": 0.4911, "mean_token_accuracy": 0.8388498574495316, "num_tokens": 89310196.0, "step": 74260 }, { "entropy": 1.822094811499119, "epoch": 0.23023050174408827, "grad_norm": 9.136000633239746, "learning_rate": 5.272470679935853e-06, "loss": 0.4498, "mean_token_accuracy": 0.8359728991985321, "num_tokens": 89323522.0, "step": 74270 }, { "entropy": 1.8379074111580849, "epoch": 0.23026150086913796, "grad_norm": 8.908660888671875, "learning_rate": 5.2721157580772635e-06, "loss": 0.4626, "mean_token_accuracy": 0.8487562105059624, "num_tokens": 89336641.0, "step": 74280 }, { "entropy": 1.9069863885641098, "epoch": 0.23029249999418766, "grad_norm": 9.512563705444336, "learning_rate": 5.27176090788483e-06, "loss": 0.4891, "mean_token_accuracy": 0.8323235869407654, "num_tokens": 89348756.0, "step": 74290 }, { "entropy": 1.9753981336951256, "epoch": 0.23032349911923736, "grad_norm": 8.980364799499512, "learning_rate": 5.271406129334436e-06, "loss": 0.5229, "mean_token_accuracy": 0.8389256626367569, "num_tokens": 89360026.0, "step": 74300 }, { "entropy": 1.9259448662400245, "epoch": 0.23035449824428705, "grad_norm": 8.680480003356934, "learning_rate": 5.271051422401982e-06, "loss": 0.5148, "mean_token_accuracy": 0.8409480020403862, "num_tokens": 89372328.0, "step": 74310 }, { "entropy": 1.9059909775853157, "epoch": 0.23038549736933675, "grad_norm": 3.84014892578125, "learning_rate": 5.2706967870633704e-06, "loss": 0.507, "mean_token_accuracy": 0.8374297887086868, "num_tokens": 89385781.0, "step": 74320 }, { "entropy": 1.9672224968671799, "epoch": 0.23041649649438645, "grad_norm": 8.767590522766113, "learning_rate": 5.270342223294524e-06, "loss": 0.5579, "mean_token_accuracy": 0.8284627929329872, "num_tokens": 89396717.0, "step": 74330 }, { "entropy": 1.9303541094064713, "epoch": 0.23044749561943614, "grad_norm": 7.687719345092773, "learning_rate": 5.2699877310713735e-06, "loss": 0.4751, "mean_token_accuracy": 0.846529133617878, "num_tokens": 89408365.0, "step": 74340 }, { "entropy": 1.8513358294963838, "epoch": 0.23047849474448584, "grad_norm": 4.211158275604248, "learning_rate": 5.26963331036986e-06, "loss": 0.4756, "mean_token_accuracy": 0.8417376056313515, "num_tokens": 89420728.0, "step": 74350 }, { "entropy": 1.9158269882202148, "epoch": 0.23050949386953554, "grad_norm": 8.824563026428223, "learning_rate": 5.2692789611659345e-06, "loss": 0.4523, "mean_token_accuracy": 0.8564553380012512, "num_tokens": 89432042.0, "step": 74360 }, { "entropy": 1.8483793511986732, "epoch": 0.23054049299458523, "grad_norm": 8.350893020629883, "learning_rate": 5.268924683435563e-06, "loss": 0.4704, "mean_token_accuracy": 0.8427222266793251, "num_tokens": 89445106.0, "step": 74370 }, { "entropy": 1.9533838108181953, "epoch": 0.23057149211963493, "grad_norm": 7.688497066497803, "learning_rate": 5.268570477154723e-06, "loss": 0.487, "mean_token_accuracy": 0.8480934947729111, "num_tokens": 89456517.0, "step": 74380 }, { "entropy": 1.7940612569451333, "epoch": 0.23060249124468463, "grad_norm": 10.271604537963867, "learning_rate": 5.268216342299399e-06, "loss": 0.4255, "mean_token_accuracy": 0.8567822203040123, "num_tokens": 89470177.0, "step": 74390 }, { "entropy": 1.905691820383072, "epoch": 0.23063349036973432, "grad_norm": 9.242095947265625, "learning_rate": 5.267862278845591e-06, "loss": 0.4724, "mean_token_accuracy": 0.8492756888270379, "num_tokens": 89481980.0, "step": 74400 }, { "entropy": 1.9306585028767587, "epoch": 0.23066448949478402, "grad_norm": 8.073739051818848, "learning_rate": 5.267508286769307e-06, "loss": 0.4924, "mean_token_accuracy": 0.8462639227509499, "num_tokens": 89493310.0, "step": 74410 }, { "entropy": 1.9487823605537415, "epoch": 0.23069548861983372, "grad_norm": 7.810174942016602, "learning_rate": 5.267154366046571e-06, "loss": 0.5225, "mean_token_accuracy": 0.8337769046425819, "num_tokens": 89504297.0, "step": 74420 }, { "entropy": 2.020465725660324, "epoch": 0.2307264877448834, "grad_norm": 8.833086967468262, "learning_rate": 5.266800516653412e-06, "loss": 0.5802, "mean_token_accuracy": 0.8244875445961952, "num_tokens": 89515014.0, "step": 74430 }, { "entropy": 1.8529538474977016, "epoch": 0.2307574868699331, "grad_norm": 8.668907165527344, "learning_rate": 5.266446738565875e-06, "loss": 0.4542, "mean_token_accuracy": 0.8453396156430244, "num_tokens": 89527739.0, "step": 74440 }, { "entropy": 1.938523431122303, "epoch": 0.23078848599498278, "grad_norm": 3.760244846343994, "learning_rate": 5.266093031760013e-06, "loss": 0.5123, "mean_token_accuracy": 0.8392288982868195, "num_tokens": 89539449.0, "step": 74450 }, { "entropy": 1.8795533359050751, "epoch": 0.23081948512003247, "grad_norm": 8.304437637329102, "learning_rate": 5.265739396211895e-06, "loss": 0.4722, "mean_token_accuracy": 0.8489022374153137, "num_tokens": 89551459.0, "step": 74460 }, { "entropy": 1.8862609788775444, "epoch": 0.23085048424508217, "grad_norm": 8.59811782836914, "learning_rate": 5.265385831897596e-06, "loss": 0.4956, "mean_token_accuracy": 0.8412604182958603, "num_tokens": 89563656.0, "step": 74470 }, { "entropy": 1.906290727853775, "epoch": 0.23088148337013187, "grad_norm": 8.959325790405273, "learning_rate": 5.2650323387932055e-06, "loss": 0.5424, "mean_token_accuracy": 0.8369390457868576, "num_tokens": 89575960.0, "step": 74480 }, { "entropy": 1.8360799536108972, "epoch": 0.23091248249518156, "grad_norm": 8.049399375915527, "learning_rate": 5.264678916874822e-06, "loss": 0.413, "mean_token_accuracy": 0.8528993338346481, "num_tokens": 89588989.0, "step": 74490 }, { "entropy": 1.9210985012352466, "epoch": 0.23094348162023126, "grad_norm": 9.01938533782959, "learning_rate": 5.264325566118559e-06, "loss": 0.5033, "mean_token_accuracy": 0.8337433129549027, "num_tokens": 89601183.0, "step": 74500 }, { "entropy": 1.925881953537464, "epoch": 0.23097448074528096, "grad_norm": 9.546213150024414, "learning_rate": 5.263972286500535e-06, "loss": 0.5117, "mean_token_accuracy": 0.8354784622788429, "num_tokens": 89612244.0, "step": 74510 }, { "entropy": 1.85896704941988, "epoch": 0.23100547987033065, "grad_norm": 9.076498031616211, "learning_rate": 5.263619077996888e-06, "loss": 0.4686, "mean_token_accuracy": 0.8546704173088073, "num_tokens": 89623851.0, "step": 74520 }, { "entropy": 1.9326560199260712, "epoch": 0.23103647899538035, "grad_norm": 3.5278940200805664, "learning_rate": 5.263265940583757e-06, "loss": 0.4875, "mean_token_accuracy": 0.8429828837513924, "num_tokens": 89635763.0, "step": 74530 }, { "entropy": 1.8581456407904624, "epoch": 0.23106747812043005, "grad_norm": 3.4591639041900635, "learning_rate": 5.262912874237302e-06, "loss": 0.4786, "mean_token_accuracy": 0.8445979550480842, "num_tokens": 89648396.0, "step": 74540 }, { "entropy": 1.983565354347229, "epoch": 0.23109847724547974, "grad_norm": 9.090567588806152, "learning_rate": 5.262559878933689e-06, "loss": 0.54, "mean_token_accuracy": 0.8390719383955002, "num_tokens": 89659197.0, "step": 74550 }, { "entropy": 1.8371982112526895, "epoch": 0.23112947637052944, "grad_norm": 8.092364311218262, "learning_rate": 5.262206954649097e-06, "loss": 0.4645, "mean_token_accuracy": 0.8549615368247032, "num_tokens": 89671458.0, "step": 74560 }, { "entropy": 1.8131123587489129, "epoch": 0.23116047549557914, "grad_norm": 4.235384941101074, "learning_rate": 5.2618541013597135e-06, "loss": 0.4451, "mean_token_accuracy": 0.8519744485616684, "num_tokens": 89684238.0, "step": 74570 }, { "entropy": 1.8373139381408692, "epoch": 0.23119147462062883, "grad_norm": 7.887211322784424, "learning_rate": 5.26150131904174e-06, "loss": 0.4723, "mean_token_accuracy": 0.8530314326286316, "num_tokens": 89697119.0, "step": 74580 }, { "entropy": 1.9169560462236404, "epoch": 0.23122247374567853, "grad_norm": 7.602973937988281, "learning_rate": 5.261148607671387e-06, "loss": 0.493, "mean_token_accuracy": 0.8439723640680313, "num_tokens": 89708694.0, "step": 74590 }, { "entropy": 1.9264726474881173, "epoch": 0.23125347287072823, "grad_norm": 8.548664093017578, "learning_rate": 5.26079596722488e-06, "loss": 0.4944, "mean_token_accuracy": 0.844902828335762, "num_tokens": 89720007.0, "step": 74600 }, { "entropy": 1.9451578676700592, "epoch": 0.23128447199577792, "grad_norm": 8.304434776306152, "learning_rate": 5.260443397678451e-06, "loss": 0.4489, "mean_token_accuracy": 0.8561795771121978, "num_tokens": 89731316.0, "step": 74610 }, { "entropy": 1.9096089273691177, "epoch": 0.23131547112082762, "grad_norm": 5.2652459144592285, "learning_rate": 5.260090899008346e-06, "loss": 0.5157, "mean_token_accuracy": 0.8433238387107849, "num_tokens": 89742801.0, "step": 74620 }, { "entropy": 1.9577663838863373, "epoch": 0.23134647024587732, "grad_norm": 9.235634803771973, "learning_rate": 5.25973847119082e-06, "loss": 0.5354, "mean_token_accuracy": 0.8379147708415985, "num_tokens": 89754662.0, "step": 74630 }, { "entropy": 1.8390830487012864, "epoch": 0.231377469370927, "grad_norm": 10.85385799407959, "learning_rate": 5.259386114202142e-06, "loss": 0.4284, "mean_token_accuracy": 0.8555236831307411, "num_tokens": 89767357.0, "step": 74640 }, { "entropy": 1.8444815009832383, "epoch": 0.2314084684959767, "grad_norm": 7.597925186157227, "learning_rate": 5.25903382801859e-06, "loss": 0.466, "mean_token_accuracy": 0.8467239618301392, "num_tokens": 89779687.0, "step": 74650 }, { "entropy": 1.8531509324908257, "epoch": 0.2314394676210264, "grad_norm": 9.575505256652832, "learning_rate": 5.2586816126164544e-06, "loss": 0.4716, "mean_token_accuracy": 0.846321189403534, "num_tokens": 89792615.0, "step": 74660 }, { "entropy": 1.8784852564334868, "epoch": 0.2314704667460761, "grad_norm": 6.865838527679443, "learning_rate": 5.258329467972034e-06, "loss": 0.4986, "mean_token_accuracy": 0.8459220543503762, "num_tokens": 89804495.0, "step": 74670 }, { "entropy": 1.8415795341134071, "epoch": 0.2315014658711258, "grad_norm": 4.12829065322876, "learning_rate": 5.257977394061643e-06, "loss": 0.4897, "mean_token_accuracy": 0.8459180906414986, "num_tokens": 89816717.0, "step": 74680 }, { "entropy": 1.8776524886488914, "epoch": 0.2315324649961755, "grad_norm": 8.387575149536133, "learning_rate": 5.257625390861604e-06, "loss": 0.5371, "mean_token_accuracy": 0.8276556923985481, "num_tokens": 89828650.0, "step": 74690 }, { "entropy": 1.8143211975693703, "epoch": 0.23156346412122517, "grad_norm": 9.098214149475098, "learning_rate": 5.25727345834825e-06, "loss": 0.4038, "mean_token_accuracy": 0.8600962340831757, "num_tokens": 89841929.0, "step": 74700 }, { "entropy": 1.9715292781591416, "epoch": 0.23159446324627486, "grad_norm": 8.993027687072754, "learning_rate": 5.256921596497926e-06, "loss": 0.5908, "mean_token_accuracy": 0.8187263280153274, "num_tokens": 89853031.0, "step": 74710 }, { "entropy": 1.8571769908070563, "epoch": 0.23162546237132456, "grad_norm": 9.328154563903809, "learning_rate": 5.256569805286989e-06, "loss": 0.4653, "mean_token_accuracy": 0.8409965321421623, "num_tokens": 89866198.0, "step": 74720 }, { "entropy": 1.930174747109413, "epoch": 0.23165646149637426, "grad_norm": 8.642340660095215, "learning_rate": 5.256218084691808e-06, "loss": 0.532, "mean_token_accuracy": 0.8321129947900772, "num_tokens": 89877322.0, "step": 74730 }, { "entropy": 1.8410791546106338, "epoch": 0.23168746062142395, "grad_norm": 7.149337291717529, "learning_rate": 5.255866434688759e-06, "loss": 0.4858, "mean_token_accuracy": 0.8385511308908462, "num_tokens": 89890110.0, "step": 74740 }, { "entropy": 1.8869677096605302, "epoch": 0.23171845974647365, "grad_norm": 4.138514518737793, "learning_rate": 5.255514855254232e-06, "loss": 0.4707, "mean_token_accuracy": 0.8496659383177757, "num_tokens": 89901498.0, "step": 74750 }, { "entropy": 1.9108739644289017, "epoch": 0.23174945887152335, "grad_norm": 3.6030426025390625, "learning_rate": 5.255163346364628e-06, "loss": 0.4849, "mean_token_accuracy": 0.8481365412473678, "num_tokens": 89913794.0, "step": 74760 }, { "entropy": 1.8742002308368684, "epoch": 0.23178045799657304, "grad_norm": 4.183920383453369, "learning_rate": 5.25481190799636e-06, "loss": 0.4603, "mean_token_accuracy": 0.854676042497158, "num_tokens": 89926016.0, "step": 74770 }, { "entropy": 1.8252388328313827, "epoch": 0.23181145712162274, "grad_norm": 8.657533645629883, "learning_rate": 5.254460540125848e-06, "loss": 0.4536, "mean_token_accuracy": 0.855050428211689, "num_tokens": 89939164.0, "step": 74780 }, { "entropy": 1.891065989434719, "epoch": 0.23184245624667243, "grad_norm": 7.831802845001221, "learning_rate": 5.254109242729526e-06, "loss": 0.5505, "mean_token_accuracy": 0.837081053853035, "num_tokens": 89950621.0, "step": 74790 }, { "entropy": 1.8497989937663077, "epoch": 0.23187345537172213, "grad_norm": 9.593158721923828, "learning_rate": 5.2537580157838395e-06, "loss": 0.487, "mean_token_accuracy": 0.8481601446866989, "num_tokens": 89962644.0, "step": 74800 }, { "entropy": 1.9117231592535973, "epoch": 0.23190445449677183, "grad_norm": 8.208306312561035, "learning_rate": 5.253406859265246e-06, "loss": 0.4946, "mean_token_accuracy": 0.8326343983411789, "num_tokens": 89973841.0, "step": 74810 }, { "entropy": 1.8912553757429122, "epoch": 0.23193545362182152, "grad_norm": 6.744791507720947, "learning_rate": 5.253055773150209e-06, "loss": 0.4592, "mean_token_accuracy": 0.8465361073613167, "num_tokens": 89985153.0, "step": 74820 }, { "entropy": 1.9291711524128914, "epoch": 0.23196645274687122, "grad_norm": 8.88097858428955, "learning_rate": 5.252704757415207e-06, "loss": 0.5299, "mean_token_accuracy": 0.8393183097243309, "num_tokens": 89996749.0, "step": 74830 }, { "entropy": 1.9174196228384972, "epoch": 0.23199745187192092, "grad_norm": 8.680536270141602, "learning_rate": 5.25235381203673e-06, "loss": 0.4885, "mean_token_accuracy": 0.8294728398323059, "num_tokens": 90008694.0, "step": 74840 }, { "entropy": 1.8946995601058005, "epoch": 0.23202845099697061, "grad_norm": 7.604076385498047, "learning_rate": 5.252002936991277e-06, "loss": 0.5037, "mean_token_accuracy": 0.843429696559906, "num_tokens": 90021433.0, "step": 74850 }, { "entropy": 1.8540204659104347, "epoch": 0.2320594501220203, "grad_norm": 4.1174635887146, "learning_rate": 5.251652132255359e-06, "loss": 0.4499, "mean_token_accuracy": 0.851498831808567, "num_tokens": 90032932.0, "step": 74860 }, { "entropy": 1.823809015750885, "epoch": 0.23209044924707, "grad_norm": 8.341029167175293, "learning_rate": 5.251301397805497e-06, "loss": 0.4716, "mean_token_accuracy": 0.8412299945950508, "num_tokens": 90045600.0, "step": 74870 }, { "entropy": 1.804415312409401, "epoch": 0.2321214483721197, "grad_norm": 9.450553894042969, "learning_rate": 5.250950733618225e-06, "loss": 0.4902, "mean_token_accuracy": 0.8472607269883156, "num_tokens": 90059093.0, "step": 74880 }, { "entropy": 1.8313181832432748, "epoch": 0.2321524474971694, "grad_norm": 8.513847351074219, "learning_rate": 5.250600139670086e-06, "loss": 0.4528, "mean_token_accuracy": 0.8442484304308892, "num_tokens": 90071931.0, "step": 74890 }, { "entropy": 1.805875188112259, "epoch": 0.2321834466222191, "grad_norm": 8.688248634338379, "learning_rate": 5.2502496159376335e-06, "loss": 0.4062, "mean_token_accuracy": 0.8494523644447327, "num_tokens": 90085027.0, "step": 74900 }, { "entropy": 1.8620700597763062, "epoch": 0.2322144457472688, "grad_norm": 7.525083065032959, "learning_rate": 5.249899162397435e-06, "loss": 0.4726, "mean_token_accuracy": 0.8424142166972161, "num_tokens": 90097799.0, "step": 74910 }, { "entropy": 1.8805269077420235, "epoch": 0.2322454448723185, "grad_norm": 9.527615547180176, "learning_rate": 5.249548779026064e-06, "loss": 0.4867, "mean_token_accuracy": 0.8365740343928337, "num_tokens": 90110343.0, "step": 74920 }, { "entropy": 1.88951036632061, "epoch": 0.2322764439973682, "grad_norm": 8.741109848022461, "learning_rate": 5.249198465800112e-06, "loss": 0.4995, "mean_token_accuracy": 0.8468944728374481, "num_tokens": 90122279.0, "step": 74930 }, { "entropy": 1.921808835864067, "epoch": 0.23230744312241786, "grad_norm": 7.858125686645508, "learning_rate": 5.248848222696175e-06, "loss": 0.4864, "mean_token_accuracy": 0.8521002262830735, "num_tokens": 90133878.0, "step": 74940 }, { "entropy": 1.9586533635854722, "epoch": 0.23233844224746755, "grad_norm": 7.217672348022461, "learning_rate": 5.248498049690861e-06, "loss": 0.5898, "mean_token_accuracy": 0.8337603956460953, "num_tokens": 90144689.0, "step": 74950 }, { "entropy": 1.942648607492447, "epoch": 0.23236944137251725, "grad_norm": 8.679248809814453, "learning_rate": 5.248147946760793e-06, "loss": 0.5142, "mean_token_accuracy": 0.8394230246543884, "num_tokens": 90155720.0, "step": 74960 }, { "entropy": 1.791871838271618, "epoch": 0.23240044049756695, "grad_norm": 4.113439559936523, "learning_rate": 5.247797913882602e-06, "loss": 0.3892, "mean_token_accuracy": 0.8623890161514283, "num_tokens": 90169661.0, "step": 74970 }, { "entropy": 1.8388600483536721, "epoch": 0.23243143962261664, "grad_norm": 3.5801477432250977, "learning_rate": 5.247447951032928e-06, "loss": 0.4552, "mean_token_accuracy": 0.8412294924259186, "num_tokens": 90182346.0, "step": 74980 }, { "entropy": 1.814172099530697, "epoch": 0.23246243874766634, "grad_norm": 3.8935399055480957, "learning_rate": 5.247098058188425e-06, "loss": 0.4632, "mean_token_accuracy": 0.8537375554442406, "num_tokens": 90195133.0, "step": 74990 }, { "entropy": 1.9371330052614213, "epoch": 0.23249343787271604, "grad_norm": 7.545095443725586, "learning_rate": 5.246748235325756e-06, "loss": 0.4919, "mean_token_accuracy": 0.8489918291568757, "num_tokens": 90205935.0, "step": 75000 }, { "entropy": 1.9675442904233933, "epoch": 0.23252443699776573, "grad_norm": 7.987549304962158, "learning_rate": 5.246398482421598e-06, "loss": 0.555, "mean_token_accuracy": 0.822739377617836, "num_tokens": 90216793.0, "step": 75010 }, { "entropy": 1.9193257197737694, "epoch": 0.23255543612281543, "grad_norm": 8.14727783203125, "learning_rate": 5.246048799452634e-06, "loss": 0.4991, "mean_token_accuracy": 0.8483793914318085, "num_tokens": 90227782.0, "step": 75020 }, { "entropy": 1.8773325115442276, "epoch": 0.23258643524786513, "grad_norm": 8.173250198364258, "learning_rate": 5.245699186395562e-06, "loss": 0.4775, "mean_token_accuracy": 0.8404614195227623, "num_tokens": 90239587.0, "step": 75030 }, { "entropy": 1.78584865629673, "epoch": 0.23261743437291482, "grad_norm": 8.468997955322266, "learning_rate": 5.24534964322709e-06, "loss": 0.4254, "mean_token_accuracy": 0.8459458023309707, "num_tokens": 90253213.0, "step": 75040 }, { "entropy": 1.860848817229271, "epoch": 0.23264843349796452, "grad_norm": 8.477463722229004, "learning_rate": 5.245000169923935e-06, "loss": 0.4891, "mean_token_accuracy": 0.8364793375134468, "num_tokens": 90265280.0, "step": 75050 }, { "entropy": 1.9089404866099358, "epoch": 0.23267943262301422, "grad_norm": 8.954151153564453, "learning_rate": 5.244650766462827e-06, "loss": 0.5373, "mean_token_accuracy": 0.8399009183049202, "num_tokens": 90276657.0, "step": 75060 }, { "entropy": 1.9542981505393981, "epoch": 0.2327104317480639, "grad_norm": 8.272551536560059, "learning_rate": 5.244301432820503e-06, "loss": 0.5212, "mean_token_accuracy": 0.8442383721470833, "num_tokens": 90287635.0, "step": 75070 }, { "entropy": 1.9224041730165482, "epoch": 0.2327414308731136, "grad_norm": 8.821447372436523, "learning_rate": 5.243952168973718e-06, "loss": 0.6275, "mean_token_accuracy": 0.8298459380865097, "num_tokens": 90300036.0, "step": 75080 }, { "entropy": 1.9240923762321471, "epoch": 0.2327724299981633, "grad_norm": 3.3205227851867676, "learning_rate": 5.243602974899231e-06, "loss": 0.5128, "mean_token_accuracy": 0.8405191898345947, "num_tokens": 90310850.0, "step": 75090 }, { "entropy": 1.7889164254069327, "epoch": 0.232803429123213, "grad_norm": 8.739767074584961, "learning_rate": 5.243253850573816e-06, "loss": 0.4138, "mean_token_accuracy": 0.8499681517481804, "num_tokens": 90323572.0, "step": 75100 }, { "entropy": 1.928827765583992, "epoch": 0.2328344282482627, "grad_norm": 6.958417892456055, "learning_rate": 5.2429047959742555e-06, "loss": 0.5652, "mean_token_accuracy": 0.8277982458472252, "num_tokens": 90334888.0, "step": 75110 }, { "entropy": 1.8841903924942016, "epoch": 0.2328654273733124, "grad_norm": 4.459338665008545, "learning_rate": 5.242555811077344e-06, "loss": 0.5206, "mean_token_accuracy": 0.8400102391839027, "num_tokens": 90346732.0, "step": 75120 }, { "entropy": 1.8876192346215248, "epoch": 0.2328964264983621, "grad_norm": 8.372060775756836, "learning_rate": 5.242206895859884e-06, "loss": 0.536, "mean_token_accuracy": 0.8402991250157357, "num_tokens": 90358781.0, "step": 75130 }, { "entropy": 1.9489302426576613, "epoch": 0.2329274256234118, "grad_norm": 7.9244384765625, "learning_rate": 5.241858050298695e-06, "loss": 0.5783, "mean_token_accuracy": 0.8289978623390197, "num_tokens": 90369723.0, "step": 75140 }, { "entropy": 1.9526596829295157, "epoch": 0.23295842474846148, "grad_norm": 9.39733600616455, "learning_rate": 5.241509274370601e-06, "loss": 0.5026, "mean_token_accuracy": 0.8326995015144348, "num_tokens": 90381573.0, "step": 75150 }, { "entropy": 1.855046857893467, "epoch": 0.23298942387351118, "grad_norm": 8.04930305480957, "learning_rate": 5.24116056805244e-06, "loss": 0.4655, "mean_token_accuracy": 0.8553052991628647, "num_tokens": 90394574.0, "step": 75160 }, { "entropy": 1.9039323568344115, "epoch": 0.23302042299856088, "grad_norm": 8.174781799316406, "learning_rate": 5.24081193132106e-06, "loss": 0.4824, "mean_token_accuracy": 0.8432982727885246, "num_tokens": 90406242.0, "step": 75170 }, { "entropy": 1.8142014488577842, "epoch": 0.23305142212361057, "grad_norm": 10.09156608581543, "learning_rate": 5.240463364153321e-06, "loss": 0.4246, "mean_token_accuracy": 0.8498436227440834, "num_tokens": 90419691.0, "step": 75180 }, { "entropy": 1.8953629210591316, "epoch": 0.23308242124866024, "grad_norm": 7.493961334228516, "learning_rate": 5.24011486652609e-06, "loss": 0.506, "mean_token_accuracy": 0.8336662292480469, "num_tokens": 90432358.0, "step": 75190 }, { "entropy": 1.8964366748929025, "epoch": 0.23311342037370994, "grad_norm": 8.871479034423828, "learning_rate": 5.2397664384162504e-06, "loss": 0.496, "mean_token_accuracy": 0.8406847149133683, "num_tokens": 90443925.0, "step": 75200 }, { "entropy": 1.8410288974642754, "epoch": 0.23314441949875964, "grad_norm": 9.23766803741455, "learning_rate": 5.239418079800691e-06, "loss": 0.4533, "mean_token_accuracy": 0.8419799253344535, "num_tokens": 90456444.0, "step": 75210 }, { "entropy": 1.9168146222829818, "epoch": 0.23317541862380933, "grad_norm": 8.451478004455566, "learning_rate": 5.239069790656316e-06, "loss": 0.5372, "mean_token_accuracy": 0.8360653147101402, "num_tokens": 90467399.0, "step": 75220 }, { "entropy": 1.7803557753562926, "epoch": 0.23320641774885903, "grad_norm": 4.116580963134766, "learning_rate": 5.238721570960036e-06, "loss": 0.4092, "mean_token_accuracy": 0.853205819427967, "num_tokens": 90479796.0, "step": 75230 }, { "entropy": 1.8654976204037665, "epoch": 0.23323741687390873, "grad_norm": 9.181920051574707, "learning_rate": 5.238373420688775e-06, "loss": 0.5074, "mean_token_accuracy": 0.8490437477827072, "num_tokens": 90491819.0, "step": 75240 }, { "entropy": 1.9017037138342858, "epoch": 0.23326841599895842, "grad_norm": 7.338005542755127, "learning_rate": 5.238025339819467e-06, "loss": 0.4745, "mean_token_accuracy": 0.838523656129837, "num_tokens": 90503606.0, "step": 75250 }, { "entropy": 1.8169363185763359, "epoch": 0.23329941512400812, "grad_norm": 11.015266418457031, "learning_rate": 5.237677328329057e-06, "loss": 0.423, "mean_token_accuracy": 0.8545296162366867, "num_tokens": 90516076.0, "step": 75260 }, { "entropy": 1.8559274673461914, "epoch": 0.23333041424905782, "grad_norm": 7.690032482147217, "learning_rate": 5.237329386194502e-06, "loss": 0.4926, "mean_token_accuracy": 0.8428187996149064, "num_tokens": 90528110.0, "step": 75270 }, { "entropy": 1.8206583276391028, "epoch": 0.2333614133741075, "grad_norm": 4.115688323974609, "learning_rate": 5.236981513392766e-06, "loss": 0.4286, "mean_token_accuracy": 0.8493341699242591, "num_tokens": 90540476.0, "step": 75280 }, { "entropy": 1.8399672955274582, "epoch": 0.2333924124991572, "grad_norm": 8.14948558807373, "learning_rate": 5.2366337099008265e-06, "loss": 0.4485, "mean_token_accuracy": 0.848198390007019, "num_tokens": 90552496.0, "step": 75290 }, { "entropy": 1.7906280860304833, "epoch": 0.2334234116242069, "grad_norm": 6.901556968688965, "learning_rate": 5.236285975695673e-06, "loss": 0.4279, "mean_token_accuracy": 0.8607045665383339, "num_tokens": 90565388.0, "step": 75300 }, { "entropy": 1.8817636847496033, "epoch": 0.2334544107492566, "grad_norm": 10.196159362792969, "learning_rate": 5.235938310754303e-06, "loss": 0.5052, "mean_token_accuracy": 0.8407386288046836, "num_tokens": 90577351.0, "step": 75310 }, { "entropy": 1.9196124032139779, "epoch": 0.2334854098743063, "grad_norm": 7.629261016845703, "learning_rate": 5.2355907150537245e-06, "loss": 0.5032, "mean_token_accuracy": 0.8403740376234055, "num_tokens": 90588716.0, "step": 75320 }, { "entropy": 1.9231160968542098, "epoch": 0.233516408999356, "grad_norm": 8.21264362335205, "learning_rate": 5.2352431885709585e-06, "loss": 0.5624, "mean_token_accuracy": 0.8309063494205475, "num_tokens": 90599515.0, "step": 75330 }, { "entropy": 1.8250703021883965, "epoch": 0.2335474081244057, "grad_norm": 8.739277839660645, "learning_rate": 5.234895731283034e-06, "loss": 0.4434, "mean_token_accuracy": 0.8584131434559822, "num_tokens": 90612420.0, "step": 75340 }, { "entropy": 1.832010643184185, "epoch": 0.2335784072494554, "grad_norm": 8.311301231384277, "learning_rate": 5.234548343166994e-06, "loss": 0.494, "mean_token_accuracy": 0.837817907333374, "num_tokens": 90624549.0, "step": 75350 }, { "entropy": 1.9003486275672912, "epoch": 0.23360940637450509, "grad_norm": 8.429434776306152, "learning_rate": 5.234201024199889e-06, "loss": 0.4714, "mean_token_accuracy": 0.8493073508143425, "num_tokens": 90635571.0, "step": 75360 }, { "entropy": 1.8496991708874702, "epoch": 0.23364040549955478, "grad_norm": 8.533575057983398, "learning_rate": 5.233853774358781e-06, "loss": 0.4762, "mean_token_accuracy": 0.8495924681425094, "num_tokens": 90647578.0, "step": 75370 }, { "entropy": 1.9252197608351707, "epoch": 0.23367140462460448, "grad_norm": 7.957611083984375, "learning_rate": 5.233506593620745e-06, "loss": 0.5422, "mean_token_accuracy": 0.834588311612606, "num_tokens": 90658684.0, "step": 75380 }, { "entropy": 1.914403536915779, "epoch": 0.23370240374965418, "grad_norm": 8.381974220275879, "learning_rate": 5.233159481962864e-06, "loss": 0.544, "mean_token_accuracy": 0.836005237698555, "num_tokens": 90670080.0, "step": 75390 }, { "entropy": 1.8845387056469918, "epoch": 0.23373340287470387, "grad_norm": 7.998114109039307, "learning_rate": 5.232812439362231e-06, "loss": 0.4658, "mean_token_accuracy": 0.8451636284589767, "num_tokens": 90682811.0, "step": 75400 }, { "entropy": 1.8718926057219505, "epoch": 0.23376440199975357, "grad_norm": 3.7296555042266846, "learning_rate": 5.232465465795954e-06, "loss": 0.569, "mean_token_accuracy": 0.8414446488022804, "num_tokens": 90694457.0, "step": 75410 }, { "entropy": 1.9601527959108354, "epoch": 0.23379540112480326, "grad_norm": 8.428439140319824, "learning_rate": 5.232118561241144e-06, "loss": 0.5494, "mean_token_accuracy": 0.8311134546995163, "num_tokens": 90705460.0, "step": 75420 }, { "entropy": 1.803976234793663, "epoch": 0.23382640024985296, "grad_norm": 6.161757946014404, "learning_rate": 5.231771725674932e-06, "loss": 0.4983, "mean_token_accuracy": 0.8395594924688339, "num_tokens": 90718488.0, "step": 75430 }, { "entropy": 1.8432627946138382, "epoch": 0.23385739937490263, "grad_norm": 8.392098426818848, "learning_rate": 5.231424959074452e-06, "loss": 0.4113, "mean_token_accuracy": 0.8638657048344612, "num_tokens": 90730557.0, "step": 75440 }, { "entropy": 1.9326068967580796, "epoch": 0.23388839849995233, "grad_norm": 9.61757755279541, "learning_rate": 5.2310782614168524e-06, "loss": 0.5731, "mean_token_accuracy": 0.828363224864006, "num_tokens": 90741224.0, "step": 75450 }, { "entropy": 1.9034574344754218, "epoch": 0.23391939762500202, "grad_norm": 9.683328628540039, "learning_rate": 5.230731632679291e-06, "loss": 0.5022, "mean_token_accuracy": 0.8382599085569382, "num_tokens": 90752495.0, "step": 75460 }, { "entropy": 1.889817087352276, "epoch": 0.23395039675005172, "grad_norm": 7.614642143249512, "learning_rate": 5.230385072838936e-06, "loss": 0.4736, "mean_token_accuracy": 0.8519553110003472, "num_tokens": 90764504.0, "step": 75470 }, { "entropy": 1.846573531627655, "epoch": 0.23398139587510142, "grad_norm": 8.647238731384277, "learning_rate": 5.230038581872968e-06, "loss": 0.4794, "mean_token_accuracy": 0.845521879196167, "num_tokens": 90777223.0, "step": 75480 }, { "entropy": 1.887799009680748, "epoch": 0.2340123950001511, "grad_norm": 9.934295654296875, "learning_rate": 5.2296921597585774e-06, "loss": 0.513, "mean_token_accuracy": 0.8402909606695175, "num_tokens": 90788632.0, "step": 75490 }, { "entropy": 1.8547486677765845, "epoch": 0.2340433941252008, "grad_norm": 7.495747089385986, "learning_rate": 5.229345806472961e-06, "loss": 0.4797, "mean_token_accuracy": 0.8464630424976349, "num_tokens": 90800492.0, "step": 75500 }, { "entropy": 1.8723068907856941, "epoch": 0.2340743932502505, "grad_norm": 8.363115310668945, "learning_rate": 5.228999521993333e-06, "loss": 0.4835, "mean_token_accuracy": 0.8466327175498009, "num_tokens": 90812390.0, "step": 75510 }, { "entropy": 1.8291691452264787, "epoch": 0.2341053923753002, "grad_norm": 4.456087112426758, "learning_rate": 5.228653306296913e-06, "loss": 0.4529, "mean_token_accuracy": 0.8438918322324753, "num_tokens": 90825014.0, "step": 75520 }, { "entropy": 1.9418904185295105, "epoch": 0.2341363915003499, "grad_norm": 8.06656265258789, "learning_rate": 5.228307159360937e-06, "loss": 0.5371, "mean_token_accuracy": 0.8240366965532303, "num_tokens": 90836051.0, "step": 75530 }, { "entropy": 1.8666341871023178, "epoch": 0.2341673906253996, "grad_norm": 9.48065185546875, "learning_rate": 5.2279610811626425e-06, "loss": 0.473, "mean_token_accuracy": 0.8425935983657837, "num_tokens": 90847835.0, "step": 75540 }, { "entropy": 1.8927869558334351, "epoch": 0.2341983897504493, "grad_norm": 9.044870376586914, "learning_rate": 5.227615071679285e-06, "loss": 0.5159, "mean_token_accuracy": 0.8357618257403374, "num_tokens": 90859514.0, "step": 75550 }, { "entropy": 1.7879324212670327, "epoch": 0.234229388875499, "grad_norm": 4.232240200042725, "learning_rate": 5.227269130888129e-06, "loss": 0.368, "mean_token_accuracy": 0.8541668400168418, "num_tokens": 90873762.0, "step": 75560 }, { "entropy": 1.86094261854887, "epoch": 0.23426038800054869, "grad_norm": 2.110261917114258, "learning_rate": 5.226923258766447e-06, "loss": 0.4928, "mean_token_accuracy": 0.8436441987752914, "num_tokens": 90886426.0, "step": 75570 }, { "entropy": 1.7484543666243553, "epoch": 0.23429138712559838, "grad_norm": 4.710477352142334, "learning_rate": 5.226577455291525e-06, "loss": 0.3791, "mean_token_accuracy": 0.8541481211781502, "num_tokens": 90900094.0, "step": 75580 }, { "entropy": 1.945685862004757, "epoch": 0.23432238625064808, "grad_norm": 8.194548606872559, "learning_rate": 5.226231720440659e-06, "loss": 0.5229, "mean_token_accuracy": 0.8371816173195838, "num_tokens": 90911642.0, "step": 75590 }, { "entropy": 1.8348077073693276, "epoch": 0.23435338537569778, "grad_norm": 9.100287437438965, "learning_rate": 5.2258860541911514e-06, "loss": 0.4516, "mean_token_accuracy": 0.8539791435003281, "num_tokens": 90924678.0, "step": 75600 }, { "entropy": 1.8494689092040062, "epoch": 0.23438438450074747, "grad_norm": 7.481054782867432, "learning_rate": 5.2255404565203234e-06, "loss": 0.469, "mean_token_accuracy": 0.8396744161844254, "num_tokens": 90937460.0, "step": 75610 }, { "entropy": 1.9320793822407722, "epoch": 0.23441538362579717, "grad_norm": 9.441415786743164, "learning_rate": 5.225194927405498e-06, "loss": 0.5608, "mean_token_accuracy": 0.8343402355909347, "num_tokens": 90949563.0, "step": 75620 }, { "entropy": 1.8647829070687294, "epoch": 0.23444638275084687, "grad_norm": 8.169805526733398, "learning_rate": 5.224849466824015e-06, "loss": 0.5014, "mean_token_accuracy": 0.839415080845356, "num_tokens": 90961677.0, "step": 75630 }, { "entropy": 1.8755572125315667, "epoch": 0.23447738187589656, "grad_norm": 7.456679344177246, "learning_rate": 5.2245040747532205e-06, "loss": 0.5616, "mean_token_accuracy": 0.8273000553250313, "num_tokens": 90974918.0, "step": 75640 }, { "entropy": 1.9732760652899741, "epoch": 0.23450838100094626, "grad_norm": 9.377166748046875, "learning_rate": 5.224158751170473e-06, "loss": 0.5314, "mean_token_accuracy": 0.8365205094218254, "num_tokens": 90986087.0, "step": 75650 }, { "entropy": 1.9110179245471954, "epoch": 0.23453938012599596, "grad_norm": 8.639516830444336, "learning_rate": 5.223813496053142e-06, "loss": 0.4806, "mean_token_accuracy": 0.8477229610085487, "num_tokens": 90997997.0, "step": 75660 }, { "entropy": 1.933686862885952, "epoch": 0.23457037925104565, "grad_norm": 8.902299880981445, "learning_rate": 5.223468309378605e-06, "loss": 0.5609, "mean_token_accuracy": 0.8341797336935997, "num_tokens": 91009272.0, "step": 75670 }, { "entropy": 1.968608994781971, "epoch": 0.23460137837609532, "grad_norm": 4.5855712890625, "learning_rate": 5.2231231911242555e-06, "loss": 0.5086, "mean_token_accuracy": 0.8370487987995148, "num_tokens": 91020661.0, "step": 75680 }, { "entropy": 1.8787442803382874, "epoch": 0.23463237750114502, "grad_norm": 7.962042331695557, "learning_rate": 5.222778141267488e-06, "loss": 0.4506, "mean_token_accuracy": 0.8470729902386666, "num_tokens": 91033230.0, "step": 75690 }, { "entropy": 1.8811139300465585, "epoch": 0.23466337662619471, "grad_norm": 7.824191093444824, "learning_rate": 5.222433159785718e-06, "loss": 0.481, "mean_token_accuracy": 0.8375716552138328, "num_tokens": 91045252.0, "step": 75700 }, { "entropy": 1.9257870644330979, "epoch": 0.2346943757512444, "grad_norm": 7.708620548248291, "learning_rate": 5.222088246656365e-06, "loss": 0.5305, "mean_token_accuracy": 0.8385186135768891, "num_tokens": 91056958.0, "step": 75710 }, { "entropy": 1.9052812889218331, "epoch": 0.2347253748762941, "grad_norm": 7.920125484466553, "learning_rate": 5.22174340185686e-06, "loss": 0.4891, "mean_token_accuracy": 0.8514411136507988, "num_tokens": 91068010.0, "step": 75720 }, { "entropy": 1.8879928544163704, "epoch": 0.2347563740013438, "grad_norm": 8.99319839477539, "learning_rate": 5.221398625364644e-06, "loss": 0.4728, "mean_token_accuracy": 0.8493816792964936, "num_tokens": 91079694.0, "step": 75730 }, { "entropy": 1.860645368695259, "epoch": 0.2347873731263935, "grad_norm": 8.87778377532959, "learning_rate": 5.2210539171571715e-06, "loss": 0.4795, "mean_token_accuracy": 0.8425444066524506, "num_tokens": 91091618.0, "step": 75740 }, { "entropy": 1.8990991547703744, "epoch": 0.2348183722514432, "grad_norm": 4.461852550506592, "learning_rate": 5.220709277211903e-06, "loss": 0.519, "mean_token_accuracy": 0.83267183303833, "num_tokens": 91104008.0, "step": 75750 }, { "entropy": 1.9108994349837303, "epoch": 0.2348493713764929, "grad_norm": 7.736074924468994, "learning_rate": 5.220364705506313e-06, "loss": 0.4869, "mean_token_accuracy": 0.8414252296090126, "num_tokens": 91115759.0, "step": 75760 }, { "entropy": 1.9764820635318756, "epoch": 0.2348803705015426, "grad_norm": 9.64189624786377, "learning_rate": 5.220020202017887e-06, "loss": 0.5797, "mean_token_accuracy": 0.8334431812167168, "num_tokens": 91125911.0, "step": 75770 }, { "entropy": 1.8024020060896873, "epoch": 0.2349113696265923, "grad_norm": 6.673051357269287, "learning_rate": 5.219675766724114e-06, "loss": 0.4302, "mean_token_accuracy": 0.8567843988537789, "num_tokens": 91139395.0, "step": 75780 }, { "entropy": 1.9167088627815247, "epoch": 0.23494236875164198, "grad_norm": 6.79614782333374, "learning_rate": 5.219331399602503e-06, "loss": 0.4774, "mean_token_accuracy": 0.8450982421636581, "num_tokens": 91151124.0, "step": 75790 }, { "entropy": 1.8575886994600297, "epoch": 0.23497336787669168, "grad_norm": 9.397269248962402, "learning_rate": 5.218987100630566e-06, "loss": 0.4664, "mean_token_accuracy": 0.8560428529977798, "num_tokens": 91163168.0, "step": 75800 }, { "entropy": 1.9414154648780824, "epoch": 0.23500436700174138, "grad_norm": 6.633089065551758, "learning_rate": 5.21864286978583e-06, "loss": 0.4886, "mean_token_accuracy": 0.8463586211204529, "num_tokens": 91174212.0, "step": 75810 }, { "entropy": 1.7952500343322755, "epoch": 0.23503536612679107, "grad_norm": 3.056459426879883, "learning_rate": 5.218298707045828e-06, "loss": 0.4439, "mean_token_accuracy": 0.8485618963837623, "num_tokens": 91188049.0, "step": 75820 }, { "entropy": 1.9243362337350844, "epoch": 0.23506636525184077, "grad_norm": 8.155814170837402, "learning_rate": 5.217954612388109e-06, "loss": 0.5203, "mean_token_accuracy": 0.8437091827392578, "num_tokens": 91199119.0, "step": 75830 }, { "entropy": 1.9802058964967728, "epoch": 0.23509736437689047, "grad_norm": 8.803181648254395, "learning_rate": 5.217610585790226e-06, "loss": 0.5569, "mean_token_accuracy": 0.8368562594056129, "num_tokens": 91209740.0, "step": 75840 }, { "entropy": 1.8397732719779014, "epoch": 0.23512836350194016, "grad_norm": 7.574740886688232, "learning_rate": 5.217266627229748e-06, "loss": 0.4793, "mean_token_accuracy": 0.8465480893850327, "num_tokens": 91223580.0, "step": 75850 }, { "entropy": 1.9436632886528968, "epoch": 0.23515936262698986, "grad_norm": 7.771301746368408, "learning_rate": 5.216922736684251e-06, "loss": 0.5232, "mean_token_accuracy": 0.8337218299508095, "num_tokens": 91235380.0, "step": 75860 }, { "entropy": 1.8707226827740668, "epoch": 0.23519036175203956, "grad_norm": 7.848099231719971, "learning_rate": 5.216578914131323e-06, "loss": 0.4367, "mean_token_accuracy": 0.8528139501810074, "num_tokens": 91247966.0, "step": 75870 }, { "entropy": 2.011591988801956, "epoch": 0.23522136087708925, "grad_norm": 9.857985496520996, "learning_rate": 5.216235159548561e-06, "loss": 0.6008, "mean_token_accuracy": 0.8229044884443283, "num_tokens": 91259077.0, "step": 75880 }, { "entropy": 1.9336156010627747, "epoch": 0.23525236000213895, "grad_norm": 3.4189701080322266, "learning_rate": 5.215891472913572e-06, "loss": 0.4667, "mean_token_accuracy": 0.8479844376444816, "num_tokens": 91271077.0, "step": 75890 }, { "entropy": 1.9687816753983498, "epoch": 0.23528335912718865, "grad_norm": 10.6126070022583, "learning_rate": 5.215547854203976e-06, "loss": 0.5246, "mean_token_accuracy": 0.839890718460083, "num_tokens": 91282469.0, "step": 75900 }, { "entropy": 1.8437663704156875, "epoch": 0.23531435825223834, "grad_norm": 4.00623083114624, "learning_rate": 5.2152043033974e-06, "loss": 0.4231, "mean_token_accuracy": 0.8508341655135154, "num_tokens": 91295928.0, "step": 75910 }, { "entropy": 1.9553633004426956, "epoch": 0.23534535737728804, "grad_norm": 8.508515357971191, "learning_rate": 5.214860820471484e-06, "loss": 0.5215, "mean_token_accuracy": 0.8452081516385078, "num_tokens": 91307197.0, "step": 75920 }, { "entropy": 1.880798263847828, "epoch": 0.2353763565023377, "grad_norm": 6.8838791847229, "learning_rate": 5.214517405403878e-06, "loss": 0.415, "mean_token_accuracy": 0.8529353275895118, "num_tokens": 91319587.0, "step": 75930 }, { "entropy": 1.8892224609851838, "epoch": 0.2354073556273874, "grad_norm": 8.371787071228027, "learning_rate": 5.214174058172241e-06, "loss": 0.4759, "mean_token_accuracy": 0.8420174464583396, "num_tokens": 91332095.0, "step": 75940 }, { "entropy": 1.9798435151576996, "epoch": 0.2354383547524371, "grad_norm": 10.347806930541992, "learning_rate": 5.213830778754241e-06, "loss": 0.6038, "mean_token_accuracy": 0.8171585813164711, "num_tokens": 91342922.0, "step": 75950 }, { "entropy": 1.9197808623313903, "epoch": 0.2354693538774868, "grad_norm": 8.591217041015625, "learning_rate": 5.213487567127559e-06, "loss": 0.4662, "mean_token_accuracy": 0.8502022624015808, "num_tokens": 91354825.0, "step": 75960 }, { "entropy": 1.9522427856922149, "epoch": 0.2355003530025365, "grad_norm": 9.180562973022461, "learning_rate": 5.213144423269887e-06, "loss": 0.538, "mean_token_accuracy": 0.8390685006976127, "num_tokens": 91365900.0, "step": 75970 }, { "entropy": 1.9248853296041488, "epoch": 0.2355313521275862, "grad_norm": 7.8382248878479, "learning_rate": 5.212801347158925e-06, "loss": 0.5151, "mean_token_accuracy": 0.8395644649863243, "num_tokens": 91377884.0, "step": 75980 }, { "entropy": 1.869673204421997, "epoch": 0.2355623512526359, "grad_norm": 10.19648551940918, "learning_rate": 5.212458338772383e-06, "loss": 0.4567, "mean_token_accuracy": 0.8442730501294136, "num_tokens": 91389637.0, "step": 75990 }, { "entropy": 1.8467582926154136, "epoch": 0.23559335037768558, "grad_norm": 4.066962718963623, "learning_rate": 5.212115398087981e-06, "loss": 0.4431, "mean_token_accuracy": 0.8452418327331543, "num_tokens": 91402099.0, "step": 76000 }, { "entropy": 1.8998699069023133, "epoch": 0.23562434950273528, "grad_norm": 7.766880035400391, "learning_rate": 5.211772525083454e-06, "loss": 0.5449, "mean_token_accuracy": 0.8390342697501183, "num_tokens": 91415037.0, "step": 76010 }, { "entropy": 1.8747850939631463, "epoch": 0.23565534862778498, "grad_norm": 9.178610801696777, "learning_rate": 5.2114297197365406e-06, "loss": 0.5079, "mean_token_accuracy": 0.8318077206611634, "num_tokens": 91427375.0, "step": 76020 }, { "entropy": 1.8805608436465264, "epoch": 0.23568634775283467, "grad_norm": 8.223982810974121, "learning_rate": 5.211086982024995e-06, "loss": 0.5108, "mean_token_accuracy": 0.8412626773118973, "num_tokens": 91439742.0, "step": 76030 }, { "entropy": 1.8259792655706406, "epoch": 0.23571734687788437, "grad_norm": 4.048516273498535, "learning_rate": 5.210744311926578e-06, "loss": 0.4072, "mean_token_accuracy": 0.8551635116338729, "num_tokens": 91451962.0, "step": 76040 }, { "entropy": 1.9941584020853043, "epoch": 0.23574834600293407, "grad_norm": 8.59123420715332, "learning_rate": 5.210401709419061e-06, "loss": 0.6048, "mean_token_accuracy": 0.8241194665431977, "num_tokens": 91462894.0, "step": 76050 }, { "entropy": 1.9185423642396926, "epoch": 0.23577934512798376, "grad_norm": 11.106711387634277, "learning_rate": 5.210059174480229e-06, "loss": 0.5074, "mean_token_accuracy": 0.8400084570050239, "num_tokens": 91474440.0, "step": 76060 }, { "entropy": 1.9143582820892333, "epoch": 0.23581034425303346, "grad_norm": 7.75733757019043, "learning_rate": 5.2097167070878754e-06, "loss": 0.5088, "mean_token_accuracy": 0.8454731091856956, "num_tokens": 91485553.0, "step": 76070 }, { "entropy": 1.9490876853466035, "epoch": 0.23584134337808316, "grad_norm": 11.029200553894043, "learning_rate": 5.209374307219801e-06, "loss": 0.5343, "mean_token_accuracy": 0.8368668779730797, "num_tokens": 91495910.0, "step": 76080 }, { "entropy": 1.9713003784418106, "epoch": 0.23587234250313285, "grad_norm": 8.084342956542969, "learning_rate": 5.20903197485382e-06, "loss": 0.5299, "mean_token_accuracy": 0.8417750880122185, "num_tokens": 91506865.0, "step": 76090 }, { "entropy": 1.8941795021295547, "epoch": 0.23590334162818255, "grad_norm": 7.648937702178955, "learning_rate": 5.208689709967756e-06, "loss": 0.5137, "mean_token_accuracy": 0.8375332474708557, "num_tokens": 91518800.0, "step": 76100 }, { "entropy": 1.8584771752357483, "epoch": 0.23593434075323225, "grad_norm": 8.80497932434082, "learning_rate": 5.208347512539442e-06, "loss": 0.5252, "mean_token_accuracy": 0.8386746376752854, "num_tokens": 91530792.0, "step": 76110 }, { "entropy": 1.8739138320088387, "epoch": 0.23596533987828194, "grad_norm": 8.877378463745117, "learning_rate": 5.2080053825467235e-06, "loss": 0.4915, "mean_token_accuracy": 0.8397540152072906, "num_tokens": 91542992.0, "step": 76120 }, { "entropy": 1.840997065603733, "epoch": 0.23599633900333164, "grad_norm": 7.287625789642334, "learning_rate": 5.207663319967453e-06, "loss": 0.4201, "mean_token_accuracy": 0.8495836406946182, "num_tokens": 91555954.0, "step": 76130 }, { "entropy": 1.826866079866886, "epoch": 0.23602733812838134, "grad_norm": 8.53991985321045, "learning_rate": 5.207321324779495e-06, "loss": 0.4131, "mean_token_accuracy": 0.8605758026242256, "num_tokens": 91568711.0, "step": 76140 }, { "entropy": 1.8429628267884255, "epoch": 0.23605833725343103, "grad_norm": 9.361772537231445, "learning_rate": 5.2069793969607265e-06, "loss": 0.4777, "mean_token_accuracy": 0.8468737691640854, "num_tokens": 91581173.0, "step": 76150 }, { "entropy": 1.8500198744237424, "epoch": 0.23608933637848073, "grad_norm": 8.414274215698242, "learning_rate": 5.206637536489028e-06, "loss": 0.4802, "mean_token_accuracy": 0.8451015651226044, "num_tokens": 91594341.0, "step": 76160 }, { "entropy": 1.93178521245718, "epoch": 0.23612033550353043, "grad_norm": 7.025452136993408, "learning_rate": 5.206295743342297e-06, "loss": 0.5028, "mean_token_accuracy": 0.8408794820308685, "num_tokens": 91605890.0, "step": 76170 }, { "entropy": 1.8896970108151436, "epoch": 0.2361513346285801, "grad_norm": 10.62279224395752, "learning_rate": 5.205954017498437e-06, "loss": 0.5293, "mean_token_accuracy": 0.8361875951290131, "num_tokens": 91617830.0, "step": 76180 }, { "entropy": 1.8840731859207154, "epoch": 0.2361823337536298, "grad_norm": 8.596649169921875, "learning_rate": 5.205612358935365e-06, "loss": 0.55, "mean_token_accuracy": 0.8344793394207954, "num_tokens": 91629222.0, "step": 76190 }, { "entropy": 1.8855000153183936, "epoch": 0.2362133328786795, "grad_norm": 7.801729202270508, "learning_rate": 5.205270767631004e-06, "loss": 0.4518, "mean_token_accuracy": 0.849211573600769, "num_tokens": 91641322.0, "step": 76200 }, { "entropy": 1.9867444038391113, "epoch": 0.23624433200372918, "grad_norm": 10.91826057434082, "learning_rate": 5.2049292435632915e-06, "loss": 0.5572, "mean_token_accuracy": 0.8317925870418549, "num_tokens": 91651690.0, "step": 76210 }, { "entropy": 1.9123092323541642, "epoch": 0.23627533112877888, "grad_norm": 8.301390647888184, "learning_rate": 5.2045877867101715e-06, "loss": 0.527, "mean_token_accuracy": 0.8369743451476097, "num_tokens": 91662933.0, "step": 76220 }, { "entropy": 1.9738686069846154, "epoch": 0.23630633025382858, "grad_norm": 9.968904495239258, "learning_rate": 5.2042463970496e-06, "loss": 0.5797, "mean_token_accuracy": 0.8200091898441315, "num_tokens": 91674479.0, "step": 76230 }, { "entropy": 1.8900879070162773, "epoch": 0.23633732937887827, "grad_norm": 10.34799575805664, "learning_rate": 5.203905074559543e-06, "loss": 0.4925, "mean_token_accuracy": 0.8390724554657936, "num_tokens": 91686217.0, "step": 76240 }, { "entropy": 1.8866849929094314, "epoch": 0.23636832850392797, "grad_norm": 8.001737594604492, "learning_rate": 5.203563819217977e-06, "loss": 0.4716, "mean_token_accuracy": 0.8539379611611366, "num_tokens": 91697971.0, "step": 76250 }, { "entropy": 1.9403339833021165, "epoch": 0.23639932762897767, "grad_norm": 8.635519027709961, "learning_rate": 5.203222631002886e-06, "loss": 0.5552, "mean_token_accuracy": 0.8294428750872612, "num_tokens": 91709307.0, "step": 76260 }, { "entropy": 1.9339663892984391, "epoch": 0.23643032675402736, "grad_norm": 8.026914596557617, "learning_rate": 5.202881509892268e-06, "loss": 0.4877, "mean_token_accuracy": 0.8449966043233872, "num_tokens": 91720715.0, "step": 76270 }, { "entropy": 1.769917319715023, "epoch": 0.23646132587907706, "grad_norm": 3.137450933456421, "learning_rate": 5.202540455864128e-06, "loss": 0.4872, "mean_token_accuracy": 0.8480799734592438, "num_tokens": 91734185.0, "step": 76280 }, { "entropy": 1.9272101998329163, "epoch": 0.23649232500412676, "grad_norm": 9.050545692443848, "learning_rate": 5.202199468896483e-06, "loss": 0.5076, "mean_token_accuracy": 0.8370890215039253, "num_tokens": 91746051.0, "step": 76290 }, { "entropy": 1.890957424044609, "epoch": 0.23652332412917645, "grad_norm": 7.916098117828369, "learning_rate": 5.201858548967359e-06, "loss": 0.4782, "mean_token_accuracy": 0.8495743572711945, "num_tokens": 91757115.0, "step": 76300 }, { "entropy": 1.904859295487404, "epoch": 0.23655432325422615, "grad_norm": 7.8038105964660645, "learning_rate": 5.201517696054792e-06, "loss": 0.4633, "mean_token_accuracy": 0.8543992474675178, "num_tokens": 91768690.0, "step": 76310 }, { "entropy": 1.8198333993554114, "epoch": 0.23658532237927585, "grad_norm": 8.247322082519531, "learning_rate": 5.2011769101368294e-06, "loss": 0.4175, "mean_token_accuracy": 0.8562105298042297, "num_tokens": 91781845.0, "step": 76320 }, { "entropy": 1.879896378517151, "epoch": 0.23661632150432554, "grad_norm": 3.9357030391693115, "learning_rate": 5.200836191191528e-06, "loss": 0.5112, "mean_token_accuracy": 0.8404986575245857, "num_tokens": 91794225.0, "step": 76330 }, { "entropy": 1.8717378214001656, "epoch": 0.23664732062937524, "grad_norm": 3.682194948196411, "learning_rate": 5.200495539196953e-06, "loss": 0.4489, "mean_token_accuracy": 0.8454118952155113, "num_tokens": 91807385.0, "step": 76340 }, { "entropy": 1.9737626880407333, "epoch": 0.23667831975442494, "grad_norm": 7.0051069259643555, "learning_rate": 5.200154954131182e-06, "loss": 0.5498, "mean_token_accuracy": 0.8395712092518807, "num_tokens": 91818225.0, "step": 76350 }, { "entropy": 1.9173112154006957, "epoch": 0.23670931887947463, "grad_norm": 7.16079568862915, "learning_rate": 5.199814435972302e-06, "loss": 0.5045, "mean_token_accuracy": 0.8442193806171417, "num_tokens": 91830394.0, "step": 76360 }, { "entropy": 1.8628339901566506, "epoch": 0.23674031800452433, "grad_norm": 4.580597400665283, "learning_rate": 5.19947398469841e-06, "loss": 0.4827, "mean_token_accuracy": 0.8353284701704979, "num_tokens": 91843266.0, "step": 76370 }, { "entropy": 1.9519380658864975, "epoch": 0.23677131712957403, "grad_norm": 7.013167381286621, "learning_rate": 5.1991336002876116e-06, "loss": 0.5079, "mean_token_accuracy": 0.8405268996953964, "num_tokens": 91853694.0, "step": 76380 }, { "entropy": 1.8750559836626053, "epoch": 0.23680231625462372, "grad_norm": 3.834982395172119, "learning_rate": 5.198793282718023e-06, "loss": 0.5224, "mean_token_accuracy": 0.8327495470643044, "num_tokens": 91866658.0, "step": 76390 }, { "entropy": 1.778690068423748, "epoch": 0.23683331537967342, "grad_norm": 8.452054977416992, "learning_rate": 5.198453031967774e-06, "loss": 0.4267, "mean_token_accuracy": 0.8489452451467514, "num_tokens": 91880171.0, "step": 76400 }, { "entropy": 1.8419625982642174, "epoch": 0.23686431450472312, "grad_norm": 5.224298000335693, "learning_rate": 5.198112848015e-06, "loss": 0.428, "mean_token_accuracy": 0.8543521463871002, "num_tokens": 91892362.0, "step": 76410 }, { "entropy": 1.8903858274221421, "epoch": 0.2368953136297728, "grad_norm": 8.055941581726074, "learning_rate": 5.197772730837848e-06, "loss": 0.4622, "mean_token_accuracy": 0.8432309225201606, "num_tokens": 91904734.0, "step": 76420 }, { "entropy": 1.8139331057667731, "epoch": 0.23692631275482248, "grad_norm": 4.4151506423950195, "learning_rate": 5.197432680414474e-06, "loss": 0.3915, "mean_token_accuracy": 0.855991254746914, "num_tokens": 91917912.0, "step": 76430 }, { "entropy": 1.7869535863399506, "epoch": 0.23695731187987218, "grad_norm": 8.555081367492676, "learning_rate": 5.1970926967230455e-06, "loss": 0.3974, "mean_token_accuracy": 0.8637645095586777, "num_tokens": 91931141.0, "step": 76440 }, { "entropy": 1.8419126272201538, "epoch": 0.23698831100492188, "grad_norm": 4.1938910484313965, "learning_rate": 5.196752779741738e-06, "loss": 0.4166, "mean_token_accuracy": 0.8464212834835052, "num_tokens": 91943834.0, "step": 76450 }, { "entropy": 1.8724951505661012, "epoch": 0.23701931012997157, "grad_norm": 9.071846008300781, "learning_rate": 5.196412929448742e-06, "loss": 0.5126, "mean_token_accuracy": 0.8366881161928177, "num_tokens": 91956524.0, "step": 76460 }, { "entropy": 1.9315653994679451, "epoch": 0.23705030925502127, "grad_norm": 4.576693534851074, "learning_rate": 5.1960731458222526e-06, "loss": 0.5247, "mean_token_accuracy": 0.832519143819809, "num_tokens": 91968089.0, "step": 76470 }, { "entropy": 1.8646988362073897, "epoch": 0.23708130838007097, "grad_norm": 9.52219009399414, "learning_rate": 5.195733428840475e-06, "loss": 0.486, "mean_token_accuracy": 0.8468518033623695, "num_tokens": 91979897.0, "step": 76480 }, { "entropy": 1.8760708898305893, "epoch": 0.23711230750512066, "grad_norm": 8.841971397399902, "learning_rate": 5.1953937784816275e-06, "loss": 0.4938, "mean_token_accuracy": 0.8391115218400955, "num_tokens": 91992057.0, "step": 76490 }, { "entropy": 1.9153295263648034, "epoch": 0.23714330663017036, "grad_norm": 9.128143310546875, "learning_rate": 5.195054194723937e-06, "loss": 0.5021, "mean_token_accuracy": 0.840194196999073, "num_tokens": 92004385.0, "step": 76500 }, { "entropy": 1.871627089381218, "epoch": 0.23717430575522006, "grad_norm": 5.556238651275635, "learning_rate": 5.19471467754564e-06, "loss": 0.4935, "mean_token_accuracy": 0.8399588122963906, "num_tokens": 92016826.0, "step": 76510 }, { "entropy": 1.9132946953177452, "epoch": 0.23720530488026975, "grad_norm": 10.617101669311523, "learning_rate": 5.194375226924984e-06, "loss": 0.549, "mean_token_accuracy": 0.8375571563839912, "num_tokens": 92028616.0, "step": 76520 }, { "entropy": 1.8806725934147834, "epoch": 0.23723630400531945, "grad_norm": 7.826884746551514, "learning_rate": 5.194035842840225e-06, "loss": 0.501, "mean_token_accuracy": 0.8441179230809212, "num_tokens": 92040482.0, "step": 76530 }, { "entropy": 1.9325586080551147, "epoch": 0.23726730313036914, "grad_norm": 10.202268600463867, "learning_rate": 5.193696525269629e-06, "loss": 0.4956, "mean_token_accuracy": 0.8491972535848618, "num_tokens": 92052267.0, "step": 76540 }, { "entropy": 1.9318912595510482, "epoch": 0.23729830225541884, "grad_norm": 8.407309532165527, "learning_rate": 5.1933572741914726e-06, "loss": 0.4927, "mean_token_accuracy": 0.8412642747163772, "num_tokens": 92063760.0, "step": 76550 }, { "entropy": 1.9290201410651207, "epoch": 0.23732930138046854, "grad_norm": 8.155739784240723, "learning_rate": 5.193018089584044e-06, "loss": 0.517, "mean_token_accuracy": 0.8428064584732056, "num_tokens": 92075347.0, "step": 76560 }, { "entropy": 1.8454096555709838, "epoch": 0.23736030050551823, "grad_norm": 10.54609203338623, "learning_rate": 5.192678971425639e-06, "loss": 0.4513, "mean_token_accuracy": 0.8462561905384064, "num_tokens": 92087679.0, "step": 76570 }, { "entropy": 1.8874484971165657, "epoch": 0.23739129963056793, "grad_norm": 9.038010597229004, "learning_rate": 5.192339919694561e-06, "loss": 0.4568, "mean_token_accuracy": 0.8541138485074043, "num_tokens": 92099382.0, "step": 76580 }, { "entropy": 1.9192816317081451, "epoch": 0.23742229875561763, "grad_norm": 9.369587898254395, "learning_rate": 5.192000934369129e-06, "loss": 0.4767, "mean_token_accuracy": 0.8446416437625885, "num_tokens": 92111204.0, "step": 76590 }, { "entropy": 1.9248986691236496, "epoch": 0.23745329788066732, "grad_norm": 8.209547996520996, "learning_rate": 5.19166201542767e-06, "loss": 0.5492, "mean_token_accuracy": 0.8277073204517365, "num_tokens": 92122562.0, "step": 76600 }, { "entropy": 1.8582263216376305, "epoch": 0.23748429700571702, "grad_norm": 8.712994575500488, "learning_rate": 5.191323162848518e-06, "loss": 0.4827, "mean_token_accuracy": 0.8368935108184814, "num_tokens": 92135073.0, "step": 76610 }, { "entropy": 1.9173593461513518, "epoch": 0.23751529613076672, "grad_norm": 7.943591594696045, "learning_rate": 5.190984376610021e-06, "loss": 0.5192, "mean_token_accuracy": 0.8503361091017723, "num_tokens": 92146436.0, "step": 76620 }, { "entropy": 1.8543553605675698, "epoch": 0.23754629525581641, "grad_norm": 7.117832183837891, "learning_rate": 5.190645656690533e-06, "loss": 0.4328, "mean_token_accuracy": 0.851584343612194, "num_tokens": 92158280.0, "step": 76630 }, { "entropy": 1.868790790438652, "epoch": 0.2375772943808661, "grad_norm": 7.763202667236328, "learning_rate": 5.19030700306842e-06, "loss": 0.4687, "mean_token_accuracy": 0.8473842933773994, "num_tokens": 92170413.0, "step": 76640 }, { "entropy": 1.9384655416011811, "epoch": 0.2376082935059158, "grad_norm": 8.59603214263916, "learning_rate": 5.189968415722057e-06, "loss": 0.5514, "mean_token_accuracy": 0.8368967100977898, "num_tokens": 92181528.0, "step": 76650 }, { "entropy": 1.7697623759508132, "epoch": 0.2376392926309655, "grad_norm": 9.119267463684082, "learning_rate": 5.189629894629832e-06, "loss": 0.414, "mean_token_accuracy": 0.8625062674283981, "num_tokens": 92195483.0, "step": 76660 }, { "entropy": 1.8749955371022224, "epoch": 0.23767029175601517, "grad_norm": 7.122738361358643, "learning_rate": 5.189291439770136e-06, "loss": 0.4683, "mean_token_accuracy": 0.8516977295279503, "num_tokens": 92208548.0, "step": 76670 }, { "entropy": 1.9658852204680444, "epoch": 0.23770129088106487, "grad_norm": 9.981183052062988, "learning_rate": 5.18895305112138e-06, "loss": 0.5554, "mean_token_accuracy": 0.8255149856209755, "num_tokens": 92219807.0, "step": 76680 }, { "entropy": 1.9098512694239616, "epoch": 0.23773229000611457, "grad_norm": 8.188959121704102, "learning_rate": 5.188614728661975e-06, "loss": 0.4968, "mean_token_accuracy": 0.8477085337042809, "num_tokens": 92231598.0, "step": 76690 }, { "entropy": 1.9508659318089485, "epoch": 0.23776328913116426, "grad_norm": 9.962569236755371, "learning_rate": 5.188276472370346e-06, "loss": 0.5163, "mean_token_accuracy": 0.8311663627624511, "num_tokens": 92243313.0, "step": 76700 }, { "entropy": 1.880191344022751, "epoch": 0.23779428825621396, "grad_norm": 8.909346580505371, "learning_rate": 5.187938282224929e-06, "loss": 0.5086, "mean_token_accuracy": 0.8435371667146683, "num_tokens": 92255339.0, "step": 76710 }, { "entropy": 1.8568480342626572, "epoch": 0.23782528738126366, "grad_norm": 3.7800729274749756, "learning_rate": 5.187600158204169e-06, "loss": 0.4585, "mean_token_accuracy": 0.8489156365394592, "num_tokens": 92268366.0, "step": 76720 }, { "entropy": 1.9763164937496185, "epoch": 0.23785628650631335, "grad_norm": 7.895925998687744, "learning_rate": 5.187262100286519e-06, "loss": 0.6004, "mean_token_accuracy": 0.8200380340218544, "num_tokens": 92278966.0, "step": 76730 }, { "entropy": 1.9572820693254471, "epoch": 0.23788728563136305, "grad_norm": 8.173888206481934, "learning_rate": 5.186924108450444e-06, "loss": 0.5823, "mean_token_accuracy": 0.831984531879425, "num_tokens": 92290141.0, "step": 76740 }, { "entropy": 1.9257027983665467, "epoch": 0.23791828475641275, "grad_norm": 9.356558799743652, "learning_rate": 5.186586182674418e-06, "loss": 0.4902, "mean_token_accuracy": 0.8511083468794822, "num_tokens": 92301596.0, "step": 76750 }, { "entropy": 1.9547690361738206, "epoch": 0.23794928388146244, "grad_norm": 9.034090995788574, "learning_rate": 5.186248322936925e-06, "loss": 0.5407, "mean_token_accuracy": 0.8326706364750862, "num_tokens": 92313300.0, "step": 76760 }, { "entropy": 1.933908286690712, "epoch": 0.23798028300651214, "grad_norm": 7.58587646484375, "learning_rate": 5.1859105292164594e-06, "loss": 0.4783, "mean_token_accuracy": 0.8463876083493233, "num_tokens": 92324745.0, "step": 76770 }, { "entropy": 1.9384202346205712, "epoch": 0.23801128213156184, "grad_norm": 7.760578632354736, "learning_rate": 5.185572801491523e-06, "loss": 0.5237, "mean_token_accuracy": 0.8321873590350151, "num_tokens": 92335893.0, "step": 76780 }, { "entropy": 1.7943998739123344, "epoch": 0.23804228125661153, "grad_norm": 3.816833734512329, "learning_rate": 5.18523513974063e-06, "loss": 0.4183, "mean_token_accuracy": 0.8502035543322564, "num_tokens": 92348555.0, "step": 76790 }, { "entropy": 1.9405929937958717, "epoch": 0.23807328038166123, "grad_norm": 8.975422859191895, "learning_rate": 5.184897543942303e-06, "loss": 0.5237, "mean_token_accuracy": 0.8349242195487022, "num_tokens": 92359797.0, "step": 76800 }, { "entropy": 1.9422178596258164, "epoch": 0.23810427950671093, "grad_norm": 11.456317901611328, "learning_rate": 5.184560014075075e-06, "loss": 0.5249, "mean_token_accuracy": 0.8406445398926735, "num_tokens": 92370595.0, "step": 76810 }, { "entropy": 1.7581544771790505, "epoch": 0.23813527863176062, "grad_norm": 4.420853614807129, "learning_rate": 5.184222550117491e-06, "loss": 0.3966, "mean_token_accuracy": 0.8657904848456383, "num_tokens": 92383924.0, "step": 76820 }, { "entropy": 1.8917575940489768, "epoch": 0.23816627775681032, "grad_norm": 7.945245742797852, "learning_rate": 5.1838851520481e-06, "loss": 0.4557, "mean_token_accuracy": 0.8527548462152481, "num_tokens": 92395855.0, "step": 76830 }, { "entropy": 1.9285854771733284, "epoch": 0.23819727688186002, "grad_norm": 8.082216262817383, "learning_rate": 5.1835478198454654e-06, "loss": 0.4869, "mean_token_accuracy": 0.8462440460920334, "num_tokens": 92406773.0, "step": 76840 }, { "entropy": 1.887254835665226, "epoch": 0.2382282760069097, "grad_norm": 8.663360595703125, "learning_rate": 5.1832105534881614e-06, "loss": 0.4671, "mean_token_accuracy": 0.8472326830029487, "num_tokens": 92418785.0, "step": 76850 }, { "entropy": 1.8737476989626884, "epoch": 0.2382592751319594, "grad_norm": 8.888916969299316, "learning_rate": 5.182873352954766e-06, "loss": 0.5369, "mean_token_accuracy": 0.8380342468619346, "num_tokens": 92431715.0, "step": 76860 }, { "entropy": 1.8650729537010193, "epoch": 0.2382902742570091, "grad_norm": 4.145284175872803, "learning_rate": 5.182536218223874e-06, "loss": 0.4606, "mean_token_accuracy": 0.8427535608410835, "num_tokens": 92443505.0, "step": 76870 }, { "entropy": 1.9061666548252105, "epoch": 0.2383212733820588, "grad_norm": 9.312609672546387, "learning_rate": 5.182199149274083e-06, "loss": 0.4714, "mean_token_accuracy": 0.8470250204205513, "num_tokens": 92454897.0, "step": 76880 }, { "entropy": 1.9185165598988534, "epoch": 0.2383522725071085, "grad_norm": 7.761290550231934, "learning_rate": 5.181862146084008e-06, "loss": 0.5389, "mean_token_accuracy": 0.8365787997841835, "num_tokens": 92467145.0, "step": 76890 }, { "entropy": 1.893602092564106, "epoch": 0.2383832716321582, "grad_norm": 9.334866523742676, "learning_rate": 5.181525208632266e-06, "loss": 0.5021, "mean_token_accuracy": 0.834423853456974, "num_tokens": 92478937.0, "step": 76900 }, { "entropy": 1.9183743864297866, "epoch": 0.2384142707572079, "grad_norm": 8.78905200958252, "learning_rate": 5.18118833689749e-06, "loss": 0.5265, "mean_token_accuracy": 0.8335022673010826, "num_tokens": 92490133.0, "step": 76910 }, { "entropy": 1.9075113162398338, "epoch": 0.23844526988225756, "grad_norm": 8.945623397827148, "learning_rate": 5.18085153085832e-06, "loss": 0.5024, "mean_token_accuracy": 0.8388106390833855, "num_tokens": 92502046.0, "step": 76920 }, { "entropy": 1.904564779996872, "epoch": 0.23847626900730726, "grad_norm": 7.865208625793457, "learning_rate": 5.180514790493405e-06, "loss": 0.4981, "mean_token_accuracy": 0.8397575378417969, "num_tokens": 92513471.0, "step": 76930 }, { "entropy": 1.9215335533022881, "epoch": 0.23850726813235695, "grad_norm": 4.170154571533203, "learning_rate": 5.180178115781404e-06, "loss": 0.4663, "mean_token_accuracy": 0.8430800303816796, "num_tokens": 92525752.0, "step": 76940 }, { "entropy": 1.8328320786356926, "epoch": 0.23853826725740665, "grad_norm": 10.626273155212402, "learning_rate": 5.179841506700989e-06, "loss": 0.4936, "mean_token_accuracy": 0.8432081758975982, "num_tokens": 92539459.0, "step": 76950 }, { "entropy": 1.8328202441334724, "epoch": 0.23856926638245635, "grad_norm": 9.927977561950684, "learning_rate": 5.179504963230835e-06, "loss": 0.4327, "mean_token_accuracy": 0.8494842126965523, "num_tokens": 92551165.0, "step": 76960 }, { "entropy": 1.9268303319811821, "epoch": 0.23860026550750604, "grad_norm": 8.204046249389648, "learning_rate": 5.179168485349633e-06, "loss": 0.5037, "mean_token_accuracy": 0.8421676337718964, "num_tokens": 92562511.0, "step": 76970 }, { "entropy": 1.799267826974392, "epoch": 0.23863126463255574, "grad_norm": 3.754833221435547, "learning_rate": 5.178832073036083e-06, "loss": 0.4261, "mean_token_accuracy": 0.860039333999157, "num_tokens": 92575004.0, "step": 76980 }, { "entropy": 1.9153159961104393, "epoch": 0.23866226375760544, "grad_norm": 8.974563598632812, "learning_rate": 5.178495726268889e-06, "loss": 0.5255, "mean_token_accuracy": 0.8295607075095177, "num_tokens": 92586146.0, "step": 76990 }, { "entropy": 1.8078723505139351, "epoch": 0.23869326288265513, "grad_norm": 7.464177131652832, "learning_rate": 5.178159445026772e-06, "loss": 0.3837, "mean_token_accuracy": 0.863148321211338, "num_tokens": 92599499.0, "step": 77000 }, { "entropy": 1.9081050038337708, "epoch": 0.23872426200770483, "grad_norm": 10.655149459838867, "learning_rate": 5.17782322928846e-06, "loss": 0.4907, "mean_token_accuracy": 0.8429564580321312, "num_tokens": 92610592.0, "step": 77010 }, { "entropy": 1.9061660870909691, "epoch": 0.23875526113275453, "grad_norm": 8.927163124084473, "learning_rate": 5.177487079032687e-06, "loss": 0.5, "mean_token_accuracy": 0.8398579880595207, "num_tokens": 92622264.0, "step": 77020 }, { "entropy": 1.8448384143412113, "epoch": 0.23878626025780422, "grad_norm": 9.433097839355469, "learning_rate": 5.177150994238202e-06, "loss": 0.4609, "mean_token_accuracy": 0.8562486276030541, "num_tokens": 92634783.0, "step": 77030 }, { "entropy": 1.8873264119029045, "epoch": 0.23881725938285392, "grad_norm": 3.7170372009277344, "learning_rate": 5.176814974883761e-06, "loss": 0.4873, "mean_token_accuracy": 0.8441817805171012, "num_tokens": 92646458.0, "step": 77040 }, { "entropy": 1.9377914816141129, "epoch": 0.23884825850790362, "grad_norm": 7.460235118865967, "learning_rate": 5.176479020948127e-06, "loss": 0.4937, "mean_token_accuracy": 0.8441906869411469, "num_tokens": 92657504.0, "step": 77050 }, { "entropy": 1.9209259897470474, "epoch": 0.2388792576329533, "grad_norm": 8.706901550292969, "learning_rate": 5.1761431324100805e-06, "loss": 0.5253, "mean_token_accuracy": 0.8378788083791733, "num_tokens": 92668549.0, "step": 77060 }, { "entropy": 1.939936462044716, "epoch": 0.238910256758003, "grad_norm": 7.477108001708984, "learning_rate": 5.175807309248405e-06, "loss": 0.5586, "mean_token_accuracy": 0.8331178665161133, "num_tokens": 92679326.0, "step": 77070 }, { "entropy": 1.9476049482822417, "epoch": 0.2389412558830527, "grad_norm": 10.547106742858887, "learning_rate": 5.175471551441896e-06, "loss": 0.5478, "mean_token_accuracy": 0.8249987348914146, "num_tokens": 92690605.0, "step": 77080 }, { "entropy": 1.8310050159692763, "epoch": 0.2389722550081024, "grad_norm": 8.774078369140625, "learning_rate": 5.175135858969356e-06, "loss": 0.466, "mean_token_accuracy": 0.8467973753809929, "num_tokens": 92702798.0, "step": 77090 }, { "entropy": 1.8465019643306733, "epoch": 0.2390032541331521, "grad_norm": 4.085997104644775, "learning_rate": 5.174800231809601e-06, "loss": 0.455, "mean_token_accuracy": 0.8533832848072052, "num_tokens": 92714580.0, "step": 77100 }, { "entropy": 1.85148034542799, "epoch": 0.2390342532582018, "grad_norm": 9.144734382629395, "learning_rate": 5.174464669941455e-06, "loss": 0.5163, "mean_token_accuracy": 0.8366668865084648, "num_tokens": 92726728.0, "step": 77110 }, { "entropy": 1.8749561220407487, "epoch": 0.2390652523832515, "grad_norm": 4.692014217376709, "learning_rate": 5.17412917334375e-06, "loss": 0.4937, "mean_token_accuracy": 0.8414742290973664, "num_tokens": 92739555.0, "step": 77120 }, { "entropy": 1.850008523464203, "epoch": 0.2390962515083012, "grad_norm": 4.577549934387207, "learning_rate": 5.17379374199533e-06, "loss": 0.4734, "mean_token_accuracy": 0.8429133415222168, "num_tokens": 92751167.0, "step": 77130 }, { "entropy": 1.7252720057964326, "epoch": 0.23912725063335089, "grad_norm": 2.2258358001708984, "learning_rate": 5.173458375875047e-06, "loss": 0.3858, "mean_token_accuracy": 0.8607361957430839, "num_tokens": 92764800.0, "step": 77140 }, { "entropy": 1.7218406319618225, "epoch": 0.23915824975840058, "grad_norm": 4.97647762298584, "learning_rate": 5.1731230749617645e-06, "loss": 0.4179, "mean_token_accuracy": 0.8532850712537765, "num_tokens": 92779776.0, "step": 77150 }, { "entropy": 1.80644121915102, "epoch": 0.23918924888345028, "grad_norm": 9.306987762451172, "learning_rate": 5.172787839234355e-06, "loss": 0.4713, "mean_token_accuracy": 0.8367184489965439, "num_tokens": 92793137.0, "step": 77160 }, { "entropy": 1.7925639390945434, "epoch": 0.23922024800849995, "grad_norm": 10.766707420349121, "learning_rate": 5.172452668671697e-06, "loss": 0.4109, "mean_token_accuracy": 0.8472514748573303, "num_tokens": 92806007.0, "step": 77170 }, { "entropy": 1.9033477440476418, "epoch": 0.23925124713354964, "grad_norm": 7.035861492156982, "learning_rate": 5.172117563252683e-06, "loss": 0.4931, "mean_token_accuracy": 0.8353909865021706, "num_tokens": 92817579.0, "step": 77180 }, { "entropy": 1.7921052902936936, "epoch": 0.23928224625859934, "grad_norm": 7.975375175476074, "learning_rate": 5.171782522956215e-06, "loss": 0.4114, "mean_token_accuracy": 0.8562724024057389, "num_tokens": 92830855.0, "step": 77190 }, { "entropy": 1.8471863463521003, "epoch": 0.23931324538364904, "grad_norm": 7.514707088470459, "learning_rate": 5.1714475477612005e-06, "loss": 0.4613, "mean_token_accuracy": 0.8399458363652229, "num_tokens": 92843224.0, "step": 77200 }, { "entropy": 1.9269508570432663, "epoch": 0.23934424450869873, "grad_norm": 7.373228073120117, "learning_rate": 5.17111263764656e-06, "loss": 0.535, "mean_token_accuracy": 0.8384528383612633, "num_tokens": 92854284.0, "step": 77210 }, { "entropy": 1.9275295376777648, "epoch": 0.23937524363374843, "grad_norm": 7.924658298492432, "learning_rate": 5.170777792591225e-06, "loss": 0.521, "mean_token_accuracy": 0.8433918222784996, "num_tokens": 92865684.0, "step": 77220 }, { "entropy": 1.902247828245163, "epoch": 0.23940624275879813, "grad_norm": 12.593827247619629, "learning_rate": 5.170443012574131e-06, "loss": 0.5043, "mean_token_accuracy": 0.8516241997480393, "num_tokens": 92876734.0, "step": 77230 }, { "entropy": 1.8969348236918449, "epoch": 0.23943724188384782, "grad_norm": 8.934786796569824, "learning_rate": 5.170108297574229e-06, "loss": 0.4461, "mean_token_accuracy": 0.8509429812431335, "num_tokens": 92888454.0, "step": 77240 }, { "entropy": 1.9256169840693473, "epoch": 0.23946824100889752, "grad_norm": 8.596548080444336, "learning_rate": 5.169773647570475e-06, "loss": 0.5858, "mean_token_accuracy": 0.8136735886335373, "num_tokens": 92900087.0, "step": 77250 }, { "entropy": 1.9090922564268111, "epoch": 0.23949924013394722, "grad_norm": 8.0187406539917, "learning_rate": 5.169439062541838e-06, "loss": 0.5194, "mean_token_accuracy": 0.8492131948471069, "num_tokens": 92910900.0, "step": 77260 }, { "entropy": 1.9346697509288788, "epoch": 0.2395302392589969, "grad_norm": 8.380687713623047, "learning_rate": 5.1691045424672945e-06, "loss": 0.5272, "mean_token_accuracy": 0.8416820123791695, "num_tokens": 92922693.0, "step": 77270 }, { "entropy": 1.9072400107979774, "epoch": 0.2395612383840466, "grad_norm": 6.178520679473877, "learning_rate": 5.16877008732583e-06, "loss": 0.4838, "mean_token_accuracy": 0.8457990005612374, "num_tokens": 92934239.0, "step": 77280 }, { "entropy": 1.8505976639688015, "epoch": 0.2395922375090963, "grad_norm": 8.494392395019531, "learning_rate": 5.16843569709644e-06, "loss": 0.4444, "mean_token_accuracy": 0.8447763130068779, "num_tokens": 92948046.0, "step": 77290 }, { "entropy": 1.8789535805583, "epoch": 0.239623236634146, "grad_norm": 8.061935424804688, "learning_rate": 5.168101371758133e-06, "loss": 0.5276, "mean_token_accuracy": 0.8349943101406098, "num_tokens": 92959929.0, "step": 77300 }, { "entropy": 1.827258250117302, "epoch": 0.2396542357591957, "grad_norm": 3.748342275619507, "learning_rate": 5.1677671112899204e-06, "loss": 0.4749, "mean_token_accuracy": 0.8416868060827255, "num_tokens": 92972031.0, "step": 77310 }, { "entropy": 1.892835232615471, "epoch": 0.2396852348842454, "grad_norm": 8.331387519836426, "learning_rate": 5.1674329156708305e-06, "loss": 0.4986, "mean_token_accuracy": 0.8418567150831222, "num_tokens": 92983623.0, "step": 77320 }, { "entropy": 1.8563826560974122, "epoch": 0.2397162340092951, "grad_norm": 7.926171779632568, "learning_rate": 5.1670987848798935e-06, "loss": 0.4651, "mean_token_accuracy": 0.8450048446655274, "num_tokens": 92996094.0, "step": 77330 }, { "entropy": 1.93873221129179, "epoch": 0.2397472331343448, "grad_norm": 3.8182835578918457, "learning_rate": 5.1667647188961544e-06, "loss": 0.5358, "mean_token_accuracy": 0.8399959117174148, "num_tokens": 93007758.0, "step": 77340 }, { "entropy": 1.8997079834342003, "epoch": 0.23977823225939449, "grad_norm": 3.8245768547058105, "learning_rate": 5.166430717698667e-06, "loss": 0.4947, "mean_token_accuracy": 0.8303521856665611, "num_tokens": 93019975.0, "step": 77350 }, { "entropy": 1.9382802799344063, "epoch": 0.23980923138444418, "grad_norm": 9.995282173156738, "learning_rate": 5.166096781266493e-06, "loss": 0.4967, "mean_token_accuracy": 0.846211564540863, "num_tokens": 93031399.0, "step": 77360 }, { "entropy": 1.872287529706955, "epoch": 0.23984023050949388, "grad_norm": 10.423619270324707, "learning_rate": 5.1657629095787045e-06, "loss": 0.5554, "mean_token_accuracy": 0.8343263044953346, "num_tokens": 93044307.0, "step": 77370 }, { "entropy": 1.9333751276135445, "epoch": 0.23987122963454358, "grad_norm": 9.734416007995605, "learning_rate": 5.165429102614382e-06, "loss": 0.5131, "mean_token_accuracy": 0.8426417291164399, "num_tokens": 93055636.0, "step": 77380 }, { "entropy": 1.858459161221981, "epoch": 0.23990222875959327, "grad_norm": 8.448528289794922, "learning_rate": 5.165095360352618e-06, "loss": 0.4225, "mean_token_accuracy": 0.8568536981940269, "num_tokens": 93068414.0, "step": 77390 }, { "entropy": 1.884053786098957, "epoch": 0.23993322788464297, "grad_norm": 9.751893043518066, "learning_rate": 5.164761682772511e-06, "loss": 0.4692, "mean_token_accuracy": 0.8394820794463158, "num_tokens": 93080506.0, "step": 77400 }, { "entropy": 1.9235117584466934, "epoch": 0.23996422700969264, "grad_norm": 8.76278305053711, "learning_rate": 5.164428069853172e-06, "loss": 0.495, "mean_token_accuracy": 0.8517300844192505, "num_tokens": 93091210.0, "step": 77410 }, { "entropy": 1.8631570398807527, "epoch": 0.23999522613474233, "grad_norm": 8.752660751342773, "learning_rate": 5.16409452157372e-06, "loss": 0.4473, "mean_token_accuracy": 0.845121631026268, "num_tokens": 93103714.0, "step": 77420 }, { "entropy": 1.831365491449833, "epoch": 0.24002622525979203, "grad_norm": 3.933938503265381, "learning_rate": 5.163761037913284e-06, "loss": 0.4581, "mean_token_accuracy": 0.8512499988079071, "num_tokens": 93116096.0, "step": 77430 }, { "entropy": 1.8161715433001517, "epoch": 0.24005722438484173, "grad_norm": 7.886687755584717, "learning_rate": 5.163427618851002e-06, "loss": 0.4417, "mean_token_accuracy": 0.8520355001091957, "num_tokens": 93128310.0, "step": 77440 }, { "entropy": 1.9185721650719643, "epoch": 0.24008822350989142, "grad_norm": 8.354714393615723, "learning_rate": 5.163094264366018e-06, "loss": 0.4982, "mean_token_accuracy": 0.8505765274167061, "num_tokens": 93139885.0, "step": 77450 }, { "entropy": 1.881207676231861, "epoch": 0.24011922263494112, "grad_norm": 3.9551658630371094, "learning_rate": 5.162760974437495e-06, "loss": 0.4783, "mean_token_accuracy": 0.8480890035629273, "num_tokens": 93152362.0, "step": 77460 }, { "entropy": 1.8166447281837463, "epoch": 0.24015022175999082, "grad_norm": 9.925832748413086, "learning_rate": 5.162427749044595e-06, "loss": 0.432, "mean_token_accuracy": 0.8519127413630485, "num_tokens": 93165382.0, "step": 77470 }, { "entropy": 1.9012980431318283, "epoch": 0.24018122088504051, "grad_norm": 6.780560493469238, "learning_rate": 5.162094588166495e-06, "loss": 0.5202, "mean_token_accuracy": 0.8458901271224022, "num_tokens": 93177192.0, "step": 77480 }, { "entropy": 1.8438314184546472, "epoch": 0.2402122200100902, "grad_norm": 9.430864334106445, "learning_rate": 5.161761491782381e-06, "loss": 0.4953, "mean_token_accuracy": 0.8461054712533951, "num_tokens": 93189929.0, "step": 77490 }, { "entropy": 1.8030365750193595, "epoch": 0.2402432191351399, "grad_norm": 8.649742126464844, "learning_rate": 5.1614284598714455e-06, "loss": 0.3987, "mean_token_accuracy": 0.8661585509777069, "num_tokens": 93203184.0, "step": 77500 }, { "entropy": 1.8316243276000024, "epoch": 0.2402742182601896, "grad_norm": 8.923284530639648, "learning_rate": 5.1610954924128944e-06, "loss": 0.4327, "mean_token_accuracy": 0.8588346287608146, "num_tokens": 93215255.0, "step": 77510 }, { "entropy": 1.9364321306347847, "epoch": 0.2403052173852393, "grad_norm": 8.266458511352539, "learning_rate": 5.160762589385941e-06, "loss": 0.5334, "mean_token_accuracy": 0.8307982221245765, "num_tokens": 93226205.0, "step": 77520 }, { "entropy": 1.868191882967949, "epoch": 0.240336216510289, "grad_norm": 9.114846229553223, "learning_rate": 5.160429750769805e-06, "loss": 0.4474, "mean_token_accuracy": 0.8514704316854477, "num_tokens": 93238768.0, "step": 77530 }, { "entropy": 1.9055574059486389, "epoch": 0.2403672156353387, "grad_norm": 4.970535755157471, "learning_rate": 5.160096976543722e-06, "loss": 0.4885, "mean_token_accuracy": 0.8339187651872635, "num_tokens": 93251174.0, "step": 77540 }, { "entropy": 1.8639071702957153, "epoch": 0.2403982147603884, "grad_norm": 9.217086791992188, "learning_rate": 5.159764266686933e-06, "loss": 0.4784, "mean_token_accuracy": 0.8411014005541801, "num_tokens": 93263487.0, "step": 77550 }, { "entropy": 1.9354406654834748, "epoch": 0.2404292138854381, "grad_norm": 7.856409072875977, "learning_rate": 5.159431621178688e-06, "loss": 0.5139, "mean_token_accuracy": 0.8418198600411415, "num_tokens": 93275107.0, "step": 77560 }, { "entropy": 1.9459631145000458, "epoch": 0.24046021301048778, "grad_norm": 10.06895923614502, "learning_rate": 5.159099039998247e-06, "loss": 0.5186, "mean_token_accuracy": 0.8353525027632713, "num_tokens": 93285373.0, "step": 77570 }, { "entropy": 1.8992941722273826, "epoch": 0.24049121213553748, "grad_norm": 9.065903663635254, "learning_rate": 5.158766523124879e-06, "loss": 0.5077, "mean_token_accuracy": 0.8420209974050522, "num_tokens": 93297887.0, "step": 77580 }, { "entropy": 1.8735466703772545, "epoch": 0.24052221126058718, "grad_norm": 8.686766624450684, "learning_rate": 5.158434070537864e-06, "loss": 0.4796, "mean_token_accuracy": 0.8445511654019355, "num_tokens": 93309645.0, "step": 77590 }, { "entropy": 1.9320282906293869, "epoch": 0.24055321038563687, "grad_norm": 4.7969770431518555, "learning_rate": 5.158101682216491e-06, "loss": 0.5188, "mean_token_accuracy": 0.8482805505394936, "num_tokens": 93320996.0, "step": 77600 }, { "entropy": 1.8526810958981514, "epoch": 0.24058420951068657, "grad_norm": 3.4721367359161377, "learning_rate": 5.157769358140056e-06, "loss": 0.4649, "mean_token_accuracy": 0.8560703948140145, "num_tokens": 93332605.0, "step": 77610 }, { "entropy": 1.8123348727822304, "epoch": 0.24061520863573627, "grad_norm": 4.217021465301514, "learning_rate": 5.157437098287867e-06, "loss": 0.4348, "mean_token_accuracy": 0.8512080743908882, "num_tokens": 93344938.0, "step": 77620 }, { "entropy": 1.955248984694481, "epoch": 0.24064620776078596, "grad_norm": 9.06552505493164, "learning_rate": 5.157104902639239e-06, "loss": 0.5756, "mean_token_accuracy": 0.8257305637001991, "num_tokens": 93356260.0, "step": 77630 }, { "entropy": 1.8736144185066224, "epoch": 0.24067720688583566, "grad_norm": 8.679845809936523, "learning_rate": 5.156772771173499e-06, "loss": 0.4804, "mean_token_accuracy": 0.835148498415947, "num_tokens": 93368497.0, "step": 77640 }, { "entropy": 1.785961812734604, "epoch": 0.24070820601088536, "grad_norm": 8.887287139892578, "learning_rate": 5.15644070386998e-06, "loss": 0.4267, "mean_token_accuracy": 0.8569586247205734, "num_tokens": 93381040.0, "step": 77650 }, { "entropy": 1.8760237216949462, "epoch": 0.24073920513593502, "grad_norm": 7.9106011390686035, "learning_rate": 5.15610870070803e-06, "loss": 0.4746, "mean_token_accuracy": 0.8385165154933929, "num_tokens": 93392906.0, "step": 77660 }, { "entropy": 1.9115961521863938, "epoch": 0.24077020426098472, "grad_norm": 7.96343469619751, "learning_rate": 5.155776761666998e-06, "loss": 0.4577, "mean_token_accuracy": 0.8567873150110245, "num_tokens": 93404492.0, "step": 77670 }, { "entropy": 1.8684505842626096, "epoch": 0.24080120338603442, "grad_norm": 8.68083667755127, "learning_rate": 5.15544488672625e-06, "loss": 0.5003, "mean_token_accuracy": 0.8393196210265159, "num_tokens": 93417211.0, "step": 77680 }, { "entropy": 1.7916448466479777, "epoch": 0.24083220251108411, "grad_norm": 7.983458518981934, "learning_rate": 5.155113075865157e-06, "loss": 0.419, "mean_token_accuracy": 0.8451001644134521, "num_tokens": 93430639.0, "step": 77690 }, { "entropy": 1.8966735377907753, "epoch": 0.2408632016361338, "grad_norm": 8.755640029907227, "learning_rate": 5.1547813290631e-06, "loss": 0.4627, "mean_token_accuracy": 0.8501760110259056, "num_tokens": 93441446.0, "step": 77700 }, { "entropy": 1.8320280969142915, "epoch": 0.2408942007611835, "grad_norm": 7.649133205413818, "learning_rate": 5.154449646299469e-06, "loss": 0.443, "mean_token_accuracy": 0.849604444205761, "num_tokens": 93453520.0, "step": 77710 }, { "entropy": 1.826886311173439, "epoch": 0.2409251998862332, "grad_norm": 3.657710313796997, "learning_rate": 5.154118027553669e-06, "loss": 0.4318, "mean_token_accuracy": 0.849964652955532, "num_tokens": 93465485.0, "step": 77720 }, { "entropy": 1.84391158670187, "epoch": 0.2409561990112829, "grad_norm": 8.545806884765625, "learning_rate": 5.153786472805101e-06, "loss": 0.4595, "mean_token_accuracy": 0.8504501432180405, "num_tokens": 93478499.0, "step": 77730 }, { "entropy": 1.8420162439346313, "epoch": 0.2409871981363326, "grad_norm": 4.115400791168213, "learning_rate": 5.15345498203319e-06, "loss": 0.454, "mean_token_accuracy": 0.8493165418505668, "num_tokens": 93490375.0, "step": 77740 }, { "entropy": 1.897342699766159, "epoch": 0.2410181972613823, "grad_norm": 9.050869941711426, "learning_rate": 5.153123555217362e-06, "loss": 0.5218, "mean_token_accuracy": 0.8383392289280891, "num_tokens": 93501729.0, "step": 77750 }, { "entropy": 1.8093791991472243, "epoch": 0.241049196386432, "grad_norm": 8.670755386352539, "learning_rate": 5.1527921923370536e-06, "loss": 0.4341, "mean_token_accuracy": 0.8506680727005005, "num_tokens": 93514603.0, "step": 77760 }, { "entropy": 1.7984491378068923, "epoch": 0.2410801955114817, "grad_norm": 2.556086540222168, "learning_rate": 5.15246089337171e-06, "loss": 0.4706, "mean_token_accuracy": 0.8423952326178551, "num_tokens": 93527732.0, "step": 77770 }, { "entropy": 1.857538816332817, "epoch": 0.24111119463653138, "grad_norm": 5.878166675567627, "learning_rate": 5.15212965830079e-06, "loss": 0.4816, "mean_token_accuracy": 0.8417029067873955, "num_tokens": 93539767.0, "step": 77780 }, { "entropy": 1.9089462801814079, "epoch": 0.24114219376158108, "grad_norm": 8.4704008102417, "learning_rate": 5.151798487103755e-06, "loss": 0.5014, "mean_token_accuracy": 0.8439002588391304, "num_tokens": 93550984.0, "step": 77790 }, { "entropy": 1.8456274837255477, "epoch": 0.24117319288663078, "grad_norm": 4.18363618850708, "learning_rate": 5.151467379760081e-06, "loss": 0.4618, "mean_token_accuracy": 0.8480399310588836, "num_tokens": 93563757.0, "step": 77800 }, { "entropy": 1.899016997218132, "epoch": 0.24120419201168047, "grad_norm": 7.410801887512207, "learning_rate": 5.1511363362492515e-06, "loss": 0.5028, "mean_token_accuracy": 0.8440502628684043, "num_tokens": 93575421.0, "step": 77810 }, { "entropy": 1.8831632077693938, "epoch": 0.24123519113673017, "grad_norm": 2.9362380504608154, "learning_rate": 5.150805356550758e-06, "loss": 0.475, "mean_token_accuracy": 0.842276705801487, "num_tokens": 93587824.0, "step": 77820 }, { "entropy": 1.9149335369467735, "epoch": 0.24126619026177987, "grad_norm": 9.272772789001465, "learning_rate": 5.150474440644102e-06, "loss": 0.5188, "mean_token_accuracy": 0.8346376657485962, "num_tokens": 93599403.0, "step": 77830 }, { "entropy": 1.8213121712207794, "epoch": 0.24129718938682956, "grad_norm": 11.08968448638916, "learning_rate": 5.150143588508796e-06, "loss": 0.4433, "mean_token_accuracy": 0.8508032530546188, "num_tokens": 93612087.0, "step": 77840 }, { "entropy": 1.832365171611309, "epoch": 0.24132818851187926, "grad_norm": 8.193666458129883, "learning_rate": 5.149812800124359e-06, "loss": 0.5089, "mean_token_accuracy": 0.8404935225844383, "num_tokens": 93623785.0, "step": 77850 }, { "entropy": 1.917369608581066, "epoch": 0.24135918763692896, "grad_norm": 8.672357559204102, "learning_rate": 5.149482075470319e-06, "loss": 0.495, "mean_token_accuracy": 0.8380819782614708, "num_tokens": 93635723.0, "step": 77860 }, { "entropy": 1.8775453761219978, "epoch": 0.24139018676197865, "grad_norm": 9.257630348205566, "learning_rate": 5.1491514145262174e-06, "loss": 0.4797, "mean_token_accuracy": 0.8449780121445656, "num_tokens": 93647450.0, "step": 77870 }, { "entropy": 1.8986103981733322, "epoch": 0.24142118588702835, "grad_norm": 8.680018424987793, "learning_rate": 5.148820817271601e-06, "loss": 0.4554, "mean_token_accuracy": 0.8502163380384445, "num_tokens": 93659062.0, "step": 77880 }, { "entropy": 1.7778849676251411, "epoch": 0.24145218501207805, "grad_norm": 4.6502604484558105, "learning_rate": 5.148490283686026e-06, "loss": 0.4188, "mean_token_accuracy": 0.8509754404425621, "num_tokens": 93673049.0, "step": 77890 }, { "entropy": 1.83993206769228, "epoch": 0.24148318413712774, "grad_norm": 3.465973138809204, "learning_rate": 5.14815981374906e-06, "loss": 0.4708, "mean_token_accuracy": 0.8415236309170723, "num_tokens": 93685767.0, "step": 77900 }, { "entropy": 1.863934588432312, "epoch": 0.2415141832621774, "grad_norm": 8.9150972366333, "learning_rate": 5.1478294074402756e-06, "loss": 0.4804, "mean_token_accuracy": 0.844880360364914, "num_tokens": 93698133.0, "step": 77910 }, { "entropy": 1.8324093401432038, "epoch": 0.2415451823872271, "grad_norm": 4.7063188552856445, "learning_rate": 5.14749906473926e-06, "loss": 0.4428, "mean_token_accuracy": 0.851141095161438, "num_tokens": 93710437.0, "step": 77920 }, { "entropy": 1.9022783473134042, "epoch": 0.2415761815122768, "grad_norm": 8.489745140075684, "learning_rate": 5.147168785625606e-06, "loss": 0.5028, "mean_token_accuracy": 0.8330109208822251, "num_tokens": 93722370.0, "step": 77930 }, { "entropy": 1.837750643491745, "epoch": 0.2416071806373265, "grad_norm": 8.317510604858398, "learning_rate": 5.146838570078916e-06, "loss": 0.453, "mean_token_accuracy": 0.8525749742984772, "num_tokens": 93735012.0, "step": 77940 }, { "entropy": 1.8423487588763237, "epoch": 0.2416381797623762, "grad_norm": 3.3121819496154785, "learning_rate": 5.146508418078802e-06, "loss": 0.4678, "mean_token_accuracy": 0.8401482835412025, "num_tokens": 93747947.0, "step": 77950 }, { "entropy": 1.858578224480152, "epoch": 0.2416691788874259, "grad_norm": 4.246115207672119, "learning_rate": 5.146178329604885e-06, "loss": 0.5066, "mean_token_accuracy": 0.8309834390878678, "num_tokens": 93760294.0, "step": 77960 }, { "entropy": 1.8050771802663803, "epoch": 0.2417001780124756, "grad_norm": 8.161648750305176, "learning_rate": 5.145848304636797e-06, "loss": 0.4388, "mean_token_accuracy": 0.8522293284535408, "num_tokens": 93773292.0, "step": 77970 }, { "entropy": 1.856981235742569, "epoch": 0.2417311771375253, "grad_norm": 9.97853946685791, "learning_rate": 5.1455183431541755e-06, "loss": 0.4607, "mean_token_accuracy": 0.8453084260225296, "num_tokens": 93785337.0, "step": 77980 }, { "entropy": 1.8165911972522735, "epoch": 0.24176217626257498, "grad_norm": 8.15949535369873, "learning_rate": 5.145188445136669e-06, "loss": 0.4365, "mean_token_accuracy": 0.8522542849183082, "num_tokens": 93798839.0, "step": 77990 }, { "entropy": 1.8631312981247903, "epoch": 0.24179317538762468, "grad_norm": 9.838682174682617, "learning_rate": 5.144858610563938e-06, "loss": 0.486, "mean_token_accuracy": 0.8407356634736061, "num_tokens": 93811151.0, "step": 78000 }, { "entropy": 1.9338207572698594, "epoch": 0.24182417451267438, "grad_norm": 9.109156608581543, "learning_rate": 5.144528839415645e-06, "loss": 0.5236, "mean_token_accuracy": 0.8407235726714134, "num_tokens": 93822175.0, "step": 78010 }, { "entropy": 1.8040778622031213, "epoch": 0.24185517363772407, "grad_norm": 4.23243522644043, "learning_rate": 5.1441991316714694e-06, "loss": 0.4351, "mean_token_accuracy": 0.8540575191378593, "num_tokens": 93834880.0, "step": 78020 }, { "entropy": 1.893469262123108, "epoch": 0.24188617276277377, "grad_norm": 8.863125801086426, "learning_rate": 5.143869487311095e-06, "loss": 0.4412, "mean_token_accuracy": 0.8541948691010475, "num_tokens": 93846671.0, "step": 78030 }, { "entropy": 1.8411024257540702, "epoch": 0.24191717188782347, "grad_norm": 4.275692462921143, "learning_rate": 5.143539906314216e-06, "loss": 0.4935, "mean_token_accuracy": 0.8460016861557961, "num_tokens": 93859086.0, "step": 78040 }, { "entropy": 1.887072142958641, "epoch": 0.24194817101287316, "grad_norm": 3.7341136932373047, "learning_rate": 5.143210388660536e-06, "loss": 0.464, "mean_token_accuracy": 0.8523519471287727, "num_tokens": 93870472.0, "step": 78050 }, { "entropy": 1.9175701081752776, "epoch": 0.24197917013792286, "grad_norm": 10.164228439331055, "learning_rate": 5.142880934329766e-06, "loss": 0.5293, "mean_token_accuracy": 0.8337636604905129, "num_tokens": 93881653.0, "step": 78060 }, { "entropy": 1.9110116347670556, "epoch": 0.24201016926297256, "grad_norm": 8.151057243347168, "learning_rate": 5.142551543301631e-06, "loss": 0.4783, "mean_token_accuracy": 0.843655027449131, "num_tokens": 93893107.0, "step": 78070 }, { "entropy": 1.8371216788887978, "epoch": 0.24204116838802225, "grad_norm": 4.863457679748535, "learning_rate": 5.142222215555856e-06, "loss": 0.4841, "mean_token_accuracy": 0.845946654677391, "num_tokens": 93905843.0, "step": 78080 }, { "entropy": 1.9443501621484756, "epoch": 0.24207216751307195, "grad_norm": 9.060688972473145, "learning_rate": 5.141892951072186e-06, "loss": 0.5785, "mean_token_accuracy": 0.8199516415596009, "num_tokens": 93916955.0, "step": 78090 }, { "entropy": 1.8930419281125068, "epoch": 0.24210316663812165, "grad_norm": 3.962271213531494, "learning_rate": 5.141563749830367e-06, "loss": 0.5161, "mean_token_accuracy": 0.8397519171237946, "num_tokens": 93928520.0, "step": 78100 }, { "entropy": 1.88976591527462, "epoch": 0.24213416576317134, "grad_norm": 8.086103439331055, "learning_rate": 5.141234611810158e-06, "loss": 0.4857, "mean_token_accuracy": 0.8491416946053505, "num_tokens": 93939987.0, "step": 78110 }, { "entropy": 1.9858234167098998, "epoch": 0.24216516488822104, "grad_norm": 7.99221658706665, "learning_rate": 5.140905536991324e-06, "loss": 0.5678, "mean_token_accuracy": 0.8298144713044167, "num_tokens": 93950877.0, "step": 78120 }, { "entropy": 1.8197284460067749, "epoch": 0.24219616401327074, "grad_norm": 8.68156909942627, "learning_rate": 5.140576525353643e-06, "loss": 0.4926, "mean_token_accuracy": 0.8402933716773987, "num_tokens": 93963270.0, "step": 78130 }, { "entropy": 1.8739581018686295, "epoch": 0.24222716313832043, "grad_norm": 8.117925643920898, "learning_rate": 5.1402475768769e-06, "loss": 0.5305, "mean_token_accuracy": 0.8324276804924011, "num_tokens": 93975569.0, "step": 78140 }, { "entropy": 1.9055853977799415, "epoch": 0.2422581622633701, "grad_norm": 8.360283851623535, "learning_rate": 5.139918691540887e-06, "loss": 0.5192, "mean_token_accuracy": 0.8316447660326958, "num_tokens": 93987298.0, "step": 78150 }, { "entropy": 2.0304242044687273, "epoch": 0.2422891613884198, "grad_norm": 8.648979187011719, "learning_rate": 5.13958986932541e-06, "loss": 0.6085, "mean_token_accuracy": 0.820050160586834, "num_tokens": 93997995.0, "step": 78160 }, { "entropy": 1.9047956734895706, "epoch": 0.2423201605134695, "grad_norm": 3.7517290115356445, "learning_rate": 5.139261110210278e-06, "loss": 0.4528, "mean_token_accuracy": 0.8566797718405723, "num_tokens": 94009266.0, "step": 78170 }, { "entropy": 1.8741468638181686, "epoch": 0.2423511596385192, "grad_norm": 8.909819602966309, "learning_rate": 5.138932414175315e-06, "loss": 0.5123, "mean_token_accuracy": 0.8410193353891373, "num_tokens": 94021292.0, "step": 78180 }, { "entropy": 1.9001810044050216, "epoch": 0.2423821587635689, "grad_norm": 8.745471000671387, "learning_rate": 5.138603781200349e-06, "loss": 0.4857, "mean_token_accuracy": 0.8419952884316444, "num_tokens": 94033054.0, "step": 78190 }, { "entropy": 1.9287573903799058, "epoch": 0.24241315788861859, "grad_norm": 3.352858543395996, "learning_rate": 5.138275211265221e-06, "loss": 0.4676, "mean_token_accuracy": 0.8414346948266029, "num_tokens": 94045999.0, "step": 78200 }, { "entropy": 1.878933884203434, "epoch": 0.24244415701366828, "grad_norm": 8.681049346923828, "learning_rate": 5.137946704349778e-06, "loss": 0.494, "mean_token_accuracy": 0.8540082618594169, "num_tokens": 94057480.0, "step": 78210 }, { "entropy": 1.913780263066292, "epoch": 0.24247515613871798, "grad_norm": 9.51512336730957, "learning_rate": 5.137618260433878e-06, "loss": 0.5343, "mean_token_accuracy": 0.8367677152156829, "num_tokens": 94068651.0, "step": 78220 }, { "entropy": 1.8172692015767098, "epoch": 0.24250615526376768, "grad_norm": 10.392766952514648, "learning_rate": 5.137289879497387e-06, "loss": 0.4076, "mean_token_accuracy": 0.8513308480381966, "num_tokens": 94081805.0, "step": 78230 }, { "entropy": 1.9101085662841797, "epoch": 0.24253715438881737, "grad_norm": 4.027338027954102, "learning_rate": 5.136961561520181e-06, "loss": 0.5266, "mean_token_accuracy": 0.8364538699388504, "num_tokens": 94093434.0, "step": 78240 }, { "entropy": 1.9400910884141922, "epoch": 0.24256815351386707, "grad_norm": 8.5806303024292, "learning_rate": 5.1366333064821426e-06, "loss": 0.5406, "mean_token_accuracy": 0.8367814645171165, "num_tokens": 94104660.0, "step": 78250 }, { "entropy": 1.9483735159039497, "epoch": 0.24259915263891677, "grad_norm": 4.43411922454834, "learning_rate": 5.136305114363167e-06, "loss": 0.4996, "mean_token_accuracy": 0.8446143805980683, "num_tokens": 94116243.0, "step": 78260 }, { "entropy": 1.9460048735141755, "epoch": 0.24263015176396646, "grad_norm": 8.685762405395508, "learning_rate": 5.1359769851431565e-06, "loss": 0.5085, "mean_token_accuracy": 0.8413349702954293, "num_tokens": 94127332.0, "step": 78270 }, { "entropy": 1.9539130926132202, "epoch": 0.24266115088901616, "grad_norm": 9.817856788635254, "learning_rate": 5.13564891880202e-06, "loss": 0.5339, "mean_token_accuracy": 0.8239889681339264, "num_tokens": 94138346.0, "step": 78280 }, { "entropy": 1.8358333230018615, "epoch": 0.24269215001406585, "grad_norm": 10.21696662902832, "learning_rate": 5.135320915319681e-06, "loss": 0.4144, "mean_token_accuracy": 0.8529067754745483, "num_tokens": 94151143.0, "step": 78290 }, { "entropy": 1.9505225792527199, "epoch": 0.24272314913911555, "grad_norm": 9.021811485290527, "learning_rate": 5.134992974676065e-06, "loss": 0.5422, "mean_token_accuracy": 0.8432751014828682, "num_tokens": 94162767.0, "step": 78300 }, { "entropy": 1.927219384908676, "epoch": 0.24275414826416525, "grad_norm": 4.087057590484619, "learning_rate": 5.134665096851114e-06, "loss": 0.5086, "mean_token_accuracy": 0.8410610109567642, "num_tokens": 94174630.0, "step": 78310 }, { "entropy": 1.8358907088637353, "epoch": 0.24278514738921494, "grad_norm": 9.222640991210938, "learning_rate": 5.134337281824774e-06, "loss": 0.4711, "mean_token_accuracy": 0.8533089682459831, "num_tokens": 94186789.0, "step": 78320 }, { "entropy": 1.8353676095604896, "epoch": 0.24281614651426464, "grad_norm": 9.232024192810059, "learning_rate": 5.1340095295769985e-06, "loss": 0.4311, "mean_token_accuracy": 0.859758959710598, "num_tokens": 94198760.0, "step": 78330 }, { "entropy": 1.9933633387088776, "epoch": 0.24284714563931434, "grad_norm": 7.663358688354492, "learning_rate": 5.1336818400877575e-06, "loss": 0.6016, "mean_token_accuracy": 0.8219748124480247, "num_tokens": 94209973.0, "step": 78340 }, { "entropy": 1.9356787115335465, "epoch": 0.24287814476436403, "grad_norm": 7.0271100997924805, "learning_rate": 5.1333542133370205e-06, "loss": 0.5275, "mean_token_accuracy": 0.8462739869952202, "num_tokens": 94221268.0, "step": 78350 }, { "entropy": 1.9141018971800805, "epoch": 0.24290914388941373, "grad_norm": 8.173383712768555, "learning_rate": 5.133026649304772e-06, "loss": 0.5224, "mean_token_accuracy": 0.8365029126405716, "num_tokens": 94233710.0, "step": 78360 }, { "entropy": 1.8921899944543839, "epoch": 0.24294014301446343, "grad_norm": 6.497464656829834, "learning_rate": 5.132699147971007e-06, "loss": 0.4443, "mean_token_accuracy": 0.8493005961179734, "num_tokens": 94245927.0, "step": 78370 }, { "entropy": 1.8819943577051164, "epoch": 0.24297114213951312, "grad_norm": 8.804264068603516, "learning_rate": 5.132371709315721e-06, "loss": 0.4768, "mean_token_accuracy": 0.8414627909660339, "num_tokens": 94258498.0, "step": 78380 }, { "entropy": 1.9550851792097093, "epoch": 0.24300214126456282, "grad_norm": 8.61777400970459, "learning_rate": 5.1320443333189265e-06, "loss": 0.5731, "mean_token_accuracy": 0.8282885104417801, "num_tokens": 94270225.0, "step": 78390 }, { "entropy": 1.8870521187782288, "epoch": 0.2430331403896125, "grad_norm": 10.647053718566895, "learning_rate": 5.131717019960643e-06, "loss": 0.46, "mean_token_accuracy": 0.847943240404129, "num_tokens": 94282222.0, "step": 78400 }, { "entropy": 1.942089530825615, "epoch": 0.2430641395146622, "grad_norm": 9.514321327209473, "learning_rate": 5.131389769220897e-06, "loss": 0.5166, "mean_token_accuracy": 0.8450910165905953, "num_tokens": 94293068.0, "step": 78410 }, { "entropy": 1.9375642448663712, "epoch": 0.24309513863971188, "grad_norm": 10.78248405456543, "learning_rate": 5.131062581079726e-06, "loss": 0.5297, "mean_token_accuracy": 0.8404152438044548, "num_tokens": 94304441.0, "step": 78420 }, { "entropy": 1.901486437022686, "epoch": 0.24312613776476158, "grad_norm": 10.459092140197754, "learning_rate": 5.130735455517173e-06, "loss": 0.4886, "mean_token_accuracy": 0.8432300522923469, "num_tokens": 94316598.0, "step": 78430 }, { "entropy": 1.9251017332077027, "epoch": 0.24315713688981128, "grad_norm": 10.798014640808105, "learning_rate": 5.130408392513295e-06, "loss": 0.5176, "mean_token_accuracy": 0.8358462870121002, "num_tokens": 94328615.0, "step": 78440 }, { "entropy": 1.7331562623381616, "epoch": 0.24318813601486097, "grad_norm": 6.879677772521973, "learning_rate": 5.130081392048156e-06, "loss": 0.4168, "mean_token_accuracy": 0.8590050891041756, "num_tokens": 94342439.0, "step": 78450 }, { "entropy": 1.876169492304325, "epoch": 0.24321913513991067, "grad_norm": 6.808117389678955, "learning_rate": 5.129754454101825e-06, "loss": 0.4903, "mean_token_accuracy": 0.8520722165703773, "num_tokens": 94354438.0, "step": 78460 }, { "entropy": 1.9563359498977662, "epoch": 0.24325013426496037, "grad_norm": 8.930147171020508, "learning_rate": 5.129427578654386e-06, "loss": 0.5295, "mean_token_accuracy": 0.84179867208004, "num_tokens": 94365036.0, "step": 78470 }, { "entropy": 1.9348897516727448, "epoch": 0.24328113339001006, "grad_norm": 8.5611572265625, "learning_rate": 5.129100765685926e-06, "loss": 0.5008, "mean_token_accuracy": 0.851142629981041, "num_tokens": 94375811.0, "step": 78480 }, { "entropy": 1.9472255200147628, "epoch": 0.24331213251505976, "grad_norm": 10.067473411560059, "learning_rate": 5.1287740151765464e-06, "loss": 0.5399, "mean_token_accuracy": 0.8335594087839127, "num_tokens": 94387094.0, "step": 78490 }, { "entropy": 1.8310412108898162, "epoch": 0.24334313164010946, "grad_norm": 8.096028327941895, "learning_rate": 5.128447327106353e-06, "loss": 0.4257, "mean_token_accuracy": 0.8536302044987678, "num_tokens": 94400414.0, "step": 78500 }, { "entropy": 1.8954817593097686, "epoch": 0.24337413076515915, "grad_norm": 8.50610637664795, "learning_rate": 5.128120701455464e-06, "loss": 0.4761, "mean_token_accuracy": 0.8491655126214027, "num_tokens": 94412241.0, "step": 78510 }, { "entropy": 1.8588109910488129, "epoch": 0.24340512989020885, "grad_norm": 8.121857643127441, "learning_rate": 5.127794138204003e-06, "loss": 0.4307, "mean_token_accuracy": 0.8583784848451614, "num_tokens": 94424289.0, "step": 78520 }, { "entropy": 1.9707530647516251, "epoch": 0.24343612901525855, "grad_norm": 11.943368911743164, "learning_rate": 5.127467637332106e-06, "loss": 0.5347, "mean_token_accuracy": 0.843015332520008, "num_tokens": 94435163.0, "step": 78530 }, { "entropy": 1.8756818160414697, "epoch": 0.24346712814030824, "grad_norm": 4.680079936981201, "learning_rate": 5.127141198819916e-06, "loss": 0.4559, "mean_token_accuracy": 0.8448122501373291, "num_tokens": 94447563.0, "step": 78540 }, { "entropy": 1.9033118396997453, "epoch": 0.24349812726535794, "grad_norm": 7.533633708953857, "learning_rate": 5.126814822647584e-06, "loss": 0.46, "mean_token_accuracy": 0.8476984471082687, "num_tokens": 94459089.0, "step": 78550 }, { "entropy": 1.9617197692394257, "epoch": 0.24352912639040764, "grad_norm": 9.499611854553223, "learning_rate": 5.126488508795272e-06, "loss": 0.5812, "mean_token_accuracy": 0.8220301568508148, "num_tokens": 94469681.0, "step": 78560 }, { "entropy": 1.8323184587061405, "epoch": 0.24356012551545733, "grad_norm": 9.460269927978516, "learning_rate": 5.126162257243148e-06, "loss": 0.4517, "mean_token_accuracy": 0.8555850937962532, "num_tokens": 94481974.0, "step": 78570 }, { "entropy": 1.8734732553362847, "epoch": 0.24359112464050703, "grad_norm": 8.973634719848633, "learning_rate": 5.1258360679713916e-06, "loss": 0.4844, "mean_token_accuracy": 0.8513824939727783, "num_tokens": 94493390.0, "step": 78580 }, { "entropy": 1.935685896873474, "epoch": 0.24362212376555673, "grad_norm": 7.930929183959961, "learning_rate": 5.125509940960189e-06, "loss": 0.5584, "mean_token_accuracy": 0.8314585655927658, "num_tokens": 94504378.0, "step": 78590 }, { "entropy": 1.881124185025692, "epoch": 0.24365312289060642, "grad_norm": 4.80453634262085, "learning_rate": 5.12518387618974e-06, "loss": 0.4441, "mean_token_accuracy": 0.8595498457551003, "num_tokens": 94515741.0, "step": 78600 }, { "entropy": 1.867940901219845, "epoch": 0.24368412201565612, "grad_norm": 8.655344009399414, "learning_rate": 5.124857873640244e-06, "loss": 0.4633, "mean_token_accuracy": 0.8418323248624802, "num_tokens": 94528325.0, "step": 78610 }, { "entropy": 1.9250017136335373, "epoch": 0.24371512114070581, "grad_norm": 9.36817741394043, "learning_rate": 5.124531933291918e-06, "loss": 0.5308, "mean_token_accuracy": 0.8363332346081733, "num_tokens": 94539198.0, "step": 78620 }, { "entropy": 1.782043893635273, "epoch": 0.2437461202657555, "grad_norm": 6.611782073974609, "learning_rate": 5.124206055124986e-06, "loss": 0.4036, "mean_token_accuracy": 0.8502190038561821, "num_tokens": 94553598.0, "step": 78630 }, { "entropy": 1.9345451429486276, "epoch": 0.2437771193908052, "grad_norm": 8.913650512695312, "learning_rate": 5.123880239119677e-06, "loss": 0.5216, "mean_token_accuracy": 0.8417821496725082, "num_tokens": 94564564.0, "step": 78640 }, { "entropy": 1.900984486937523, "epoch": 0.24380811851585488, "grad_norm": 7.233798503875732, "learning_rate": 5.123554485256231e-06, "loss": 0.5571, "mean_token_accuracy": 0.835866068303585, "num_tokens": 94577049.0, "step": 78650 }, { "entropy": 1.898327873647213, "epoch": 0.24383911764090457, "grad_norm": 9.916297912597656, "learning_rate": 5.123228793514897e-06, "loss": 0.5133, "mean_token_accuracy": 0.8348048180341721, "num_tokens": 94588672.0, "step": 78660 }, { "entropy": 1.9103514924645424, "epoch": 0.24387011676595427, "grad_norm": 7.947087287902832, "learning_rate": 5.122903163875935e-06, "loss": 0.5346, "mean_token_accuracy": 0.8312714904546737, "num_tokens": 94600977.0, "step": 78670 }, { "entropy": 1.9822331309318542, "epoch": 0.24390111589100397, "grad_norm": 7.2357001304626465, "learning_rate": 5.1225775963196104e-06, "loss": 0.5092, "mean_token_accuracy": 0.8357154637575149, "num_tokens": 94612105.0, "step": 78680 }, { "entropy": 1.8677599892020225, "epoch": 0.24393211501605366, "grad_norm": 8.366761207580566, "learning_rate": 5.1222520908261965e-06, "loss": 0.4345, "mean_token_accuracy": 0.8575931638479233, "num_tokens": 94624016.0, "step": 78690 }, { "entropy": 1.9394556313753128, "epoch": 0.24396311414110336, "grad_norm": 10.68550968170166, "learning_rate": 5.12192664737598e-06, "loss": 0.5114, "mean_token_accuracy": 0.8385277986526489, "num_tokens": 94635343.0, "step": 78700 }, { "entropy": 1.8500966012477875, "epoch": 0.24399411326615306, "grad_norm": 8.567591667175293, "learning_rate": 5.121601265949253e-06, "loss": 0.432, "mean_token_accuracy": 0.8590314507484436, "num_tokens": 94647452.0, "step": 78710 }, { "entropy": 1.900219763815403, "epoch": 0.24402511239120275, "grad_norm": 9.341552734375, "learning_rate": 5.121275946526316e-06, "loss": 0.4803, "mean_token_accuracy": 0.8450950443744659, "num_tokens": 94659220.0, "step": 78720 }, { "entropy": 1.8428411841392518, "epoch": 0.24405611151625245, "grad_norm": 5.176535129547119, "learning_rate": 5.120950689087481e-06, "loss": 0.4266, "mean_token_accuracy": 0.8477652072906494, "num_tokens": 94672010.0, "step": 78730 }, { "entropy": 1.960240864753723, "epoch": 0.24408711064130215, "grad_norm": 7.885514736175537, "learning_rate": 5.120625493613066e-06, "loss": 0.539, "mean_token_accuracy": 0.8431348979473114, "num_tokens": 94682844.0, "step": 78740 }, { "entropy": 1.9407020330429077, "epoch": 0.24411810976635184, "grad_norm": 9.455016136169434, "learning_rate": 5.1203003600834e-06, "loss": 0.4972, "mean_token_accuracy": 0.8410814523696899, "num_tokens": 94694739.0, "step": 78750 }, { "entropy": 1.922006744146347, "epoch": 0.24414910889140154, "grad_norm": 4.940672874450684, "learning_rate": 5.119975288478818e-06, "loss": 0.4887, "mean_token_accuracy": 0.8352184310555458, "num_tokens": 94707168.0, "step": 78760 }, { "entropy": 1.8744517169892787, "epoch": 0.24418010801645124, "grad_norm": 8.631587982177734, "learning_rate": 5.119650278779667e-06, "loss": 0.4898, "mean_token_accuracy": 0.841625614464283, "num_tokens": 94719921.0, "step": 78770 }, { "entropy": 1.8324996635317803, "epoch": 0.24421110714150093, "grad_norm": 7.340449333190918, "learning_rate": 5.119325330966301e-06, "loss": 0.5179, "mean_token_accuracy": 0.8373197898268699, "num_tokens": 94732979.0, "step": 78780 }, { "entropy": 1.820642825961113, "epoch": 0.24424210626655063, "grad_norm": 3.907349109649658, "learning_rate": 5.119000445019081e-06, "loss": 0.4211, "mean_token_accuracy": 0.853115190565586, "num_tokens": 94745715.0, "step": 78790 }, { "entropy": 1.992446595430374, "epoch": 0.24427310539160033, "grad_norm": 8.855254173278809, "learning_rate": 5.118675620918381e-06, "loss": 0.5751, "mean_token_accuracy": 0.8356500491499901, "num_tokens": 94757429.0, "step": 78800 }, { "entropy": 1.915099148452282, "epoch": 0.24430410451665002, "grad_norm": 8.259538650512695, "learning_rate": 5.11835085864458e-06, "loss": 0.5032, "mean_token_accuracy": 0.8401688992977142, "num_tokens": 94769751.0, "step": 78810 }, { "entropy": 1.999873200058937, "epoch": 0.24433510364169972, "grad_norm": 8.896110534667969, "learning_rate": 5.118026158178065e-06, "loss": 0.5863, "mean_token_accuracy": 0.8252399504184723, "num_tokens": 94781117.0, "step": 78820 }, { "entropy": 1.9294245585799217, "epoch": 0.24436610276674942, "grad_norm": 8.745575904846191, "learning_rate": 5.117701519499237e-06, "loss": 0.5684, "mean_token_accuracy": 0.8319064795970916, "num_tokens": 94793063.0, "step": 78830 }, { "entropy": 1.9275419771671296, "epoch": 0.2443971018917991, "grad_norm": 9.089315414428711, "learning_rate": 5.1173769425885015e-06, "loss": 0.456, "mean_token_accuracy": 0.8480811953544617, "num_tokens": 94804637.0, "step": 78840 }, { "entropy": 1.8522877663373947, "epoch": 0.2444281010168488, "grad_norm": 8.082670211791992, "learning_rate": 5.117052427426272e-06, "loss": 0.4151, "mean_token_accuracy": 0.8565731942653656, "num_tokens": 94817168.0, "step": 78850 }, { "entropy": 1.9246091678738595, "epoch": 0.2444591001418985, "grad_norm": 7.712028980255127, "learning_rate": 5.116727973992975e-06, "loss": 0.5084, "mean_token_accuracy": 0.8396865114569664, "num_tokens": 94828976.0, "step": 78860 }, { "entropy": 1.9487199917435647, "epoch": 0.2444900992669482, "grad_norm": 8.486215591430664, "learning_rate": 5.116403582269041e-06, "loss": 0.4936, "mean_token_accuracy": 0.8476333856582642, "num_tokens": 94840153.0, "step": 78870 }, { "entropy": 1.8542052239179612, "epoch": 0.2445210983919979, "grad_norm": 9.091276168823242, "learning_rate": 5.11607925223491e-06, "loss": 0.4383, "mean_token_accuracy": 0.8544638007879257, "num_tokens": 94852408.0, "step": 78880 }, { "entropy": 1.938251782208681, "epoch": 0.24455209751704757, "grad_norm": 8.672202110290527, "learning_rate": 5.115754983871035e-06, "loss": 0.5264, "mean_token_accuracy": 0.8358112215995789, "num_tokens": 94864131.0, "step": 78890 }, { "entropy": 1.9440853491425514, "epoch": 0.24458309664209726, "grad_norm": 8.338716506958008, "learning_rate": 5.115430777157873e-06, "loss": 0.5453, "mean_token_accuracy": 0.8359151259064674, "num_tokens": 94875953.0, "step": 78900 }, { "entropy": 1.9307004496455193, "epoch": 0.24461409576714696, "grad_norm": 8.26486587524414, "learning_rate": 5.11510663207589e-06, "loss": 0.503, "mean_token_accuracy": 0.8382903128862381, "num_tokens": 94887663.0, "step": 78910 }, { "entropy": 1.88083965331316, "epoch": 0.24464509489219666, "grad_norm": 9.269944190979004, "learning_rate": 5.114782548605563e-06, "loss": 0.4844, "mean_token_accuracy": 0.8377712652087211, "num_tokens": 94899958.0, "step": 78920 }, { "entropy": 1.872906593978405, "epoch": 0.24467609401724635, "grad_norm": 9.342957496643066, "learning_rate": 5.1144585267273764e-06, "loss": 0.4659, "mean_token_accuracy": 0.8415094420313836, "num_tokens": 94912389.0, "step": 78930 }, { "entropy": 2.0011187493801117, "epoch": 0.24470709314229605, "grad_norm": 9.931851387023926, "learning_rate": 5.114134566421823e-06, "loss": 0.5745, "mean_token_accuracy": 0.8279946282505989, "num_tokens": 94923039.0, "step": 78940 }, { "entropy": 1.8712818920612335, "epoch": 0.24473809226734575, "grad_norm": 8.743650436401367, "learning_rate": 5.113810667669406e-06, "loss": 0.4473, "mean_token_accuracy": 0.8516668871045112, "num_tokens": 94935553.0, "step": 78950 }, { "entropy": 1.8285594224929809, "epoch": 0.24476909139239544, "grad_norm": 6.643009662628174, "learning_rate": 5.1134868304506335e-06, "loss": 0.3967, "mean_token_accuracy": 0.8515212833881378, "num_tokens": 94949056.0, "step": 78960 }, { "entropy": 1.8055023223161697, "epoch": 0.24480009051744514, "grad_norm": 8.94931411743164, "learning_rate": 5.1131630547460264e-06, "loss": 0.419, "mean_token_accuracy": 0.8564708262681962, "num_tokens": 94962250.0, "step": 78970 }, { "entropy": 1.8580831050872804, "epoch": 0.24483108964249484, "grad_norm": 8.115015983581543, "learning_rate": 5.112839340536111e-06, "loss": 0.4625, "mean_token_accuracy": 0.8429727420210839, "num_tokens": 94975092.0, "step": 78980 }, { "entropy": 1.8813864275813104, "epoch": 0.24486208876754453, "grad_norm": 3.8340635299682617, "learning_rate": 5.112515687801425e-06, "loss": 0.4508, "mean_token_accuracy": 0.8568834289908409, "num_tokens": 94986957.0, "step": 78990 }, { "entropy": 1.9341845840215683, "epoch": 0.24489308789259423, "grad_norm": 9.034078598022461, "learning_rate": 5.112192096522513e-06, "loss": 0.5372, "mean_token_accuracy": 0.8297013834118843, "num_tokens": 94998820.0, "step": 79000 }, { "entropy": 1.8774139389395714, "epoch": 0.24492408701764393, "grad_norm": 8.676530838012695, "learning_rate": 5.1118685666799276e-06, "loss": 0.5233, "mean_token_accuracy": 0.8356031611561775, "num_tokens": 95011150.0, "step": 79010 }, { "entropy": 1.7854979574680327, "epoch": 0.24495508614269362, "grad_norm": 4.426049709320068, "learning_rate": 5.111545098254231e-06, "loss": 0.3809, "mean_token_accuracy": 0.8604443162679672, "num_tokens": 95024853.0, "step": 79020 }, { "entropy": 1.9145899727940558, "epoch": 0.24498608526774332, "grad_norm": 3.6612796783447266, "learning_rate": 5.111221691225996e-06, "loss": 0.4687, "mean_token_accuracy": 0.8452530711889267, "num_tokens": 95036962.0, "step": 79030 }, { "entropy": 1.9246152505278586, "epoch": 0.24501708439279302, "grad_norm": 4.121425628662109, "learning_rate": 5.1108983455758e-06, "loss": 0.4913, "mean_token_accuracy": 0.8439809292554855, "num_tokens": 95048716.0, "step": 79040 }, { "entropy": 1.8734241530299187, "epoch": 0.2450480835178427, "grad_norm": 7.86496114730835, "learning_rate": 5.110575061284232e-06, "loss": 0.4875, "mean_token_accuracy": 0.8426591798663139, "num_tokens": 95061683.0, "step": 79050 }, { "entropy": 1.9878466486930848, "epoch": 0.2450790826428924, "grad_norm": 7.495734691619873, "learning_rate": 5.110251838331888e-06, "loss": 0.5437, "mean_token_accuracy": 0.8370781674981117, "num_tokens": 95072706.0, "step": 79060 }, { "entropy": 1.8844273030757903, "epoch": 0.2451100817679421, "grad_norm": 8.038873672485352, "learning_rate": 5.109928676699374e-06, "loss": 0.5024, "mean_token_accuracy": 0.8393307909369468, "num_tokens": 95084183.0, "step": 79070 }, { "entropy": 1.9518539249897002, "epoch": 0.2451410808929918, "grad_norm": 8.668111801147461, "learning_rate": 5.109605576367302e-06, "loss": 0.5666, "mean_token_accuracy": 0.8321204602718353, "num_tokens": 95095191.0, "step": 79080 }, { "entropy": 1.8990130126476288, "epoch": 0.2451720800180415, "grad_norm": 9.605693817138672, "learning_rate": 5.1092825373162965e-06, "loss": 0.4926, "mean_token_accuracy": 0.8330251052975655, "num_tokens": 95107290.0, "step": 79090 }, { "entropy": 1.853539851307869, "epoch": 0.2452030791430912, "grad_norm": 8.671045303344727, "learning_rate": 5.108959559526987e-06, "loss": 0.4328, "mean_token_accuracy": 0.8565944582223892, "num_tokens": 95119603.0, "step": 79100 }, { "entropy": 1.9775980800390243, "epoch": 0.2452340782681409, "grad_norm": 10.049057960510254, "learning_rate": 5.108636642980014e-06, "loss": 0.5379, "mean_token_accuracy": 0.8332414567470551, "num_tokens": 95130785.0, "step": 79110 }, { "entropy": 1.8694923490285873, "epoch": 0.2452650773931906, "grad_norm": 5.189975738525391, "learning_rate": 5.108313787656024e-06, "loss": 0.4671, "mean_token_accuracy": 0.8414556428790092, "num_tokens": 95142468.0, "step": 79120 }, { "entropy": 1.8693754494190216, "epoch": 0.24529607651824029, "grad_norm": 9.248258590698242, "learning_rate": 5.107990993535676e-06, "loss": 0.4554, "mean_token_accuracy": 0.848180778324604, "num_tokens": 95154856.0, "step": 79130 }, { "entropy": 1.8843225702643394, "epoch": 0.24532707564328995, "grad_norm": 8.56281852722168, "learning_rate": 5.107668260599633e-06, "loss": 0.4779, "mean_token_accuracy": 0.8529567748308182, "num_tokens": 95165960.0, "step": 79140 }, { "entropy": 1.8180608585476876, "epoch": 0.24535807476833965, "grad_norm": 3.8715312480926514, "learning_rate": 5.107345588828569e-06, "loss": 0.4352, "mean_token_accuracy": 0.85515176653862, "num_tokens": 95178500.0, "step": 79150 }, { "entropy": 1.9070587247610091, "epoch": 0.24538907389338935, "grad_norm": 7.640878677368164, "learning_rate": 5.107022978203167e-06, "loss": 0.4931, "mean_token_accuracy": 0.8319504678249359, "num_tokens": 95190327.0, "step": 79160 }, { "entropy": 1.8788634032011031, "epoch": 0.24542007301843904, "grad_norm": 7.821987628936768, "learning_rate": 5.106700428704119e-06, "loss": 0.5155, "mean_token_accuracy": 0.8348909318447113, "num_tokens": 95202577.0, "step": 79170 }, { "entropy": 1.8948452711105346, "epoch": 0.24545107214348874, "grad_norm": 9.048386573791504, "learning_rate": 5.1063779403121214e-06, "loss": 0.4898, "mean_token_accuracy": 0.837831036746502, "num_tokens": 95214436.0, "step": 79180 }, { "entropy": 1.889863994717598, "epoch": 0.24548207126853844, "grad_norm": 11.092171669006348, "learning_rate": 5.106055513007883e-06, "loss": 0.5045, "mean_token_accuracy": 0.8403749257326126, "num_tokens": 95225844.0, "step": 79190 }, { "entropy": 1.8009657189249992, "epoch": 0.24551307039358813, "grad_norm": 10.045639038085938, "learning_rate": 5.105733146772122e-06, "loss": 0.4362, "mean_token_accuracy": 0.8504743695259094, "num_tokens": 95239379.0, "step": 79200 }, { "entropy": 1.9157941043376923, "epoch": 0.24554406951863783, "grad_norm": 8.408306121826172, "learning_rate": 5.105410841585562e-06, "loss": 0.4829, "mean_token_accuracy": 0.837490102648735, "num_tokens": 95251567.0, "step": 79210 }, { "entropy": 1.8614981457591058, "epoch": 0.24557506864368753, "grad_norm": 8.256009101867676, "learning_rate": 5.1050885974289354e-06, "loss": 0.4565, "mean_token_accuracy": 0.8346313327550888, "num_tokens": 95264155.0, "step": 79220 }, { "entropy": 1.9501688122749328, "epoch": 0.24560606776873722, "grad_norm": 8.228469848632812, "learning_rate": 5.104766414282987e-06, "loss": 0.5083, "mean_token_accuracy": 0.8341888338327408, "num_tokens": 95275602.0, "step": 79230 }, { "entropy": 1.8669834539294243, "epoch": 0.24563706689378692, "grad_norm": 7.618318557739258, "learning_rate": 5.1044442921284635e-06, "loss": 0.4577, "mean_token_accuracy": 0.8466882199048996, "num_tokens": 95288087.0, "step": 79240 }, { "entropy": 2.0039695501327515, "epoch": 0.24566806601883662, "grad_norm": 10.323431968688965, "learning_rate": 5.104122230946127e-06, "loss": 0.5297, "mean_token_accuracy": 0.8389030545949936, "num_tokens": 95298644.0, "step": 79250 }, { "entropy": 1.9950080424547196, "epoch": 0.2456990651438863, "grad_norm": 8.642731666564941, "learning_rate": 5.103800230716744e-06, "loss": 0.5272, "mean_token_accuracy": 0.8389644399285316, "num_tokens": 95309147.0, "step": 79260 }, { "entropy": 1.9348495423793792, "epoch": 0.245730064268936, "grad_norm": 9.167795181274414, "learning_rate": 5.10347829142109e-06, "loss": 0.5279, "mean_token_accuracy": 0.8349600344896316, "num_tokens": 95320103.0, "step": 79270 }, { "entropy": 1.925625742971897, "epoch": 0.2457610633939857, "grad_norm": 9.226808547973633, "learning_rate": 5.103156413039949e-06, "loss": 0.4999, "mean_token_accuracy": 0.8468294337391853, "num_tokens": 95331516.0, "step": 79280 }, { "entropy": 1.890987327694893, "epoch": 0.2457920625190354, "grad_norm": 9.890280723571777, "learning_rate": 5.102834595554116e-06, "loss": 0.4966, "mean_token_accuracy": 0.8447692885994911, "num_tokens": 95343047.0, "step": 79290 }, { "entropy": 1.9169722646474838, "epoch": 0.2458230616440851, "grad_norm": 7.960250377655029, "learning_rate": 5.102512838944389e-06, "loss": 0.5274, "mean_token_accuracy": 0.8344372197985649, "num_tokens": 95354554.0, "step": 79300 }, { "entropy": 1.9122997641563415, "epoch": 0.2458540607691348, "grad_norm": 7.443688869476318, "learning_rate": 5.102191143191582e-06, "loss": 0.4783, "mean_token_accuracy": 0.844282315671444, "num_tokens": 95367170.0, "step": 79310 }, { "entropy": 1.989633098244667, "epoch": 0.2458850598941845, "grad_norm": 8.14206600189209, "learning_rate": 5.101869508276509e-06, "loss": 0.5527, "mean_token_accuracy": 0.8317234337329864, "num_tokens": 95377788.0, "step": 79320 }, { "entropy": 1.8243707031011582, "epoch": 0.2459160590192342, "grad_norm": 8.314239501953125, "learning_rate": 5.10154793418e-06, "loss": 0.3902, "mean_token_accuracy": 0.863274447619915, "num_tokens": 95391018.0, "step": 79330 }, { "entropy": 2.020896875858307, "epoch": 0.2459470581442839, "grad_norm": 9.07596492767334, "learning_rate": 5.10122642088289e-06, "loss": 0.5552, "mean_token_accuracy": 0.8318601101636887, "num_tokens": 95401518.0, "step": 79340 }, { "entropy": 1.9158252328634262, "epoch": 0.24597805726933358, "grad_norm": 7.7173614501953125, "learning_rate": 5.100904968366021e-06, "loss": 0.4968, "mean_token_accuracy": 0.8370094135403633, "num_tokens": 95413144.0, "step": 79350 }, { "entropy": 1.8825179994106294, "epoch": 0.24600905639438328, "grad_norm": 9.029237747192383, "learning_rate": 5.100583576610246e-06, "loss": 0.4686, "mean_token_accuracy": 0.8415081441402436, "num_tokens": 95426258.0, "step": 79360 }, { "entropy": 1.8851216346025468, "epoch": 0.24604005551943298, "grad_norm": 9.818262100219727, "learning_rate": 5.100262245596426e-06, "loss": 0.5106, "mean_token_accuracy": 0.8332912772893906, "num_tokens": 95438222.0, "step": 79370 }, { "entropy": 1.8941182017326355, "epoch": 0.24607105464448267, "grad_norm": 8.11664867401123, "learning_rate": 5.099940975305429e-06, "loss": 0.4486, "mean_token_accuracy": 0.8558889240026474, "num_tokens": 95449373.0, "step": 79380 }, { "entropy": 1.870937429368496, "epoch": 0.24610205376953234, "grad_norm": 9.294564247131348, "learning_rate": 5.099619765718133e-06, "loss": 0.4675, "mean_token_accuracy": 0.8433859512209892, "num_tokens": 95461757.0, "step": 79390 }, { "entropy": 1.9412083461880685, "epoch": 0.24613305289458204, "grad_norm": 9.425738334655762, "learning_rate": 5.099298616815426e-06, "loss": 0.5611, "mean_token_accuracy": 0.8349045276641845, "num_tokens": 95473347.0, "step": 79400 }, { "entropy": 1.8201213255524635, "epoch": 0.24616405201963173, "grad_norm": 4.238409042358398, "learning_rate": 5.098977528578199e-06, "loss": 0.4338, "mean_token_accuracy": 0.8556158289313316, "num_tokens": 95486624.0, "step": 79410 }, { "entropy": 1.8622691087424754, "epoch": 0.24619505114468143, "grad_norm": 7.4858012199401855, "learning_rate": 5.098656500987356e-06, "loss": 0.4253, "mean_token_accuracy": 0.8564441815018654, "num_tokens": 95499802.0, "step": 79420 }, { "entropy": 1.823809403181076, "epoch": 0.24622605026973113, "grad_norm": 2.4532666206359863, "learning_rate": 5.0983355340238096e-06, "loss": 0.4602, "mean_token_accuracy": 0.8558837413787842, "num_tokens": 95512410.0, "step": 79430 }, { "entropy": 1.8783535480499267, "epoch": 0.24625704939478082, "grad_norm": 7.133194446563721, "learning_rate": 5.0980146276684775e-06, "loss": 0.5046, "mean_token_accuracy": 0.8473603904247284, "num_tokens": 95524775.0, "step": 79440 }, { "entropy": 1.8303729638457298, "epoch": 0.24628804851983052, "grad_norm": 8.516709327697754, "learning_rate": 5.097693781902286e-06, "loss": 0.4894, "mean_token_accuracy": 0.8431586921215057, "num_tokens": 95537638.0, "step": 79450 }, { "entropy": 1.9185088947415352, "epoch": 0.24631904764488022, "grad_norm": 10.453644752502441, "learning_rate": 5.097372996706177e-06, "loss": 0.4657, "mean_token_accuracy": 0.84544235765934, "num_tokens": 95549282.0, "step": 79460 }, { "entropy": 1.8807260736823082, "epoch": 0.24635004676992991, "grad_norm": 11.140459060668945, "learning_rate": 5.09705227206109e-06, "loss": 0.4641, "mean_token_accuracy": 0.8533786773681641, "num_tokens": 95561550.0, "step": 79470 }, { "entropy": 1.9539588153362275, "epoch": 0.2463810458949796, "grad_norm": 9.068120002746582, "learning_rate": 5.096731607947981e-06, "loss": 0.5577, "mean_token_accuracy": 0.8324769794940948, "num_tokens": 95572478.0, "step": 79480 }, { "entropy": 1.803953130543232, "epoch": 0.2464120450200293, "grad_norm": 3.0993998050689697, "learning_rate": 5.096411004347811e-06, "loss": 0.4285, "mean_token_accuracy": 0.8539158910512924, "num_tokens": 95585431.0, "step": 79490 }, { "entropy": 1.9023709833621978, "epoch": 0.246443044145079, "grad_norm": 8.446845054626465, "learning_rate": 5.096090461241549e-06, "loss": 0.4672, "mean_token_accuracy": 0.8470258235931396, "num_tokens": 95596888.0, "step": 79500 }, { "entropy": 1.9057755261659621, "epoch": 0.2464740432701287, "grad_norm": 3.741781711578369, "learning_rate": 5.095769978610174e-06, "loss": 0.4954, "mean_token_accuracy": 0.8380037263035774, "num_tokens": 95608611.0, "step": 79510 }, { "entropy": 1.8339520961046218, "epoch": 0.2465050423951784, "grad_norm": 3.7660844326019287, "learning_rate": 5.095449556434673e-06, "loss": 0.4444, "mean_token_accuracy": 0.8561963886022568, "num_tokens": 95621421.0, "step": 79520 }, { "entropy": 1.8732254639267922, "epoch": 0.2465360415202281, "grad_norm": 8.025121688842773, "learning_rate": 5.09512919469604e-06, "loss": 0.4849, "mean_token_accuracy": 0.8454299658536911, "num_tokens": 95633799.0, "step": 79530 }, { "entropy": 1.9583280727267265, "epoch": 0.2465670406452778, "grad_norm": 8.794914245605469, "learning_rate": 5.0948088933752795e-06, "loss": 0.5379, "mean_token_accuracy": 0.8344408705830574, "num_tokens": 95645176.0, "step": 79540 }, { "entropy": 1.8870036020874976, "epoch": 0.2465980397703275, "grad_norm": 3.5362777709960938, "learning_rate": 5.094488652453403e-06, "loss": 0.4673, "mean_token_accuracy": 0.8496022373437881, "num_tokens": 95657347.0, "step": 79550 }, { "entropy": 1.9044120475649833, "epoch": 0.24662903889537718, "grad_norm": 9.073451042175293, "learning_rate": 5.094168471911431e-06, "loss": 0.4952, "mean_token_accuracy": 0.8413415655493737, "num_tokens": 95670276.0, "step": 79560 }, { "entropy": 1.8711192563176156, "epoch": 0.24666003802042688, "grad_norm": 7.03518533706665, "learning_rate": 5.0938483517303914e-06, "loss": 0.4409, "mean_token_accuracy": 0.8590395718812942, "num_tokens": 95682420.0, "step": 79570 }, { "entropy": 1.8627604782581328, "epoch": 0.24669103714547658, "grad_norm": 8.095098495483398, "learning_rate": 5.093528291891321e-06, "loss": 0.4162, "mean_token_accuracy": 0.8616518348455429, "num_tokens": 95695131.0, "step": 79580 }, { "entropy": 1.8479859337210656, "epoch": 0.24672203627052627, "grad_norm": 10.04355525970459, "learning_rate": 5.093208292375264e-06, "loss": 0.4436, "mean_token_accuracy": 0.8598237797617913, "num_tokens": 95707350.0, "step": 79590 }, { "entropy": 1.8836325496435165, "epoch": 0.24675303539557597, "grad_norm": 7.801244735717773, "learning_rate": 5.092888353163278e-06, "loss": 0.4592, "mean_token_accuracy": 0.8486521244049072, "num_tokens": 95719130.0, "step": 79600 }, { "entropy": 1.8783444881439209, "epoch": 0.24678403452062567, "grad_norm": 7.576953411102295, "learning_rate": 5.092568474236419e-06, "loss": 0.4971, "mean_token_accuracy": 0.8356266215443611, "num_tokens": 95730894.0, "step": 79610 }, { "entropy": 1.778034082055092, "epoch": 0.24681503364567536, "grad_norm": 8.774757385253906, "learning_rate": 5.0922486555757615e-06, "loss": 0.4483, "mean_token_accuracy": 0.8508021235466003, "num_tokens": 95743797.0, "step": 79620 }, { "entropy": 1.893149121105671, "epoch": 0.24684603277072506, "grad_norm": 10.234265327453613, "learning_rate": 5.09192889716238e-06, "loss": 0.4511, "mean_token_accuracy": 0.8557818323373795, "num_tokens": 95756512.0, "step": 79630 }, { "entropy": 1.9457867011427878, "epoch": 0.24687703189577473, "grad_norm": 8.266569137573242, "learning_rate": 5.091609198977366e-06, "loss": 0.5249, "mean_token_accuracy": 0.832012552022934, "num_tokens": 95767848.0, "step": 79640 }, { "entropy": 1.8916153132915496, "epoch": 0.24690803102082443, "grad_norm": 8.928706169128418, "learning_rate": 5.091289561001813e-06, "loss": 0.4741, "mean_token_accuracy": 0.8554543048143387, "num_tokens": 95779825.0, "step": 79650 }, { "entropy": 1.843326412141323, "epoch": 0.24693903014587412, "grad_norm": 7.779528617858887, "learning_rate": 5.090969983216823e-06, "loss": 0.4886, "mean_token_accuracy": 0.8484487190842629, "num_tokens": 95792518.0, "step": 79660 }, { "entropy": 1.8858656302094459, "epoch": 0.24697002927092382, "grad_norm": 4.888106822967529, "learning_rate": 5.090650465603507e-06, "loss": 0.4602, "mean_token_accuracy": 0.8411580830812454, "num_tokens": 95805316.0, "step": 79670 }, { "entropy": 1.9305724531412125, "epoch": 0.24700102839597352, "grad_norm": 8.88187026977539, "learning_rate": 5.090331008142988e-06, "loss": 0.5174, "mean_token_accuracy": 0.8380944326519966, "num_tokens": 95816570.0, "step": 79680 }, { "entropy": 1.944003912806511, "epoch": 0.2470320275210232, "grad_norm": 8.070425033569336, "learning_rate": 5.090011610816392e-06, "loss": 0.5157, "mean_token_accuracy": 0.8433306530117989, "num_tokens": 95827116.0, "step": 79690 }, { "entropy": 1.8563833236694336, "epoch": 0.2470630266460729, "grad_norm": 9.2925386428833, "learning_rate": 5.089692273604857e-06, "loss": 0.4667, "mean_token_accuracy": 0.8484684824943542, "num_tokens": 95839368.0, "step": 79700 }, { "entropy": 1.8929186090826988, "epoch": 0.2470940257711226, "grad_norm": 7.603442668914795, "learning_rate": 5.089372996489528e-06, "loss": 0.5032, "mean_token_accuracy": 0.844200924038887, "num_tokens": 95850482.0, "step": 79710 }, { "entropy": 1.9358657032251358, "epoch": 0.2471250248961723, "grad_norm": 10.141935348510742, "learning_rate": 5.089053779451555e-06, "loss": 0.5079, "mean_token_accuracy": 0.8503325775265693, "num_tokens": 95861937.0, "step": 79720 }, { "entropy": 1.7907779216766357, "epoch": 0.247156024021222, "grad_norm": 4.321413040161133, "learning_rate": 5.088734622472102e-06, "loss": 0.4334, "mean_token_accuracy": 0.846727529168129, "num_tokens": 95875553.0, "step": 79730 }, { "entropy": 1.889219006896019, "epoch": 0.2471870231462717, "grad_norm": 4.462600231170654, "learning_rate": 5.0884155255323405e-06, "loss": 0.519, "mean_token_accuracy": 0.8342496454715729, "num_tokens": 95888042.0, "step": 79740 }, { "entropy": 1.9225512325763703, "epoch": 0.2472180222713214, "grad_norm": 8.681279182434082, "learning_rate": 5.088096488613445e-06, "loss": 0.4779, "mean_token_accuracy": 0.8418059065937996, "num_tokens": 95899096.0, "step": 79750 }, { "entropy": 1.9431275591254233, "epoch": 0.2472490213963711, "grad_norm": 10.247178077697754, "learning_rate": 5.087777511696603e-06, "loss": 0.5136, "mean_token_accuracy": 0.8435952246189118, "num_tokens": 95910960.0, "step": 79760 }, { "entropy": 1.9674682512879371, "epoch": 0.24728002052142078, "grad_norm": 10.39339542388916, "learning_rate": 5.087458594763011e-06, "loss": 0.5176, "mean_token_accuracy": 0.8337034076452255, "num_tokens": 95922652.0, "step": 79770 }, { "entropy": 1.9188352286815644, "epoch": 0.24731101964647048, "grad_norm": 8.393038749694824, "learning_rate": 5.087139737793868e-06, "loss": 0.4999, "mean_token_accuracy": 0.8489988818764687, "num_tokens": 95933179.0, "step": 79780 }, { "entropy": 1.9350088462233543, "epoch": 0.24734201877152018, "grad_norm": 7.100698471069336, "learning_rate": 5.086820940770387e-06, "loss": 0.4757, "mean_token_accuracy": 0.8473016381263733, "num_tokens": 95944650.0, "step": 79790 }, { "entropy": 1.9587112590670586, "epoch": 0.24737301789656987, "grad_norm": 10.05678653717041, "learning_rate": 5.0865022036737876e-06, "loss": 0.5308, "mean_token_accuracy": 0.8289535716176033, "num_tokens": 95956214.0, "step": 79800 }, { "entropy": 1.854060137271881, "epoch": 0.24740401702161957, "grad_norm": 7.256962776184082, "learning_rate": 5.086183526485297e-06, "loss": 0.4153, "mean_token_accuracy": 0.8500014215707778, "num_tokens": 95968951.0, "step": 79810 }, { "entropy": 1.9160088315606116, "epoch": 0.24743501614666927, "grad_norm": 8.998300552368164, "learning_rate": 5.08586490918615e-06, "loss": 0.5045, "mean_token_accuracy": 0.8392581641674042, "num_tokens": 95980577.0, "step": 79820 }, { "entropy": 1.88340106010437, "epoch": 0.24746601527171896, "grad_norm": 5.0286030769348145, "learning_rate": 5.08554635175759e-06, "loss": 0.4791, "mean_token_accuracy": 0.8310941204428672, "num_tokens": 95993600.0, "step": 79830 }, { "entropy": 1.831264679133892, "epoch": 0.24749701439676866, "grad_norm": 9.444331169128418, "learning_rate": 5.085227854180872e-06, "loss": 0.4911, "mean_token_accuracy": 0.847031046450138, "num_tokens": 96006803.0, "step": 79840 }, { "entropy": 1.895230557024479, "epoch": 0.24752801352181836, "grad_norm": 9.098989486694336, "learning_rate": 5.0849094164372525e-06, "loss": 0.4721, "mean_token_accuracy": 0.8438019439578056, "num_tokens": 96018578.0, "step": 79850 }, { "entropy": 1.875000685453415, "epoch": 0.24755901264686805, "grad_norm": 8.089691162109375, "learning_rate": 5.084591038508003e-06, "loss": 0.4484, "mean_token_accuracy": 0.8493760719895362, "num_tokens": 96030419.0, "step": 79860 }, { "entropy": 1.9285678580403327, "epoch": 0.24759001177191775, "grad_norm": 8.030923843383789, "learning_rate": 5.0842727203744e-06, "loss": 0.4933, "mean_token_accuracy": 0.8497920066118241, "num_tokens": 96042124.0, "step": 79870 }, { "entropy": 1.9058194294571877, "epoch": 0.24762101089696742, "grad_norm": 6.725025653839111, "learning_rate": 5.083954462017727e-06, "loss": 0.45, "mean_token_accuracy": 0.8582265987992287, "num_tokens": 96053579.0, "step": 79880 }, { "entropy": 1.9877204060554505, "epoch": 0.24765201002201712, "grad_norm": 7.7386555671691895, "learning_rate": 5.083636263419278e-06, "loss": 0.5676, "mean_token_accuracy": 0.8407276496291161, "num_tokens": 96064626.0, "step": 79890 }, { "entropy": 1.9019395023584367, "epoch": 0.2476830091470668, "grad_norm": 8.415245056152344, "learning_rate": 5.083318124560355e-06, "loss": 0.4931, "mean_token_accuracy": 0.8463913604617119, "num_tokens": 96075407.0, "step": 79900 }, { "entropy": 1.9611292690038682, "epoch": 0.2477140082721165, "grad_norm": 7.75224494934082, "learning_rate": 5.083000045422266e-06, "loss": 0.5072, "mean_token_accuracy": 0.8489187434315681, "num_tokens": 96086058.0, "step": 79910 }, { "entropy": 1.8318257644772529, "epoch": 0.2477450073971662, "grad_norm": 4.178698539733887, "learning_rate": 5.082682025986331e-06, "loss": 0.4311, "mean_token_accuracy": 0.8574068948626519, "num_tokens": 96098208.0, "step": 79920 }, { "entropy": 1.8961550071835518, "epoch": 0.2477760065222159, "grad_norm": 9.65478515625, "learning_rate": 5.082364066233872e-06, "loss": 0.5372, "mean_token_accuracy": 0.8370327904820443, "num_tokens": 96109735.0, "step": 79930 }, { "entropy": 1.8611363500356675, "epoch": 0.2478070056472656, "grad_norm": 4.771914482116699, "learning_rate": 5.082046166146227e-06, "loss": 0.4327, "mean_token_accuracy": 0.8574331164360046, "num_tokens": 96121606.0, "step": 79940 }, { "entropy": 1.8779680162668229, "epoch": 0.2478380047723153, "grad_norm": 10.091143608093262, "learning_rate": 5.0817283257047375e-06, "loss": 0.479, "mean_token_accuracy": 0.8469004735350609, "num_tokens": 96133832.0, "step": 79950 }, { "entropy": 1.8258475840091706, "epoch": 0.247869003897365, "grad_norm": 8.701608657836914, "learning_rate": 5.081410544890754e-06, "loss": 0.4852, "mean_token_accuracy": 0.84885775744915, "num_tokens": 96146484.0, "step": 79960 }, { "entropy": 1.856001165509224, "epoch": 0.2479000030224147, "grad_norm": 7.74146842956543, "learning_rate": 5.081092823685633e-06, "loss": 0.4187, "mean_token_accuracy": 0.8504522681236267, "num_tokens": 96159276.0, "step": 79970 }, { "entropy": 1.8811848238110542, "epoch": 0.24793100214746439, "grad_norm": 4.684483051300049, "learning_rate": 5.0807751620707425e-06, "loss": 0.4806, "mean_token_accuracy": 0.841371999680996, "num_tokens": 96171602.0, "step": 79980 }, { "entropy": 1.9496762931346894, "epoch": 0.24796200127251408, "grad_norm": 9.64576530456543, "learning_rate": 5.0804575600274575e-06, "loss": 0.5328, "mean_token_accuracy": 0.8369770348072052, "num_tokens": 96182516.0, "step": 79990 }, { "entropy": 1.8256424218416214, "epoch": 0.24799300039756378, "grad_norm": 7.734647750854492, "learning_rate": 5.080140017537162e-06, "loss": 0.4226, "mean_token_accuracy": 0.8576287388801574, "num_tokens": 96195002.0, "step": 80000 }, { "entropy": 1.8538661286234857, "epoch": 0.24802399952261348, "grad_norm": 4.36631441116333, "learning_rate": 5.079822534581246e-06, "loss": 0.4843, "mean_token_accuracy": 0.8439467817544937, "num_tokens": 96207489.0, "step": 80010 }, { "entropy": 1.9011688023805617, "epoch": 0.24805499864766317, "grad_norm": 10.34740924835205, "learning_rate": 5.07950511114111e-06, "loss": 0.5364, "mean_token_accuracy": 0.8319029092788697, "num_tokens": 96218430.0, "step": 80020 }, { "entropy": 1.8638526298105718, "epoch": 0.24808599777271287, "grad_norm": 3.3797831535339355, "learning_rate": 5.07918774719816e-06, "loss": 0.4521, "mean_token_accuracy": 0.8378943756222725, "num_tokens": 96231778.0, "step": 80030 }, { "entropy": 1.861025558412075, "epoch": 0.24811699689776257, "grad_norm": 8.062844276428223, "learning_rate": 5.078870442733811e-06, "loss": 0.4897, "mean_token_accuracy": 0.8469403609633446, "num_tokens": 96243932.0, "step": 80040 }, { "entropy": 1.835094903409481, "epoch": 0.24814799602281226, "grad_norm": 4.131919860839844, "learning_rate": 5.07855319772949e-06, "loss": 0.4282, "mean_token_accuracy": 0.8548068985342979, "num_tokens": 96256765.0, "step": 80050 }, { "entropy": 1.9781760573387146, "epoch": 0.24817899514786196, "grad_norm": 8.877656936645508, "learning_rate": 5.078236012166626e-06, "loss": 0.546, "mean_token_accuracy": 0.8398910224437713, "num_tokens": 96267560.0, "step": 80060 }, { "entropy": 1.8965528056025505, "epoch": 0.24820999427291165, "grad_norm": 9.912452697753906, "learning_rate": 5.077918886026659e-06, "loss": 0.521, "mean_token_accuracy": 0.8338564082980156, "num_tokens": 96279157.0, "step": 80070 }, { "entropy": 1.8625015437602996, "epoch": 0.24824099339796135, "grad_norm": 7.917440891265869, "learning_rate": 5.077601819291041e-06, "loss": 0.4465, "mean_token_accuracy": 0.8532622784376145, "num_tokens": 96291601.0, "step": 80080 }, { "entropy": 1.8780291944742202, "epoch": 0.24827199252301105, "grad_norm": 4.1151123046875, "learning_rate": 5.077284811941222e-06, "loss": 0.5087, "mean_token_accuracy": 0.837240794301033, "num_tokens": 96303685.0, "step": 80090 }, { "entropy": 1.9204677224159241, "epoch": 0.24830299164806074, "grad_norm": 8.22902774810791, "learning_rate": 5.076967863958671e-06, "loss": 0.5369, "mean_token_accuracy": 0.8477205768227577, "num_tokens": 96314313.0, "step": 80100 }, { "entropy": 1.8754928812384606, "epoch": 0.24833399077311044, "grad_norm": 3.904203176498413, "learning_rate": 5.076650975324857e-06, "loss": 0.4377, "mean_token_accuracy": 0.852228082716465, "num_tokens": 96326655.0, "step": 80110 }, { "entropy": 1.8840164422988892, "epoch": 0.24836498989816014, "grad_norm": 7.1997504234313965, "learning_rate": 5.076334146021265e-06, "loss": 0.459, "mean_token_accuracy": 0.850933963060379, "num_tokens": 96338425.0, "step": 80120 }, { "entropy": 1.923794236779213, "epoch": 0.2483959890232098, "grad_norm": 9.449333190917969, "learning_rate": 5.076017376029378e-06, "loss": 0.5339, "mean_token_accuracy": 0.8383274003863335, "num_tokens": 96349766.0, "step": 80130 }, { "entropy": 1.9393807545304298, "epoch": 0.2484269881482595, "grad_norm": 7.0489397048950195, "learning_rate": 5.0757006653306975e-06, "loss": 0.488, "mean_token_accuracy": 0.8506465151906013, "num_tokens": 96360918.0, "step": 80140 }, { "entropy": 1.9307123482227326, "epoch": 0.2484579872733092, "grad_norm": 8.270514488220215, "learning_rate": 5.075384013906726e-06, "loss": 0.5188, "mean_token_accuracy": 0.8430813983082771, "num_tokens": 96371709.0, "step": 80150 }, { "entropy": 1.8894588977098465, "epoch": 0.2484889863983589, "grad_norm": 8.323099136352539, "learning_rate": 5.075067421738976e-06, "loss": 0.4728, "mean_token_accuracy": 0.8491836950182915, "num_tokens": 96384161.0, "step": 80160 }, { "entropy": 1.9335136204957961, "epoch": 0.2485199855234086, "grad_norm": 8.298654556274414, "learning_rate": 5.074750888808969e-06, "loss": 0.5369, "mean_token_accuracy": 0.8408285826444626, "num_tokens": 96395080.0, "step": 80170 }, { "entropy": 1.9049769386649131, "epoch": 0.2485509846484583, "grad_norm": 8.685126304626465, "learning_rate": 5.074434415098235e-06, "loss": 0.4992, "mean_token_accuracy": 0.8457057103514671, "num_tokens": 96406602.0, "step": 80180 }, { "entropy": 1.8694998741149902, "epoch": 0.248581983773508, "grad_norm": 9.182705879211426, "learning_rate": 5.07411800058831e-06, "loss": 0.4787, "mean_token_accuracy": 0.8451626107096673, "num_tokens": 96418323.0, "step": 80190 }, { "entropy": 1.8484734997153283, "epoch": 0.24861298289855768, "grad_norm": 8.897099494934082, "learning_rate": 5.0738016452607374e-06, "loss": 0.4857, "mean_token_accuracy": 0.8333059296011924, "num_tokens": 96430631.0, "step": 80200 }, { "entropy": 1.9490864470601081, "epoch": 0.24864398202360738, "grad_norm": 8.925657272338867, "learning_rate": 5.073485349097073e-06, "loss": 0.5366, "mean_token_accuracy": 0.8292359367012978, "num_tokens": 96442572.0, "step": 80210 }, { "entropy": 1.9751644432544708, "epoch": 0.24867498114865708, "grad_norm": 8.735760688781738, "learning_rate": 5.073169112078877e-06, "loss": 0.5984, "mean_token_accuracy": 0.8311158329248428, "num_tokens": 96453557.0, "step": 80220 }, { "entropy": 1.8875069230794908, "epoch": 0.24870598027370677, "grad_norm": 3.8038547039031982, "learning_rate": 5.072852934187719e-06, "loss": 0.4878, "mean_token_accuracy": 0.8413284301757813, "num_tokens": 96465184.0, "step": 80230 }, { "entropy": 1.9452220991253852, "epoch": 0.24873697939875647, "grad_norm": 10.38296127319336, "learning_rate": 5.072536815405176e-06, "loss": 0.5095, "mean_token_accuracy": 0.8376503065228462, "num_tokens": 96476422.0, "step": 80240 }, { "entropy": 1.9426960051059723, "epoch": 0.24876797852380617, "grad_norm": 8.761679649353027, "learning_rate": 5.072220755712832e-06, "loss": 0.4998, "mean_token_accuracy": 0.8422357112169265, "num_tokens": 96487607.0, "step": 80250 }, { "entropy": 1.968003484606743, "epoch": 0.24879897764885586, "grad_norm": 9.588860511779785, "learning_rate": 5.071904755092282e-06, "loss": 0.537, "mean_token_accuracy": 0.8418572053313256, "num_tokens": 96498517.0, "step": 80260 }, { "entropy": 1.798211957514286, "epoch": 0.24882997677390556, "grad_norm": 4.688083648681641, "learning_rate": 5.071588813525126e-06, "loss": 0.383, "mean_token_accuracy": 0.8539124757051468, "num_tokens": 96511459.0, "step": 80270 }, { "entropy": 1.9249870374798774, "epoch": 0.24886097589895526, "grad_norm": 9.468367576599121, "learning_rate": 5.071272930992976e-06, "loss": 0.5748, "mean_token_accuracy": 0.8211545005440712, "num_tokens": 96523206.0, "step": 80280 }, { "entropy": 1.9651254445314408, "epoch": 0.24889197502400495, "grad_norm": 8.676048278808594, "learning_rate": 5.070957107477445e-06, "loss": 0.5158, "mean_token_accuracy": 0.8506427973508834, "num_tokens": 96534006.0, "step": 80290 }, { "entropy": 1.9338118746876716, "epoch": 0.24892297414905465, "grad_norm": 10.065473556518555, "learning_rate": 5.070641342960163e-06, "loss": 0.5124, "mean_token_accuracy": 0.8366307452321052, "num_tokens": 96545681.0, "step": 80300 }, { "entropy": 1.9056643381714822, "epoch": 0.24895397327410435, "grad_norm": 7.843869686126709, "learning_rate": 5.070325637422762e-06, "loss": 0.5366, "mean_token_accuracy": 0.8389701470732689, "num_tokens": 96557894.0, "step": 80310 }, { "entropy": 1.766680945456028, "epoch": 0.24898497239915404, "grad_norm": 8.891898155212402, "learning_rate": 5.070009990846881e-06, "loss": 0.4187, "mean_token_accuracy": 0.8574616640806199, "num_tokens": 96572023.0, "step": 80320 }, { "entropy": 1.873360113799572, "epoch": 0.24901597152420374, "grad_norm": 8.903762817382812, "learning_rate": 5.069694403214172e-06, "loss": 0.4984, "mean_token_accuracy": 0.8422929286956787, "num_tokens": 96584169.0, "step": 80330 }, { "entropy": 1.949498575925827, "epoch": 0.24904697064925344, "grad_norm": 5.78523063659668, "learning_rate": 5.069378874506292e-06, "loss": 0.5428, "mean_token_accuracy": 0.8299229055643081, "num_tokens": 96596779.0, "step": 80340 }, { "entropy": 1.8998542904853821, "epoch": 0.24907796977430313, "grad_norm": 8.483941078186035, "learning_rate": 5.069063404704906e-06, "loss": 0.4755, "mean_token_accuracy": 0.8469171851873398, "num_tokens": 96608569.0, "step": 80350 }, { "entropy": 1.8074628964066506, "epoch": 0.24910896889935283, "grad_norm": 8.821971893310547, "learning_rate": 5.068747993791688e-06, "loss": 0.4681, "mean_token_accuracy": 0.850117315351963, "num_tokens": 96620814.0, "step": 80360 }, { "entropy": 1.818614599108696, "epoch": 0.24913996802440252, "grad_norm": 8.12628173828125, "learning_rate": 5.068432641748318e-06, "loss": 0.4463, "mean_token_accuracy": 0.8480607718229294, "num_tokens": 96634266.0, "step": 80370 }, { "entropy": 1.865596404671669, "epoch": 0.2491709671494522, "grad_norm": 7.75429630279541, "learning_rate": 5.068117348556486e-06, "loss": 0.4391, "mean_token_accuracy": 0.8603037342429161, "num_tokens": 96646716.0, "step": 80380 }, { "entropy": 1.9810962960124017, "epoch": 0.2492019662745019, "grad_norm": 9.098136901855469, "learning_rate": 5.06780211419789e-06, "loss": 0.508, "mean_token_accuracy": 0.841071504354477, "num_tokens": 96657927.0, "step": 80390 }, { "entropy": 1.8735760763287543, "epoch": 0.2492329653995516, "grad_norm": 8.33572769165039, "learning_rate": 5.067486938654235e-06, "loss": 0.504, "mean_token_accuracy": 0.8473378553986549, "num_tokens": 96670616.0, "step": 80400 }, { "entropy": 1.9213013619184494, "epoch": 0.24926396452460128, "grad_norm": 8.421183586120605, "learning_rate": 5.067171821907233e-06, "loss": 0.4998, "mean_token_accuracy": 0.8397013619542122, "num_tokens": 96682228.0, "step": 80410 }, { "entropy": 1.8531699359416962, "epoch": 0.24929496364965098, "grad_norm": 7.683859825134277, "learning_rate": 5.066856763938607e-06, "loss": 0.4447, "mean_token_accuracy": 0.8473792493343353, "num_tokens": 96695485.0, "step": 80420 }, { "entropy": 1.8864355862140656, "epoch": 0.24932596277470068, "grad_norm": 8.114187240600586, "learning_rate": 5.066541764730085e-06, "loss": 0.4721, "mean_token_accuracy": 0.8485756784677505, "num_tokens": 96707903.0, "step": 80430 }, { "entropy": 1.9324491962790489, "epoch": 0.24935696189975037, "grad_norm": 7.229660987854004, "learning_rate": 5.066226824263405e-06, "loss": 0.524, "mean_token_accuracy": 0.8466485366225243, "num_tokens": 96719778.0, "step": 80440 }, { "entropy": 1.9333231955766679, "epoch": 0.24938796102480007, "grad_norm": 10.796730041503906, "learning_rate": 5.0659119425203116e-06, "loss": 0.5191, "mean_token_accuracy": 0.8371896788477897, "num_tokens": 96731297.0, "step": 80450 }, { "entropy": 1.8949188530445098, "epoch": 0.24941896014984977, "grad_norm": 8.327264785766602, "learning_rate": 5.0655971194825586e-06, "loss": 0.4731, "mean_token_accuracy": 0.840109933912754, "num_tokens": 96743476.0, "step": 80460 }, { "entropy": 1.9443549513816833, "epoch": 0.24944995927489946, "grad_norm": 8.616156578063965, "learning_rate": 5.065282355131904e-06, "loss": 0.5149, "mean_token_accuracy": 0.8376484885811806, "num_tokens": 96755299.0, "step": 80470 }, { "entropy": 1.8526927456259727, "epoch": 0.24948095839994916, "grad_norm": 4.920670032501221, "learning_rate": 5.06496764945012e-06, "loss": 0.4747, "mean_token_accuracy": 0.8482913061976433, "num_tokens": 96768603.0, "step": 80480 }, { "entropy": 1.9240273088216782, "epoch": 0.24951195752499886, "grad_norm": 7.758111000061035, "learning_rate": 5.064653002418982e-06, "loss": 0.4934, "mean_token_accuracy": 0.8401697605848313, "num_tokens": 96780005.0, "step": 80490 }, { "entropy": 1.8932354971766472, "epoch": 0.24954295665004855, "grad_norm": 8.145684242248535, "learning_rate": 5.064338414020274e-06, "loss": 0.449, "mean_token_accuracy": 0.8508092865347863, "num_tokens": 96792344.0, "step": 80500 }, { "entropy": 1.8841583669185638, "epoch": 0.24957395577509825, "grad_norm": 7.790008068084717, "learning_rate": 5.064023884235791e-06, "loss": 0.499, "mean_token_accuracy": 0.8376784399151802, "num_tokens": 96804429.0, "step": 80510 }, { "entropy": 1.905587163567543, "epoch": 0.24960495490014795, "grad_norm": 8.785361289978027, "learning_rate": 5.063709413047332e-06, "loss": 0.4873, "mean_token_accuracy": 0.8395880877971649, "num_tokens": 96815913.0, "step": 80520 }, { "entropy": 1.884190782904625, "epoch": 0.24963595402519764, "grad_norm": 7.5868659019470215, "learning_rate": 5.063395000436705e-06, "loss": 0.4536, "mean_token_accuracy": 0.8370744809508324, "num_tokens": 96828062.0, "step": 80530 }, { "entropy": 1.9852372884750367, "epoch": 0.24966695315024734, "grad_norm": 7.919397354125977, "learning_rate": 5.063080646385727e-06, "loss": 0.5456, "mean_token_accuracy": 0.8372212365269661, "num_tokens": 96838879.0, "step": 80540 }, { "entropy": 1.8885437712073325, "epoch": 0.24969795227529704, "grad_norm": 8.829280853271484, "learning_rate": 5.062766350876223e-06, "loss": 0.442, "mean_token_accuracy": 0.8520119696855545, "num_tokens": 96851447.0, "step": 80550 }, { "entropy": 1.9246813550591468, "epoch": 0.24972895140034673, "grad_norm": 9.468807220458984, "learning_rate": 5.062452113890023e-06, "loss": 0.4682, "mean_token_accuracy": 0.8505311653017997, "num_tokens": 96863095.0, "step": 80560 }, { "entropy": 1.9204016119241714, "epoch": 0.24975995052539643, "grad_norm": 9.094423294067383, "learning_rate": 5.06213793540897e-06, "loss": 0.4745, "mean_token_accuracy": 0.8493364602327347, "num_tokens": 96874536.0, "step": 80570 }, { "entropy": 1.8319402411580086, "epoch": 0.24979094965044613, "grad_norm": 6.896250247955322, "learning_rate": 5.061823815414909e-06, "loss": 0.4068, "mean_token_accuracy": 0.8573963329195976, "num_tokens": 96887175.0, "step": 80580 }, { "entropy": 1.8221399798989295, "epoch": 0.24982194877549582, "grad_norm": 8.107512474060059, "learning_rate": 5.061509753889697e-06, "loss": 0.4384, "mean_token_accuracy": 0.8606969341635704, "num_tokens": 96899578.0, "step": 80590 }, { "entropy": 1.8691432937979697, "epoch": 0.24985294790054552, "grad_norm": 4.902885913848877, "learning_rate": 5.0611957508152e-06, "loss": 0.4548, "mean_token_accuracy": 0.8467679604887962, "num_tokens": 96911585.0, "step": 80600 }, { "entropy": 1.8758586004376412, "epoch": 0.24988394702559522, "grad_norm": 9.201493263244629, "learning_rate": 5.0608818061732855e-06, "loss": 0.4911, "mean_token_accuracy": 0.8440716713666916, "num_tokens": 96923853.0, "step": 80610 }, { "entropy": 1.9490404814481734, "epoch": 0.24991494615064488, "grad_norm": 9.697155952453613, "learning_rate": 5.0605679199458365e-06, "loss": 0.5383, "mean_token_accuracy": 0.8389123737812042, "num_tokens": 96935124.0, "step": 80620 }, { "entropy": 1.86717938631773, "epoch": 0.24994594527569458, "grad_norm": 8.20489501953125, "learning_rate": 5.060254092114738e-06, "loss": 0.4591, "mean_token_accuracy": 0.8462685376405716, "num_tokens": 96946886.0, "step": 80630 }, { "entropy": 1.856597825884819, "epoch": 0.24997694440074428, "grad_norm": 8.653462409973145, "learning_rate": 5.059940322661886e-06, "loss": 0.4763, "mean_token_accuracy": 0.8464651450514793, "num_tokens": 96958754.0, "step": 80640 }, { "entropy": 1.9769204616546632, "epoch": 0.250007943525794, "grad_norm": 8.563497543334961, "learning_rate": 5.059626611569183e-06, "loss": 0.527, "mean_token_accuracy": 0.8379195779561996, "num_tokens": 96969763.0, "step": 80650 }, { "entropy": 1.8927036389708518, "epoch": 0.2500389426508437, "grad_norm": 10.38947868347168, "learning_rate": 5.059312958818542e-06, "loss": 0.4557, "mean_token_accuracy": 0.8479386597871781, "num_tokens": 96981550.0, "step": 80660 }, { "entropy": 1.9301312297582627, "epoch": 0.25006994177589337, "grad_norm": 10.17356014251709, "learning_rate": 5.058999364391879e-06, "loss": 0.5176, "mean_token_accuracy": 0.8410406053066254, "num_tokens": 96992490.0, "step": 80670 }, { "entropy": 1.89289371073246, "epoch": 0.2501009409009431, "grad_norm": 7.985718250274658, "learning_rate": 5.058685828271122e-06, "loss": 0.4597, "mean_token_accuracy": 0.8415110245347023, "num_tokens": 97004316.0, "step": 80680 }, { "entropy": 1.9630913496017457, "epoch": 0.25013194002599276, "grad_norm": 7.961121559143066, "learning_rate": 5.0583723504382044e-06, "loss": 0.5045, "mean_token_accuracy": 0.8376610234379769, "num_tokens": 97015485.0, "step": 80690 }, { "entropy": 1.934187889099121, "epoch": 0.2501629391510425, "grad_norm": 8.046591758728027, "learning_rate": 5.05805893087507e-06, "loss": 0.5516, "mean_token_accuracy": 0.8271699488162995, "num_tokens": 97027583.0, "step": 80700 }, { "entropy": 1.9274443075060845, "epoch": 0.25019393827609215, "grad_norm": 3.1628236770629883, "learning_rate": 5.057745569563669e-06, "loss": 0.5329, "mean_token_accuracy": 0.8287164881825447, "num_tokens": 97039330.0, "step": 80710 }, { "entropy": 1.9359917491674423, "epoch": 0.2502249374011419, "grad_norm": 7.84744119644165, "learning_rate": 5.057432266485958e-06, "loss": 0.4986, "mean_token_accuracy": 0.8385084256529808, "num_tokens": 97050574.0, "step": 80720 }, { "entropy": 1.9594187870621682, "epoch": 0.25025593652619155, "grad_norm": 7.170385837554932, "learning_rate": 5.057119021623903e-06, "loss": 0.4977, "mean_token_accuracy": 0.8456411883234978, "num_tokens": 97062756.0, "step": 80730 }, { "entropy": 1.9243200287222861, "epoch": 0.25028693565124127, "grad_norm": 7.072485446929932, "learning_rate": 5.056805834959478e-06, "loss": 0.5129, "mean_token_accuracy": 0.8350550681352615, "num_tokens": 97074540.0, "step": 80740 }, { "entropy": 1.9237784013152122, "epoch": 0.25031793477629094, "grad_norm": 8.005240440368652, "learning_rate": 5.056492706474664e-06, "loss": 0.5107, "mean_token_accuracy": 0.8389381229877472, "num_tokens": 97086382.0, "step": 80750 }, { "entropy": 1.939412146806717, "epoch": 0.2503489339013406, "grad_norm": 8.643763542175293, "learning_rate": 5.056179636151449e-06, "loss": 0.4937, "mean_token_accuracy": 0.847502426803112, "num_tokens": 97097391.0, "step": 80760 }, { "entropy": 1.9007414281368256, "epoch": 0.25037993302639033, "grad_norm": 5.617809295654297, "learning_rate": 5.055866623971834e-06, "loss": 0.4676, "mean_token_accuracy": 0.8450678214430809, "num_tokens": 97109904.0, "step": 80770 }, { "entropy": 1.9128052070736885, "epoch": 0.25041093215144, "grad_norm": 10.466955184936523, "learning_rate": 5.05555366991782e-06, "loss": 0.4726, "mean_token_accuracy": 0.8468824103474617, "num_tokens": 97121730.0, "step": 80780 }, { "entropy": 1.9782243341207504, "epoch": 0.2504419312764897, "grad_norm": 9.422632217407227, "learning_rate": 5.0552407739714205e-06, "loss": 0.5243, "mean_token_accuracy": 0.8376658350229264, "num_tokens": 97133221.0, "step": 80790 }, { "entropy": 1.8325415551662445, "epoch": 0.2504729304015394, "grad_norm": 4.565164089202881, "learning_rate": 5.0549279361146554e-06, "loss": 0.3966, "mean_token_accuracy": 0.8581367552280426, "num_tokens": 97145983.0, "step": 80800 }, { "entropy": 1.8578097239136695, "epoch": 0.2505039295265891, "grad_norm": 8.166582107543945, "learning_rate": 5.0546151563295545e-06, "loss": 0.4472, "mean_token_accuracy": 0.8440769106149674, "num_tokens": 97159728.0, "step": 80810 }, { "entropy": 1.8590758338570594, "epoch": 0.2505349286516388, "grad_norm": 10.394712448120117, "learning_rate": 5.054302434598153e-06, "loss": 0.4654, "mean_token_accuracy": 0.8476429939270019, "num_tokens": 97172401.0, "step": 80820 }, { "entropy": 1.8516904532909393, "epoch": 0.2505659277766885, "grad_norm": 4.466270923614502, "learning_rate": 5.053989770902494e-06, "loss": 0.4435, "mean_token_accuracy": 0.8435142487287521, "num_tokens": 97184703.0, "step": 80830 }, { "entropy": 1.9154213652014733, "epoch": 0.2505969269017382, "grad_norm": 3.9221031665802, "learning_rate": 5.053677165224629e-06, "loss": 0.4504, "mean_token_accuracy": 0.8629103854298592, "num_tokens": 97195885.0, "step": 80840 }, { "entropy": 1.8582031592726707, "epoch": 0.2506279260267879, "grad_norm": 4.536048412322998, "learning_rate": 5.053364617546619e-06, "loss": 0.4221, "mean_token_accuracy": 0.8564950287342071, "num_tokens": 97207732.0, "step": 80850 }, { "entropy": 1.953849881887436, "epoch": 0.2506589251518376, "grad_norm": 7.936028003692627, "learning_rate": 5.05305212785053e-06, "loss": 0.5772, "mean_token_accuracy": 0.838682298362255, "num_tokens": 97218464.0, "step": 80860 }, { "entropy": 1.91454386562109, "epoch": 0.2506899242768873, "grad_norm": 10.065215110778809, "learning_rate": 5.052739696118435e-06, "loss": 0.5076, "mean_token_accuracy": 0.837799771130085, "num_tokens": 97230250.0, "step": 80870 }, { "entropy": 1.8838393643498421, "epoch": 0.25072092340193697, "grad_norm": 9.204728126525879, "learning_rate": 5.05242732233242e-06, "loss": 0.4506, "mean_token_accuracy": 0.8544922456145286, "num_tokens": 97242703.0, "step": 80880 }, { "entropy": 1.7693653479218483, "epoch": 0.2507519225269867, "grad_norm": 2.456925392150879, "learning_rate": 5.052115006474571e-06, "loss": 0.38, "mean_token_accuracy": 0.8611262261867523, "num_tokens": 97256526.0, "step": 80890 }, { "entropy": 1.830974417924881, "epoch": 0.25078292165203636, "grad_norm": 9.351210594177246, "learning_rate": 5.051802748526991e-06, "loss": 0.4618, "mean_token_accuracy": 0.8481344610452652, "num_tokens": 97268711.0, "step": 80900 }, { "entropy": 1.8810481294989585, "epoch": 0.2508139207770861, "grad_norm": 7.888736724853516, "learning_rate": 5.051490548471781e-06, "loss": 0.4459, "mean_token_accuracy": 0.8467276513576507, "num_tokens": 97281167.0, "step": 80910 }, { "entropy": 1.9074201628565788, "epoch": 0.25084491990213575, "grad_norm": 9.632227897644043, "learning_rate": 5.051178406291058e-06, "loss": 0.4912, "mean_token_accuracy": 0.8443691492080688, "num_tokens": 97293034.0, "step": 80920 }, { "entropy": 1.9555223256349563, "epoch": 0.2508759190271855, "grad_norm": 9.613581657409668, "learning_rate": 5.050866321966943e-06, "loss": 0.5608, "mean_token_accuracy": 0.8227898702025414, "num_tokens": 97304238.0, "step": 80930 }, { "entropy": 1.852898570895195, "epoch": 0.25090691815223515, "grad_norm": 4.1173553466796875, "learning_rate": 5.050554295481563e-06, "loss": 0.4399, "mean_token_accuracy": 0.8532208919525146, "num_tokens": 97317260.0, "step": 80940 }, { "entropy": 1.9421288311481475, "epoch": 0.25093791727728487, "grad_norm": 8.63619613647461, "learning_rate": 5.0502423268170556e-06, "loss": 0.5326, "mean_token_accuracy": 0.8310228928923606, "num_tokens": 97329358.0, "step": 80950 }, { "entropy": 1.9374548494815826, "epoch": 0.25096891640233454, "grad_norm": 8.348370552062988, "learning_rate": 5.049930415955566e-06, "loss": 0.4762, "mean_token_accuracy": 0.8462329894304276, "num_tokens": 97340759.0, "step": 80960 }, { "entropy": 1.9339235559105874, "epoch": 0.25099991552738427, "grad_norm": 3.9944283962249756, "learning_rate": 5.049618562879247e-06, "loss": 0.5083, "mean_token_accuracy": 0.8473946884274483, "num_tokens": 97351791.0, "step": 80970 }, { "entropy": 1.936272232234478, "epoch": 0.25103091465243393, "grad_norm": 3.7326107025146484, "learning_rate": 5.049306767570257e-06, "loss": 0.5225, "mean_token_accuracy": 0.8310480803251267, "num_tokens": 97363291.0, "step": 80980 }, { "entropy": 1.924396450817585, "epoch": 0.2510619137774836, "grad_norm": 8.854395866394043, "learning_rate": 5.048995030010763e-06, "loss": 0.5065, "mean_token_accuracy": 0.8389732867479325, "num_tokens": 97374667.0, "step": 80990 }, { "entropy": 1.9151296749711038, "epoch": 0.2510929129025333, "grad_norm": 8.511651992797852, "learning_rate": 5.048683350182941e-06, "loss": 0.474, "mean_token_accuracy": 0.8510990381240845, "num_tokens": 97385441.0, "step": 81000 }, { "entropy": 1.9651855736970902, "epoch": 0.251123912027583, "grad_norm": 8.566028594970703, "learning_rate": 5.048371728068976e-06, "loss": 0.4987, "mean_token_accuracy": 0.8424704790115356, "num_tokens": 97396228.0, "step": 81010 }, { "entropy": 1.8919865861535072, "epoch": 0.2511549111526327, "grad_norm": 7.970651149749756, "learning_rate": 5.048060163651056e-06, "loss": 0.4567, "mean_token_accuracy": 0.8576287001371383, "num_tokens": 97408337.0, "step": 81020 }, { "entropy": 1.8696946695446968, "epoch": 0.2511859102776824, "grad_norm": 8.631657600402832, "learning_rate": 5.047748656911381e-06, "loss": 0.4702, "mean_token_accuracy": 0.8475007191300392, "num_tokens": 97420330.0, "step": 81030 }, { "entropy": 1.8537717416882515, "epoch": 0.2512169094027321, "grad_norm": 8.173912048339844, "learning_rate": 5.047437207832157e-06, "loss": 0.4509, "mean_token_accuracy": 0.8481451213359833, "num_tokens": 97432563.0, "step": 81040 }, { "entropy": 1.8485816575586795, "epoch": 0.2512479085277818, "grad_norm": 9.571538925170898, "learning_rate": 5.047125816395597e-06, "loss": 0.4444, "mean_token_accuracy": 0.8497818827629089, "num_tokens": 97445205.0, "step": 81050 }, { "entropy": 1.9077823638916016, "epoch": 0.2512789076528315, "grad_norm": 9.712011337280273, "learning_rate": 5.046814482583923e-06, "loss": 0.532, "mean_token_accuracy": 0.8364897713065147, "num_tokens": 97456191.0, "step": 81060 }, { "entropy": 1.9123283997178078, "epoch": 0.2513099067778812, "grad_norm": 3.9969263076782227, "learning_rate": 5.046503206379363e-06, "loss": 0.5006, "mean_token_accuracy": 0.8428677946329117, "num_tokens": 97468188.0, "step": 81070 }, { "entropy": 1.8334277987480163, "epoch": 0.2513409059029309, "grad_norm": 8.850257873535156, "learning_rate": 5.046191987764155e-06, "loss": 0.472, "mean_token_accuracy": 0.8447373628616333, "num_tokens": 97481082.0, "step": 81080 }, { "entropy": 1.9028025731444358, "epoch": 0.25137190502798057, "grad_norm": 8.100180625915527, "learning_rate": 5.045880826720544e-06, "loss": 0.5003, "mean_token_accuracy": 0.8357649400830269, "num_tokens": 97492640.0, "step": 81090 }, { "entropy": 1.9044124081730842, "epoch": 0.2514029041530303, "grad_norm": 7.797017574310303, "learning_rate": 5.045569723230781e-06, "loss": 0.495, "mean_token_accuracy": 0.8423758924007416, "num_tokens": 97503638.0, "step": 81100 }, { "entropy": 1.9882340669631957, "epoch": 0.25143390327807996, "grad_norm": 7.203917980194092, "learning_rate": 5.045258677277125e-06, "loss": 0.557, "mean_token_accuracy": 0.8339864879846572, "num_tokens": 97514886.0, "step": 81110 }, { "entropy": 1.9591509833931924, "epoch": 0.2514649024031297, "grad_norm": 7.691939830780029, "learning_rate": 5.044947688841846e-06, "loss": 0.5052, "mean_token_accuracy": 0.8347377508878708, "num_tokens": 97526189.0, "step": 81120 }, { "entropy": 1.8668823108077048, "epoch": 0.25149590152817936, "grad_norm": 4.000504970550537, "learning_rate": 5.044636757907217e-06, "loss": 0.4437, "mean_token_accuracy": 0.8517090499401092, "num_tokens": 97538527.0, "step": 81130 }, { "entropy": 1.8323442935943604, "epoch": 0.2515269006532291, "grad_norm": 9.106334686279297, "learning_rate": 5.044325884455522e-06, "loss": 0.4715, "mean_token_accuracy": 0.8402392759919166, "num_tokens": 97551259.0, "step": 81140 }, { "entropy": 1.8727603957057, "epoch": 0.25155789977827875, "grad_norm": 4.572244167327881, "learning_rate": 5.04401506846905e-06, "loss": 0.4993, "mean_token_accuracy": 0.8374785885214806, "num_tokens": 97563511.0, "step": 81150 }, { "entropy": 1.858438329398632, "epoch": 0.2515888989033285, "grad_norm": 7.100371360778809, "learning_rate": 5.0437043099301006e-06, "loss": 0.4453, "mean_token_accuracy": 0.8520154163241387, "num_tokens": 97575837.0, "step": 81160 }, { "entropy": 1.8653832495212554, "epoch": 0.25161989802837814, "grad_norm": 7.727576732635498, "learning_rate": 5.043393608820979e-06, "loss": 0.4841, "mean_token_accuracy": 0.8588355794548989, "num_tokens": 97587041.0, "step": 81170 }, { "entropy": 1.8832016855478286, "epoch": 0.25165089715342787, "grad_norm": 9.95567512512207, "learning_rate": 5.043082965123996e-06, "loss": 0.5331, "mean_token_accuracy": 0.8414867803454399, "num_tokens": 97598721.0, "step": 81180 }, { "entropy": 1.9306304231286049, "epoch": 0.25168189627847753, "grad_norm": 9.012680053710938, "learning_rate": 5.042772378821477e-06, "loss": 0.5017, "mean_token_accuracy": 0.8460228756070137, "num_tokens": 97609705.0, "step": 81190 }, { "entropy": 1.8687114715576172, "epoch": 0.25171289540352726, "grad_norm": 7.753897190093994, "learning_rate": 5.042461849895747e-06, "loss": 0.4928, "mean_token_accuracy": 0.8424293950200081, "num_tokens": 97622075.0, "step": 81200 }, { "entropy": 1.8677958205342293, "epoch": 0.25174389452857693, "grad_norm": 9.426398277282715, "learning_rate": 5.0421513783291445e-06, "loss": 0.4804, "mean_token_accuracy": 0.8376675888895988, "num_tokens": 97634437.0, "step": 81210 }, { "entropy": 1.85673858076334, "epoch": 0.25177489365362665, "grad_norm": 3.3894803524017334, "learning_rate": 5.04184096410401e-06, "loss": 0.449, "mean_token_accuracy": 0.849704897403717, "num_tokens": 97647188.0, "step": 81220 }, { "entropy": 1.8845996797084807, "epoch": 0.2518058927786763, "grad_norm": 4.1136322021484375, "learning_rate": 5.041530607202698e-06, "loss": 0.4931, "mean_token_accuracy": 0.8388149186968803, "num_tokens": 97659181.0, "step": 81230 }, { "entropy": 1.8818348929286004, "epoch": 0.251836891903726, "grad_norm": 8.5837984085083, "learning_rate": 5.041220307607568e-06, "loss": 0.4394, "mean_token_accuracy": 0.8552572011947632, "num_tokens": 97670943.0, "step": 81240 }, { "entropy": 1.9214624166488647, "epoch": 0.2518678910287757, "grad_norm": 8.553311347961426, "learning_rate": 5.040910065300984e-06, "loss": 0.5291, "mean_token_accuracy": 0.8412210613489151, "num_tokens": 97682965.0, "step": 81250 }, { "entropy": 1.7995470568537713, "epoch": 0.2518988901538254, "grad_norm": 10.36379623413086, "learning_rate": 5.04059988026532e-06, "loss": 0.436, "mean_token_accuracy": 0.8609731838107109, "num_tokens": 97696313.0, "step": 81260 }, { "entropy": 1.9689121127128602, "epoch": 0.2519298892788751, "grad_norm": 7.462158679962158, "learning_rate": 5.0402897524829595e-06, "loss": 0.5265, "mean_token_accuracy": 0.8380709275603294, "num_tokens": 97708096.0, "step": 81270 }, { "entropy": 1.9260016784071923, "epoch": 0.2519608884039248, "grad_norm": 3.709622859954834, "learning_rate": 5.039979681936291e-06, "loss": 0.5007, "mean_token_accuracy": 0.8301331534981727, "num_tokens": 97720285.0, "step": 81280 }, { "entropy": 1.868030358850956, "epoch": 0.2519918875289745, "grad_norm": 9.611808776855469, "learning_rate": 5.039669668607713e-06, "loss": 0.4107, "mean_token_accuracy": 0.8606329753994941, "num_tokens": 97733109.0, "step": 81290 }, { "entropy": 1.8924655795097352, "epoch": 0.25202288665402417, "grad_norm": 9.830230712890625, "learning_rate": 5.039359712479628e-06, "loss": 0.4907, "mean_token_accuracy": 0.8400240853428841, "num_tokens": 97745924.0, "step": 81300 }, { "entropy": 1.900289523601532, "epoch": 0.2520538857790739, "grad_norm": 7.719475746154785, "learning_rate": 5.039049813534448e-06, "loss": 0.4758, "mean_token_accuracy": 0.84294343739748, "num_tokens": 97758200.0, "step": 81310 }, { "entropy": 1.8474449053406716, "epoch": 0.25208488490412356, "grad_norm": 4.5430684089660645, "learning_rate": 5.0387399717545945e-06, "loss": 0.4601, "mean_token_accuracy": 0.8552279889583587, "num_tokens": 97770861.0, "step": 81320 }, { "entropy": 1.965096142888069, "epoch": 0.2521158840291733, "grad_norm": 10.388503074645996, "learning_rate": 5.038430187122494e-06, "loss": 0.4992, "mean_token_accuracy": 0.8290927574038506, "num_tokens": 97783190.0, "step": 81330 }, { "entropy": 1.9893094927072525, "epoch": 0.25214688315422296, "grad_norm": 7.405788898468018, "learning_rate": 5.03812045962058e-06, "loss": 0.5311, "mean_token_accuracy": 0.8353447288274765, "num_tokens": 97794623.0, "step": 81340 }, { "entropy": 1.9728745833039283, "epoch": 0.2521778822792727, "grad_norm": 7.6129045486450195, "learning_rate": 5.037810789231295e-06, "loss": 0.5364, "mean_token_accuracy": 0.8378026425838471, "num_tokens": 97806542.0, "step": 81350 }, { "entropy": 1.9820994019508362, "epoch": 0.25220888140432235, "grad_norm": 8.426169395446777, "learning_rate": 5.0375011759370905e-06, "loss": 0.5264, "mean_token_accuracy": 0.8454423397779465, "num_tokens": 97817791.0, "step": 81360 }, { "entropy": 1.9337128177285194, "epoch": 0.2522398805293721, "grad_norm": 9.23006534576416, "learning_rate": 5.037191619720424e-06, "loss": 0.5056, "mean_token_accuracy": 0.8424638077616692, "num_tokens": 97830448.0, "step": 81370 }, { "entropy": 1.8902158245444298, "epoch": 0.25227087965442174, "grad_norm": 8.688264846801758, "learning_rate": 5.036882120563758e-06, "loss": 0.4861, "mean_token_accuracy": 0.8471032664179802, "num_tokens": 97842561.0, "step": 81380 }, { "entropy": 1.7683439910411836, "epoch": 0.25230187877947147, "grad_norm": 7.763362407684326, "learning_rate": 5.036572678449568e-06, "loss": 0.3783, "mean_token_accuracy": 0.8493596822023392, "num_tokens": 97856622.0, "step": 81390 }, { "entropy": 1.9423942849040032, "epoch": 0.25233287790452114, "grad_norm": 7.845071792602539, "learning_rate": 5.036263293360331e-06, "loss": 0.5274, "mean_token_accuracy": 0.8349541559815407, "num_tokens": 97867976.0, "step": 81400 }, { "entropy": 1.9156456768512726, "epoch": 0.25236387702957086, "grad_norm": 9.051328659057617, "learning_rate": 5.035953965278539e-06, "loss": 0.5054, "mean_token_accuracy": 0.8346843421459198, "num_tokens": 97880411.0, "step": 81410 }, { "entropy": 1.9632582008838653, "epoch": 0.25239487615462053, "grad_norm": 8.78569507598877, "learning_rate": 5.035644694186681e-06, "loss": 0.5321, "mean_token_accuracy": 0.8315117686986924, "num_tokens": 97891776.0, "step": 81420 }, { "entropy": 1.9228404372930528, "epoch": 0.25242587527967025, "grad_norm": 3.864513397216797, "learning_rate": 5.035335480067265e-06, "loss": 0.4328, "mean_token_accuracy": 0.8493173897266388, "num_tokens": 97903613.0, "step": 81430 }, { "entropy": 1.8714887380599976, "epoch": 0.2524568744047199, "grad_norm": 2.324655294418335, "learning_rate": 5.035026322902799e-06, "loss": 0.4154, "mean_token_accuracy": 0.8534365460276604, "num_tokens": 97915252.0, "step": 81440 }, { "entropy": 1.9306543365120887, "epoch": 0.25248787352976965, "grad_norm": 3.5631608963012695, "learning_rate": 5.0347172226758e-06, "loss": 0.5313, "mean_token_accuracy": 0.8418522760272026, "num_tokens": 97926415.0, "step": 81450 }, { "entropy": 1.9040459290146827, "epoch": 0.2525188726548193, "grad_norm": 8.15766429901123, "learning_rate": 5.034408179368794e-06, "loss": 0.5042, "mean_token_accuracy": 0.8509362369775773, "num_tokens": 97937394.0, "step": 81460 }, { "entropy": 1.8772794365882874, "epoch": 0.25254987177986904, "grad_norm": 4.18629789352417, "learning_rate": 5.034099192964314e-06, "loss": 0.458, "mean_token_accuracy": 0.8516110569238663, "num_tokens": 97949121.0, "step": 81470 }, { "entropy": 1.9287880495190621, "epoch": 0.2525808709049187, "grad_norm": 9.347918510437012, "learning_rate": 5.033790263444901e-06, "loss": 0.5062, "mean_token_accuracy": 0.8376461073756218, "num_tokens": 97960754.0, "step": 81480 }, { "entropy": 1.9006264075636863, "epoch": 0.2526118700299684, "grad_norm": 4.023174285888672, "learning_rate": 5.0334813907931005e-06, "loss": 0.4957, "mean_token_accuracy": 0.8348308220505715, "num_tokens": 97972812.0, "step": 81490 }, { "entropy": 1.983270612359047, "epoch": 0.2526428691550181, "grad_norm": 10.707830429077148, "learning_rate": 5.033172574991469e-06, "loss": 0.5652, "mean_token_accuracy": 0.8273524597287178, "num_tokens": 97984573.0, "step": 81500 }, { "entropy": 1.8726325988769532, "epoch": 0.25267386828006777, "grad_norm": 9.031349182128906, "learning_rate": 5.03286381602257e-06, "loss": 0.4764, "mean_token_accuracy": 0.8485777363181114, "num_tokens": 97996152.0, "step": 81510 }, { "entropy": 1.9527307838201522, "epoch": 0.2527048674051175, "grad_norm": 8.993306159973145, "learning_rate": 5.032555113868971e-06, "loss": 0.5688, "mean_token_accuracy": 0.8380294933915138, "num_tokens": 98007120.0, "step": 81520 }, { "entropy": 1.9475104868412019, "epoch": 0.25273586653016716, "grad_norm": 8.436033248901367, "learning_rate": 5.032246468513252e-06, "loss": 0.5843, "mean_token_accuracy": 0.8303594037890434, "num_tokens": 98018955.0, "step": 81530 }, { "entropy": 1.8729032024741172, "epoch": 0.2527668656552169, "grad_norm": 7.274577617645264, "learning_rate": 5.031937879937998e-06, "loss": 0.4667, "mean_token_accuracy": 0.8430945709347725, "num_tokens": 98030512.0, "step": 81540 }, { "entropy": 1.8622186571359634, "epoch": 0.25279786478026656, "grad_norm": 7.648394584655762, "learning_rate": 5.031629348125801e-06, "loss": 0.4795, "mean_token_accuracy": 0.8509231805801392, "num_tokens": 98043454.0, "step": 81550 }, { "entropy": 1.8595874547958373, "epoch": 0.2528288639053163, "grad_norm": 8.356622695922852, "learning_rate": 5.031320873059261e-06, "loss": 0.4618, "mean_token_accuracy": 0.8488967612385749, "num_tokens": 98055991.0, "step": 81560 }, { "entropy": 1.8365911930799483, "epoch": 0.25285986303036595, "grad_norm": 3.591733932495117, "learning_rate": 5.031012454720986e-06, "loss": 0.4586, "mean_token_accuracy": 0.8576977401971817, "num_tokens": 98067770.0, "step": 81570 }, { "entropy": 1.7719722762703896, "epoch": 0.2528908621554157, "grad_norm": 3.6505837440490723, "learning_rate": 5.03070409309359e-06, "loss": 0.4642, "mean_token_accuracy": 0.8484556913375855, "num_tokens": 98081361.0, "step": 81580 }, { "entropy": 1.801680639386177, "epoch": 0.25292186128046534, "grad_norm": 8.050759315490723, "learning_rate": 5.030395788159697e-06, "loss": 0.4351, "mean_token_accuracy": 0.8547103390097618, "num_tokens": 98094649.0, "step": 81590 }, { "entropy": 1.9035234346985817, "epoch": 0.25295286040551507, "grad_norm": 9.848466873168945, "learning_rate": 5.030087539901935e-06, "loss": 0.5127, "mean_token_accuracy": 0.8474085479974747, "num_tokens": 98106101.0, "step": 81600 }, { "entropy": 1.925628274679184, "epoch": 0.25298385953056474, "grad_norm": 7.4657206535339355, "learning_rate": 5.0297793483029445e-06, "loss": 0.5281, "mean_token_accuracy": 0.8374746307730675, "num_tokens": 98116953.0, "step": 81610 }, { "entropy": 1.8128420755267143, "epoch": 0.25301485865561446, "grad_norm": 8.32465934753418, "learning_rate": 5.029471213345367e-06, "loss": 0.4391, "mean_token_accuracy": 0.856282414495945, "num_tokens": 98129159.0, "step": 81620 }, { "entropy": 1.8045406460762023, "epoch": 0.25304585778066413, "grad_norm": 8.122908592224121, "learning_rate": 5.029163135011857e-06, "loss": 0.4565, "mean_token_accuracy": 0.8475161448121071, "num_tokens": 98142941.0, "step": 81630 }, { "entropy": 1.9032706007361413, "epoch": 0.25307685690571385, "grad_norm": 7.461269855499268, "learning_rate": 5.028855113285072e-06, "loss": 0.5337, "mean_token_accuracy": 0.8353031173348426, "num_tokens": 98154626.0, "step": 81640 }, { "entropy": 1.8948104843497275, "epoch": 0.2531078560307635, "grad_norm": 3.860278606414795, "learning_rate": 5.02854714814768e-06, "loss": 0.4937, "mean_token_accuracy": 0.8362452149391174, "num_tokens": 98166557.0, "step": 81650 }, { "entropy": 1.9045625925064087, "epoch": 0.25313885515581325, "grad_norm": 8.716605186462402, "learning_rate": 5.028239239582357e-06, "loss": 0.4543, "mean_token_accuracy": 0.8562442421913147, "num_tokens": 98177738.0, "step": 81660 }, { "entropy": 1.7965163722634316, "epoch": 0.2531698542808629, "grad_norm": 7.601503849029541, "learning_rate": 5.027931387571784e-06, "loss": 0.4912, "mean_token_accuracy": 0.8531433448195458, "num_tokens": 98190832.0, "step": 81670 }, { "entropy": 1.9607613369822503, "epoch": 0.25320085340591264, "grad_norm": 10.182971000671387, "learning_rate": 5.0276235920986505e-06, "loss": 0.5278, "mean_token_accuracy": 0.8343812167644501, "num_tokens": 98202527.0, "step": 81680 }, { "entropy": 1.8433429718017578, "epoch": 0.2532318525309623, "grad_norm": 10.645054817199707, "learning_rate": 5.027315853145653e-06, "loss": 0.5052, "mean_token_accuracy": 0.8503381237387657, "num_tokens": 98215514.0, "step": 81690 }, { "entropy": 1.8184980183839798, "epoch": 0.25326285165601203, "grad_norm": 8.565644264221191, "learning_rate": 5.0270081706954955e-06, "loss": 0.4284, "mean_token_accuracy": 0.8563423335552216, "num_tokens": 98228449.0, "step": 81700 }, { "entropy": 1.9226618602871894, "epoch": 0.2532938507810617, "grad_norm": 8.709757804870605, "learning_rate": 5.02670054473089e-06, "loss": 0.5179, "mean_token_accuracy": 0.841064678132534, "num_tokens": 98240774.0, "step": 81710 }, { "entropy": 1.8866528853774072, "epoch": 0.2533248499061114, "grad_norm": 8.839398384094238, "learning_rate": 5.0263929752345564e-06, "loss": 0.4838, "mean_token_accuracy": 0.8352084383368492, "num_tokens": 98253172.0, "step": 81720 }, { "entropy": 1.9001985654234885, "epoch": 0.2533558490311611, "grad_norm": 7.948025226593018, "learning_rate": 5.0260854621892196e-06, "loss": 0.4723, "mean_token_accuracy": 0.8508359596133233, "num_tokens": 98265024.0, "step": 81730 }, { "entropy": 1.9735320836305619, "epoch": 0.25338684815621076, "grad_norm": 8.330632209777832, "learning_rate": 5.0257780055776154e-06, "loss": 0.5311, "mean_token_accuracy": 0.8348188728094101, "num_tokens": 98275695.0, "step": 81740 }, { "entropy": 1.852024681866169, "epoch": 0.2534178472812605, "grad_norm": 7.069023132324219, "learning_rate": 5.025470605382483e-06, "loss": 0.4779, "mean_token_accuracy": 0.8471727296710014, "num_tokens": 98288269.0, "step": 81750 }, { "entropy": 1.8877501636743546, "epoch": 0.25344884640631016, "grad_norm": 7.931328296661377, "learning_rate": 5.0251632615865705e-06, "loss": 0.4818, "mean_token_accuracy": 0.8467486530542374, "num_tokens": 98299962.0, "step": 81760 }, { "entropy": 1.9214854270219803, "epoch": 0.2534798455313599, "grad_norm": 8.707962989807129, "learning_rate": 5.024855974172638e-06, "loss": 0.5115, "mean_token_accuracy": 0.8390247240662575, "num_tokens": 98311678.0, "step": 81770 }, { "entropy": 1.88422030210495, "epoch": 0.25351084465640955, "grad_norm": 8.946457862854004, "learning_rate": 5.024548743123444e-06, "loss": 0.5201, "mean_token_accuracy": 0.8396681264042855, "num_tokens": 98324228.0, "step": 81780 }, { "entropy": 1.8994328901171684, "epoch": 0.2535418437814593, "grad_norm": 8.10692024230957, "learning_rate": 5.024241568421762e-06, "loss": 0.4675, "mean_token_accuracy": 0.8410519272089004, "num_tokens": 98335294.0, "step": 81790 }, { "entropy": 1.9097337901592255, "epoch": 0.25357284290650894, "grad_norm": 3.829202175140381, "learning_rate": 5.02393445005037e-06, "loss": 0.4827, "mean_token_accuracy": 0.8425879299640655, "num_tokens": 98347548.0, "step": 81800 }, { "entropy": 1.864240688085556, "epoch": 0.25360384203155867, "grad_norm": 4.576921463012695, "learning_rate": 5.0236273879920534e-06, "loss": 0.4262, "mean_token_accuracy": 0.8558398082852363, "num_tokens": 98359543.0, "step": 81810 }, { "entropy": 1.885629440844059, "epoch": 0.25363484115660834, "grad_norm": 8.460371017456055, "learning_rate": 5.023320382229604e-06, "loss": 0.4796, "mean_token_accuracy": 0.8493375033140182, "num_tokens": 98370400.0, "step": 81820 }, { "entropy": 1.953327089548111, "epoch": 0.25366584028165806, "grad_norm": 9.160079956054688, "learning_rate": 5.023013432745823e-06, "loss": 0.532, "mean_token_accuracy": 0.8363123252987862, "num_tokens": 98381007.0, "step": 81830 }, { "entropy": 1.9075039952993393, "epoch": 0.25369683940670773, "grad_norm": 8.570027351379395, "learning_rate": 5.022706539523518e-06, "loss": 0.5037, "mean_token_accuracy": 0.8331753626465798, "num_tokens": 98393129.0, "step": 81840 }, { "entropy": 1.9241829916834832, "epoch": 0.25372783853175745, "grad_norm": 3.9442126750946045, "learning_rate": 5.022399702545504e-06, "loss": 0.4657, "mean_token_accuracy": 0.8449306398630142, "num_tokens": 98405137.0, "step": 81850 }, { "entropy": 1.9484668985009193, "epoch": 0.2537588376568071, "grad_norm": 9.32621955871582, "learning_rate": 5.022092921794602e-06, "loss": 0.4975, "mean_token_accuracy": 0.8393065080046653, "num_tokens": 98416316.0, "step": 81860 }, { "entropy": 1.8186310246586799, "epoch": 0.25378983678185685, "grad_norm": 8.844595909118652, "learning_rate": 5.021786197253644e-06, "loss": 0.4889, "mean_token_accuracy": 0.84593296200037, "num_tokens": 98429654.0, "step": 81870 }, { "entropy": 1.8217352986335755, "epoch": 0.2538208359069065, "grad_norm": 3.006070137023926, "learning_rate": 5.021479528905465e-06, "loss": 0.4014, "mean_token_accuracy": 0.8578614339232444, "num_tokens": 98442831.0, "step": 81880 }, { "entropy": 1.9646976083517074, "epoch": 0.25385183503195624, "grad_norm": 8.035730361938477, "learning_rate": 5.02117291673291e-06, "loss": 0.5245, "mean_token_accuracy": 0.8446223840117455, "num_tokens": 98454107.0, "step": 81890 }, { "entropy": 1.8987499296665191, "epoch": 0.2538828341570059, "grad_norm": 7.47988748550415, "learning_rate": 5.02086636071883e-06, "loss": 0.493, "mean_token_accuracy": 0.8420893490314484, "num_tokens": 98466334.0, "step": 81900 }, { "entropy": 1.9272898733615875, "epoch": 0.25391383328205563, "grad_norm": 9.14543342590332, "learning_rate": 5.020559860846086e-06, "loss": 0.4936, "mean_token_accuracy": 0.841991500556469, "num_tokens": 98477479.0, "step": 81910 }, { "entropy": 1.929322722554207, "epoch": 0.2539448324071053, "grad_norm": 10.226633071899414, "learning_rate": 5.020253417097542e-06, "loss": 0.5184, "mean_token_accuracy": 0.8391476318240165, "num_tokens": 98489083.0, "step": 81920 }, { "entropy": 1.9254152104258537, "epoch": 0.253975831532155, "grad_norm": 4.747035026550293, "learning_rate": 5.019947029456072e-06, "loss": 0.5488, "mean_token_accuracy": 0.841267442703247, "num_tokens": 98500874.0, "step": 81930 }, { "entropy": 1.8812214568257333, "epoch": 0.2540068306572047, "grad_norm": 8.595871925354004, "learning_rate": 5.019640697904557e-06, "loss": 0.4644, "mean_token_accuracy": 0.8475135773420334, "num_tokens": 98513127.0, "step": 81940 }, { "entropy": 1.8378457948565483, "epoch": 0.2540378297822544, "grad_norm": 8.983741760253906, "learning_rate": 5.019334422425887e-06, "loss": 0.4303, "mean_token_accuracy": 0.8528070077300072, "num_tokens": 98526234.0, "step": 81950 }, { "entropy": 1.953259851038456, "epoch": 0.2540688289073041, "grad_norm": 8.572074890136719, "learning_rate": 5.019028203002956e-06, "loss": 0.5288, "mean_token_accuracy": 0.8342129945755005, "num_tokens": 98537899.0, "step": 81960 }, { "entropy": 1.9012918874621392, "epoch": 0.2540998280323538, "grad_norm": 7.741192817687988, "learning_rate": 5.018722039618667e-06, "loss": 0.5136, "mean_token_accuracy": 0.8414129927754402, "num_tokens": 98549927.0, "step": 81970 }, { "entropy": 1.890120567381382, "epoch": 0.2541308271574035, "grad_norm": 9.252531051635742, "learning_rate": 5.018415932255929e-06, "loss": 0.4762, "mean_token_accuracy": 0.8448996976017952, "num_tokens": 98562215.0, "step": 81980 }, { "entropy": 1.8428632244467735, "epoch": 0.25416182628245315, "grad_norm": 3.551053762435913, "learning_rate": 5.0181098808976615e-06, "loss": 0.5019, "mean_token_accuracy": 0.843362420797348, "num_tokens": 98574911.0, "step": 81990 }, { "entropy": 1.9683369904756547, "epoch": 0.2541928254075029, "grad_norm": 7.981224060058594, "learning_rate": 5.0178038855267885e-06, "loss": 0.5373, "mean_token_accuracy": 0.841914513707161, "num_tokens": 98585178.0, "step": 82000 }, { "entropy": 1.8377211585640907, "epoch": 0.25422382453255254, "grad_norm": 3.9185397624969482, "learning_rate": 5.017497946126241e-06, "loss": 0.3844, "mean_token_accuracy": 0.8608157888054848, "num_tokens": 98598026.0, "step": 82010 }, { "entropy": 1.882440300285816, "epoch": 0.25425482365760227, "grad_norm": 10.865336418151855, "learning_rate": 5.01719206267896e-06, "loss": 0.4595, "mean_token_accuracy": 0.8479091212153435, "num_tokens": 98609992.0, "step": 82020 }, { "entropy": 1.8768093585968018, "epoch": 0.25428582278265194, "grad_norm": 9.238245010375977, "learning_rate": 5.016886235167892e-06, "loss": 0.4918, "mean_token_accuracy": 0.8360858023166656, "num_tokens": 98622049.0, "step": 82030 }, { "entropy": 1.9196750313043593, "epoch": 0.25431682190770166, "grad_norm": 10.01637077331543, "learning_rate": 5.016580463575987e-06, "loss": 0.4847, "mean_token_accuracy": 0.8404786124825477, "num_tokens": 98633923.0, "step": 82040 }, { "entropy": 1.9348924160003662, "epoch": 0.25434782103275133, "grad_norm": 8.29347038269043, "learning_rate": 5.016274747886213e-06, "loss": 0.5139, "mean_token_accuracy": 0.8385466530919075, "num_tokens": 98646426.0, "step": 82050 }, { "entropy": 1.8654908925294875, "epoch": 0.25437882015780106, "grad_norm": 7.821435451507568, "learning_rate": 5.0159690880815324e-06, "loss": 0.4524, "mean_token_accuracy": 0.8516849011182785, "num_tokens": 98659280.0, "step": 82060 }, { "entropy": 1.9111699253320693, "epoch": 0.2544098192828507, "grad_norm": 7.601536273956299, "learning_rate": 5.015663484144925e-06, "loss": 0.5258, "mean_token_accuracy": 0.83255735039711, "num_tokens": 98671363.0, "step": 82070 }, { "entropy": 1.9060380011796951, "epoch": 0.25444081840790045, "grad_norm": 7.3169965744018555, "learning_rate": 5.0153579360593696e-06, "loss": 0.5096, "mean_token_accuracy": 0.8374595761299133, "num_tokens": 98683802.0, "step": 82080 }, { "entropy": 1.93545441031456, "epoch": 0.2544718175329501, "grad_norm": 4.195225238800049, "learning_rate": 5.015052443807861e-06, "loss": 0.4874, "mean_token_accuracy": 0.846612386405468, "num_tokens": 98695666.0, "step": 82090 }, { "entropy": 1.8932590246200562, "epoch": 0.25450281665799984, "grad_norm": 4.190147399902344, "learning_rate": 5.014747007373392e-06, "loss": 0.457, "mean_token_accuracy": 0.8346954330801963, "num_tokens": 98707862.0, "step": 82100 }, { "entropy": 1.8419014051556588, "epoch": 0.2545338157830495, "grad_norm": 9.007070541381836, "learning_rate": 5.014441626738971e-06, "loss": 0.4531, "mean_token_accuracy": 0.8538463622331619, "num_tokens": 98719963.0, "step": 82110 }, { "entropy": 1.9747213378548623, "epoch": 0.25456481490809924, "grad_norm": 7.8509297370910645, "learning_rate": 5.014136301887608e-06, "loss": 0.5592, "mean_token_accuracy": 0.8374496519565582, "num_tokens": 98731159.0, "step": 82120 }, { "entropy": 1.950367696583271, "epoch": 0.2545958140331489, "grad_norm": 9.41358470916748, "learning_rate": 5.013831032802323e-06, "loss": 0.5059, "mean_token_accuracy": 0.847783799469471, "num_tokens": 98742604.0, "step": 82130 }, { "entropy": 1.967973504960537, "epoch": 0.25462681315819863, "grad_norm": 8.383604049682617, "learning_rate": 5.013525819466142e-06, "loss": 0.51, "mean_token_accuracy": 0.8421026647090912, "num_tokens": 98754125.0, "step": 82140 }, { "entropy": 1.984065267443657, "epoch": 0.2546578122832483, "grad_norm": 8.877246856689453, "learning_rate": 5.013220661862098e-06, "loss": 0.5354, "mean_token_accuracy": 0.8417667001485825, "num_tokens": 98764605.0, "step": 82150 }, { "entropy": 1.878348208218813, "epoch": 0.254688811408298, "grad_norm": 3.9120850563049316, "learning_rate": 5.012915559973233e-06, "loss": 0.4629, "mean_token_accuracy": 0.8392788946628571, "num_tokens": 98777263.0, "step": 82160 }, { "entropy": 2.001929074525833, "epoch": 0.2547198105333477, "grad_norm": 8.413508415222168, "learning_rate": 5.012610513782595e-06, "loss": 0.5951, "mean_token_accuracy": 0.8199850216507911, "num_tokens": 98788877.0, "step": 82170 }, { "entropy": 1.9609450832009316, "epoch": 0.2547508096583974, "grad_norm": 9.111734390258789, "learning_rate": 5.01230552327324e-06, "loss": 0.4973, "mean_token_accuracy": 0.844335462152958, "num_tokens": 98800367.0, "step": 82180 }, { "entropy": 1.9912359118461609, "epoch": 0.2547818087834471, "grad_norm": 9.116207122802734, "learning_rate": 5.012000588428227e-06, "loss": 0.5586, "mean_token_accuracy": 0.8303110048174858, "num_tokens": 98811373.0, "step": 82190 }, { "entropy": 1.8973192408680917, "epoch": 0.2548128079084968, "grad_norm": 8.382734298706055, "learning_rate": 5.01169570923063e-06, "loss": 0.441, "mean_token_accuracy": 0.8509496867656707, "num_tokens": 98823915.0, "step": 82200 }, { "entropy": 1.888469786942005, "epoch": 0.2548438070335465, "grad_norm": 4.078553676605225, "learning_rate": 5.011390885663524e-06, "loss": 0.4514, "mean_token_accuracy": 0.8516004219651222, "num_tokens": 98835873.0, "step": 82210 }, { "entropy": 1.9228701919317246, "epoch": 0.2548748061585962, "grad_norm": 9.359138488769531, "learning_rate": 5.011086117709992e-06, "loss": 0.5104, "mean_token_accuracy": 0.8420409306883812, "num_tokens": 98846828.0, "step": 82220 }, { "entropy": 1.9370484218001365, "epoch": 0.25490580528364587, "grad_norm": 9.458020210266113, "learning_rate": 5.010781405353129e-06, "loss": 0.5082, "mean_token_accuracy": 0.8457821294665336, "num_tokens": 98858261.0, "step": 82230 }, { "entropy": 1.815031287074089, "epoch": 0.25493680440869554, "grad_norm": 7.948138236999512, "learning_rate": 5.010476748576029e-06, "loss": 0.4499, "mean_token_accuracy": 0.8458429276943207, "num_tokens": 98870539.0, "step": 82240 }, { "entropy": 1.9159539863467216, "epoch": 0.25496780353374526, "grad_norm": 4.097768306732178, "learning_rate": 5.010172147361801e-06, "loss": 0.542, "mean_token_accuracy": 0.8348632201552391, "num_tokens": 98882616.0, "step": 82250 }, { "entropy": 1.8219385787844657, "epoch": 0.25499880265879493, "grad_norm": 4.100628852844238, "learning_rate": 5.009867601693556e-06, "loss": 0.4851, "mean_token_accuracy": 0.8387814164161682, "num_tokens": 98895560.0, "step": 82260 }, { "entropy": 1.8934132039546967, "epoch": 0.25502980178384466, "grad_norm": 9.86856746673584, "learning_rate": 5.009563111554415e-06, "loss": 0.4734, "mean_token_accuracy": 0.8422671303153038, "num_tokens": 98906966.0, "step": 82270 }, { "entropy": 1.9088118925690651, "epoch": 0.2550608009088943, "grad_norm": 4.092050075531006, "learning_rate": 5.009258676927506e-06, "loss": 0.4658, "mean_token_accuracy": 0.845549589395523, "num_tokens": 98918825.0, "step": 82280 }, { "entropy": 1.9305264785885812, "epoch": 0.25509180003394405, "grad_norm": 9.015393257141113, "learning_rate": 5.008954297795962e-06, "loss": 0.5136, "mean_token_accuracy": 0.8419323205947876, "num_tokens": 98930506.0, "step": 82290 }, { "entropy": 1.9115617856383325, "epoch": 0.2551227991589937, "grad_norm": 9.485702514648438, "learning_rate": 5.008649974142926e-06, "loss": 0.4921, "mean_token_accuracy": 0.8377510070800781, "num_tokens": 98941230.0, "step": 82300 }, { "entropy": 1.8585439771413803, "epoch": 0.25515379828404344, "grad_norm": 8.97275447845459, "learning_rate": 5.0083457059515476e-06, "loss": 0.4594, "mean_token_accuracy": 0.8494518890976905, "num_tokens": 98953271.0, "step": 82310 }, { "entropy": 1.781735722720623, "epoch": 0.2551847974090931, "grad_norm": 3.6371991634368896, "learning_rate": 5.008041493204978e-06, "loss": 0.4071, "mean_token_accuracy": 0.8600785121321678, "num_tokens": 98966568.0, "step": 82320 }, { "entropy": 1.9445871606469154, "epoch": 0.25521579653414284, "grad_norm": 7.9240899085998535, "learning_rate": 5.007737335886387e-06, "loss": 0.4885, "mean_token_accuracy": 0.8479147583246232, "num_tokens": 98977867.0, "step": 82330 }, { "entropy": 1.9001877725124359, "epoch": 0.2552467956591925, "grad_norm": 9.06245231628418, "learning_rate": 5.007433233978939e-06, "loss": 0.4689, "mean_token_accuracy": 0.8466285720467568, "num_tokens": 98990050.0, "step": 82340 }, { "entropy": 1.96756109893322, "epoch": 0.25527779478424223, "grad_norm": 7.101724624633789, "learning_rate": 5.007129187465815e-06, "loss": 0.4917, "mean_token_accuracy": 0.8483997210860252, "num_tokens": 99001143.0, "step": 82350 }, { "entropy": 1.9655298113822937, "epoch": 0.2553087939092919, "grad_norm": 11.712712287902832, "learning_rate": 5.006825196330199e-06, "loss": 0.5492, "mean_token_accuracy": 0.8361737057566643, "num_tokens": 99012612.0, "step": 82360 }, { "entropy": 1.8864617615938186, "epoch": 0.2553397930343416, "grad_norm": 9.521370887756348, "learning_rate": 5.006521260555282e-06, "loss": 0.4869, "mean_token_accuracy": 0.8489149019122124, "num_tokens": 99024254.0, "step": 82370 }, { "entropy": 1.9336427330970765, "epoch": 0.2553707921593913, "grad_norm": 8.10055923461914, "learning_rate": 5.006217380124263e-06, "loss": 0.5025, "mean_token_accuracy": 0.8430611327290535, "num_tokens": 99035921.0, "step": 82380 }, { "entropy": 1.8473542481660843, "epoch": 0.255401791284441, "grad_norm": 3.776082992553711, "learning_rate": 5.005913555020348e-06, "loss": 0.4707, "mean_token_accuracy": 0.8381215184926987, "num_tokens": 99048817.0, "step": 82390 }, { "entropy": 1.9615378364920617, "epoch": 0.2554327904094907, "grad_norm": 4.9302215576171875, "learning_rate": 5.00560978522675e-06, "loss": 0.4888, "mean_token_accuracy": 0.8320355072617531, "num_tokens": 99060522.0, "step": 82400 }, { "entropy": 1.8505050733685493, "epoch": 0.2554637895345404, "grad_norm": 10.341068267822266, "learning_rate": 5.0053060707266894e-06, "loss": 0.4186, "mean_token_accuracy": 0.852343961596489, "num_tokens": 99073004.0, "step": 82410 }, { "entropy": 1.918088473379612, "epoch": 0.2554947886595901, "grad_norm": 7.087062835693359, "learning_rate": 5.005002411503397e-06, "loss": 0.4884, "mean_token_accuracy": 0.836826567351818, "num_tokens": 99084688.0, "step": 82420 }, { "entropy": 1.8801950231194495, "epoch": 0.2555257877846398, "grad_norm": 8.337331771850586, "learning_rate": 5.004698807540101e-06, "loss": 0.4613, "mean_token_accuracy": 0.8451901748776436, "num_tokens": 99097086.0, "step": 82430 }, { "entropy": 1.9792321264743804, "epoch": 0.25555678690968947, "grad_norm": 9.403802871704102, "learning_rate": 5.004395258820048e-06, "loss": 0.5651, "mean_token_accuracy": 0.8320185244083405, "num_tokens": 99108547.0, "step": 82440 }, { "entropy": 1.88391984552145, "epoch": 0.2555877860347392, "grad_norm": 7.921189308166504, "learning_rate": 5.004091765326484e-06, "loss": 0.4816, "mean_token_accuracy": 0.8392195641994477, "num_tokens": 99121168.0, "step": 82450 }, { "entropy": 1.936862115561962, "epoch": 0.25561878515978886, "grad_norm": 8.814833641052246, "learning_rate": 5.003788327042666e-06, "loss": 0.5028, "mean_token_accuracy": 0.842677928507328, "num_tokens": 99133107.0, "step": 82460 }, { "entropy": 1.900815899670124, "epoch": 0.25564978428483853, "grad_norm": 8.44135570526123, "learning_rate": 5.003484943951856e-06, "loss": 0.4759, "mean_token_accuracy": 0.8460040614008904, "num_tokens": 99145316.0, "step": 82470 }, { "entropy": 1.9658162996172905, "epoch": 0.25568078340988826, "grad_norm": 9.297836303710938, "learning_rate": 5.003181616037326e-06, "loss": 0.5277, "mean_token_accuracy": 0.842269380390644, "num_tokens": 99157003.0, "step": 82480 }, { "entropy": 1.8745719879865645, "epoch": 0.2557117825349379, "grad_norm": 6.9280805587768555, "learning_rate": 5.002878343282352e-06, "loss": 0.4638, "mean_token_accuracy": 0.8584736675024033, "num_tokens": 99168906.0, "step": 82490 }, { "entropy": 1.8955281019210815, "epoch": 0.25574278165998765, "grad_norm": 4.146218299865723, "learning_rate": 5.0025751256702195e-06, "loss": 0.4766, "mean_token_accuracy": 0.8466937229037285, "num_tokens": 99181020.0, "step": 82500 }, { "entropy": 1.9218576028943062, "epoch": 0.2557737807850373, "grad_norm": 7.757521629333496, "learning_rate": 5.0022719631842165e-06, "loss": 0.4913, "mean_token_accuracy": 0.8493802517652511, "num_tokens": 99192352.0, "step": 82510 }, { "entropy": 1.9906231686472893, "epoch": 0.25580477991008704, "grad_norm": 8.75942611694336, "learning_rate": 5.001968855807645e-06, "loss": 0.5567, "mean_token_accuracy": 0.8316405296325684, "num_tokens": 99203591.0, "step": 82520 }, { "entropy": 1.9479998797178268, "epoch": 0.2558357790351367, "grad_norm": 8.903733253479004, "learning_rate": 5.001665803523808e-06, "loss": 0.4778, "mean_token_accuracy": 0.8445803746581078, "num_tokens": 99214411.0, "step": 82530 }, { "entropy": 1.920227263867855, "epoch": 0.25586677816018644, "grad_norm": 8.966920852661133, "learning_rate": 5.001362806316021e-06, "loss": 0.5075, "mean_token_accuracy": 0.8462096214294433, "num_tokens": 99226223.0, "step": 82540 }, { "entropy": 1.914782066643238, "epoch": 0.2558977772852361, "grad_norm": 9.925724983215332, "learning_rate": 5.001059864167602e-06, "loss": 0.4852, "mean_token_accuracy": 0.8446723312139511, "num_tokens": 99238079.0, "step": 82550 }, { "entropy": 1.9496339425444602, "epoch": 0.25592877641028583, "grad_norm": 7.7270612716674805, "learning_rate": 5.000756977061877e-06, "loss": 0.4965, "mean_token_accuracy": 0.8507271915674209, "num_tokens": 99249265.0, "step": 82560 }, { "entropy": 1.9208646342158318, "epoch": 0.2559597755353355, "grad_norm": 7.940464019775391, "learning_rate": 5.000454144982181e-06, "loss": 0.4896, "mean_token_accuracy": 0.8384231016039848, "num_tokens": 99261128.0, "step": 82570 }, { "entropy": 1.9947969660162925, "epoch": 0.2559907746603852, "grad_norm": 8.898752212524414, "learning_rate": 5.000151367911854e-06, "loss": 0.5323, "mean_token_accuracy": 0.8397686898708343, "num_tokens": 99273022.0, "step": 82580 }, { "entropy": 2.0049765020608903, "epoch": 0.2560217737854349, "grad_norm": 8.215011596679688, "learning_rate": 4.999848645834245e-06, "loss": 0.5369, "mean_token_accuracy": 0.8358174741268158, "num_tokens": 99283993.0, "step": 82590 }, { "entropy": 1.9047711238265037, "epoch": 0.2560527729104846, "grad_norm": 7.696551322937012, "learning_rate": 4.999545978732709e-06, "loss": 0.4165, "mean_token_accuracy": 0.8554992079734802, "num_tokens": 99295756.0, "step": 82600 }, { "entropy": 1.7249569281935693, "epoch": 0.2560837720355343, "grad_norm": 3.8466684818267822, "learning_rate": 4.999243366590607e-06, "loss": 0.3453, "mean_token_accuracy": 0.8684180244803429, "num_tokens": 99310148.0, "step": 82610 }, { "entropy": 1.8993670761585235, "epoch": 0.256114771160584, "grad_norm": 4.150681972503662, "learning_rate": 4.998940809391308e-06, "loss": 0.4989, "mean_token_accuracy": 0.8339141398668289, "num_tokens": 99322579.0, "step": 82620 }, { "entropy": 1.8982570886611938, "epoch": 0.2561457702856337, "grad_norm": 7.722603797912598, "learning_rate": 4.998638307118189e-06, "loss": 0.4361, "mean_token_accuracy": 0.8633927822113037, "num_tokens": 99334404.0, "step": 82630 }, { "entropy": 1.8842903196811676, "epoch": 0.2561767694106834, "grad_norm": 8.307390213012695, "learning_rate": 4.998335859754634e-06, "loss": 0.4148, "mean_token_accuracy": 0.8525113880634307, "num_tokens": 99347056.0, "step": 82640 }, { "entropy": 1.9822999000549317, "epoch": 0.25620776853573307, "grad_norm": 3.5337090492248535, "learning_rate": 4.998033467284031e-06, "loss": 0.5451, "mean_token_accuracy": 0.8361595958471298, "num_tokens": 99359039.0, "step": 82650 }, { "entropy": 1.8225185006856919, "epoch": 0.2562387676607828, "grad_norm": 8.926518440246582, "learning_rate": 4.997731129689778e-06, "loss": 0.4907, "mean_token_accuracy": 0.8412432596087456, "num_tokens": 99373001.0, "step": 82660 }, { "entropy": 1.9116671606898308, "epoch": 0.25626976678583246, "grad_norm": 9.220154762268066, "learning_rate": 4.997428846955282e-06, "loss": 0.6017, "mean_token_accuracy": 0.8289077386260033, "num_tokens": 99385753.0, "step": 82670 }, { "entropy": 1.981253059208393, "epoch": 0.2563007659108822, "grad_norm": 9.764948844909668, "learning_rate": 4.99712661906395e-06, "loss": 0.5408, "mean_token_accuracy": 0.8350262567400932, "num_tokens": 99396793.0, "step": 82680 }, { "entropy": 1.922914719581604, "epoch": 0.25633176503593186, "grad_norm": 9.412257194519043, "learning_rate": 4.996824445999205e-06, "loss": 0.493, "mean_token_accuracy": 0.8538893148303032, "num_tokens": 99408399.0, "step": 82690 }, { "entropy": 1.9097683414816857, "epoch": 0.2563627641609816, "grad_norm": 3.221794366836548, "learning_rate": 4.996522327744468e-06, "loss": 0.4683, "mean_token_accuracy": 0.8507839366793633, "num_tokens": 99420256.0, "step": 82700 }, { "entropy": 1.8777303710579871, "epoch": 0.25639376328603125, "grad_norm": 8.001666069030762, "learning_rate": 4.996220264283173e-06, "loss": 0.4211, "mean_token_accuracy": 0.8601910769939423, "num_tokens": 99433400.0, "step": 82710 }, { "entropy": 1.9291005671024322, "epoch": 0.2564247624110809, "grad_norm": 10.794572830200195, "learning_rate": 4.99591825559876e-06, "loss": 0.5153, "mean_token_accuracy": 0.8337120041251183, "num_tokens": 99445875.0, "step": 82720 }, { "entropy": 1.8854432210326195, "epoch": 0.25645576153613064, "grad_norm": 3.7990822792053223, "learning_rate": 4.995616301674676e-06, "loss": 0.4319, "mean_token_accuracy": 0.8595590248703957, "num_tokens": 99458264.0, "step": 82730 }, { "entropy": 1.9592019245028496, "epoch": 0.2564867606611803, "grad_norm": 8.916022300720215, "learning_rate": 4.995314402494373e-06, "loss": 0.5263, "mean_token_accuracy": 0.8408660888671875, "num_tokens": 99469951.0, "step": 82740 }, { "entropy": 1.9431997925043105, "epoch": 0.25651775978623004, "grad_norm": 6.906679153442383, "learning_rate": 4.99501255804131e-06, "loss": 0.4109, "mean_token_accuracy": 0.8524601250886917, "num_tokens": 99482201.0, "step": 82750 }, { "entropy": 1.91104666441679, "epoch": 0.2565487589112797, "grad_norm": 7.423837184906006, "learning_rate": 4.9947107682989585e-06, "loss": 0.4479, "mean_token_accuracy": 0.8510598838329315, "num_tokens": 99494427.0, "step": 82760 }, { "entropy": 1.9561400279402732, "epoch": 0.25657975803632943, "grad_norm": 9.361892700195312, "learning_rate": 4.994409033250789e-06, "loss": 0.5083, "mean_token_accuracy": 0.8367945715785027, "num_tokens": 99506187.0, "step": 82770 }, { "entropy": 1.9289663940668107, "epoch": 0.2566107571613791, "grad_norm": 8.348969459533691, "learning_rate": 4.9941073528802855e-06, "loss": 0.4674, "mean_token_accuracy": 0.8491498351097106, "num_tokens": 99518185.0, "step": 82780 }, { "entropy": 1.9189318656921386, "epoch": 0.2566417562864288, "grad_norm": 10.50049877166748, "learning_rate": 4.993805727170934e-06, "loss": 0.4672, "mean_token_accuracy": 0.8476282939314842, "num_tokens": 99531191.0, "step": 82790 }, { "entropy": 1.9094542279839515, "epoch": 0.2566727554114785, "grad_norm": 8.101419448852539, "learning_rate": 4.993504156106232e-06, "loss": 0.4681, "mean_token_accuracy": 0.8485571637749671, "num_tokens": 99542937.0, "step": 82800 }, { "entropy": 1.880283573269844, "epoch": 0.2567037545365282, "grad_norm": 7.880818843841553, "learning_rate": 4.99320263966968e-06, "loss": 0.4746, "mean_token_accuracy": 0.8481839671730995, "num_tokens": 99555383.0, "step": 82810 }, { "entropy": 1.8960697516798972, "epoch": 0.2567347536615779, "grad_norm": Infinity, "learning_rate": 4.992901177844789e-06, "loss": 0.419, "mean_token_accuracy": 0.857147465646267, "num_tokens": 99567485.0, "step": 82820 }, { "entropy": 1.992298111319542, "epoch": 0.2567657527866276, "grad_norm": 7.422923564910889, "learning_rate": 4.992599770615074e-06, "loss": 0.492, "mean_token_accuracy": 0.8521854832768441, "num_tokens": 99578298.0, "step": 82830 }, { "entropy": 1.9098379969596864, "epoch": 0.2567967519116773, "grad_norm": 7.3638739585876465, "learning_rate": 4.992298417964059e-06, "loss": 0.4517, "mean_token_accuracy": 0.8458767265081406, "num_tokens": 99590734.0, "step": 82840 }, { "entropy": 1.8104907512664794, "epoch": 0.256827751036727, "grad_norm": 4.290040493011475, "learning_rate": 4.991997119875274e-06, "loss": 0.4357, "mean_token_accuracy": 0.8509851738810539, "num_tokens": 99603863.0, "step": 82850 }, { "entropy": 1.9496165543794632, "epoch": 0.25685875016177667, "grad_norm": 8.491985321044922, "learning_rate": 4.991695876332256e-06, "loss": 0.5497, "mean_token_accuracy": 0.8299289435148239, "num_tokens": 99614829.0, "step": 82860 }, { "entropy": 1.7824837416410446, "epoch": 0.2568897492868264, "grad_norm": 10.624833106994629, "learning_rate": 4.991394687318549e-06, "loss": 0.3994, "mean_token_accuracy": 0.8593706652522087, "num_tokens": 99627639.0, "step": 82870 }, { "entropy": 1.8935058623552323, "epoch": 0.25692074841187607, "grad_norm": 3.4996988773345947, "learning_rate": 4.9910935528177034e-06, "loss": 0.4734, "mean_token_accuracy": 0.8467808067798615, "num_tokens": 99639515.0, "step": 82880 }, { "entropy": 1.9544168338179588, "epoch": 0.2569517475369258, "grad_norm": 8.53878402709961, "learning_rate": 4.990792472813278e-06, "loss": 0.4862, "mean_token_accuracy": 0.846103484928608, "num_tokens": 99651187.0, "step": 82890 }, { "entropy": 1.9207098990678788, "epoch": 0.25698274666197546, "grad_norm": 9.49964427947998, "learning_rate": 4.990491447288838e-06, "loss": 0.5052, "mean_token_accuracy": 0.8317215725779533, "num_tokens": 99662564.0, "step": 82900 }, { "entropy": 1.876941241323948, "epoch": 0.2570137457870252, "grad_norm": 7.434007167816162, "learning_rate": 4.990190476227954e-06, "loss": 0.4525, "mean_token_accuracy": 0.8501386985182762, "num_tokens": 99674973.0, "step": 82910 }, { "entropy": 1.9188931852579116, "epoch": 0.25704474491207485, "grad_norm": 7.2484331130981445, "learning_rate": 4.989889559614206e-06, "loss": 0.4591, "mean_token_accuracy": 0.8477975860238075, "num_tokens": 99686447.0, "step": 82920 }, { "entropy": 1.8817449413239955, "epoch": 0.2570757440371246, "grad_norm": 4.548129081726074, "learning_rate": 4.98958869743118e-06, "loss": 0.4449, "mean_token_accuracy": 0.8410625770688057, "num_tokens": 99698742.0, "step": 82930 }, { "entropy": 1.8077333301305771, "epoch": 0.25710674316217424, "grad_norm": 8.342117309570312, "learning_rate": 4.989287889662468e-06, "loss": 0.3919, "mean_token_accuracy": 0.8545293301343918, "num_tokens": 99712362.0, "step": 82940 }, { "entropy": 1.9139455810189248, "epoch": 0.25713774228722397, "grad_norm": 5.589305400848389, "learning_rate": 4.988987136291668e-06, "loss": 0.4878, "mean_token_accuracy": 0.8507546544075012, "num_tokens": 99724409.0, "step": 82950 }, { "entropy": 1.8849102705717087, "epoch": 0.25716874141227364, "grad_norm": 8.964681625366211, "learning_rate": 4.98868643730239e-06, "loss": 0.5041, "mean_token_accuracy": 0.8342619195580483, "num_tokens": 99736094.0, "step": 82960 }, { "entropy": 1.9025137901306153, "epoch": 0.2571997405373233, "grad_norm": 4.793386936187744, "learning_rate": 4.9883857926782435e-06, "loss": 0.46, "mean_token_accuracy": 0.8469575837254524, "num_tokens": 99748673.0, "step": 82970 }, { "entropy": 1.9453041523694992, "epoch": 0.25723073966237303, "grad_norm": 6.930491924285889, "learning_rate": 4.988085202402852e-06, "loss": 0.4549, "mean_token_accuracy": 0.8533410027623176, "num_tokens": 99760599.0, "step": 82980 }, { "entropy": 1.9440397009253503, "epoch": 0.2572617387874227, "grad_norm": 10.45510196685791, "learning_rate": 4.987784666459842e-06, "loss": 0.5538, "mean_token_accuracy": 0.8280631914734841, "num_tokens": 99771974.0, "step": 82990 }, { "entropy": 1.9070080369710922, "epoch": 0.2572927379124724, "grad_norm": 10.232966423034668, "learning_rate": 4.987484184832846e-06, "loss": 0.4834, "mean_token_accuracy": 0.8449847355484963, "num_tokens": 99784534.0, "step": 83000 }, { "entropy": 1.9375251710414887, "epoch": 0.2573237370375221, "grad_norm": 3.9058806896209717, "learning_rate": 4.9871837575055064e-06, "loss": 0.4807, "mean_token_accuracy": 0.8462153524160385, "num_tokens": 99796298.0, "step": 83010 }, { "entropy": 1.9144327610731124, "epoch": 0.2573547361625718, "grad_norm": 10.11092472076416, "learning_rate": 4.9868833844614715e-06, "loss": 0.4911, "mean_token_accuracy": 0.8435170888900757, "num_tokens": 99807740.0, "step": 83020 }, { "entropy": 1.8731989122927188, "epoch": 0.2573857352876215, "grad_norm": 7.537945747375488, "learning_rate": 4.986583065684396e-06, "loss": 0.476, "mean_token_accuracy": 0.8457079976797104, "num_tokens": 99819888.0, "step": 83030 }, { "entropy": 1.9202357351779937, "epoch": 0.2574167344126712, "grad_norm": 8.85262393951416, "learning_rate": 4.986282801157941e-06, "loss": 0.4819, "mean_token_accuracy": 0.844878327846527, "num_tokens": 99831444.0, "step": 83040 }, { "entropy": 1.8694255024194717, "epoch": 0.2574477335377209, "grad_norm": 5.111952304840088, "learning_rate": 4.9859825908657735e-06, "loss": 0.512, "mean_token_accuracy": 0.835330268740654, "num_tokens": 99844184.0, "step": 83050 }, { "entropy": 1.9209684014320374, "epoch": 0.2574787326627706, "grad_norm": 6.903238773345947, "learning_rate": 4.985682434791573e-06, "loss": 0.5015, "mean_token_accuracy": 0.845658715069294, "num_tokens": 99856287.0, "step": 83060 }, { "entropy": 1.8327378258109093, "epoch": 0.2575097317878203, "grad_norm": 9.568243026733398, "learning_rate": 4.9853823329190185e-06, "loss": 0.4809, "mean_token_accuracy": 0.8382641449570656, "num_tokens": 99869468.0, "step": 83070 }, { "entropy": 1.9063044503331183, "epoch": 0.25754073091287, "grad_norm": 3.617058515548706, "learning_rate": 4.985082285231801e-06, "loss": 0.4922, "mean_token_accuracy": 0.8415888145565986, "num_tokens": 99881146.0, "step": 83080 }, { "entropy": 1.7789159893989563, "epoch": 0.25757173003791967, "grad_norm": 7.718278884887695, "learning_rate": 4.9847822917136154e-06, "loss": 0.4231, "mean_token_accuracy": 0.8528930768370628, "num_tokens": 99894684.0, "step": 83090 }, { "entropy": 1.946078921854496, "epoch": 0.2576027291629694, "grad_norm": 8.70456600189209, "learning_rate": 4.984482352348164e-06, "loss": 0.5013, "mean_token_accuracy": 0.8418860971927643, "num_tokens": 99905660.0, "step": 83100 }, { "entropy": 1.9768987134099008, "epoch": 0.25763372828801906, "grad_norm": 9.16089153289795, "learning_rate": 4.9841824671191594e-06, "loss": 0.5825, "mean_token_accuracy": 0.8343336895108223, "num_tokens": 99917120.0, "step": 83110 }, { "entropy": 1.889198412001133, "epoch": 0.2576647274130688, "grad_norm": 7.599856853485107, "learning_rate": 4.983882636010317e-06, "loss": 0.4276, "mean_token_accuracy": 0.8526261046528816, "num_tokens": 99928830.0, "step": 83120 }, { "entropy": 1.9562062606215478, "epoch": 0.25769572653811845, "grad_norm": 8.07655143737793, "learning_rate": 4.983582859005359e-06, "loss": 0.4726, "mean_token_accuracy": 0.831449817121029, "num_tokens": 99940719.0, "step": 83130 }, { "entropy": 1.896943534910679, "epoch": 0.2577267256631682, "grad_norm": 8.602059364318848, "learning_rate": 4.983283136088018e-06, "loss": 0.4598, "mean_token_accuracy": 0.8445844024419784, "num_tokens": 99953117.0, "step": 83140 }, { "entropy": 1.9443399429321289, "epoch": 0.25775772478821785, "grad_norm": 7.0254693031311035, "learning_rate": 4.982983467242029e-06, "loss": 0.5087, "mean_token_accuracy": 0.8422896787524223, "num_tokens": 99964325.0, "step": 83150 }, { "entropy": 1.9491709470748901, "epoch": 0.25778872391326757, "grad_norm": 11.82594108581543, "learning_rate": 4.982683852451138e-06, "loss": 0.5412, "mean_token_accuracy": 0.8369784370064736, "num_tokens": 99975197.0, "step": 83160 }, { "entropy": 1.8532451003789903, "epoch": 0.25781972303831724, "grad_norm": 3.9568448066711426, "learning_rate": 4.982384291699096e-06, "loss": 0.4711, "mean_token_accuracy": 0.8396353736519814, "num_tokens": 99987783.0, "step": 83170 }, { "entropy": 1.9464505165815353, "epoch": 0.25785072216336696, "grad_norm": 10.583782196044922, "learning_rate": 4.982084784969659e-06, "loss": 0.515, "mean_token_accuracy": 0.8377991825342178, "num_tokens": 100000020.0, "step": 83180 }, { "entropy": 1.9166669502854348, "epoch": 0.25788172128841663, "grad_norm": 9.20700740814209, "learning_rate": 4.981785332246592e-06, "loss": 0.4661, "mean_token_accuracy": 0.8458949193358422, "num_tokens": 100011762.0, "step": 83190 }, { "entropy": 1.9713429033756256, "epoch": 0.25791272041346636, "grad_norm": 9.420412063598633, "learning_rate": 4.981485933513668e-06, "loss": 0.4776, "mean_token_accuracy": 0.8544860139489174, "num_tokens": 100023022.0, "step": 83200 }, { "entropy": 1.9141755923628807, "epoch": 0.257943719538516, "grad_norm": 9.200664520263672, "learning_rate": 4.981186588754664e-06, "loss": 0.5146, "mean_token_accuracy": 0.8375491261482239, "num_tokens": 100034493.0, "step": 83210 }, { "entropy": 1.8373278394341468, "epoch": 0.2579747186635657, "grad_norm": 3.6070780754089355, "learning_rate": 4.980887297953366e-06, "loss": 0.4488, "mean_token_accuracy": 0.853793577849865, "num_tokens": 100047907.0, "step": 83220 }, { "entropy": 1.905658522248268, "epoch": 0.2580057177886154, "grad_norm": 33.18852996826172, "learning_rate": 4.980588061093565e-06, "loss": 0.4779, "mean_token_accuracy": 0.844824655354023, "num_tokens": 100059902.0, "step": 83230 }, { "entropy": 1.9518269658088685, "epoch": 0.2580367169136651, "grad_norm": 7.96235990524292, "learning_rate": 4.980288878159059e-06, "loss": 0.4883, "mean_token_accuracy": 0.8474616572260857, "num_tokens": 100070453.0, "step": 83240 }, { "entropy": 1.971319329738617, "epoch": 0.2580677160387148, "grad_norm": 8.494059562683105, "learning_rate": 4.9799897491336564e-06, "loss": 0.5307, "mean_token_accuracy": 0.8315307438373566, "num_tokens": 100082070.0, "step": 83250 }, { "entropy": 1.9625032275915146, "epoch": 0.2580987151637645, "grad_norm": 7.6323466300964355, "learning_rate": 4.979690674001167e-06, "loss": 0.5205, "mean_token_accuracy": 0.8371849060058594, "num_tokens": 100093397.0, "step": 83260 }, { "entropy": 1.8805483281612396, "epoch": 0.2581297142888142, "grad_norm": 8.261248588562012, "learning_rate": 4.979391652745411e-06, "loss": 0.4452, "mean_token_accuracy": 0.8561758771538734, "num_tokens": 100105997.0, "step": 83270 }, { "entropy": 1.8767956405878068, "epoch": 0.2581607134138639, "grad_norm": 3.6510441303253174, "learning_rate": 4.9790926853502125e-06, "loss": 0.4729, "mean_token_accuracy": 0.8472672030329704, "num_tokens": 100117793.0, "step": 83280 }, { "entropy": 1.7951404377818108, "epoch": 0.2581917125389136, "grad_norm": 4.375608921051025, "learning_rate": 4.978793771799407e-06, "loss": 0.3975, "mean_token_accuracy": 0.8583429366350174, "num_tokens": 100130835.0, "step": 83290 }, { "entropy": 1.971182020008564, "epoch": 0.25822271166396327, "grad_norm": 10.078766822814941, "learning_rate": 4.978494912076833e-06, "loss": 0.5194, "mean_token_accuracy": 0.8344864457845688, "num_tokens": 100141909.0, "step": 83300 }, { "entropy": 1.9942039638757705, "epoch": 0.258253710789013, "grad_norm": 8.207865715026855, "learning_rate": 4.978196106166336e-06, "loss": 0.574, "mean_token_accuracy": 0.828905712068081, "num_tokens": 100152944.0, "step": 83310 }, { "entropy": 1.8606795385479926, "epoch": 0.25828470991406266, "grad_norm": 4.853648662567139, "learning_rate": 4.97789735405177e-06, "loss": 0.4582, "mean_token_accuracy": 0.849544158577919, "num_tokens": 100166124.0, "step": 83320 }, { "entropy": 1.9509618058800697, "epoch": 0.2583157090391124, "grad_norm": 6.577378273010254, "learning_rate": 4.977598655716994e-06, "loss": 0.5034, "mean_token_accuracy": 0.8365138009190559, "num_tokens": 100177997.0, "step": 83330 }, { "entropy": 1.9413925766944886, "epoch": 0.25834670816416205, "grad_norm": 7.664201736450195, "learning_rate": 4.977300011145877e-06, "loss": 0.497, "mean_token_accuracy": 0.8374240696430206, "num_tokens": 100189432.0, "step": 83340 }, { "entropy": 1.9520146340131759, "epoch": 0.2583777072892118, "grad_norm": 3.7987093925476074, "learning_rate": 4.97700142032229e-06, "loss": 0.4852, "mean_token_accuracy": 0.8407413721084595, "num_tokens": 100201276.0, "step": 83350 }, { "entropy": 1.9646510049700736, "epoch": 0.25840870641426145, "grad_norm": 8.533371925354004, "learning_rate": 4.976702883230112e-06, "loss": 0.5696, "mean_token_accuracy": 0.8276545464992523, "num_tokens": 100212137.0, "step": 83360 }, { "entropy": 1.8566021844744682, "epoch": 0.25843970553931117, "grad_norm": 9.078433990478516, "learning_rate": 4.976404399853234e-06, "loss": 0.4749, "mean_token_accuracy": 0.8363482743501663, "num_tokens": 100224796.0, "step": 83370 }, { "entropy": 1.9266142144799232, "epoch": 0.25847070466436084, "grad_norm": 8.397682189941406, "learning_rate": 4.9761059701755475e-06, "loss": 0.4819, "mean_token_accuracy": 0.8483969137072563, "num_tokens": 100236575.0, "step": 83380 }, { "entropy": 1.9420603141188622, "epoch": 0.25850170378941056, "grad_norm": 8.181031227111816, "learning_rate": 4.975807594180953e-06, "loss": 0.5213, "mean_token_accuracy": 0.844786761701107, "num_tokens": 100248133.0, "step": 83390 }, { "entropy": 1.7950069807469844, "epoch": 0.25853270291446023, "grad_norm": 7.972817420959473, "learning_rate": 4.975509271853358e-06, "loss": 0.3808, "mean_token_accuracy": 0.8578445136547088, "num_tokens": 100261552.0, "step": 83400 }, { "entropy": 1.9505249321460725, "epoch": 0.25856370203950996, "grad_norm": 3.8303351402282715, "learning_rate": 4.975211003176676e-06, "loss": 0.5553, "mean_token_accuracy": 0.8300097927451133, "num_tokens": 100272407.0, "step": 83410 }, { "entropy": 1.9518933594226837, "epoch": 0.2585947011645596, "grad_norm": 8.778966903686523, "learning_rate": 4.97491278813483e-06, "loss": 0.4873, "mean_token_accuracy": 0.8443615302443505, "num_tokens": 100284483.0, "step": 83420 }, { "entropy": 1.825792530924082, "epoch": 0.25862570028960935, "grad_norm": 8.191307067871094, "learning_rate": 4.974614626711745e-06, "loss": 0.4156, "mean_token_accuracy": 0.8553913697600365, "num_tokens": 100298331.0, "step": 83430 }, { "entropy": 1.9290891095995903, "epoch": 0.258656699414659, "grad_norm": 8.639996528625488, "learning_rate": 4.9743165188913564e-06, "loss": 0.5654, "mean_token_accuracy": 0.8340361103415489, "num_tokens": 100309563.0, "step": 83440 }, { "entropy": 1.8901295900344848, "epoch": 0.25868769853970874, "grad_norm": 4.191493988037109, "learning_rate": 4.9740184646576036e-06, "loss": 0.4504, "mean_token_accuracy": 0.8442709073424339, "num_tokens": 100321894.0, "step": 83450 }, { "entropy": 1.8269127234816551, "epoch": 0.2587186976647584, "grad_norm": 4.574268817901611, "learning_rate": 4.9737204639944376e-06, "loss": 0.4119, "mean_token_accuracy": 0.8548773661255836, "num_tokens": 100334501.0, "step": 83460 }, { "entropy": 1.8532571867108345, "epoch": 0.2587496967898081, "grad_norm": 3.918750762939453, "learning_rate": 4.973422516885809e-06, "loss": 0.4792, "mean_token_accuracy": 0.846004131436348, "num_tokens": 100346656.0, "step": 83470 }, { "entropy": 1.8245261393487453, "epoch": 0.2587806959148578, "grad_norm": 3.927574872970581, "learning_rate": 4.973124623315682e-06, "loss": 0.4089, "mean_token_accuracy": 0.8500414654612541, "num_tokens": 100359699.0, "step": 83480 }, { "entropy": 1.77280533015728, "epoch": 0.2588116950399075, "grad_norm": 4.003964424133301, "learning_rate": 4.972826783268022e-06, "loss": 0.3831, "mean_token_accuracy": 0.8541926577687263, "num_tokens": 100373537.0, "step": 83490 }, { "entropy": 1.894656127691269, "epoch": 0.2588426941649572, "grad_norm": 8.42563247680664, "learning_rate": 4.972528996726807e-06, "loss": 0.494, "mean_token_accuracy": 0.8406502559781075, "num_tokens": 100385609.0, "step": 83500 }, { "entropy": 1.9357464522123338, "epoch": 0.25887369329000687, "grad_norm": 9.4671049118042, "learning_rate": 4.972231263676015e-06, "loss": 0.496, "mean_token_accuracy": 0.8405605420470238, "num_tokens": 100397032.0, "step": 83510 }, { "entropy": 1.9452675521373748, "epoch": 0.2589046924150566, "grad_norm": 12.05567455291748, "learning_rate": 4.971933584099637e-06, "loss": 0.5118, "mean_token_accuracy": 0.832793453335762, "num_tokens": 100408610.0, "step": 83520 }, { "entropy": 1.9751826629042626, "epoch": 0.25893569154010626, "grad_norm": 8.384647369384766, "learning_rate": 4.971635957981665e-06, "loss": 0.513, "mean_token_accuracy": 0.8382089316844941, "num_tokens": 100420251.0, "step": 83530 }, { "entropy": 1.911265040934086, "epoch": 0.258966690665156, "grad_norm": 8.947986602783203, "learning_rate": 4.971338385306102e-06, "loss": 0.4612, "mean_token_accuracy": 0.853412701189518, "num_tokens": 100432233.0, "step": 83540 }, { "entropy": 1.9375739723443985, "epoch": 0.25899768979020565, "grad_norm": 7.377249240875244, "learning_rate": 4.9710408660569555e-06, "loss": 0.4935, "mean_token_accuracy": 0.8441775634884834, "num_tokens": 100443781.0, "step": 83550 }, { "entropy": 1.9460734203457832, "epoch": 0.2590286889152554, "grad_norm": 8.327649116516113, "learning_rate": 4.970743400218241e-06, "loss": 0.5368, "mean_token_accuracy": 0.8340301439166069, "num_tokens": 100454515.0, "step": 83560 }, { "entropy": 1.9006448999047278, "epoch": 0.25905968804030505, "grad_norm": 4.181915760040283, "learning_rate": 4.9704459877739805e-06, "loss": 0.4663, "mean_token_accuracy": 0.8439620837569237, "num_tokens": 100466532.0, "step": 83570 }, { "entropy": 2.0110969811677935, "epoch": 0.25909068716535477, "grad_norm": 7.133867263793945, "learning_rate": 4.9701486287082e-06, "loss": 0.5219, "mean_token_accuracy": 0.84031240940094, "num_tokens": 100477071.0, "step": 83580 }, { "entropy": 1.8151050120592118, "epoch": 0.25912168629040444, "grad_norm": 4.122489929199219, "learning_rate": 4.9698513230049375e-06, "loss": 0.4686, "mean_token_accuracy": 0.8427058979868889, "num_tokens": 100490782.0, "step": 83590 }, { "entropy": 1.8606194391846658, "epoch": 0.25915268541545416, "grad_norm": 3.6981208324432373, "learning_rate": 4.969554070648234e-06, "loss": 0.456, "mean_token_accuracy": 0.8553460776805878, "num_tokens": 100503176.0, "step": 83600 }, { "entropy": 1.9358109071850778, "epoch": 0.25918368454050383, "grad_norm": 4.142949104309082, "learning_rate": 4.9692568716221355e-06, "loss": 0.5155, "mean_token_accuracy": 0.8340194925665856, "num_tokens": 100514798.0, "step": 83610 }, { "entropy": 1.8827938050031663, "epoch": 0.25921468366555356, "grad_norm": 8.898327827453613, "learning_rate": 4.968959725910699e-06, "loss": 0.4757, "mean_token_accuracy": 0.8450506746768951, "num_tokens": 100526985.0, "step": 83620 }, { "entropy": 1.9060676455497743, "epoch": 0.2592456827906032, "grad_norm": 9.307757377624512, "learning_rate": 4.968662633497986e-06, "loss": 0.4762, "mean_token_accuracy": 0.8441753014922142, "num_tokens": 100539529.0, "step": 83630 }, { "entropy": 1.8917318254709243, "epoch": 0.25927668191565295, "grad_norm": 7.30048942565918, "learning_rate": 4.968365594368065e-06, "loss": 0.4478, "mean_token_accuracy": 0.849764208495617, "num_tokens": 100551756.0, "step": 83640 }, { "entropy": 1.8924952074885368, "epoch": 0.2593076810407026, "grad_norm": 4.036615371704102, "learning_rate": 4.968068608505009e-06, "loss": 0.4507, "mean_token_accuracy": 0.8556442365050316, "num_tokens": 100564203.0, "step": 83650 }, { "entropy": 1.8785409778356552, "epoch": 0.25933868016575234, "grad_norm": 4.065882682800293, "learning_rate": 4.967771675892903e-06, "loss": 0.4462, "mean_token_accuracy": 0.8419948130846023, "num_tokens": 100575948.0, "step": 83660 }, { "entropy": 1.92302625477314, "epoch": 0.259369679290802, "grad_norm": 5.085048675537109, "learning_rate": 4.967474796515832e-06, "loss": 0.54, "mean_token_accuracy": 0.8281594708561897, "num_tokens": 100588049.0, "step": 83670 }, { "entropy": 1.8481286302208901, "epoch": 0.25940067841585174, "grad_norm": 10.417789459228516, "learning_rate": 4.9671779703578934e-06, "loss": 0.4989, "mean_token_accuracy": 0.848712483048439, "num_tokens": 100600632.0, "step": 83680 }, { "entropy": 1.8013351663947106, "epoch": 0.2594316775409014, "grad_norm": 8.973665237426758, "learning_rate": 4.966881197403189e-06, "loss": 0.4899, "mean_token_accuracy": 0.83788081407547, "num_tokens": 100613842.0, "step": 83690 }, { "entropy": 1.8483421131968498, "epoch": 0.25946267666595113, "grad_norm": 7.858611583709717, "learning_rate": 4.966584477635825e-06, "loss": 0.4966, "mean_token_accuracy": 0.8433775931596756, "num_tokens": 100626204.0, "step": 83700 }, { "entropy": 1.8308247201144696, "epoch": 0.2594936757910008, "grad_norm": 8.385130882263184, "learning_rate": 4.966287811039917e-06, "loss": 0.4519, "mean_token_accuracy": 0.8558708354830742, "num_tokens": 100639013.0, "step": 83710 }, { "entropy": 1.9155432254076004, "epoch": 0.25952467491605047, "grad_norm": 9.558300971984863, "learning_rate": 4.965991197599587e-06, "loss": 0.4991, "mean_token_accuracy": 0.8412322223186492, "num_tokens": 100649836.0, "step": 83720 }, { "entropy": 1.8477103784680367, "epoch": 0.2595556740411002, "grad_norm": 3.852271556854248, "learning_rate": 4.965694637298964e-06, "loss": 0.4422, "mean_token_accuracy": 0.848893304169178, "num_tokens": 100662993.0, "step": 83730 }, { "entropy": 1.9299596205353737, "epoch": 0.25958667316614986, "grad_norm": 8.642110824584961, "learning_rate": 4.9653981301221825e-06, "loss": 0.5333, "mean_token_accuracy": 0.8376413077116013, "num_tokens": 100674572.0, "step": 83740 }, { "entropy": 1.8460013434290885, "epoch": 0.2596176722911996, "grad_norm": 10.720597267150879, "learning_rate": 4.9651016760533816e-06, "loss": 0.4302, "mean_token_accuracy": 0.8525181710720062, "num_tokens": 100687154.0, "step": 83750 }, { "entropy": 1.8346559152007103, "epoch": 0.25964867141624925, "grad_norm": 7.259960651397705, "learning_rate": 4.964805275076713e-06, "loss": 0.4357, "mean_token_accuracy": 0.8434622973203659, "num_tokens": 100699665.0, "step": 83760 }, { "entropy": 1.9489260196685791, "epoch": 0.259679670541299, "grad_norm": 8.808201789855957, "learning_rate": 4.96450892717633e-06, "loss": 0.5478, "mean_token_accuracy": 0.8384017795324326, "num_tokens": 100710924.0, "step": 83770 }, { "entropy": 1.771421131491661, "epoch": 0.25971066966634865, "grad_norm": 7.782841682434082, "learning_rate": 4.964212632336392e-06, "loss": 0.4088, "mean_token_accuracy": 0.8582013815641403, "num_tokens": 100724256.0, "step": 83780 }, { "entropy": 1.8205000385642052, "epoch": 0.2597416687913984, "grad_norm": 7.884888648986816, "learning_rate": 4.963916390541071e-06, "loss": 0.4246, "mean_token_accuracy": 0.8426259756088257, "num_tokens": 100737066.0, "step": 83790 }, { "entropy": 1.9249374747276307, "epoch": 0.25977266791644804, "grad_norm": 8.629660606384277, "learning_rate": 4.963620201774537e-06, "loss": 0.5384, "mean_token_accuracy": 0.8426753237843514, "num_tokens": 100748371.0, "step": 83800 }, { "entropy": 1.869993396103382, "epoch": 0.25980366704149777, "grad_norm": 7.948463439941406, "learning_rate": 4.963324066020974e-06, "loss": 0.5011, "mean_token_accuracy": 0.8404388725757599, "num_tokens": 100760062.0, "step": 83810 }, { "entropy": 1.8810945719480514, "epoch": 0.25983466616654743, "grad_norm": 8.10258960723877, "learning_rate": 4.963027983264569e-06, "loss": 0.4583, "mean_token_accuracy": 0.85190070271492, "num_tokens": 100771799.0, "step": 83820 }, { "entropy": 1.932461032271385, "epoch": 0.25986566529159716, "grad_norm": 10.165177345275879, "learning_rate": 4.9627319534895166e-06, "loss": 0.5158, "mean_token_accuracy": 0.8437979429960251, "num_tokens": 100783029.0, "step": 83830 }, { "entropy": 1.8803840085864068, "epoch": 0.2598966644166468, "grad_norm": 3.2739675045013428, "learning_rate": 4.962435976680019e-06, "loss": 0.4717, "mean_token_accuracy": 0.853711499273777, "num_tokens": 100794075.0, "step": 83840 }, { "entropy": 1.879416285455227, "epoch": 0.25992766354169655, "grad_norm": 8.004817008972168, "learning_rate": 4.9621400528202814e-06, "loss": 0.5403, "mean_token_accuracy": 0.829861244559288, "num_tokens": 100805286.0, "step": 83850 }, { "entropy": 1.8929785311222076, "epoch": 0.2599586626667462, "grad_norm": 8.447615623474121, "learning_rate": 4.96184418189452e-06, "loss": 0.5056, "mean_token_accuracy": 0.8426691144704819, "num_tokens": 100816771.0, "step": 83860 }, { "entropy": 1.8302429109811782, "epoch": 0.25998966179179595, "grad_norm": 6.880725860595703, "learning_rate": 4.9615483638869545e-06, "loss": 0.4712, "mean_token_accuracy": 0.8440744116902351, "num_tokens": 100829800.0, "step": 83870 }, { "entropy": 1.947381141781807, "epoch": 0.2600206609168456, "grad_norm": 8.487883567810059, "learning_rate": 4.961252598781812e-06, "loss": 0.5444, "mean_token_accuracy": 0.8282511562108994, "num_tokens": 100840370.0, "step": 83880 }, { "entropy": 1.8844257071614265, "epoch": 0.26005166004189534, "grad_norm": 5.970224380493164, "learning_rate": 4.960956886563326e-06, "loss": 0.5086, "mean_token_accuracy": 0.8489205956459045, "num_tokens": 100852053.0, "step": 83890 }, { "entropy": 1.783471368253231, "epoch": 0.260082659166945, "grad_norm": 9.093459129333496, "learning_rate": 4.960661227215739e-06, "loss": 0.4188, "mean_token_accuracy": 0.858287438750267, "num_tokens": 100865934.0, "step": 83900 }, { "entropy": 1.9448846295475959, "epoch": 0.26011365829199473, "grad_norm": 4.641866683959961, "learning_rate": 4.960365620723297e-06, "loss": 0.5616, "mean_token_accuracy": 0.8277464538812638, "num_tokens": 100876883.0, "step": 83910 }, { "entropy": 1.8289760783314706, "epoch": 0.2601446574170444, "grad_norm": 4.197445392608643, "learning_rate": 4.960070067070255e-06, "loss": 0.4408, "mean_token_accuracy": 0.8453979283571244, "num_tokens": 100889433.0, "step": 83920 }, { "entropy": 1.7806072607636452, "epoch": 0.2601756565420941, "grad_norm": 3.8781611919403076, "learning_rate": 4.95977456624087e-06, "loss": 0.3753, "mean_token_accuracy": 0.85598254352808, "num_tokens": 100902238.0, "step": 83930 }, { "entropy": 1.8489570021629333, "epoch": 0.2602066556671438, "grad_norm": 4.011428356170654, "learning_rate": 4.95947911821941e-06, "loss": 0.447, "mean_token_accuracy": 0.8497495472431182, "num_tokens": 100914487.0, "step": 83940 }, { "entropy": 1.8972808972001076, "epoch": 0.2602376547921935, "grad_norm": 7.408459663391113, "learning_rate": 4.959183722990151e-06, "loss": 0.4737, "mean_token_accuracy": 0.8526468485593796, "num_tokens": 100925789.0, "step": 83950 }, { "entropy": 1.8424657888710498, "epoch": 0.2602686539172432, "grad_norm": 8.171313285827637, "learning_rate": 4.9588883805373686e-06, "loss": 0.3904, "mean_token_accuracy": 0.8596995994448662, "num_tokens": 100937994.0, "step": 83960 }, { "entropy": 1.8914167776703834, "epoch": 0.26029965304229286, "grad_norm": 7.714399814605713, "learning_rate": 4.958593090845352e-06, "loss": 0.4511, "mean_token_accuracy": 0.8574990868568421, "num_tokens": 100949553.0, "step": 83970 }, { "entropy": 1.8375475853681564, "epoch": 0.2603306521673426, "grad_norm": 9.123080253601074, "learning_rate": 4.958297853898395e-06, "loss": 0.4537, "mean_token_accuracy": 0.8516534611582756, "num_tokens": 100962264.0, "step": 83980 }, { "entropy": 1.9303360790014268, "epoch": 0.26036165129239225, "grad_norm": 9.462535858154297, "learning_rate": 4.958002669680794e-06, "loss": 0.4873, "mean_token_accuracy": 0.8448026522994041, "num_tokens": 100972790.0, "step": 83990 }, { "entropy": 1.9033367425203322, "epoch": 0.260392650417442, "grad_norm": 6.338131427764893, "learning_rate": 4.957707538176859e-06, "loss": 0.4897, "mean_token_accuracy": 0.837484510242939, "num_tokens": 100984286.0, "step": 84000 }, { "entropy": 1.9744462817907333, "epoch": 0.26042364954249164, "grad_norm": 8.549060821533203, "learning_rate": 4.9574124593708985e-06, "loss": 0.5128, "mean_token_accuracy": 0.8344295799732209, "num_tokens": 100995371.0, "step": 84010 }, { "entropy": 1.7369410261511802, "epoch": 0.26045464866754137, "grad_norm": 8.24365520477295, "learning_rate": 4.957117433247236e-06, "loss": 0.4083, "mean_token_accuracy": 0.8557371169328689, "num_tokens": 101009845.0, "step": 84020 }, { "entropy": 1.9079995438456536, "epoch": 0.26048564779259104, "grad_norm": 9.164064407348633, "learning_rate": 4.956822459790193e-06, "loss": 0.5175, "mean_token_accuracy": 0.8357024490833282, "num_tokens": 101021686.0, "step": 84030 }, { "entropy": 1.8953372284770011, "epoch": 0.26051664691764076, "grad_norm": 9.431413650512695, "learning_rate": 4.956527538984104e-06, "loss": 0.4786, "mean_token_accuracy": 0.8483018189668655, "num_tokens": 101033454.0, "step": 84040 }, { "entropy": 1.8954351365566253, "epoch": 0.26054764604269043, "grad_norm": 3.8915398120880127, "learning_rate": 4.956232670813308e-06, "loss": 0.4902, "mean_token_accuracy": 0.8407752260565757, "num_tokens": 101044798.0, "step": 84050 }, { "entropy": 1.8448843389749527, "epoch": 0.26057864516774015, "grad_norm": 4.189465045928955, "learning_rate": 4.955937855262149e-06, "loss": 0.436, "mean_token_accuracy": 0.85302524715662, "num_tokens": 101057618.0, "step": 84060 }, { "entropy": 1.832655143737793, "epoch": 0.2606096442927898, "grad_norm": 7.561342239379883, "learning_rate": 4.955643092314979e-06, "loss": 0.4504, "mean_token_accuracy": 0.8396886929869651, "num_tokens": 101069666.0, "step": 84070 }, { "entropy": 1.898691761493683, "epoch": 0.26064064341783955, "grad_norm": 8.714390754699707, "learning_rate": 4.955348381956157e-06, "loss": 0.5377, "mean_token_accuracy": 0.827970777451992, "num_tokens": 101081285.0, "step": 84080 }, { "entropy": 1.7665072850883008, "epoch": 0.2606716425428892, "grad_norm": 8.24121379852295, "learning_rate": 4.9550537241700474e-06, "loss": 0.4068, "mean_token_accuracy": 0.8553405046463013, "num_tokens": 101094537.0, "step": 84090 }, { "entropy": 1.8713910579681396, "epoch": 0.26070264166793894, "grad_norm": 9.43482780456543, "learning_rate": 4.954759118941024e-06, "loss": 0.4749, "mean_token_accuracy": 0.845010656118393, "num_tokens": 101106520.0, "step": 84100 }, { "entropy": 1.8151496931910516, "epoch": 0.2607336407929886, "grad_norm": 9.085859298706055, "learning_rate": 4.954464566253459e-06, "loss": 0.4454, "mean_token_accuracy": 0.8474389865994454, "num_tokens": 101119130.0, "step": 84110 }, { "entropy": 1.9014802396297454, "epoch": 0.26076463991803833, "grad_norm": 3.9436869621276855, "learning_rate": 4.95417006609174e-06, "loss": 0.5645, "mean_token_accuracy": 0.8273421004414558, "num_tokens": 101130816.0, "step": 84120 }, { "entropy": 1.9717437848448753, "epoch": 0.260795639043088, "grad_norm": 8.339731216430664, "learning_rate": 4.953875618440259e-06, "loss": 0.5301, "mean_token_accuracy": 0.8363761708140374, "num_tokens": 101142058.0, "step": 84130 }, { "entropy": 1.8874025538563728, "epoch": 0.2608266381681377, "grad_norm": 5.998051643371582, "learning_rate": 4.953581223283413e-06, "loss": 0.4681, "mean_token_accuracy": 0.8477589651942253, "num_tokens": 101153159.0, "step": 84140 }, { "entropy": 1.7967896267771721, "epoch": 0.2608576372931874, "grad_norm": 8.227131843566895, "learning_rate": 4.953286880605603e-06, "loss": 0.4636, "mean_token_accuracy": 0.8455402344465256, "num_tokens": 101165998.0, "step": 84150 }, { "entropy": 1.8968286886811256, "epoch": 0.2608886364182371, "grad_norm": 7.774280071258545, "learning_rate": 4.952992590391242e-06, "loss": 0.4362, "mean_token_accuracy": 0.8559000626206398, "num_tokens": 101177800.0, "step": 84160 }, { "entropy": 1.7744996875524521, "epoch": 0.2609196355432868, "grad_norm": 8.268373489379883, "learning_rate": 4.952698352624746e-06, "loss": 0.4205, "mean_token_accuracy": 0.8595765858888627, "num_tokens": 101190665.0, "step": 84170 }, { "entropy": 1.8480232998728752, "epoch": 0.2609506346683365, "grad_norm": 7.481622219085693, "learning_rate": 4.952404167290538e-06, "loss": 0.4727, "mean_token_accuracy": 0.849323433637619, "num_tokens": 101203178.0, "step": 84180 }, { "entropy": 1.8875873282551765, "epoch": 0.2609816337933862, "grad_norm": 4.315346717834473, "learning_rate": 4.952110034373047e-06, "loss": 0.462, "mean_token_accuracy": 0.8455449759960174, "num_tokens": 101214641.0, "step": 84190 }, { "entropy": 1.7977977305650712, "epoch": 0.26101263291843585, "grad_norm": 7.563590049743652, "learning_rate": 4.9518159538567115e-06, "loss": 0.4434, "mean_token_accuracy": 0.8515077039599419, "num_tokens": 101227190.0, "step": 84200 }, { "entropy": 1.8671275533735752, "epoch": 0.2610436320434856, "grad_norm": 8.546191215515137, "learning_rate": 4.951521925725971e-06, "loss": 0.4726, "mean_token_accuracy": 0.8378291219472885, "num_tokens": 101239940.0, "step": 84210 }, { "entropy": 1.9210672780871392, "epoch": 0.26107463116853524, "grad_norm": 8.786494255065918, "learning_rate": 4.951227949965277e-06, "loss": 0.5202, "mean_token_accuracy": 0.8403745800256729, "num_tokens": 101251411.0, "step": 84220 }, { "entropy": 1.870493806898594, "epoch": 0.26110563029358497, "grad_norm": 7.429269313812256, "learning_rate": 4.9509340265590865e-06, "loss": 0.4554, "mean_token_accuracy": 0.8402852207422257, "num_tokens": 101263421.0, "step": 84230 }, { "entropy": 1.8900667324662208, "epoch": 0.26113662941863464, "grad_norm": 9.321823120117188, "learning_rate": 4.950640155491857e-06, "loss": 0.4765, "mean_token_accuracy": 0.8420423567295074, "num_tokens": 101275836.0, "step": 84240 }, { "entropy": 1.6973878130316735, "epoch": 0.26116762854368436, "grad_norm": 7.488030910491943, "learning_rate": 4.95034633674806e-06, "loss": 0.3739, "mean_token_accuracy": 0.868793734908104, "num_tokens": 101289722.0, "step": 84250 }, { "entropy": 1.8956893503665924, "epoch": 0.26119862766873403, "grad_norm": 8.01175594329834, "learning_rate": 4.950052570312171e-06, "loss": 0.5072, "mean_token_accuracy": 0.8392576932907104, "num_tokens": 101301555.0, "step": 84260 }, { "entropy": 1.9166808992624282, "epoch": 0.26122962679378375, "grad_norm": 6.5919928550720215, "learning_rate": 4.94975885616867e-06, "loss": 0.5282, "mean_token_accuracy": 0.8426938772201538, "num_tokens": 101312392.0, "step": 84270 }, { "entropy": 1.9476675420999527, "epoch": 0.2612606259188334, "grad_norm": 9.677026748657227, "learning_rate": 4.9494651943020455e-06, "loss": 0.5275, "mean_token_accuracy": 0.8333592966198922, "num_tokens": 101323006.0, "step": 84280 }, { "entropy": 1.818124982714653, "epoch": 0.26129162504388315, "grad_norm": 4.290999889373779, "learning_rate": 4.94917158469679e-06, "loss": 0.4158, "mean_token_accuracy": 0.8480872571468353, "num_tokens": 101335938.0, "step": 84290 }, { "entropy": 1.8131288312375546, "epoch": 0.2613226241689328, "grad_norm": 8.009013175964355, "learning_rate": 4.948878027337407e-06, "loss": 0.4989, "mean_token_accuracy": 0.839995177090168, "num_tokens": 101349763.0, "step": 84300 }, { "entropy": 1.8972848400473594, "epoch": 0.26135362329398254, "grad_norm": 8.380783081054688, "learning_rate": 4.948584522208402e-06, "loss": 0.4657, "mean_token_accuracy": 0.8484173402190208, "num_tokens": 101361738.0, "step": 84310 }, { "entropy": 1.91866305321455, "epoch": 0.2613846224190322, "grad_norm": 5.500089645385742, "learning_rate": 4.948291069294289e-06, "loss": 0.5167, "mean_token_accuracy": 0.8474255546927452, "num_tokens": 101373866.0, "step": 84320 }, { "entropy": 1.777488799393177, "epoch": 0.26141562154408193, "grad_norm": 8.77873706817627, "learning_rate": 4.947997668579589e-06, "loss": 0.4046, "mean_token_accuracy": 0.8580469012260437, "num_tokens": 101387715.0, "step": 84330 }, { "entropy": 1.9092938348650932, "epoch": 0.2614466206691316, "grad_norm": 8.913098335266113, "learning_rate": 4.947704320048827e-06, "loss": 0.5127, "mean_token_accuracy": 0.8450995326042176, "num_tokens": 101399203.0, "step": 84340 }, { "entropy": 1.8070560455322267, "epoch": 0.2614776197941813, "grad_norm": 8.269847869873047, "learning_rate": 4.947411023686535e-06, "loss": 0.4214, "mean_token_accuracy": 0.8473980888724327, "num_tokens": 101411534.0, "step": 84350 }, { "entropy": 1.8068130150437356, "epoch": 0.261508618919231, "grad_norm": 4.175093173980713, "learning_rate": 4.9471177794772555e-06, "loss": 0.4111, "mean_token_accuracy": 0.8587175875902175, "num_tokens": 101425216.0, "step": 84360 }, { "entropy": 1.906698650121689, "epoch": 0.2615396180442807, "grad_norm": 9.216361999511719, "learning_rate": 4.946824587405532e-06, "loss": 0.5222, "mean_token_accuracy": 0.8331103935837746, "num_tokens": 101436502.0, "step": 84370 }, { "entropy": 1.7797369614243508, "epoch": 0.2615706171693304, "grad_norm": 2.6891257762908936, "learning_rate": 4.946531447455915e-06, "loss": 0.3698, "mean_token_accuracy": 0.8665702223777771, "num_tokens": 101449912.0, "step": 84380 }, { "entropy": 1.8645511791110039, "epoch": 0.2616016162943801, "grad_norm": 4.295123100280762, "learning_rate": 4.946238359612967e-06, "loss": 0.4511, "mean_token_accuracy": 0.8417528823018074, "num_tokens": 101463078.0, "step": 84390 }, { "entropy": 1.8012978717684747, "epoch": 0.2616326154194298, "grad_norm": 7.812502861022949, "learning_rate": 4.945945323861249e-06, "loss": 0.4191, "mean_token_accuracy": 0.8614039555191993, "num_tokens": 101475914.0, "step": 84400 }, { "entropy": 1.886396862566471, "epoch": 0.2616636145444795, "grad_norm": 7.294875144958496, "learning_rate": 4.945652340185336e-06, "loss": 0.4551, "mean_token_accuracy": 0.8441071733832359, "num_tokens": 101487890.0, "step": 84410 }, { "entropy": 1.7817871391773223, "epoch": 0.2616946136695292, "grad_norm": 10.357475280761719, "learning_rate": 4.9453594085698036e-06, "loss": 0.4135, "mean_token_accuracy": 0.8648449271917343, "num_tokens": 101501358.0, "step": 84420 }, { "entropy": 1.9272992476820945, "epoch": 0.2617256127945789, "grad_norm": 9.080680847167969, "learning_rate": 4.9450665289992355e-06, "loss": 0.5201, "mean_token_accuracy": 0.8325537592172623, "num_tokens": 101513086.0, "step": 84430 }, { "entropy": 1.9224831491708756, "epoch": 0.26175661191962857, "grad_norm": 8.535758972167969, "learning_rate": 4.9447737014582235e-06, "loss": 0.4723, "mean_token_accuracy": 0.8473838672041893, "num_tokens": 101524876.0, "step": 84440 }, { "entropy": 1.8453692942857742, "epoch": 0.26178761104467824, "grad_norm": 4.549447059631348, "learning_rate": 4.9444809259313635e-06, "loss": 0.4148, "mean_token_accuracy": 0.8549572035670281, "num_tokens": 101537098.0, "step": 84450 }, { "entropy": 1.9371949970722198, "epoch": 0.26181861016972796, "grad_norm": 8.131068229675293, "learning_rate": 4.944188202403261e-06, "loss": 0.5529, "mean_token_accuracy": 0.8304179921746254, "num_tokens": 101548246.0, "step": 84460 }, { "entropy": 1.8725028142333031, "epoch": 0.26184960929477763, "grad_norm": 7.878946781158447, "learning_rate": 4.943895530858521e-06, "loss": 0.4804, "mean_token_accuracy": 0.840125747025013, "num_tokens": 101560349.0, "step": 84470 }, { "entropy": 1.86340638846159, "epoch": 0.26188060841982735, "grad_norm": 7.6368818283081055, "learning_rate": 4.943602911281764e-06, "loss": 0.4558, "mean_token_accuracy": 0.8389788880944252, "num_tokens": 101572940.0, "step": 84480 }, { "entropy": 1.9475991562008859, "epoch": 0.261911607544877, "grad_norm": 8.88925552368164, "learning_rate": 4.943310343657611e-06, "loss": 0.5051, "mean_token_accuracy": 0.8385830983519554, "num_tokens": 101584643.0, "step": 84490 }, { "entropy": 1.9739305421710014, "epoch": 0.26194260666992675, "grad_norm": 6.695905685424805, "learning_rate": 4.943017827970689e-06, "loss": 0.5063, "mean_token_accuracy": 0.8335414931178093, "num_tokens": 101595908.0, "step": 84500 }, { "entropy": 1.9505710929632187, "epoch": 0.2619736057949764, "grad_norm": 8.216191291809082, "learning_rate": 4.942725364205635e-06, "loss": 0.5215, "mean_token_accuracy": 0.8390222921967506, "num_tokens": 101606676.0, "step": 84510 }, { "entropy": 1.932286986708641, "epoch": 0.26200460492002614, "grad_norm": 7.449299335479736, "learning_rate": 4.942432952347092e-06, "loss": 0.5658, "mean_token_accuracy": 0.8347490966320038, "num_tokens": 101618109.0, "step": 84520 }, { "entropy": 1.9106614410877227, "epoch": 0.2620356040450758, "grad_norm": 9.041545867919922, "learning_rate": 4.942140592379704e-06, "loss": 0.5185, "mean_token_accuracy": 0.8461478322744369, "num_tokens": 101629907.0, "step": 84530 }, { "entropy": 1.8061072051525116, "epoch": 0.26206660317012553, "grad_norm": 9.392251968383789, "learning_rate": 4.941848284288129e-06, "loss": 0.4095, "mean_token_accuracy": 0.8683505341410637, "num_tokens": 101642763.0, "step": 84540 }, { "entropy": 1.8147373288869857, "epoch": 0.2620976022951752, "grad_norm": 3.8025999069213867, "learning_rate": 4.941556028057024e-06, "loss": 0.3993, "mean_token_accuracy": 0.8555989354848862, "num_tokens": 101655606.0, "step": 84550 }, { "entropy": 1.9344567239284516, "epoch": 0.2621286014202249, "grad_norm": 9.05136489868164, "learning_rate": 4.9412638236710595e-06, "loss": 0.5001, "mean_token_accuracy": 0.8368930295109749, "num_tokens": 101667711.0, "step": 84560 }, { "entropy": 1.882543933391571, "epoch": 0.2621596005452746, "grad_norm": 7.798995018005371, "learning_rate": 4.940971671114905e-06, "loss": 0.4799, "mean_token_accuracy": 0.852934005856514, "num_tokens": 101678953.0, "step": 84570 }, { "entropy": 1.9046371474862098, "epoch": 0.2621905996703243, "grad_norm": 8.572905540466309, "learning_rate": 4.9406795703732436e-06, "loss": 0.4999, "mean_token_accuracy": 0.8400033757090568, "num_tokens": 101690994.0, "step": 84580 }, { "entropy": 1.955969288945198, "epoch": 0.262221598795374, "grad_norm": 6.911218166351318, "learning_rate": 4.94038752143076e-06, "loss": 0.5409, "mean_token_accuracy": 0.8287007570266723, "num_tokens": 101702443.0, "step": 84590 }, { "entropy": 1.8909879684448243, "epoch": 0.2622525979204237, "grad_norm": 9.11408519744873, "learning_rate": 4.940095524272145e-06, "loss": 0.4748, "mean_token_accuracy": 0.8456226110458374, "num_tokens": 101714218.0, "step": 84600 }, { "entropy": 1.9254190102219582, "epoch": 0.2622835970454734, "grad_norm": 6.650926113128662, "learning_rate": 4.939803578882099e-06, "loss": 0.446, "mean_token_accuracy": 0.8574258908629417, "num_tokens": 101725211.0, "step": 84610 }, { "entropy": 1.9131127685308456, "epoch": 0.2623145961705231, "grad_norm": 8.146158218383789, "learning_rate": 4.939511685245327e-06, "loss": 0.4687, "mean_token_accuracy": 0.8508924171328545, "num_tokens": 101736251.0, "step": 84620 }, { "entropy": 1.9253426373004914, "epoch": 0.2623455952955728, "grad_norm": 6.606905937194824, "learning_rate": 4.939219843346538e-06, "loss": 0.4778, "mean_token_accuracy": 0.8473613321781158, "num_tokens": 101748009.0, "step": 84630 }, { "entropy": 1.9077803820371628, "epoch": 0.2623765944206225, "grad_norm": 9.684782981872559, "learning_rate": 4.938928053170453e-06, "loss": 0.5242, "mean_token_accuracy": 0.8346043735742569, "num_tokens": 101759254.0, "step": 84640 }, { "entropy": 1.9003472611308099, "epoch": 0.26240759354567217, "grad_norm": 4.827385425567627, "learning_rate": 4.938636314701793e-06, "loss": 0.5056, "mean_token_accuracy": 0.8391556084156037, "num_tokens": 101770387.0, "step": 84650 }, { "entropy": 1.911148864030838, "epoch": 0.2624385926707219, "grad_norm": 8.909619331359863, "learning_rate": 4.9383446279252895e-06, "loss": 0.4964, "mean_token_accuracy": 0.8415949329733848, "num_tokens": 101781627.0, "step": 84660 }, { "entropy": 1.8494424894452095, "epoch": 0.26246959179577156, "grad_norm": 4.314811706542969, "learning_rate": 4.93805299282568e-06, "loss": 0.4077, "mean_token_accuracy": 0.8588860586285592, "num_tokens": 101794354.0, "step": 84670 }, { "entropy": 1.9854395300149918, "epoch": 0.2625005909208213, "grad_norm": 4.768241882324219, "learning_rate": 4.937761409387704e-06, "loss": 0.5498, "mean_token_accuracy": 0.8315106898546218, "num_tokens": 101805875.0, "step": 84680 }, { "entropy": 1.8694578632712364, "epoch": 0.26253159004587096, "grad_norm": 11.459022521972656, "learning_rate": 4.937469877596115e-06, "loss": 0.4507, "mean_token_accuracy": 0.8435337409377098, "num_tokens": 101818131.0, "step": 84690 }, { "entropy": 1.9157974675297738, "epoch": 0.2625625891709206, "grad_norm": 10.549239158630371, "learning_rate": 4.9371783974356665e-06, "loss": 0.4791, "mean_token_accuracy": 0.8447314709424972, "num_tokens": 101830440.0, "step": 84700 }, { "entropy": 1.9070345923304557, "epoch": 0.26259358829597035, "grad_norm": 7.496677398681641, "learning_rate": 4.93688696889112e-06, "loss": 0.4687, "mean_token_accuracy": 0.8451457783579827, "num_tokens": 101841964.0, "step": 84710 }, { "entropy": 1.8439010679721832, "epoch": 0.26262458742102, "grad_norm": 3.86055850982666, "learning_rate": 4.936595591947242e-06, "loss": 0.4291, "mean_token_accuracy": 0.8469921678304673, "num_tokens": 101854984.0, "step": 84720 }, { "entropy": 1.8208919912576675, "epoch": 0.26265558654606974, "grad_norm": 8.848628044128418, "learning_rate": 4.93630426658881e-06, "loss": 0.4266, "mean_token_accuracy": 0.8524587988853455, "num_tokens": 101868205.0, "step": 84730 }, { "entropy": 1.9341197982430458, "epoch": 0.2626865856711194, "grad_norm": 9.021467208862305, "learning_rate": 4.936012992800602e-06, "loss": 0.5118, "mean_token_accuracy": 0.8401493951678276, "num_tokens": 101879993.0, "step": 84740 }, { "entropy": 1.900348238646984, "epoch": 0.26271758479616913, "grad_norm": 9.046464920043945, "learning_rate": 4.935721770567406e-06, "loss": 0.4902, "mean_token_accuracy": 0.8420239523053169, "num_tokens": 101891699.0, "step": 84750 }, { "entropy": 2.011109836399555, "epoch": 0.2627485839212188, "grad_norm": 9.504515647888184, "learning_rate": 4.935430599874017e-06, "loss": 0.5426, "mean_token_accuracy": 0.8297847151756287, "num_tokens": 101902570.0, "step": 84760 }, { "entropy": 1.8008052319288255, "epoch": 0.26277958304626853, "grad_norm": 7.699909687042236, "learning_rate": 4.9351394807052325e-06, "loss": 0.3834, "mean_token_accuracy": 0.8641871735453606, "num_tokens": 101915635.0, "step": 84770 }, { "entropy": 1.8245060086250304, "epoch": 0.2628105821713182, "grad_norm": 8.3301362991333, "learning_rate": 4.934848413045857e-06, "loss": 0.4336, "mean_token_accuracy": 0.8514155328273774, "num_tokens": 101928385.0, "step": 84780 }, { "entropy": 1.8403597161173821, "epoch": 0.2628415812963679, "grad_norm": 3.8224892616271973, "learning_rate": 4.934557396880704e-06, "loss": 0.4186, "mean_token_accuracy": 0.8499826923012733, "num_tokens": 101941790.0, "step": 84790 }, { "entropy": 1.950501237809658, "epoch": 0.2628725804214176, "grad_norm": 6.607431411743164, "learning_rate": 4.934266432194593e-06, "loss": 0.5115, "mean_token_accuracy": 0.8484028398990631, "num_tokens": 101953387.0, "step": 84800 }, { "entropy": 1.913737191259861, "epoch": 0.2629035795464673, "grad_norm": 7.8601508140563965, "learning_rate": 4.933975518972347e-06, "loss": 0.485, "mean_token_accuracy": 0.850081168115139, "num_tokens": 101964983.0, "step": 84810 }, { "entropy": 1.9299700886011124, "epoch": 0.262934578671517, "grad_norm": 8.866097450256348, "learning_rate": 4.9336846571987965e-06, "loss": 0.4687, "mean_token_accuracy": 0.8501459851861, "num_tokens": 101976619.0, "step": 84820 }, { "entropy": 1.9162129878997802, "epoch": 0.2629655777965667, "grad_norm": 8.77447509765625, "learning_rate": 4.93339384685878e-06, "loss": 0.4388, "mean_token_accuracy": 0.8534630805253982, "num_tokens": 101988887.0, "step": 84830 }, { "entropy": 1.9051405146718026, "epoch": 0.2629965769216164, "grad_norm": 7.113283157348633, "learning_rate": 4.93310308793714e-06, "loss": 0.4795, "mean_token_accuracy": 0.8431653559207917, "num_tokens": 102000822.0, "step": 84840 }, { "entropy": 1.8918147072196008, "epoch": 0.2630275760466661, "grad_norm": 7.229731559753418, "learning_rate": 4.9328123804187265e-06, "loss": 0.42, "mean_token_accuracy": 0.8591841533780098, "num_tokens": 102012280.0, "step": 84850 }, { "entropy": 1.84690400660038, "epoch": 0.26305857517171577, "grad_norm": 4.94510555267334, "learning_rate": 4.932521724288395e-06, "loss": 0.4528, "mean_token_accuracy": 0.8478389665484428, "num_tokens": 102025734.0, "step": 84860 }, { "entropy": 1.8911878764629364, "epoch": 0.2630895742967655, "grad_norm": 4.1434783935546875, "learning_rate": 4.932231119531007e-06, "loss": 0.4728, "mean_token_accuracy": 0.8523555234074592, "num_tokens": 102038473.0, "step": 84870 }, { "entropy": 1.9033065840601922, "epoch": 0.26312057342181516, "grad_norm": 3.400432586669922, "learning_rate": 4.9319405661314326e-06, "loss": 0.4971, "mean_token_accuracy": 0.8344430983066559, "num_tokens": 102050229.0, "step": 84880 }, { "entropy": 1.9568065509200097, "epoch": 0.2631515725468649, "grad_norm": 10.253649711608887, "learning_rate": 4.931650064074543e-06, "loss": 0.5204, "mean_token_accuracy": 0.8353859558701515, "num_tokens": 102062301.0, "step": 84890 }, { "entropy": 1.8090133965015411, "epoch": 0.26318257167191456, "grad_norm": 9.690705299377441, "learning_rate": 4.931359613345223e-06, "loss": 0.409, "mean_token_accuracy": 0.8670252472162246, "num_tokens": 102076084.0, "step": 84900 }, { "entropy": 1.881265440583229, "epoch": 0.2632135707969643, "grad_norm": 8.040104866027832, "learning_rate": 4.9310692139283576e-06, "loss": 0.4399, "mean_token_accuracy": 0.848574922978878, "num_tokens": 102088089.0, "step": 84910 }, { "entropy": 1.823999959230423, "epoch": 0.26324456992201395, "grad_norm": 4.09398078918457, "learning_rate": 4.9307788658088396e-06, "loss": 0.4375, "mean_token_accuracy": 0.8424740388989449, "num_tokens": 102101088.0, "step": 84920 }, { "entropy": 1.8733570471405983, "epoch": 0.2632755690470637, "grad_norm": 7.2603230476379395, "learning_rate": 4.93048856897157e-06, "loss": 0.4377, "mean_token_accuracy": 0.8479010179638863, "num_tokens": 102113485.0, "step": 84930 }, { "entropy": 1.8576104506850242, "epoch": 0.26330656817211334, "grad_norm": 8.178943634033203, "learning_rate": 4.930198323401454e-06, "loss": 0.3913, "mean_token_accuracy": 0.8650277197360993, "num_tokens": 102126971.0, "step": 84940 }, { "entropy": 1.819041645526886, "epoch": 0.263337567297163, "grad_norm": 10.852204322814941, "learning_rate": 4.929908129083402e-06, "loss": 0.4542, "mean_token_accuracy": 0.84122084826231, "num_tokens": 102140395.0, "step": 84950 }, { "entropy": 1.8517862141132355, "epoch": 0.26336856642221274, "grad_norm": 7.924666404724121, "learning_rate": 4.929617986002334e-06, "loss": 0.5, "mean_token_accuracy": 0.841532975435257, "num_tokens": 102153419.0, "step": 84960 }, { "entropy": 1.9836448311805726, "epoch": 0.2633995655472624, "grad_norm": 7.1847004890441895, "learning_rate": 4.9293278941431724e-06, "loss": 0.5787, "mean_token_accuracy": 0.830291448533535, "num_tokens": 102164560.0, "step": 84970 }, { "entropy": 1.9570752799510955, "epoch": 0.26343056467231213, "grad_norm": 7.852522850036621, "learning_rate": 4.929037853490851e-06, "loss": 0.5222, "mean_token_accuracy": 0.8373702332377434, "num_tokens": 102176029.0, "step": 84980 }, { "entropy": 1.9419500917196273, "epoch": 0.2634615637973618, "grad_norm": 8.110418319702148, "learning_rate": 4.928747864030304e-06, "loss": 0.5139, "mean_token_accuracy": 0.8400467559695244, "num_tokens": 102188021.0, "step": 84990 }, { "entropy": 1.9458017632365228, "epoch": 0.2634925629224115, "grad_norm": 10.08768081665039, "learning_rate": 4.928457925746475e-06, "loss": 0.486, "mean_token_accuracy": 0.8457490563392639, "num_tokens": 102199697.0, "step": 85000 }, { "entropy": 1.9600411295890807, "epoch": 0.2635235620474612, "grad_norm": 8.410282135009766, "learning_rate": 4.928168038624313e-06, "loss": 0.483, "mean_token_accuracy": 0.8549587532877923, "num_tokens": 102210184.0, "step": 85010 }, { "entropy": 1.7980479046702385, "epoch": 0.2635545611725109, "grad_norm": 7.882757663726807, "learning_rate": 4.927878202648774e-06, "loss": 0.4294, "mean_token_accuracy": 0.8440237864851952, "num_tokens": 102223273.0, "step": 85020 }, { "entropy": 1.9615017265081405, "epoch": 0.2635855602975606, "grad_norm": 9.58923053741455, "learning_rate": 4.927588417804819e-06, "loss": 0.5578, "mean_token_accuracy": 0.8358608677983284, "num_tokens": 102234501.0, "step": 85030 }, { "entropy": 1.864892715215683, "epoch": 0.2636165594226103, "grad_norm": 7.730620384216309, "learning_rate": 4.9272986840774155e-06, "loss": 0.456, "mean_token_accuracy": 0.8360424354672432, "num_tokens": 102247359.0, "step": 85040 }, { "entropy": 1.8440838590264321, "epoch": 0.26364755854766, "grad_norm": 3.6796610355377197, "learning_rate": 4.927009001451538e-06, "loss": 0.4102, "mean_token_accuracy": 0.8569944024085998, "num_tokens": 102260021.0, "step": 85050 }, { "entropy": 1.905400250852108, "epoch": 0.2636785576727097, "grad_norm": 3.788116931915283, "learning_rate": 4.926719369912167e-06, "loss": 0.4323, "mean_token_accuracy": 0.8591458886861801, "num_tokens": 102271674.0, "step": 85060 }, { "entropy": 1.8443732798099517, "epoch": 0.26370955679775937, "grad_norm": 7.661864757537842, "learning_rate": 4.926429789444288e-06, "loss": 0.4706, "mean_token_accuracy": 0.8470209822058677, "num_tokens": 102284578.0, "step": 85070 }, { "entropy": 1.8414702624082566, "epoch": 0.2637405559228091, "grad_norm": 7.228357791900635, "learning_rate": 4.926140260032895e-06, "loss": 0.4743, "mean_token_accuracy": 0.8367054253816605, "num_tokens": 102296979.0, "step": 85080 }, { "entropy": 1.9466857939958573, "epoch": 0.26377155504785876, "grad_norm": 11.023140907287598, "learning_rate": 4.925850781662983e-06, "loss": 0.5557, "mean_token_accuracy": 0.8282995998859406, "num_tokens": 102308493.0, "step": 85090 }, { "entropy": 1.963821244239807, "epoch": 0.2638025541729085, "grad_norm": 11.524614334106445, "learning_rate": 4.92556135431956e-06, "loss": 0.5303, "mean_token_accuracy": 0.8317586749792099, "num_tokens": 102320319.0, "step": 85100 }, { "entropy": 1.9346794202923774, "epoch": 0.26383355329795816, "grad_norm": 7.5674214363098145, "learning_rate": 4.9252719779876374e-06, "loss": 0.5261, "mean_token_accuracy": 0.8399398401379585, "num_tokens": 102331510.0, "step": 85110 }, { "entropy": 1.8758284986019134, "epoch": 0.2638645524230079, "grad_norm": 3.6614625453948975, "learning_rate": 4.92498265265223e-06, "loss": 0.4775, "mean_token_accuracy": 0.8341593369841576, "num_tokens": 102344007.0, "step": 85120 }, { "entropy": 1.9138728231191635, "epoch": 0.26389555154805755, "grad_norm": 4.82459831237793, "learning_rate": 4.924693378298362e-06, "loss": 0.473, "mean_token_accuracy": 0.8473570510745049, "num_tokens": 102355863.0, "step": 85130 }, { "entropy": 1.9841463685035705, "epoch": 0.2639265506731073, "grad_norm": 7.99963903427124, "learning_rate": 4.924404154911063e-06, "loss": 0.5413, "mean_token_accuracy": 0.8389569863677024, "num_tokens": 102366684.0, "step": 85140 }, { "entropy": 1.8069503650069236, "epoch": 0.26395754979815694, "grad_norm": 7.8926568031311035, "learning_rate": 4.92411498247537e-06, "loss": 0.4926, "mean_token_accuracy": 0.8335721820592881, "num_tokens": 102380841.0, "step": 85150 }, { "entropy": 1.9148933947086335, "epoch": 0.26398854892320667, "grad_norm": 7.529796600341797, "learning_rate": 4.923825860976324e-06, "loss": 0.4734, "mean_token_accuracy": 0.8480879321694375, "num_tokens": 102392714.0, "step": 85160 }, { "entropy": 1.9118664294481278, "epoch": 0.26401954804825634, "grad_norm": 9.968255996704102, "learning_rate": 4.9235367903989705e-06, "loss": 0.4858, "mean_token_accuracy": 0.8447333693504333, "num_tokens": 102404766.0, "step": 85170 }, { "entropy": 1.9002967342734336, "epoch": 0.26405054717330606, "grad_norm": 9.95118236541748, "learning_rate": 4.923247770728366e-06, "loss": 0.4568, "mean_token_accuracy": 0.855803194642067, "num_tokens": 102416668.0, "step": 85180 }, { "entropy": 1.8763260886073112, "epoch": 0.26408154629835573, "grad_norm": 7.586075305938721, "learning_rate": 4.9229588019495714e-06, "loss": 0.4963, "mean_token_accuracy": 0.8425167426466942, "num_tokens": 102429365.0, "step": 85190 }, { "entropy": 2.0041319727897644, "epoch": 0.2641125454234054, "grad_norm": 9.366758346557617, "learning_rate": 4.922669884047651e-06, "loss": 0.5717, "mean_token_accuracy": 0.8297568678855896, "num_tokens": 102440584.0, "step": 85200 }, { "entropy": 1.9379279300570489, "epoch": 0.2641435445484551, "grad_norm": 8.975931167602539, "learning_rate": 4.922381017007679e-06, "loss": 0.5412, "mean_token_accuracy": 0.8403313905000687, "num_tokens": 102452027.0, "step": 85210 }, { "entropy": 1.9602735459804534, "epoch": 0.2641745436735048, "grad_norm": 8.235448837280273, "learning_rate": 4.9220922008147325e-06, "loss": 0.5294, "mean_token_accuracy": 0.8311275467276573, "num_tokens": 102463662.0, "step": 85220 }, { "entropy": 1.8260437846183777, "epoch": 0.2642055427985545, "grad_norm": 9.317481994628906, "learning_rate": 4.921803435453896e-06, "loss": 0.4387, "mean_token_accuracy": 0.8505254268646241, "num_tokens": 102476893.0, "step": 85230 }, { "entropy": 1.8744411259889602, "epoch": 0.2642365419236042, "grad_norm": 7.509456634521484, "learning_rate": 4.921514720910262e-06, "loss": 0.5191, "mean_token_accuracy": 0.8349858596920967, "num_tokens": 102489557.0, "step": 85240 }, { "entropy": 1.8670647412538528, "epoch": 0.2642675410486539, "grad_norm": 3.518993377685547, "learning_rate": 4.921226057168927e-06, "loss": 0.4244, "mean_token_accuracy": 0.8494436904788017, "num_tokens": 102502419.0, "step": 85250 }, { "entropy": 1.9050493866205216, "epoch": 0.2642985401737036, "grad_norm": 8.357398986816406, "learning_rate": 4.920937444214995e-06, "loss": 0.5004, "mean_token_accuracy": 0.8412919878959656, "num_tokens": 102514450.0, "step": 85260 }, { "entropy": 1.79098000228405, "epoch": 0.2643295392987533, "grad_norm": 4.370672702789307, "learning_rate": 4.920648882033572e-06, "loss": 0.4494, "mean_token_accuracy": 0.8489180520176888, "num_tokens": 102527984.0, "step": 85270 }, { "entropy": 1.9773102968931198, "epoch": 0.26436053842380297, "grad_norm": 7.729239463806152, "learning_rate": 4.920360370609777e-06, "loss": 0.4939, "mean_token_accuracy": 0.8391767382621765, "num_tokens": 102538749.0, "step": 85280 }, { "entropy": 1.935066755115986, "epoch": 0.2643915375488527, "grad_norm": 8.969949722290039, "learning_rate": 4.920071909928729e-06, "loss": 0.5233, "mean_token_accuracy": 0.842116117477417, "num_tokens": 102550262.0, "step": 85290 }, { "entropy": 1.8114179536700248, "epoch": 0.26442253667390236, "grad_norm": 7.595683574676514, "learning_rate": 4.919783499975556e-06, "loss": 0.3997, "mean_token_accuracy": 0.8623112186789512, "num_tokens": 102563124.0, "step": 85300 }, { "entropy": 1.8905057892203332, "epoch": 0.2644535357989521, "grad_norm": 7.8862199783325195, "learning_rate": 4.919495140735392e-06, "loss": 0.4715, "mean_token_accuracy": 0.8516170725226402, "num_tokens": 102575911.0, "step": 85310 }, { "entropy": 1.7555623829364777, "epoch": 0.26448453492400176, "grad_norm": 7.201220989227295, "learning_rate": 4.919206832193378e-06, "loss": 0.4096, "mean_token_accuracy": 0.8540243923664093, "num_tokens": 102589847.0, "step": 85320 }, { "entropy": 1.9273912906646729, "epoch": 0.2645155340490515, "grad_norm": 8.060796737670898, "learning_rate": 4.918918574334659e-06, "loss": 0.4865, "mean_token_accuracy": 0.848893666267395, "num_tokens": 102601212.0, "step": 85330 }, { "entropy": 1.8354052126407623, "epoch": 0.26454653317410115, "grad_norm": 8.087235450744629, "learning_rate": 4.918630367144384e-06, "loss": 0.543, "mean_token_accuracy": 0.8323055505752563, "num_tokens": 102614529.0, "step": 85340 }, { "entropy": 1.9022644311189651, "epoch": 0.2645775322991509, "grad_norm": 9.103813171386719, "learning_rate": 4.918342210607715e-06, "loss": 0.5306, "mean_token_accuracy": 0.828366307914257, "num_tokens": 102627163.0, "step": 85350 }, { "entropy": 1.8333250015974045, "epoch": 0.26460853142420054, "grad_norm": 9.28365707397461, "learning_rate": 4.918054104709815e-06, "loss": 0.4207, "mean_token_accuracy": 0.8535311058163643, "num_tokens": 102639926.0, "step": 85360 }, { "entropy": 1.8946420654654503, "epoch": 0.26463953054925027, "grad_norm": 7.266073703765869, "learning_rate": 4.917766049435854e-06, "loss": 0.457, "mean_token_accuracy": 0.8455268830060959, "num_tokens": 102652224.0, "step": 85370 }, { "entropy": 1.9376175180077553, "epoch": 0.26467052967429994, "grad_norm": 7.632312297821045, "learning_rate": 4.917478044771007e-06, "loss": 0.4726, "mean_token_accuracy": 0.8479833841323853, "num_tokens": 102664350.0, "step": 85380 }, { "entropy": 1.9229725405573845, "epoch": 0.26470152879934966, "grad_norm": 7.888539791107178, "learning_rate": 4.9171900907004585e-06, "loss": 0.5669, "mean_token_accuracy": 0.8408885851502419, "num_tokens": 102675841.0, "step": 85390 }, { "entropy": 1.8650186344981194, "epoch": 0.26473252792439933, "grad_norm": 7.847538471221924, "learning_rate": 4.916902187209395e-06, "loss": 0.4484, "mean_token_accuracy": 0.8481592908501625, "num_tokens": 102687826.0, "step": 85400 }, { "entropy": 1.8229721501469611, "epoch": 0.26476352704944905, "grad_norm": 7.471333026885986, "learning_rate": 4.916614334283012e-06, "loss": 0.406, "mean_token_accuracy": 0.8582251816987991, "num_tokens": 102700892.0, "step": 85410 }, { "entropy": 1.934105758368969, "epoch": 0.2647945261744987, "grad_norm": 3.659540891647339, "learning_rate": 4.91632653190651e-06, "loss": 0.4666, "mean_token_accuracy": 0.8533608466386795, "num_tokens": 102712704.0, "step": 85420 }, { "entropy": 1.9130233809351922, "epoch": 0.26482552529954845, "grad_norm": 9.22014331817627, "learning_rate": 4.916038780065096e-06, "loss": 0.4777, "mean_token_accuracy": 0.8448394879698753, "num_tokens": 102724684.0, "step": 85430 }, { "entropy": 1.9155945912003518, "epoch": 0.2648565244245981, "grad_norm": 8.854886054992676, "learning_rate": 4.9157510787439814e-06, "loss": 0.4884, "mean_token_accuracy": 0.8422875538468361, "num_tokens": 102736595.0, "step": 85440 }, { "entropy": 1.8779947102069854, "epoch": 0.2648875235496478, "grad_norm": 7.41634464263916, "learning_rate": 4.9154634279283864e-06, "loss": 0.505, "mean_token_accuracy": 0.8430365577340126, "num_tokens": 102749126.0, "step": 85450 }, { "entropy": 1.8960552558302879, "epoch": 0.2649185226746975, "grad_norm": 10.775057792663574, "learning_rate": 4.915175827603535e-06, "loss": 0.4752, "mean_token_accuracy": 0.8464867159724235, "num_tokens": 102761000.0, "step": 85460 }, { "entropy": 1.8219962686300277, "epoch": 0.2649495217997472, "grad_norm": 4.283982753753662, "learning_rate": 4.914888277754658e-06, "loss": 0.4364, "mean_token_accuracy": 0.8421214401721955, "num_tokens": 102774313.0, "step": 85470 }, { "entropy": 1.9417266234755517, "epoch": 0.2649805209247969, "grad_norm": 3.7757527828216553, "learning_rate": 4.914600778366993e-06, "loss": 0.5156, "mean_token_accuracy": 0.8345933735370636, "num_tokens": 102785961.0, "step": 85480 }, { "entropy": 1.9103323504328729, "epoch": 0.26501152004984657, "grad_norm": 3.362926721572876, "learning_rate": 4.9143133294257815e-06, "loss": 0.4855, "mean_token_accuracy": 0.8477786153554916, "num_tokens": 102798060.0, "step": 85490 }, { "entropy": 1.943329544365406, "epoch": 0.2650425191748963, "grad_norm": 9.327890396118164, "learning_rate": 4.914025930916273e-06, "loss": 0.5187, "mean_token_accuracy": 0.8373201444745064, "num_tokens": 102809320.0, "step": 85500 }, { "entropy": 1.881453277170658, "epoch": 0.26507351829994596, "grad_norm": 3.5227839946746826, "learning_rate": 4.913738582823723e-06, "loss": 0.4471, "mean_token_accuracy": 0.8572242721915245, "num_tokens": 102820907.0, "step": 85510 }, { "entropy": 1.8902042105793952, "epoch": 0.2651045174249957, "grad_norm": 7.871299743652344, "learning_rate": 4.913451285133394e-06, "loss": 0.4718, "mean_token_accuracy": 0.8486515626311302, "num_tokens": 102832171.0, "step": 85520 }, { "entropy": 1.9584900826215743, "epoch": 0.26513551655004536, "grad_norm": 3.7320501804351807, "learning_rate": 4.91316403783055e-06, "loss": 0.5608, "mean_token_accuracy": 0.8346157044172287, "num_tokens": 102843429.0, "step": 85530 }, { "entropy": 1.8431198254227639, "epoch": 0.2651665156750951, "grad_norm": 3.781135320663452, "learning_rate": 4.912876840900466e-06, "loss": 0.4362, "mean_token_accuracy": 0.8510622009634972, "num_tokens": 102855860.0, "step": 85540 }, { "entropy": 1.9608791798353196, "epoch": 0.26519751480014475, "grad_norm": 9.239676475524902, "learning_rate": 4.91258969432842e-06, "loss": 0.5309, "mean_token_accuracy": 0.8367750391364097, "num_tokens": 102867302.0, "step": 85550 }, { "entropy": 1.8789279222488404, "epoch": 0.2652285139251945, "grad_norm": 3.7677621841430664, "learning_rate": 4.912302598099698e-06, "loss": 0.4519, "mean_token_accuracy": 0.8507544234395027, "num_tokens": 102879643.0, "step": 85560 }, { "entropy": 1.9066937759518623, "epoch": 0.26525951305024414, "grad_norm": 10.09460163116455, "learning_rate": 4.9120155521995925e-06, "loss": 0.4615, "mean_token_accuracy": 0.8424730479717255, "num_tokens": 102892044.0, "step": 85570 }, { "entropy": 1.8171610802412033, "epoch": 0.26529051217529387, "grad_norm": 7.337011337280273, "learning_rate": 4.911728556613397e-06, "loss": 0.4325, "mean_token_accuracy": 0.8553290620446206, "num_tokens": 102904614.0, "step": 85580 }, { "entropy": 1.8738331109285356, "epoch": 0.26532151130034354, "grad_norm": 9.422406196594238, "learning_rate": 4.911441611326418e-06, "loss": 0.4839, "mean_token_accuracy": 0.8550050288438797, "num_tokens": 102916052.0, "step": 85590 }, { "entropy": 1.776879619061947, "epoch": 0.26535251042539326, "grad_norm": 7.145488262176514, "learning_rate": 4.911154716323966e-06, "loss": 0.3784, "mean_token_accuracy": 0.8612538367509842, "num_tokens": 102929553.0, "step": 85600 }, { "entropy": 1.9105664394795894, "epoch": 0.26538350955044293, "grad_norm": 12.720219612121582, "learning_rate": 4.91086787159135e-06, "loss": 0.5435, "mean_token_accuracy": 0.8275511726737023, "num_tokens": 102940839.0, "step": 85610 }, { "entropy": 1.8572928413748742, "epoch": 0.26541450867549266, "grad_norm": 2.5268802642822266, "learning_rate": 4.910581077113897e-06, "loss": 0.4746, "mean_token_accuracy": 0.8346114948391914, "num_tokens": 102953161.0, "step": 85620 }, { "entropy": 1.8211661458015442, "epoch": 0.2654455078005423, "grad_norm": 4.548563480377197, "learning_rate": 4.910294332876931e-06, "loss": 0.4697, "mean_token_accuracy": 0.8422662571072579, "num_tokens": 102966676.0, "step": 85630 }, { "entropy": 1.874080342054367, "epoch": 0.26547650692559205, "grad_norm": 9.284010887145996, "learning_rate": 4.910007638865787e-06, "loss": 0.4907, "mean_token_accuracy": 0.8418890178203583, "num_tokens": 102978834.0, "step": 85640 }, { "entropy": 1.9538467079401016, "epoch": 0.2655075060506417, "grad_norm": 7.954508304595947, "learning_rate": 4.909720995065805e-06, "loss": 0.4945, "mean_token_accuracy": 0.8483369365334511, "num_tokens": 102989479.0, "step": 85650 }, { "entropy": 1.9834106057882308, "epoch": 0.26553850517569144, "grad_norm": 9.530325889587402, "learning_rate": 4.909434401462327e-06, "loss": 0.5648, "mean_token_accuracy": 0.8310058429837227, "num_tokens": 103000295.0, "step": 85660 }, { "entropy": 1.8795513778924942, "epoch": 0.2655695043007411, "grad_norm": 9.067574501037598, "learning_rate": 4.9091478580407075e-06, "loss": 0.4291, "mean_token_accuracy": 0.8638965725898743, "num_tokens": 103012052.0, "step": 85670 }, { "entropy": 1.8716339603066445, "epoch": 0.2656005034257908, "grad_norm": 11.293997764587402, "learning_rate": 4.908861364786301e-06, "loss": 0.4986, "mean_token_accuracy": 0.8458243295550346, "num_tokens": 103023681.0, "step": 85680 }, { "entropy": 1.8574876859784126, "epoch": 0.2656315025508405, "grad_norm": 8.510407447814941, "learning_rate": 4.908574921684474e-06, "loss": 0.4554, "mean_token_accuracy": 0.8516736879944802, "num_tokens": 103035416.0, "step": 85690 }, { "entropy": 1.8569199055433274, "epoch": 0.2656625016758902, "grad_norm": 4.086944103240967, "learning_rate": 4.908288528720592e-06, "loss": 0.4604, "mean_token_accuracy": 0.8491792261600495, "num_tokens": 103046882.0, "step": 85700 }, { "entropy": 1.9134871244430542, "epoch": 0.2656935008009399, "grad_norm": 7.807006359100342, "learning_rate": 4.908002185880031e-06, "loss": 0.4602, "mean_token_accuracy": 0.847848904132843, "num_tokens": 103058705.0, "step": 85710 }, { "entropy": 1.911807608604431, "epoch": 0.26572449992598957, "grad_norm": 9.756507873535156, "learning_rate": 4.907715893148174e-06, "loss": 0.5384, "mean_token_accuracy": 0.8321962848305702, "num_tokens": 103071199.0, "step": 85720 }, { "entropy": 1.883806975185871, "epoch": 0.2657554990510393, "grad_norm": 6.826350212097168, "learning_rate": 4.9074296505104055e-06, "loss": 0.4805, "mean_token_accuracy": 0.8427292719483376, "num_tokens": 103082635.0, "step": 85730 }, { "entropy": 1.9495513945817948, "epoch": 0.26578649817608896, "grad_norm": 10.174969673156738, "learning_rate": 4.9071434579521205e-06, "loss": 0.5348, "mean_token_accuracy": 0.8444296821951867, "num_tokens": 103093525.0, "step": 85740 }, { "entropy": 1.934284047782421, "epoch": 0.2658174973011387, "grad_norm": 9.920119285583496, "learning_rate": 4.9068573154587165e-06, "loss": 0.5163, "mean_token_accuracy": 0.8450621247291565, "num_tokens": 103105373.0, "step": 85750 }, { "entropy": 1.7569790095090867, "epoch": 0.26584849642618835, "grad_norm": 7.43773078918457, "learning_rate": 4.9065712230156e-06, "loss": 0.3879, "mean_token_accuracy": 0.8663360059261322, "num_tokens": 103118573.0, "step": 85760 }, { "entropy": 1.8341956838965416, "epoch": 0.2658794955512381, "grad_norm": 7.597226619720459, "learning_rate": 4.906285180608181e-06, "loss": 0.4638, "mean_token_accuracy": 0.8481891751289368, "num_tokens": 103131304.0, "step": 85770 }, { "entropy": 1.9248315215110778, "epoch": 0.26591049467628775, "grad_norm": 8.323822021484375, "learning_rate": 4.905999188221875e-06, "loss": 0.519, "mean_token_accuracy": 0.8360444948077201, "num_tokens": 103142765.0, "step": 85780 }, { "entropy": 1.892122246325016, "epoch": 0.26594149380133747, "grad_norm": 8.809585571289062, "learning_rate": 4.905713245842107e-06, "loss": 0.5083, "mean_token_accuracy": 0.834871856868267, "num_tokens": 103154444.0, "step": 85790 }, { "entropy": 1.9127579972147941, "epoch": 0.26597249292638714, "grad_norm": 8.541977882385254, "learning_rate": 4.905427353454305e-06, "loss": 0.5018, "mean_token_accuracy": 0.839243420958519, "num_tokens": 103166217.0, "step": 85800 }, { "entropy": 1.8976189360022544, "epoch": 0.26600349205143686, "grad_norm": 8.723061561584473, "learning_rate": 4.905141511043905e-06, "loss": 0.4897, "mean_token_accuracy": 0.8352738916873932, "num_tokens": 103178533.0, "step": 85810 }, { "entropy": 1.9483844459056854, "epoch": 0.26603449117648653, "grad_norm": 8.283819198608398, "learning_rate": 4.904855718596345e-06, "loss": 0.5166, "mean_token_accuracy": 0.8419344946742058, "num_tokens": 103188784.0, "step": 85820 }, { "entropy": 1.8302007086575032, "epoch": 0.26606549030153626, "grad_norm": 7.834589004516602, "learning_rate": 4.9045699760970725e-06, "loss": 0.4607, "mean_token_accuracy": 0.8487353563308716, "num_tokens": 103201469.0, "step": 85830 }, { "entropy": 1.9522307723760606, "epoch": 0.2660964894265859, "grad_norm": 11.1272554397583, "learning_rate": 4.904284283531541e-06, "loss": 0.4868, "mean_token_accuracy": 0.8433923795819283, "num_tokens": 103212554.0, "step": 85840 }, { "entropy": 1.9034996896982193, "epoch": 0.26612748855163565, "grad_norm": 8.404441833496094, "learning_rate": 4.903998640885207e-06, "loss": 0.5074, "mean_token_accuracy": 0.8423371136188507, "num_tokens": 103223756.0, "step": 85850 }, { "entropy": 1.8009837806224822, "epoch": 0.2661584876766853, "grad_norm": 9.704791069030762, "learning_rate": 4.903713048143537e-06, "loss": 0.4614, "mean_token_accuracy": 0.851232835650444, "num_tokens": 103237242.0, "step": 85860 }, { "entropy": 1.8711876735091209, "epoch": 0.26618948680173504, "grad_norm": 8.211481094360352, "learning_rate": 4.903427505292001e-06, "loss": 0.4864, "mean_token_accuracy": 0.842264424264431, "num_tokens": 103249121.0, "step": 85870 }, { "entropy": 1.7938581064343453, "epoch": 0.2662204859267847, "grad_norm": 3.8567850589752197, "learning_rate": 4.903142012316073e-06, "loss": 0.4382, "mean_token_accuracy": 0.8489397838711739, "num_tokens": 103261025.0, "step": 85880 }, { "entropy": 1.9541205585002899, "epoch": 0.26625148505183444, "grad_norm": 10.45516300201416, "learning_rate": 4.902856569201237e-06, "loss": 0.5165, "mean_token_accuracy": 0.8429744258522988, "num_tokens": 103271985.0, "step": 85890 }, { "entropy": 1.7870109647512435, "epoch": 0.2662824841768841, "grad_norm": 2.5378193855285645, "learning_rate": 4.90257117593298e-06, "loss": 0.4031, "mean_token_accuracy": 0.8521895721554756, "num_tokens": 103285897.0, "step": 85900 }, { "entropy": 1.769122688472271, "epoch": 0.26631348330193383, "grad_norm": 8.693305969238281, "learning_rate": 4.902285832496798e-06, "loss": 0.4092, "mean_token_accuracy": 0.8523093193769455, "num_tokens": 103299434.0, "step": 85910 }, { "entropy": 1.8774210557341575, "epoch": 0.2663444824269835, "grad_norm": 7.8302507400512695, "learning_rate": 4.902000538878188e-06, "loss": 0.4501, "mean_token_accuracy": 0.8508623972535133, "num_tokens": 103312561.0, "step": 85920 }, { "entropy": 1.8622436970472336, "epoch": 0.26637548155203317, "grad_norm": 8.453393936157227, "learning_rate": 4.9017152950626585e-06, "loss": 0.4582, "mean_token_accuracy": 0.8456780537962914, "num_tokens": 103324471.0, "step": 85930 }, { "entropy": 1.9303288459777832, "epoch": 0.2664064806770829, "grad_norm": 9.193410873413086, "learning_rate": 4.901430101035719e-06, "loss": 0.494, "mean_token_accuracy": 0.8492061391472816, "num_tokens": 103336441.0, "step": 85940 }, { "entropy": 1.9568013072013855, "epoch": 0.26643747980213256, "grad_norm": 8.285205841064453, "learning_rate": 4.901144956782889e-06, "loss": 0.5306, "mean_token_accuracy": 0.8406900480389595, "num_tokens": 103347523.0, "step": 85950 }, { "entropy": 1.9028612434864045, "epoch": 0.2664684789271823, "grad_norm": 8.355502128601074, "learning_rate": 4.900859862289691e-06, "loss": 0.4835, "mean_token_accuracy": 0.8526147872209549, "num_tokens": 103358369.0, "step": 85960 }, { "entropy": 1.929163258522749, "epoch": 0.26649947805223195, "grad_norm": 8.101810455322266, "learning_rate": 4.900574817541653e-06, "loss": 0.5188, "mean_token_accuracy": 0.8357960835099221, "num_tokens": 103369659.0, "step": 85970 }, { "entropy": 1.910251635313034, "epoch": 0.2665304771772817, "grad_norm": 8.729976654052734, "learning_rate": 4.900289822524311e-06, "loss": 0.4835, "mean_token_accuracy": 0.8469779431819916, "num_tokens": 103381688.0, "step": 85980 }, { "entropy": 1.9128791213035583, "epoch": 0.26656147630233135, "grad_norm": 4.791084289550781, "learning_rate": 4.900004877223208e-06, "loss": 0.5225, "mean_token_accuracy": 0.8399335280060768, "num_tokens": 103393309.0, "step": 85990 }, { "entropy": 1.8070705935359002, "epoch": 0.26659247542738107, "grad_norm": 4.363813877105713, "learning_rate": 4.899719981623888e-06, "loss": 0.3931, "mean_token_accuracy": 0.8653892949223518, "num_tokens": 103406198.0, "step": 86000 }, { "entropy": 1.858493185043335, "epoch": 0.26662347455243074, "grad_norm": 8.77523422241211, "learning_rate": 4.899435135711908e-06, "loss": 0.4392, "mean_token_accuracy": 0.853272658586502, "num_tokens": 103418547.0, "step": 86010 }, { "entropy": 1.8741918861865998, "epoch": 0.26665447367748046, "grad_norm": 8.620582580566406, "learning_rate": 4.899150339472823e-06, "loss": 0.4967, "mean_token_accuracy": 0.8357633009552956, "num_tokens": 103430557.0, "step": 86020 }, { "entropy": 1.7336426332592965, "epoch": 0.26668547280253013, "grad_norm": 3.6474039554595947, "learning_rate": 4.898865592892199e-06, "loss": 0.4074, "mean_token_accuracy": 0.8620437294244766, "num_tokens": 103444908.0, "step": 86030 }, { "entropy": 1.8742715016007423, "epoch": 0.26671647192757986, "grad_norm": 9.908348083496094, "learning_rate": 4.8985808959556055e-06, "loss": 0.4596, "mean_token_accuracy": 0.8436573952436447, "num_tokens": 103457439.0, "step": 86040 }, { "entropy": 1.8521887600421905, "epoch": 0.2667474710526295, "grad_norm": 9.044089317321777, "learning_rate": 4.8982962486486215e-06, "loss": 0.5119, "mean_token_accuracy": 0.8412206932902336, "num_tokens": 103469444.0, "step": 86050 }, { "entropy": 1.892989605665207, "epoch": 0.26677847017767925, "grad_norm": 9.907114028930664, "learning_rate": 4.898011650956826e-06, "loss": 0.5118, "mean_token_accuracy": 0.8311768040060997, "num_tokens": 103480820.0, "step": 86060 }, { "entropy": 1.9290689766407012, "epoch": 0.2668094693027289, "grad_norm": 9.498516082763672, "learning_rate": 4.897727102865811e-06, "loss": 0.5325, "mean_token_accuracy": 0.8411960631608963, "num_tokens": 103491988.0, "step": 86070 }, { "entropy": 1.857162345945835, "epoch": 0.26684046842777864, "grad_norm": 9.090229988098145, "learning_rate": 4.897442604361166e-06, "loss": 0.4517, "mean_token_accuracy": 0.8425956487655639, "num_tokens": 103504063.0, "step": 86080 }, { "entropy": 1.8890363931655885, "epoch": 0.2668714675528283, "grad_norm": 7.7313947677612305, "learning_rate": 4.8971581554284956e-06, "loss": 0.5048, "mean_token_accuracy": 0.8431884482502937, "num_tokens": 103515289.0, "step": 86090 }, { "entropy": 1.7703155070543288, "epoch": 0.26690246667787804, "grad_norm": 9.580257415771484, "learning_rate": 4.896873756053401e-06, "loss": 0.4185, "mean_token_accuracy": 0.8482003748416901, "num_tokens": 103528699.0, "step": 86100 }, { "entropy": 1.8586272314190864, "epoch": 0.2669334658029277, "grad_norm": 8.929677963256836, "learning_rate": 4.8965894062214955e-06, "loss": 0.4653, "mean_token_accuracy": 0.8518487945199013, "num_tokens": 103540418.0, "step": 86110 }, { "entropy": 1.7248114220798016, "epoch": 0.26696446492797743, "grad_norm": 9.42658805847168, "learning_rate": 4.896305105918398e-06, "loss": 0.3427, "mean_token_accuracy": 0.8690110579133034, "num_tokens": 103554470.0, "step": 86120 }, { "entropy": 1.8832508057355881, "epoch": 0.2669954640530271, "grad_norm": 8.39008903503418, "learning_rate": 4.89602085512973e-06, "loss": 0.5325, "mean_token_accuracy": 0.8384171515703202, "num_tokens": 103566005.0, "step": 86130 }, { "entropy": 1.88730780929327, "epoch": 0.2670264631780768, "grad_norm": 7.347865104675293, "learning_rate": 4.895736653841122e-06, "loss": 0.5064, "mean_token_accuracy": 0.8440589159727097, "num_tokens": 103577436.0, "step": 86140 }, { "entropy": 1.87120311409235, "epoch": 0.2670574623031265, "grad_norm": 10.774239540100098, "learning_rate": 4.895452502038206e-06, "loss": 0.4719, "mean_token_accuracy": 0.8394436612725258, "num_tokens": 103589660.0, "step": 86150 }, { "entropy": 1.8455160409212112, "epoch": 0.2670884614281762, "grad_norm": 4.862356662750244, "learning_rate": 4.895168399706626e-06, "loss": 0.4755, "mean_token_accuracy": 0.8532706961035729, "num_tokens": 103601065.0, "step": 86160 }, { "entropy": 1.8480889692902565, "epoch": 0.2671194605532259, "grad_norm": 7.0500688552856445, "learning_rate": 4.894884346832027e-06, "loss": 0.4816, "mean_token_accuracy": 0.8402373760938644, "num_tokens": 103613385.0, "step": 86170 }, { "entropy": 1.8862942337989808, "epoch": 0.26715045967827555, "grad_norm": 8.611431121826172, "learning_rate": 4.894600343400061e-06, "loss": 0.4786, "mean_token_accuracy": 0.8492077127099037, "num_tokens": 103625382.0, "step": 86180 }, { "entropy": 1.8930604338645936, "epoch": 0.2671814588033253, "grad_norm": 7.554574966430664, "learning_rate": 4.894316389396388e-06, "loss": 0.4678, "mean_token_accuracy": 0.8508794024586678, "num_tokens": 103637191.0, "step": 86190 }, { "entropy": 1.921440924704075, "epoch": 0.26721245792837495, "grad_norm": 8.372522354125977, "learning_rate": 4.894032484806671e-06, "loss": 0.485, "mean_token_accuracy": 0.8466569393873214, "num_tokens": 103648987.0, "step": 86200 }, { "entropy": 1.9750257551670074, "epoch": 0.26724345705342467, "grad_norm": 6.741246700286865, "learning_rate": 4.893748629616579e-06, "loss": 0.5252, "mean_token_accuracy": 0.83692237585783, "num_tokens": 103659952.0, "step": 86210 }, { "entropy": 1.8079271107912063, "epoch": 0.26727445617847434, "grad_norm": 4.520543098449707, "learning_rate": 4.89346482381179e-06, "loss": 0.4497, "mean_token_accuracy": 0.851072619855404, "num_tokens": 103672555.0, "step": 86220 }, { "entropy": 1.9372185349464417, "epoch": 0.26730545530352406, "grad_norm": 8.379465103149414, "learning_rate": 4.8931810673779826e-06, "loss": 0.5333, "mean_token_accuracy": 0.8367386668920517, "num_tokens": 103683329.0, "step": 86230 }, { "entropy": 1.8972872629761697, "epoch": 0.26733645442857373, "grad_norm": 8.481284141540527, "learning_rate": 4.8928973603008466e-06, "loss": 0.5154, "mean_token_accuracy": 0.836910292506218, "num_tokens": 103694708.0, "step": 86240 }, { "entropy": 1.8982973158359528, "epoch": 0.26736745355362346, "grad_norm": 9.208887100219727, "learning_rate": 4.892613702566074e-06, "loss": 0.528, "mean_token_accuracy": 0.8313050597906113, "num_tokens": 103707234.0, "step": 86250 }, { "entropy": 1.9922932714223862, "epoch": 0.2673984526786731, "grad_norm": 8.469325065612793, "learning_rate": 4.892330094159364e-06, "loss": 0.5287, "mean_token_accuracy": 0.8502902209758758, "num_tokens": 103717727.0, "step": 86260 }, { "entropy": 1.8680451080203055, "epoch": 0.26742945180372285, "grad_norm": 9.319469451904297, "learning_rate": 4.892046535066422e-06, "loss": 0.4582, "mean_token_accuracy": 0.844627107679844, "num_tokens": 103729854.0, "step": 86270 }, { "entropy": 1.9178496971726418, "epoch": 0.2674604509287725, "grad_norm": 3.9577114582061768, "learning_rate": 4.891763025272957e-06, "loss": 0.4889, "mean_token_accuracy": 0.8466630190610885, "num_tokens": 103742121.0, "step": 86280 }, { "entropy": 1.9498088628053665, "epoch": 0.26749145005382224, "grad_norm": 7.915012359619141, "learning_rate": 4.891479564764686e-06, "loss": 0.5296, "mean_token_accuracy": 0.8389876633882523, "num_tokens": 103753077.0, "step": 86290 }, { "entropy": 1.8598709747195243, "epoch": 0.2675224491788719, "grad_norm": 5.166788101196289, "learning_rate": 4.891196153527332e-06, "loss": 0.4362, "mean_token_accuracy": 0.8542357131838798, "num_tokens": 103765572.0, "step": 86300 }, { "entropy": 1.87834425419569, "epoch": 0.26755344830392164, "grad_norm": 5.004603862762451, "learning_rate": 4.890912791546621e-06, "loss": 0.4865, "mean_token_accuracy": 0.8426024809479713, "num_tokens": 103777759.0, "step": 86310 }, { "entropy": 1.9219666391611099, "epoch": 0.2675844474289713, "grad_norm": 7.858938694000244, "learning_rate": 4.8906294788082895e-06, "loss": 0.4515, "mean_token_accuracy": 0.8506090447306633, "num_tokens": 103790518.0, "step": 86320 }, { "entropy": 1.9264658272266388, "epoch": 0.26761544655402103, "grad_norm": 8.828166007995605, "learning_rate": 4.890346215298074e-06, "loss": 0.4848, "mean_token_accuracy": 0.8476721942424774, "num_tokens": 103801617.0, "step": 86330 }, { "entropy": 1.8663738921284676, "epoch": 0.2676464456790707, "grad_norm": 8.534181594848633, "learning_rate": 4.890063001001723e-06, "loss": 0.4735, "mean_token_accuracy": 0.8441844269633293, "num_tokens": 103814001.0, "step": 86340 }, { "entropy": 1.9115709066390991, "epoch": 0.2676774448041204, "grad_norm": 3.800837278366089, "learning_rate": 4.889779835904984e-06, "loss": 0.4427, "mean_token_accuracy": 0.8440373882651329, "num_tokens": 103826239.0, "step": 86350 }, { "entropy": 1.9562127739191055, "epoch": 0.2677084439291701, "grad_norm": 9.03986930847168, "learning_rate": 4.889496719993616e-06, "loss": 0.5581, "mean_token_accuracy": 0.8341035321354866, "num_tokens": 103837216.0, "step": 86360 }, { "entropy": 1.8933446779847145, "epoch": 0.2677394430542198, "grad_norm": 8.497458457946777, "learning_rate": 4.889213653253382e-06, "loss": 0.4572, "mean_token_accuracy": 0.8447739273309708, "num_tokens": 103849512.0, "step": 86370 }, { "entropy": 1.940016995370388, "epoch": 0.2677704421792695, "grad_norm": 9.415840148925781, "learning_rate": 4.88893063567005e-06, "loss": 0.4701, "mean_token_accuracy": 0.8498956128954888, "num_tokens": 103860985.0, "step": 86380 }, { "entropy": 1.9313148841261865, "epoch": 0.2678014413043192, "grad_norm": 8.689431190490723, "learning_rate": 4.888647667229392e-06, "loss": 0.5064, "mean_token_accuracy": 0.8414283052086831, "num_tokens": 103872780.0, "step": 86390 }, { "entropy": 1.9360795393586159, "epoch": 0.2678324404293689, "grad_norm": 9.888842582702637, "learning_rate": 4.888364747917191e-06, "loss": 0.4754, "mean_token_accuracy": 0.8488156452775002, "num_tokens": 103884056.0, "step": 86400 }, { "entropy": 1.8884512677788734, "epoch": 0.2678634395544186, "grad_norm": 9.365774154663086, "learning_rate": 4.888081877719231e-06, "loss": 0.489, "mean_token_accuracy": 0.8477799639105796, "num_tokens": 103895788.0, "step": 86410 }, { "entropy": 1.8175974920392037, "epoch": 0.26789443867946827, "grad_norm": 8.061196327209473, "learning_rate": 4.887799056621303e-06, "loss": 0.3801, "mean_token_accuracy": 0.8709731310606003, "num_tokens": 103908055.0, "step": 86420 }, { "entropy": 1.815473848581314, "epoch": 0.26792543780451794, "grad_norm": 8.550416946411133, "learning_rate": 4.887516284609206e-06, "loss": 0.4819, "mean_token_accuracy": 0.8482015028595924, "num_tokens": 103921019.0, "step": 86430 }, { "entropy": 1.8964615538716316, "epoch": 0.26795643692956767, "grad_norm": 7.224480152130127, "learning_rate": 4.887233561668741e-06, "loss": 0.5041, "mean_token_accuracy": 0.8473669067025185, "num_tokens": 103932376.0, "step": 86440 }, { "entropy": 1.8842689141631126, "epoch": 0.26798743605461733, "grad_norm": 3.8930001258850098, "learning_rate": 4.886950887785717e-06, "loss": 0.4318, "mean_token_accuracy": 0.8496074840426445, "num_tokens": 103944476.0, "step": 86450 }, { "entropy": 1.946980032324791, "epoch": 0.26801843517966706, "grad_norm": 9.502989768981934, "learning_rate": 4.886668262945951e-06, "loss": 0.5331, "mean_token_accuracy": 0.8362909764051437, "num_tokens": 103955653.0, "step": 86460 }, { "entropy": 1.9322603926062585, "epoch": 0.2680494343047167, "grad_norm": 8.219178199768066, "learning_rate": 4.886385687135257e-06, "loss": 0.5317, "mean_token_accuracy": 0.8409213215112686, "num_tokens": 103967278.0, "step": 86470 }, { "entropy": 1.9176155909895898, "epoch": 0.26808043342976645, "grad_norm": 6.983169078826904, "learning_rate": 4.886103160339469e-06, "loss": 0.4947, "mean_token_accuracy": 0.8338638916611671, "num_tokens": 103979747.0, "step": 86480 }, { "entropy": 1.89949888586998, "epoch": 0.2681114325548161, "grad_norm": 8.026739120483398, "learning_rate": 4.885820682544414e-06, "loss": 0.4411, "mean_token_accuracy": 0.8553324237465858, "num_tokens": 103991452.0, "step": 86490 }, { "entropy": 1.9148534148931504, "epoch": 0.26814243167986584, "grad_norm": 8.056224822998047, "learning_rate": 4.885538253735928e-06, "loss": 0.4862, "mean_token_accuracy": 0.8458040565252304, "num_tokens": 104003229.0, "step": 86500 }, { "entropy": 1.9580560460686685, "epoch": 0.2681734308049155, "grad_norm": 9.577115058898926, "learning_rate": 4.885255873899857e-06, "loss": 0.5021, "mean_token_accuracy": 0.8413164436817169, "num_tokens": 104014372.0, "step": 86510 }, { "entropy": 1.8256202667951584, "epoch": 0.26820442992996524, "grad_norm": 4.1934733390808105, "learning_rate": 4.884973543022048e-06, "loss": 0.5344, "mean_token_accuracy": 0.8329330936074257, "num_tokens": 104027131.0, "step": 86520 }, { "entropy": 1.9278512462973594, "epoch": 0.2682354290550149, "grad_norm": 6.770676612854004, "learning_rate": 4.884691261088359e-06, "loss": 0.5187, "mean_token_accuracy": 0.8434366106986999, "num_tokens": 104038489.0, "step": 86530 }, { "entropy": 1.7697638273239136, "epoch": 0.26826642818006463, "grad_norm": 9.314817428588867, "learning_rate": 4.884409028084645e-06, "loss": 0.4188, "mean_token_accuracy": 0.8589093953371048, "num_tokens": 104051697.0, "step": 86540 }, { "entropy": 1.8212177708745003, "epoch": 0.2682974273051143, "grad_norm": 8.558985710144043, "learning_rate": 4.8841268439967744e-06, "loss": 0.3656, "mean_token_accuracy": 0.8659472689032555, "num_tokens": 104064685.0, "step": 86550 }, { "entropy": 1.8339843198657035, "epoch": 0.268328426430164, "grad_norm": 5.08605432510376, "learning_rate": 4.883844708810621e-06, "loss": 0.4406, "mean_token_accuracy": 0.8505037307739258, "num_tokens": 104077583.0, "step": 86560 }, { "entropy": 1.9253122627735137, "epoch": 0.2683594255552137, "grad_norm": 7.690542697906494, "learning_rate": 4.883562622512059e-06, "loss": 0.522, "mean_token_accuracy": 0.8420386493206025, "num_tokens": 104088267.0, "step": 86570 }, { "entropy": 1.8858140379190445, "epoch": 0.2683904246802634, "grad_norm": 8.237641334533691, "learning_rate": 4.883280585086974e-06, "loss": 0.4529, "mean_token_accuracy": 0.8476707622408867, "num_tokens": 104099930.0, "step": 86580 }, { "entropy": 1.858066162467003, "epoch": 0.2684214238053131, "grad_norm": 8.798260688781738, "learning_rate": 4.882998596521253e-06, "loss": 0.4129, "mean_token_accuracy": 0.8484656035900116, "num_tokens": 104111901.0, "step": 86590 }, { "entropy": 1.92180934548378, "epoch": 0.2684524229303628, "grad_norm": 9.742270469665527, "learning_rate": 4.882716656800792e-06, "loss": 0.5152, "mean_token_accuracy": 0.8452163085341453, "num_tokens": 104122977.0, "step": 86600 }, { "entropy": 1.867686577141285, "epoch": 0.2684834220554125, "grad_norm": 10.891302108764648, "learning_rate": 4.882434765911489e-06, "loss": 0.4698, "mean_token_accuracy": 0.8573295056819916, "num_tokens": 104135141.0, "step": 86610 }, { "entropy": 1.9070853009819984, "epoch": 0.2685144211804622, "grad_norm": 6.908443450927734, "learning_rate": 4.882152923839252e-06, "loss": 0.5018, "mean_token_accuracy": 0.8457126155495643, "num_tokens": 104146361.0, "step": 86620 }, { "entropy": 1.9306767612695694, "epoch": 0.2685454203055119, "grad_norm": 8.809544563293457, "learning_rate": 4.881871130569993e-06, "loss": 0.4938, "mean_token_accuracy": 0.831186157464981, "num_tokens": 104158055.0, "step": 86630 }, { "entropy": 1.7939895704388618, "epoch": 0.2685764194305616, "grad_norm": 6.0357184410095215, "learning_rate": 4.8815893860896265e-06, "loss": 0.3657, "mean_token_accuracy": 0.8565925478935241, "num_tokens": 104171215.0, "step": 86640 }, { "entropy": 1.8932900041341783, "epoch": 0.26860741855561127, "grad_norm": 9.490089416503906, "learning_rate": 4.881307690384079e-06, "loss": 0.4687, "mean_token_accuracy": 0.851920661330223, "num_tokens": 104183346.0, "step": 86650 }, { "entropy": 1.7981626734137535, "epoch": 0.268638417680661, "grad_norm": 4.065007209777832, "learning_rate": 4.881026043439277e-06, "loss": 0.4316, "mean_token_accuracy": 0.850606782734394, "num_tokens": 104197135.0, "step": 86660 }, { "entropy": 1.9110074192285538, "epoch": 0.26866941680571066, "grad_norm": 8.759778022766113, "learning_rate": 4.880744445241155e-06, "loss": 0.4975, "mean_token_accuracy": 0.8564579889178277, "num_tokens": 104207822.0, "step": 86670 }, { "entropy": 1.980569313466549, "epoch": 0.26870041593076033, "grad_norm": 8.87386703491211, "learning_rate": 4.880462895775654e-06, "loss": 0.5313, "mean_token_accuracy": 0.8343950539827347, "num_tokens": 104219081.0, "step": 86680 }, { "entropy": 1.8610002383589745, "epoch": 0.26873141505581005, "grad_norm": 8.677059173583984, "learning_rate": 4.880181395028719e-06, "loss": 0.4443, "mean_token_accuracy": 0.8492666438221932, "num_tokens": 104231630.0, "step": 86690 }, { "entropy": 1.899726065993309, "epoch": 0.2687624141808597, "grad_norm": 8.745160102844238, "learning_rate": 4.879899942986303e-06, "loss": 0.5213, "mean_token_accuracy": 0.8413066044449806, "num_tokens": 104242889.0, "step": 86700 }, { "entropy": 1.8926833271980286, "epoch": 0.26879341330590945, "grad_norm": 3.9941442012786865, "learning_rate": 4.87961853963436e-06, "loss": 0.4936, "mean_token_accuracy": 0.8299556851387024, "num_tokens": 104254389.0, "step": 86710 }, { "entropy": 1.875252665579319, "epoch": 0.2688244124309591, "grad_norm": 8.266586303710938, "learning_rate": 4.879337184958854e-06, "loss": 0.5124, "mean_token_accuracy": 0.8451120510697365, "num_tokens": 104266592.0, "step": 86720 }, { "entropy": 1.8953094065189362, "epoch": 0.26885541155600884, "grad_norm": 8.299905776977539, "learning_rate": 4.8790558789457545e-06, "loss": 0.4767, "mean_token_accuracy": 0.8386603578925133, "num_tokens": 104278272.0, "step": 86730 }, { "entropy": 1.8657784268260003, "epoch": 0.2688864106810585, "grad_norm": 8.039691925048828, "learning_rate": 4.878774621581035e-06, "loss": 0.4618, "mean_token_accuracy": 0.8498806357383728, "num_tokens": 104290052.0, "step": 86740 }, { "entropy": 1.924768103659153, "epoch": 0.26891740980610823, "grad_norm": 8.882854461669922, "learning_rate": 4.878493412850675e-06, "loss": 0.5499, "mean_token_accuracy": 0.8329790845513344, "num_tokens": 104301303.0, "step": 86750 }, { "entropy": 1.8597337052226066, "epoch": 0.2689484089311579, "grad_norm": 8.194012641906738, "learning_rate": 4.878212252740661e-06, "loss": 0.4851, "mean_token_accuracy": 0.8411440491676331, "num_tokens": 104313725.0, "step": 86760 }, { "entropy": 1.955818921327591, "epoch": 0.2689794080562076, "grad_norm": 8.100213050842285, "learning_rate": 4.877931141236982e-06, "loss": 0.5445, "mean_token_accuracy": 0.8334660112857819, "num_tokens": 104325362.0, "step": 86770 }, { "entropy": 1.8957918226718902, "epoch": 0.2690104071812573, "grad_norm": 5.112435340881348, "learning_rate": 4.877650078325635e-06, "loss": 0.4801, "mean_token_accuracy": 0.8381680518388748, "num_tokens": 104337583.0, "step": 86780 }, { "entropy": 1.9326987490057945, "epoch": 0.269041406306307, "grad_norm": 8.216394424438477, "learning_rate": 4.8773690639926246e-06, "loss": 0.4806, "mean_token_accuracy": 0.8478187531232834, "num_tokens": 104349160.0, "step": 86790 }, { "entropy": 1.8107408866286279, "epoch": 0.2690724054313567, "grad_norm": 8.589932441711426, "learning_rate": 4.8770880982239565e-06, "loss": 0.4214, "mean_token_accuracy": 0.8569507896900177, "num_tokens": 104362032.0, "step": 86800 }, { "entropy": 1.9010547578334809, "epoch": 0.2691034045564064, "grad_norm": 7.747185707092285, "learning_rate": 4.876807181005645e-06, "loss": 0.4862, "mean_token_accuracy": 0.8419987246394157, "num_tokens": 104374183.0, "step": 86810 }, { "entropy": 1.9327505372464657, "epoch": 0.2691344036814561, "grad_norm": 7.296648025512695, "learning_rate": 4.87652631232371e-06, "loss": 0.5457, "mean_token_accuracy": 0.8373480141162872, "num_tokens": 104385704.0, "step": 86820 }, { "entropy": 1.9187294945120812, "epoch": 0.2691654028065058, "grad_norm": 9.021369934082031, "learning_rate": 4.876245492164175e-06, "loss": 0.5099, "mean_token_accuracy": 0.8350897148251534, "num_tokens": 104397570.0, "step": 86830 }, { "entropy": 1.8674516141414643, "epoch": 0.2691964019315555, "grad_norm": 8.63692855834961, "learning_rate": 4.875964720513072e-06, "loss": 0.5416, "mean_token_accuracy": 0.8315334841609001, "num_tokens": 104409275.0, "step": 86840 }, { "entropy": 1.8427454948425293, "epoch": 0.2692274010566052, "grad_norm": 3.408676862716675, "learning_rate": 4.875683997356437e-06, "loss": 0.4465, "mean_token_accuracy": 0.85171507447958, "num_tokens": 104421885.0, "step": 86850 }, { "entropy": 1.9858239054679871, "epoch": 0.26925840018165487, "grad_norm": 8.626346588134766, "learning_rate": 4.87540332268031e-06, "loss": 0.6016, "mean_token_accuracy": 0.8261340126395226, "num_tokens": 104433208.0, "step": 86860 }, { "entropy": 1.8474789157509803, "epoch": 0.2692893993067046, "grad_norm": 3.6940155029296875, "learning_rate": 4.87512269647074e-06, "loss": 0.4441, "mean_token_accuracy": 0.8468289896845818, "num_tokens": 104445950.0, "step": 86870 }, { "entropy": 1.7919086948037148, "epoch": 0.26932039843175426, "grad_norm": 7.589000225067139, "learning_rate": 4.8748421187137786e-06, "loss": 0.4006, "mean_token_accuracy": 0.8585388869047165, "num_tokens": 104459408.0, "step": 86880 }, { "entropy": 1.817798225581646, "epoch": 0.269351397556804, "grad_norm": 2.4566121101379395, "learning_rate": 4.8745615893954875e-06, "loss": 0.4488, "mean_token_accuracy": 0.856153316795826, "num_tokens": 104472083.0, "step": 86890 }, { "entropy": 1.829006166756153, "epoch": 0.26938239668185365, "grad_norm": 3.47737455368042, "learning_rate": 4.8742811085019294e-06, "loss": 0.4355, "mean_token_accuracy": 0.8479893788695335, "num_tokens": 104485313.0, "step": 86900 }, { "entropy": 1.824818679690361, "epoch": 0.2694133958069034, "grad_norm": 10.421551704406738, "learning_rate": 4.8740006760191715e-06, "loss": 0.4166, "mean_token_accuracy": 0.8599545955657959, "num_tokens": 104498290.0, "step": 86910 }, { "entropy": 1.8766320884227752, "epoch": 0.26944439493195305, "grad_norm": 8.567687034606934, "learning_rate": 4.873720291933294e-06, "loss": 0.4632, "mean_token_accuracy": 0.8494315207004547, "num_tokens": 104510307.0, "step": 86920 }, { "entropy": 1.8442302539944648, "epoch": 0.2694753940570027, "grad_norm": 3.7030580043792725, "learning_rate": 4.873439956230375e-06, "loss": 0.4602, "mean_token_accuracy": 0.848169319331646, "num_tokens": 104522429.0, "step": 86930 }, { "entropy": 1.890008282661438, "epoch": 0.26950639318205244, "grad_norm": 8.97164249420166, "learning_rate": 4.873159668896501e-06, "loss": 0.5014, "mean_token_accuracy": 0.8344573676586151, "num_tokens": 104534058.0, "step": 86940 }, { "entropy": 1.818005445599556, "epoch": 0.2695373923071021, "grad_norm": 8.09427547454834, "learning_rate": 4.8728794299177655e-06, "loss": 0.4666, "mean_token_accuracy": 0.8522487103939056, "num_tokens": 104546415.0, "step": 86950 }, { "entropy": 1.838931292295456, "epoch": 0.26956839143215183, "grad_norm": 7.319267272949219, "learning_rate": 4.8725992392802655e-06, "loss": 0.504, "mean_token_accuracy": 0.8448920026421547, "num_tokens": 104558731.0, "step": 86960 }, { "entropy": 1.8411687076091767, "epoch": 0.2695993905572015, "grad_norm": 3.573763847351074, "learning_rate": 4.872319096970106e-06, "loss": 0.439, "mean_token_accuracy": 0.8461699530482292, "num_tokens": 104571213.0, "step": 86970 }, { "entropy": 1.8673091575503349, "epoch": 0.2696303896822512, "grad_norm": 4.311919689178467, "learning_rate": 4.872039002973394e-06, "loss": 0.5261, "mean_token_accuracy": 0.8393310904502869, "num_tokens": 104583049.0, "step": 86980 }, { "entropy": 1.8796320587396622, "epoch": 0.2696613888073009, "grad_norm": 9.332188606262207, "learning_rate": 4.871758957276246e-06, "loss": 0.5274, "mean_token_accuracy": 0.8384488835930825, "num_tokens": 104595379.0, "step": 86990 }, { "entropy": 1.8900238052010536, "epoch": 0.2696923879323506, "grad_norm": 7.770622253417969, "learning_rate": 4.871478959864781e-06, "loss": 0.4843, "mean_token_accuracy": 0.8497345179319382, "num_tokens": 104606394.0, "step": 87000 }, { "entropy": 1.839431057870388, "epoch": 0.2697233870574003, "grad_norm": 8.380780220031738, "learning_rate": 4.871199010725126e-06, "loss": 0.5151, "mean_token_accuracy": 0.8396735802292824, "num_tokens": 104619650.0, "step": 87010 }, { "entropy": 1.827296996116638, "epoch": 0.26975438618245, "grad_norm": 8.714934349060059, "learning_rate": 4.870919109843412e-06, "loss": 0.509, "mean_token_accuracy": 0.8432153165340424, "num_tokens": 104632127.0, "step": 87020 }, { "entropy": 1.8935890957713126, "epoch": 0.2697853853074997, "grad_norm": 9.051390647888184, "learning_rate": 4.870639257205774e-06, "loss": 0.5075, "mean_token_accuracy": 0.8443873509764671, "num_tokens": 104643478.0, "step": 87030 }, { "entropy": 1.8204589426517486, "epoch": 0.2698163844325494, "grad_norm": 7.4907379150390625, "learning_rate": 4.870359452798357e-06, "loss": 0.4349, "mean_token_accuracy": 0.8611741289496422, "num_tokens": 104655632.0, "step": 87040 }, { "entropy": 1.8853262856602668, "epoch": 0.2698473835575991, "grad_norm": 8.072641372680664, "learning_rate": 4.8700796966073084e-06, "loss": 0.4742, "mean_token_accuracy": 0.8454017639160156, "num_tokens": 104667018.0, "step": 87050 }, { "entropy": 1.8324878126382829, "epoch": 0.2698783826826488, "grad_norm": 8.132426261901855, "learning_rate": 4.869799988618784e-06, "loss": 0.4688, "mean_token_accuracy": 0.8415536895394325, "num_tokens": 104679052.0, "step": 87060 }, { "entropy": 1.884521934390068, "epoch": 0.26990938180769847, "grad_norm": 9.089740753173828, "learning_rate": 4.869520328818938e-06, "loss": 0.4913, "mean_token_accuracy": 0.8491548255085946, "num_tokens": 104690472.0, "step": 87070 }, { "entropy": 1.7616024523973466, "epoch": 0.2699403809327482, "grad_norm": 5.119749546051025, "learning_rate": 4.86924071719394e-06, "loss": 0.4834, "mean_token_accuracy": 0.8421557962894439, "num_tokens": 104703935.0, "step": 87080 }, { "entropy": 1.8466105580329895, "epoch": 0.26997138005779786, "grad_norm": 6.891587734222412, "learning_rate": 4.868961153729958e-06, "loss": 0.5664, "mean_token_accuracy": 0.8311922624707222, "num_tokens": 104715762.0, "step": 87090 }, { "entropy": 1.830435362458229, "epoch": 0.2700023791828476, "grad_norm": 9.212553024291992, "learning_rate": 4.86868163841317e-06, "loss": 0.5407, "mean_token_accuracy": 0.8380068346858025, "num_tokens": 104727912.0, "step": 87100 }, { "entropy": 1.8718600660562514, "epoch": 0.27003337830789725, "grad_norm": 8.763904571533203, "learning_rate": 4.8684021712297545e-06, "loss": 0.485, "mean_token_accuracy": 0.8420531466603279, "num_tokens": 104740144.0, "step": 87110 }, { "entropy": 1.8686859756708145, "epoch": 0.270064377432947, "grad_norm": 4.998740196228027, "learning_rate": 4.868122752165901e-06, "loss": 0.5043, "mean_token_accuracy": 0.8439597055315972, "num_tokens": 104751419.0, "step": 87120 }, { "entropy": 1.821141104400158, "epoch": 0.27009537655799665, "grad_norm": 8.166898727416992, "learning_rate": 4.867843381207802e-06, "loss": 0.4898, "mean_token_accuracy": 0.8448257312178612, "num_tokens": 104764014.0, "step": 87130 }, { "entropy": 1.8268358632922173, "epoch": 0.27012637568304637, "grad_norm": 9.196722984313965, "learning_rate": 4.867564058341654e-06, "loss": 0.4653, "mean_token_accuracy": 0.8561849161982537, "num_tokens": 104776029.0, "step": 87140 }, { "entropy": 1.8853852570056915, "epoch": 0.27015737480809604, "grad_norm": 7.830870151519775, "learning_rate": 4.867284783553663e-06, "loss": 0.5312, "mean_token_accuracy": 0.8442418292164803, "num_tokens": 104787139.0, "step": 87150 }, { "entropy": 1.7887479558587074, "epoch": 0.27018837393314576, "grad_norm": 8.097960472106934, "learning_rate": 4.867005556830035e-06, "loss": 0.3924, "mean_token_accuracy": 0.8651418328285218, "num_tokens": 104799482.0, "step": 87160 }, { "entropy": 1.8204817980527879, "epoch": 0.27021937305819543, "grad_norm": 8.041654586791992, "learning_rate": 4.8667263781569875e-06, "loss": 0.4393, "mean_token_accuracy": 0.8555315598845482, "num_tokens": 104811350.0, "step": 87170 }, { "entropy": 1.7814991921186447, "epoch": 0.2702503721832451, "grad_norm": 7.963624477386475, "learning_rate": 4.86644724752074e-06, "loss": 0.4126, "mean_token_accuracy": 0.8552462846040726, "num_tokens": 104824216.0, "step": 87180 }, { "entropy": 1.9131552398204803, "epoch": 0.2702813713082948, "grad_norm": 9.447894096374512, "learning_rate": 4.86616816490752e-06, "loss": 0.5184, "mean_token_accuracy": 0.8396668806672096, "num_tokens": 104835560.0, "step": 87190 }, { "entropy": 1.824287013709545, "epoch": 0.2703123704333445, "grad_norm": 9.329700469970703, "learning_rate": 4.865889130303556e-06, "loss": 0.4567, "mean_token_accuracy": 0.8538261905312539, "num_tokens": 104847449.0, "step": 87200 }, { "entropy": 1.8483013391494751, "epoch": 0.2703433695583942, "grad_norm": 8.532864570617676, "learning_rate": 4.865610143695086e-06, "loss": 0.4943, "mean_token_accuracy": 0.8494655951857567, "num_tokens": 104859220.0, "step": 87210 }, { "entropy": 1.8401647135615349, "epoch": 0.2703743686834439, "grad_norm": 7.677708148956299, "learning_rate": 4.8653312050683524e-06, "loss": 0.5101, "mean_token_accuracy": 0.8437712222337723, "num_tokens": 104870632.0, "step": 87220 }, { "entropy": 1.8975766867399215, "epoch": 0.2704053678084936, "grad_norm": 7.6379780769348145, "learning_rate": 4.865052314409605e-06, "loss": 0.5677, "mean_token_accuracy": 0.8343770399689674, "num_tokens": 104881139.0, "step": 87230 }, { "entropy": 1.842423902451992, "epoch": 0.2704363669335433, "grad_norm": 8.821908950805664, "learning_rate": 4.864773471705094e-06, "loss": 0.4859, "mean_token_accuracy": 0.8355495795607567, "num_tokens": 104893189.0, "step": 87240 }, { "entropy": 1.8596946865320205, "epoch": 0.270467366058593, "grad_norm": 8.274313926696777, "learning_rate": 4.86449467694108e-06, "loss": 0.4714, "mean_token_accuracy": 0.8518381923437118, "num_tokens": 104904842.0, "step": 87250 }, { "entropy": 1.7499984815716743, "epoch": 0.2704983651836427, "grad_norm": 8.799256324768066, "learning_rate": 4.864215930103828e-06, "loss": 0.367, "mean_token_accuracy": 0.8650006785988807, "num_tokens": 104918602.0, "step": 87260 }, { "entropy": 1.8299191161990165, "epoch": 0.2705293643086924, "grad_norm": 8.756028175354004, "learning_rate": 4.863937231179608e-06, "loss": 0.4585, "mean_token_accuracy": 0.8430431827902793, "num_tokens": 104930274.0, "step": 87270 }, { "entropy": 1.8168650731444358, "epoch": 0.27056036343374207, "grad_norm": 8.896665573120117, "learning_rate": 4.863658580154694e-06, "loss": 0.4216, "mean_token_accuracy": 0.8473124504089355, "num_tokens": 104943125.0, "step": 87280 }, { "entropy": 1.8909821808338165, "epoch": 0.2705913625587918, "grad_norm": 11.013216018676758, "learning_rate": 4.863379977015369e-06, "loss": 0.5405, "mean_token_accuracy": 0.8345570892095566, "num_tokens": 104954178.0, "step": 87290 }, { "entropy": 1.8781416594982148, "epoch": 0.27062236168384146, "grad_norm": 8.503053665161133, "learning_rate": 4.863101421747918e-06, "loss": 0.5134, "mean_token_accuracy": 0.8471224218606949, "num_tokens": 104965323.0, "step": 87300 }, { "entropy": 1.7377824038267136, "epoch": 0.2706533608088912, "grad_norm": 3.2067818641662598, "learning_rate": 4.862822914338635e-06, "loss": 0.3778, "mean_token_accuracy": 0.8619665712118149, "num_tokens": 104979529.0, "step": 87310 }, { "entropy": 1.8459437713027, "epoch": 0.27068435993394085, "grad_norm": 9.494819641113281, "learning_rate": 4.862544454773815e-06, "loss": 0.4917, "mean_token_accuracy": 0.8475456729531288, "num_tokens": 104991778.0, "step": 87320 }, { "entropy": 1.8079747796058654, "epoch": 0.2707153590589906, "grad_norm": 4.102128505706787, "learning_rate": 4.86226604303976e-06, "loss": 0.495, "mean_token_accuracy": 0.8489979475736618, "num_tokens": 105004887.0, "step": 87330 }, { "entropy": 1.8566383227705956, "epoch": 0.27074635818404025, "grad_norm": 9.459664344787598, "learning_rate": 4.8619876791227834e-06, "loss": 0.463, "mean_token_accuracy": 0.8472496911883354, "num_tokens": 105015787.0, "step": 87340 }, { "entropy": 1.8375443920493126, "epoch": 0.27077735730908997, "grad_norm": 8.758172988891602, "learning_rate": 4.861709363009195e-06, "loss": 0.4461, "mean_token_accuracy": 0.8570204228162766, "num_tokens": 105027235.0, "step": 87350 }, { "entropy": 1.8649962782859801, "epoch": 0.27080835643413964, "grad_norm": 9.725914001464844, "learning_rate": 4.861431094685316e-06, "loss": 0.5095, "mean_token_accuracy": 0.8399304628372193, "num_tokens": 105038434.0, "step": 87360 }, { "entropy": 1.7663211345672607, "epoch": 0.27083935555918937, "grad_norm": 7.783099174499512, "learning_rate": 4.86115287413747e-06, "loss": 0.4074, "mean_token_accuracy": 0.8482378587126732, "num_tokens": 105050998.0, "step": 87370 }, { "entropy": 1.8368576869368554, "epoch": 0.27087035468423903, "grad_norm": 9.434002876281738, "learning_rate": 4.8608747013519896e-06, "loss": 0.4833, "mean_token_accuracy": 0.8487260073423386, "num_tokens": 105062505.0, "step": 87380 }, { "entropy": 1.7946744754910469, "epoch": 0.27090135380928876, "grad_norm": 8.645105361938477, "learning_rate": 4.860596576315209e-06, "loss": 0.4614, "mean_token_accuracy": 0.8437353923916817, "num_tokens": 105075561.0, "step": 87390 }, { "entropy": 1.8670944333076478, "epoch": 0.2709323529343384, "grad_norm": 8.151835441589355, "learning_rate": 4.860318499013468e-06, "loss": 0.5036, "mean_token_accuracy": 0.8455032199621201, "num_tokens": 105087761.0, "step": 87400 }, { "entropy": 1.8563807129859924, "epoch": 0.2709633520593881, "grad_norm": 8.382278442382812, "learning_rate": 4.860040469433119e-06, "loss": 0.4837, "mean_token_accuracy": 0.8437006428837777, "num_tokens": 105099328.0, "step": 87410 }, { "entropy": 1.8392608642578125, "epoch": 0.2709943511844378, "grad_norm": 6.8433098793029785, "learning_rate": 4.8597624875605076e-06, "loss": 0.4492, "mean_token_accuracy": 0.8475878596305847, "num_tokens": 105111492.0, "step": 87420 }, { "entropy": 1.896904969215393, "epoch": 0.2710253503094875, "grad_norm": 9.722173690795898, "learning_rate": 4.859484553381996e-06, "loss": 0.5157, "mean_token_accuracy": 0.8329481184482574, "num_tokens": 105123552.0, "step": 87430 }, { "entropy": 1.8754973500967025, "epoch": 0.2710563494345372, "grad_norm": 3.6622695922851562, "learning_rate": 4.859206666883946e-06, "loss": 0.4939, "mean_token_accuracy": 0.8413586497306824, "num_tokens": 105135820.0, "step": 87440 }, { "entropy": 1.8094146370887756, "epoch": 0.2710873485595869, "grad_norm": 9.525456428527832, "learning_rate": 4.858928828052725e-06, "loss": 0.4518, "mean_token_accuracy": 0.8486891463398933, "num_tokens": 105147391.0, "step": 87450 }, { "entropy": 1.9204721599817276, "epoch": 0.2711183476846366, "grad_norm": 8.702652931213379, "learning_rate": 4.858651036874711e-06, "loss": 0.5346, "mean_token_accuracy": 0.837224043905735, "num_tokens": 105159245.0, "step": 87460 }, { "entropy": 1.9499018788337708, "epoch": 0.2711493468096863, "grad_norm": 9.13373851776123, "learning_rate": 4.858373293336278e-06, "loss": 0.5221, "mean_token_accuracy": 0.8495534911751748, "num_tokens": 105169906.0, "step": 87470 }, { "entropy": 1.865763219445944, "epoch": 0.271180345934736, "grad_norm": 8.000102043151855, "learning_rate": 4.858095597423816e-06, "loss": 0.4663, "mean_token_accuracy": 0.8466974958777428, "num_tokens": 105182696.0, "step": 87480 }, { "entropy": 1.8530535832047463, "epoch": 0.27121134505978567, "grad_norm": 9.345272064208984, "learning_rate": 4.8578179491237135e-06, "loss": 0.5162, "mean_token_accuracy": 0.8383201539516449, "num_tokens": 105194697.0, "step": 87490 }, { "entropy": 1.9272264629602431, "epoch": 0.2712423441848354, "grad_norm": 8.332130432128906, "learning_rate": 4.857540348422365e-06, "loss": 0.4812, "mean_token_accuracy": 0.8568297758698463, "num_tokens": 105205400.0, "step": 87500 }, { "entropy": 1.8942699432373047, "epoch": 0.27127334330988506, "grad_norm": 8.65912914276123, "learning_rate": 4.857262795306176e-06, "loss": 0.4641, "mean_token_accuracy": 0.8520406141877175, "num_tokens": 105217304.0, "step": 87510 }, { "entropy": 1.7813436336815358, "epoch": 0.2713043424349348, "grad_norm": 4.422085762023926, "learning_rate": 4.8569852897615476e-06, "loss": 0.4322, "mean_token_accuracy": 0.8593872472643852, "num_tokens": 105230048.0, "step": 87520 }, { "entropy": 1.9077577859163284, "epoch": 0.27133534155998446, "grad_norm": 8.263802528381348, "learning_rate": 4.856707831774897e-06, "loss": 0.4897, "mean_token_accuracy": 0.8396870777010917, "num_tokens": 105241231.0, "step": 87530 }, { "entropy": 1.8345076471567154, "epoch": 0.2713663406850342, "grad_norm": 8.313085556030273, "learning_rate": 4.856430421332639e-06, "loss": 0.4701, "mean_token_accuracy": 0.8465406790375709, "num_tokens": 105253356.0, "step": 87540 }, { "entropy": 1.8547306582331657, "epoch": 0.27139733981008385, "grad_norm": 7.723386287689209, "learning_rate": 4.856153058421199e-06, "loss": 0.4872, "mean_token_accuracy": 0.8484765499830246, "num_tokens": 105265696.0, "step": 87550 }, { "entropy": 1.8216210514307023, "epoch": 0.2714283389351336, "grad_norm": 7.835054874420166, "learning_rate": 4.855875743027003e-06, "loss": 0.4493, "mean_token_accuracy": 0.8526194721460343, "num_tokens": 105277875.0, "step": 87560 }, { "entropy": 1.8399008169770241, "epoch": 0.27145933806018324, "grad_norm": 7.611944198608398, "learning_rate": 4.855598475136486e-06, "loss": 0.4977, "mean_token_accuracy": 0.8431843638420105, "num_tokens": 105290058.0, "step": 87570 }, { "entropy": 1.8355013683438302, "epoch": 0.27149033718523297, "grad_norm": 8.615690231323242, "learning_rate": 4.855321254736087e-06, "loss": 0.4688, "mean_token_accuracy": 0.8420334681868553, "num_tokens": 105302569.0, "step": 87580 }, { "entropy": 1.8711734786629677, "epoch": 0.27152133631028263, "grad_norm": 8.991096496582031, "learning_rate": 4.855044081812253e-06, "loss": 0.5087, "mean_token_accuracy": 0.8353176578879357, "num_tokens": 105314367.0, "step": 87590 }, { "entropy": 1.8946691662073136, "epoch": 0.27155233543533236, "grad_norm": 7.5710577964782715, "learning_rate": 4.854766956351432e-06, "loss": 0.5232, "mean_token_accuracy": 0.8443698287010193, "num_tokens": 105325617.0, "step": 87600 }, { "entropy": 1.8058865994215012, "epoch": 0.27158333456038203, "grad_norm": 2.2330517768859863, "learning_rate": 4.854489878340079e-06, "loss": 0.4736, "mean_token_accuracy": 0.8525756880640983, "num_tokens": 105338254.0, "step": 87610 }, { "entropy": 1.9165008813142776, "epoch": 0.27161433368543175, "grad_norm": 8.385638236999512, "learning_rate": 4.854212847764657e-06, "loss": 0.5176, "mean_token_accuracy": 0.8384706929326058, "num_tokens": 105349177.0, "step": 87620 }, { "entropy": 1.906524208188057, "epoch": 0.2716453328104814, "grad_norm": 8.768911361694336, "learning_rate": 4.853935864611632e-06, "loss": 0.5187, "mean_token_accuracy": 0.8458487808704376, "num_tokens": 105360116.0, "step": 87630 }, { "entropy": 1.8195328041911125, "epoch": 0.27167633193553115, "grad_norm": 4.211945056915283, "learning_rate": 4.853658928867475e-06, "loss": 0.4673, "mean_token_accuracy": 0.8428912833333015, "num_tokens": 105372704.0, "step": 87640 }, { "entropy": 1.8345939561724662, "epoch": 0.2717073310605808, "grad_norm": 10.063714981079102, "learning_rate": 4.853382040518665e-06, "loss": 0.4395, "mean_token_accuracy": 0.8555810198187828, "num_tokens": 105384756.0, "step": 87650 }, { "entropy": 1.8712797194719315, "epoch": 0.2717383301856305, "grad_norm": 7.05558967590332, "learning_rate": 4.853105199551681e-06, "loss": 0.5203, "mean_token_accuracy": 0.8413851290941239, "num_tokens": 105396711.0, "step": 87660 }, { "entropy": 1.8855430006980896, "epoch": 0.2717693293106802, "grad_norm": 9.500574111938477, "learning_rate": 4.8528284059530145e-06, "loss": 0.507, "mean_token_accuracy": 0.841337351500988, "num_tokens": 105407862.0, "step": 87670 }, { "entropy": 1.7173856884241103, "epoch": 0.2718003284357299, "grad_norm": 4.654793739318848, "learning_rate": 4.852551659709158e-06, "loss": 0.3756, "mean_token_accuracy": 0.8598586186766625, "num_tokens": 105422391.0, "step": 87680 }, { "entropy": 1.9247530341148376, "epoch": 0.2718313275607796, "grad_norm": 9.287614822387695, "learning_rate": 4.85227496080661e-06, "loss": 0.5294, "mean_token_accuracy": 0.8380204871296882, "num_tokens": 105433845.0, "step": 87690 }, { "entropy": 1.9517512962222099, "epoch": 0.27186232668582927, "grad_norm": 6.864774227142334, "learning_rate": 4.851998309231874e-06, "loss": 0.5121, "mean_token_accuracy": 0.8360002964735032, "num_tokens": 105445129.0, "step": 87700 }, { "entropy": 1.9424969971179962, "epoch": 0.271893325810879, "grad_norm": 8.023564338684082, "learning_rate": 4.8517217049714625e-06, "loss": 0.5175, "mean_token_accuracy": 0.8406699001789093, "num_tokens": 105457210.0, "step": 87710 }, { "entropy": 1.8285282671451568, "epoch": 0.27192432493592866, "grad_norm": 9.891963958740234, "learning_rate": 4.851445148011887e-06, "loss": 0.454, "mean_token_accuracy": 0.853777602314949, "num_tokens": 105469532.0, "step": 87720 }, { "entropy": 1.825080545246601, "epoch": 0.2719553240609784, "grad_norm": 6.351779460906982, "learning_rate": 4.8511686383396706e-06, "loss": 0.4105, "mean_token_accuracy": 0.858852119743824, "num_tokens": 105482237.0, "step": 87730 }, { "entropy": 1.9200027763843537, "epoch": 0.27198632318602806, "grad_norm": 8.065184593200684, "learning_rate": 4.850892175941337e-06, "loss": 0.5068, "mean_token_accuracy": 0.8401974439620972, "num_tokens": 105493906.0, "step": 87740 }, { "entropy": 1.8792470768094063, "epoch": 0.2720173223110778, "grad_norm": 8.423270225524902, "learning_rate": 4.8506157608034186e-06, "loss": 0.534, "mean_token_accuracy": 0.8373197540640831, "num_tokens": 105505414.0, "step": 87750 }, { "entropy": 1.900409395992756, "epoch": 0.27204832143612745, "grad_norm": 7.148952007293701, "learning_rate": 4.850339392912451e-06, "loss": 0.4789, "mean_token_accuracy": 0.8442846789956093, "num_tokens": 105516469.0, "step": 87760 }, { "entropy": 1.8958803132176398, "epoch": 0.2720793205611772, "grad_norm": 8.72636604309082, "learning_rate": 4.850063072254976e-06, "loss": 0.5062, "mean_token_accuracy": 0.8429311379790306, "num_tokens": 105528375.0, "step": 87770 }, { "entropy": 1.8233585372567176, "epoch": 0.27211031968622684, "grad_norm": 8.03514289855957, "learning_rate": 4.849786798817542e-06, "loss": 0.478, "mean_token_accuracy": 0.8540633916854858, "num_tokens": 105540482.0, "step": 87780 }, { "entropy": 1.8922078132629394, "epoch": 0.27214131881127657, "grad_norm": 8.679289817810059, "learning_rate": 4.8495105725867e-06, "loss": 0.4951, "mean_token_accuracy": 0.8497767493128776, "num_tokens": 105551735.0, "step": 87790 }, { "entropy": 1.809168304502964, "epoch": 0.27217231793632624, "grad_norm": 9.962617874145508, "learning_rate": 4.84923439354901e-06, "loss": 0.4717, "mean_token_accuracy": 0.8472581446170807, "num_tokens": 105564314.0, "step": 87800 }, { "entropy": 1.843416164815426, "epoch": 0.27220331706137596, "grad_norm": 7.68436861038208, "learning_rate": 4.848958261691033e-06, "loss": 0.5533, "mean_token_accuracy": 0.8343981161713601, "num_tokens": 105576827.0, "step": 87810 }, { "entropy": 1.8259743750095367, "epoch": 0.27223431618642563, "grad_norm": 7.573136329650879, "learning_rate": 4.84868217699934e-06, "loss": 0.4568, "mean_token_accuracy": 0.852935828268528, "num_tokens": 105589417.0, "step": 87820 }, { "entropy": 1.8123556807637216, "epoch": 0.27226531531147535, "grad_norm": 3.7434639930725098, "learning_rate": 4.848406139460503e-06, "loss": 0.4605, "mean_token_accuracy": 0.8481271311640739, "num_tokens": 105602368.0, "step": 87830 }, { "entropy": 1.8485308945178986, "epoch": 0.272296314436525, "grad_norm": 7.756927490234375, "learning_rate": 4.848130149061103e-06, "loss": 0.448, "mean_token_accuracy": 0.8554665118455886, "num_tokens": 105613811.0, "step": 87840 }, { "entropy": 1.8490839540958404, "epoch": 0.27232731356157475, "grad_norm": 6.909817695617676, "learning_rate": 4.847854205787724e-06, "loss": 0.4426, "mean_token_accuracy": 0.8522930264472961, "num_tokens": 105625880.0, "step": 87850 }, { "entropy": 1.882150113582611, "epoch": 0.2723583126866244, "grad_norm": 9.245566368103027, "learning_rate": 4.847578309626954e-06, "loss": 0.4931, "mean_token_accuracy": 0.8421803861856461, "num_tokens": 105638135.0, "step": 87860 }, { "entropy": 1.8645110860466958, "epoch": 0.27238931181167414, "grad_norm": 8.556130409240723, "learning_rate": 4.847302460565392e-06, "loss": 0.5045, "mean_token_accuracy": 0.8380250200629235, "num_tokens": 105649997.0, "step": 87870 }, { "entropy": 1.9121182456612587, "epoch": 0.2724203109367238, "grad_norm": 6.9276862144470215, "learning_rate": 4.847026658589637e-06, "loss": 0.4903, "mean_token_accuracy": 0.8434757009148598, "num_tokens": 105661149.0, "step": 87880 }, { "entropy": 1.8295169189572333, "epoch": 0.27245131006177353, "grad_norm": 9.222490310668945, "learning_rate": 4.846750903686295e-06, "loss": 0.4377, "mean_token_accuracy": 0.8513430684804917, "num_tokens": 105673234.0, "step": 87890 }, { "entropy": 1.9055490285158156, "epoch": 0.2724823091868232, "grad_norm": 8.43136215209961, "learning_rate": 4.846475195841978e-06, "loss": 0.5092, "mean_token_accuracy": 0.849004752933979, "num_tokens": 105684180.0, "step": 87900 }, { "entropy": 1.7300630405545234, "epoch": 0.27251330831187287, "grad_norm": 4.451493740081787, "learning_rate": 4.846199535043302e-06, "loss": 0.453, "mean_token_accuracy": 0.8522528484463692, "num_tokens": 105698528.0, "step": 87910 }, { "entropy": 1.855320343375206, "epoch": 0.2725443074369226, "grad_norm": 7.741222381591797, "learning_rate": 4.845923921276889e-06, "loss": 0.4827, "mean_token_accuracy": 0.843564510345459, "num_tokens": 105710686.0, "step": 87920 }, { "entropy": 1.8979840464890003, "epoch": 0.27257530656197226, "grad_norm": Infinity, "learning_rate": 4.845648354529367e-06, "loss": 0.4751, "mean_token_accuracy": 0.8410883545875549, "num_tokens": 105722636.0, "step": 87930 }, { "entropy": 1.9001470863819123, "epoch": 0.272606305687022, "grad_norm": 7.072505474090576, "learning_rate": 4.845372834787369e-06, "loss": 0.5521, "mean_token_accuracy": 0.8388607785105705, "num_tokens": 105733839.0, "step": 87940 }, { "entropy": 1.8749860867857933, "epoch": 0.27263730481207166, "grad_norm": 9.139892578125, "learning_rate": 4.845097362037533e-06, "loss": 0.4707, "mean_token_accuracy": 0.8396959900856018, "num_tokens": 105746121.0, "step": 87950 }, { "entropy": 1.8686931714415551, "epoch": 0.2726683039371214, "grad_norm": 9.034399032592773, "learning_rate": 4.844821936266501e-06, "loss": 0.4891, "mean_token_accuracy": 0.8411045849323273, "num_tokens": 105758031.0, "step": 87960 }, { "entropy": 1.9092384189367295, "epoch": 0.27269930306217105, "grad_norm": 4.113952159881592, "learning_rate": 4.844546557460922e-06, "loss": 0.4858, "mean_token_accuracy": 0.8454341217875481, "num_tokens": 105769453.0, "step": 87970 }, { "entropy": 1.8871904879808425, "epoch": 0.2727303021872208, "grad_norm": 7.443089962005615, "learning_rate": 4.844271225607452e-06, "loss": 0.4942, "mean_token_accuracy": 0.8477641880512238, "num_tokens": 105781468.0, "step": 87980 }, { "entropy": 1.887850184738636, "epoch": 0.27276130131227044, "grad_norm": 7.757199287414551, "learning_rate": 4.843995940692748e-06, "loss": 0.4486, "mean_token_accuracy": 0.8484135344624519, "num_tokens": 105793191.0, "step": 87990 }, { "entropy": 1.9092398703098297, "epoch": 0.27279230043732017, "grad_norm": 8.517298698425293, "learning_rate": 4.843720702703475e-06, "loss": 0.5326, "mean_token_accuracy": 0.8369758918881416, "num_tokens": 105804532.0, "step": 88000 }, { "entropy": 1.7902554288506507, "epoch": 0.27282329956236984, "grad_norm": 4.456912517547607, "learning_rate": 4.843445511626304e-06, "loss": 0.4211, "mean_token_accuracy": 0.8474618881940842, "num_tokens": 105817217.0, "step": 88010 }, { "entropy": 1.8608660578727723, "epoch": 0.27285429868741956, "grad_norm": 8.066084861755371, "learning_rate": 4.843170367447909e-06, "loss": 0.4997, "mean_token_accuracy": 0.8490109965205193, "num_tokens": 105828552.0, "step": 88020 }, { "entropy": 1.89578920006752, "epoch": 0.27288529781246923, "grad_norm": 6.772432327270508, "learning_rate": 4.842895270154972e-06, "loss": 0.5289, "mean_token_accuracy": 0.8335303783416748, "num_tokens": 105840037.0, "step": 88030 }, { "entropy": 1.7588407546281815, "epoch": 0.27291629693751895, "grad_norm": 1.8999171257019043, "learning_rate": 4.842620219734178e-06, "loss": 0.4357, "mean_token_accuracy": 0.8491491839289665, "num_tokens": 105854237.0, "step": 88040 }, { "entropy": 1.7782690718770027, "epoch": 0.2729472960625686, "grad_norm": 8.12587833404541, "learning_rate": 4.842345216172217e-06, "loss": 0.4459, "mean_token_accuracy": 0.8528685718774796, "num_tokens": 105866817.0, "step": 88050 }, { "entropy": 1.8298373103141785, "epoch": 0.27297829518761835, "grad_norm": 7.805505752563477, "learning_rate": 4.8420702594557855e-06, "loss": 0.4965, "mean_token_accuracy": 0.8397832661867142, "num_tokens": 105878717.0, "step": 88060 }, { "entropy": 1.8683532044291495, "epoch": 0.273009294312668, "grad_norm": 3.9877965450286865, "learning_rate": 4.841795349571587e-06, "loss": 0.4879, "mean_token_accuracy": 0.8447675094008446, "num_tokens": 105890128.0, "step": 88070 }, { "entropy": 1.8671092882752418, "epoch": 0.27304029343771774, "grad_norm": 8.055480003356934, "learning_rate": 4.841520486506328e-06, "loss": 0.4736, "mean_token_accuracy": 0.8493238598108291, "num_tokens": 105901651.0, "step": 88080 }, { "entropy": 1.9300762385129928, "epoch": 0.2730712925627674, "grad_norm": 8.968958854675293, "learning_rate": 4.841245670246719e-06, "loss": 0.5404, "mean_token_accuracy": 0.8325337320566177, "num_tokens": 105912601.0, "step": 88090 }, { "entropy": 1.7842349156737327, "epoch": 0.27310229168781713, "grad_norm": 2.736703634262085, "learning_rate": 4.840970900779478e-06, "loss": 0.4399, "mean_token_accuracy": 0.8453866109251976, "num_tokens": 105926027.0, "step": 88100 }, { "entropy": 1.8227722927927972, "epoch": 0.2731332908128668, "grad_norm": 7.752776622772217, "learning_rate": 4.840696178091329e-06, "loss": 0.4375, "mean_token_accuracy": 0.8525062575936317, "num_tokens": 105938989.0, "step": 88110 }, { "entropy": 1.85889712870121, "epoch": 0.2731642899379165, "grad_norm": 7.539937973022461, "learning_rate": 4.840421502169e-06, "loss": 0.5062, "mean_token_accuracy": 0.8374249458312988, "num_tokens": 105950886.0, "step": 88120 }, { "entropy": 1.846862156689167, "epoch": 0.2731952890629662, "grad_norm": 8.776389122009277, "learning_rate": 4.840146872999224e-06, "loss": 0.458, "mean_token_accuracy": 0.8551399633288383, "num_tokens": 105962835.0, "step": 88130 }, { "entropy": 1.7878411993384362, "epoch": 0.2732262881880159, "grad_norm": 7.770561695098877, "learning_rate": 4.839872290568737e-06, "loss": 0.4268, "mean_token_accuracy": 0.8593245849013329, "num_tokens": 105975339.0, "step": 88140 }, { "entropy": 1.8643562525510788, "epoch": 0.2732572873130656, "grad_norm": 12.23235034942627, "learning_rate": 4.839597754864288e-06, "loss": 0.4731, "mean_token_accuracy": 0.8453767746686935, "num_tokens": 105987199.0, "step": 88150 }, { "entropy": 1.887287637591362, "epoch": 0.27328828643811526, "grad_norm": 7.948936462402344, "learning_rate": 4.839323265872622e-06, "loss": 0.4833, "mean_token_accuracy": 0.8448623090982437, "num_tokens": 105998566.0, "step": 88160 }, { "entropy": 1.831418040394783, "epoch": 0.273319285563165, "grad_norm": 9.216192245483398, "learning_rate": 4.839048823580495e-06, "loss": 0.482, "mean_token_accuracy": 0.8426416292786598, "num_tokens": 106011036.0, "step": 88170 }, { "entropy": 1.8624485075473785, "epoch": 0.27335028468821465, "grad_norm": 8.75737476348877, "learning_rate": 4.838774427974665e-06, "loss": 0.5206, "mean_token_accuracy": 0.837440450489521, "num_tokens": 106022579.0, "step": 88180 }, { "entropy": 1.87282153069973, "epoch": 0.2733812838132644, "grad_norm": 3.783078670501709, "learning_rate": 4.8385000790419005e-06, "loss": 0.4822, "mean_token_accuracy": 0.8478828489780426, "num_tokens": 106033958.0, "step": 88190 }, { "entropy": 1.8366525799036026, "epoch": 0.27341228293831404, "grad_norm": 4.001052379608154, "learning_rate": 4.8382257767689696e-06, "loss": 0.4476, "mean_token_accuracy": 0.8528647780418396, "num_tokens": 106046043.0, "step": 88200 }, { "entropy": 1.9092376694083213, "epoch": 0.27344328206336377, "grad_norm": 8.00796127319336, "learning_rate": 4.837951521142646e-06, "loss": 0.5147, "mean_token_accuracy": 0.8405633881688118, "num_tokens": 106057385.0, "step": 88210 }, { "entropy": 1.8548787474632262, "epoch": 0.27347428118841344, "grad_norm": 6.67507791519165, "learning_rate": 4.837677312149712e-06, "loss": 0.4822, "mean_token_accuracy": 0.8439633920788765, "num_tokens": 106069738.0, "step": 88220 }, { "entropy": 1.7936237141489983, "epoch": 0.27350528031346316, "grad_norm": 10.650125503540039, "learning_rate": 4.837403149776953e-06, "loss": 0.5144, "mean_token_accuracy": 0.8396651700139046, "num_tokens": 106083549.0, "step": 88230 }, { "entropy": 1.8829040303826332, "epoch": 0.27353627943851283, "grad_norm": 8.387964248657227, "learning_rate": 4.837129034011162e-06, "loss": 0.4883, "mean_token_accuracy": 0.8502602845430374, "num_tokens": 106094686.0, "step": 88240 }, { "entropy": 1.8475438334047793, "epoch": 0.27356727856356255, "grad_norm": 4.180426597595215, "learning_rate": 4.836854964839133e-06, "loss": 0.4467, "mean_token_accuracy": 0.8456549108028412, "num_tokens": 106108422.0, "step": 88250 }, { "entropy": 1.9390867114067079, "epoch": 0.2735982776886122, "grad_norm": 7.525297164916992, "learning_rate": 4.836580942247668e-06, "loss": 0.5079, "mean_token_accuracy": 0.8467183902859687, "num_tokens": 106119386.0, "step": 88260 }, { "entropy": 1.8454492062330246, "epoch": 0.27362927681366195, "grad_norm": 7.91515588760376, "learning_rate": 4.836306966223574e-06, "loss": 0.5361, "mean_token_accuracy": 0.84375009983778, "num_tokens": 106131751.0, "step": 88270 }, { "entropy": 1.8392127811908723, "epoch": 0.2736602759387116, "grad_norm": 4.0122270584106445, "learning_rate": 4.8360330367536644e-06, "loss": 0.4702, "mean_token_accuracy": 0.8450425997376442, "num_tokens": 106144785.0, "step": 88280 }, { "entropy": 1.902709110081196, "epoch": 0.27369127506376134, "grad_norm": 9.007641792297363, "learning_rate": 4.835759153824755e-06, "loss": 0.5141, "mean_token_accuracy": 0.8378954946994781, "num_tokens": 106156151.0, "step": 88290 }, { "entropy": 1.8917675152420999, "epoch": 0.273722274188811, "grad_norm": 8.450139045715332, "learning_rate": 4.835485317423669e-06, "loss": 0.5173, "mean_token_accuracy": 0.8372221887111664, "num_tokens": 106167862.0, "step": 88300 }, { "entropy": 1.7971489533782006, "epoch": 0.27375327331386073, "grad_norm": 8.281867980957031, "learning_rate": 4.835211527537234e-06, "loss": 0.419, "mean_token_accuracy": 0.8558110296726227, "num_tokens": 106181012.0, "step": 88310 }, { "entropy": 1.9332615464925766, "epoch": 0.2737842724389104, "grad_norm": 7.937662124633789, "learning_rate": 4.834937784152283e-06, "loss": 0.5597, "mean_token_accuracy": 0.8360219776630402, "num_tokens": 106192536.0, "step": 88320 }, { "entropy": 1.8722572714090346, "epoch": 0.2738152715639601, "grad_norm": 9.418374061584473, "learning_rate": 4.834664087255653e-06, "loss": 0.4561, "mean_token_accuracy": 0.8445107057690621, "num_tokens": 106204886.0, "step": 88330 }, { "entropy": 1.8897190272808075, "epoch": 0.2738462706890098, "grad_norm": 7.282384872436523, "learning_rate": 4.83439043683419e-06, "loss": 0.4265, "mean_token_accuracy": 0.8582799851894378, "num_tokens": 106216856.0, "step": 88340 }, { "entropy": 1.8999308928847314, "epoch": 0.2738772698140595, "grad_norm": 9.030301094055176, "learning_rate": 4.8341168328747395e-06, "loss": 0.496, "mean_token_accuracy": 0.8379582807421684, "num_tokens": 106228905.0, "step": 88350 }, { "entropy": 1.8722223863005638, "epoch": 0.2739082689391092, "grad_norm": 7.225384712219238, "learning_rate": 4.833843275364157e-06, "loss": 0.4883, "mean_token_accuracy": 0.8510954797267913, "num_tokens": 106240136.0, "step": 88360 }, { "entropy": 1.7975994154810906, "epoch": 0.2739392680641589, "grad_norm": 7.694894790649414, "learning_rate": 4.833569764289303e-06, "loss": 0.3527, "mean_token_accuracy": 0.8697395831346512, "num_tokens": 106253669.0, "step": 88370 }, { "entropy": 1.7122723177075385, "epoch": 0.2739702671892086, "grad_norm": 4.0345540046691895, "learning_rate": 4.833296299637038e-06, "loss": 0.3687, "mean_token_accuracy": 0.8615933701395988, "num_tokens": 106268120.0, "step": 88380 }, { "entropy": 1.7913119062781333, "epoch": 0.2740012663142583, "grad_norm": 3.695681095123291, "learning_rate": 4.833022881394236e-06, "loss": 0.4178, "mean_token_accuracy": 0.8548486337065697, "num_tokens": 106281871.0, "step": 88390 }, { "entropy": 1.8856089636683464, "epoch": 0.274032265439308, "grad_norm": 7.232779026031494, "learning_rate": 4.832749509547768e-06, "loss": 0.4807, "mean_token_accuracy": 0.8429864302277565, "num_tokens": 106293484.0, "step": 88400 }, { "entropy": 1.9002991870045662, "epoch": 0.27406326456435764, "grad_norm": 4.058182239532471, "learning_rate": 4.832476184084515e-06, "loss": 0.5214, "mean_token_accuracy": 0.8347552672028542, "num_tokens": 106305347.0, "step": 88410 }, { "entropy": 1.8636735692620277, "epoch": 0.27409426368940737, "grad_norm": 7.725419998168945, "learning_rate": 4.832202904991362e-06, "loss": 0.4471, "mean_token_accuracy": 0.8470685452222824, "num_tokens": 106317379.0, "step": 88420 }, { "entropy": 1.8224295750260353, "epoch": 0.27412526281445704, "grad_norm": 7.4432148933410645, "learning_rate": 4.8319296722552e-06, "loss": 0.4332, "mean_token_accuracy": 0.8520729199051857, "num_tokens": 106329618.0, "step": 88430 }, { "entropy": 1.9166969522833823, "epoch": 0.27415626193950676, "grad_norm": 8.651660919189453, "learning_rate": 4.8316564858629236e-06, "loss": 0.4738, "mean_token_accuracy": 0.843331104516983, "num_tokens": 106341322.0, "step": 88440 }, { "entropy": 1.900239697098732, "epoch": 0.27418726106455643, "grad_norm": 9.139606475830078, "learning_rate": 4.831383345801432e-06, "loss": 0.4642, "mean_token_accuracy": 0.8463894948363304, "num_tokens": 106352704.0, "step": 88450 }, { "entropy": 1.8953612327575684, "epoch": 0.27421826018960616, "grad_norm": 7.59618616104126, "learning_rate": 4.831110252057634e-06, "loss": 0.4812, "mean_token_accuracy": 0.8437128469347954, "num_tokens": 106364147.0, "step": 88460 }, { "entropy": 1.8952149584889413, "epoch": 0.2742492593146558, "grad_norm": 7.970698356628418, "learning_rate": 4.830837204618439e-06, "loss": 0.4564, "mean_token_accuracy": 0.856081509590149, "num_tokens": 106375733.0, "step": 88470 }, { "entropy": 1.8306088283658029, "epoch": 0.27428025843970555, "grad_norm": 8.874154090881348, "learning_rate": 4.830564203470762e-06, "loss": 0.4212, "mean_token_accuracy": 0.8551815196871757, "num_tokens": 106388345.0, "step": 88480 }, { "entropy": 1.8102877527475356, "epoch": 0.2743112575647552, "grad_norm": 5.486161231994629, "learning_rate": 4.830291248601526e-06, "loss": 0.4662, "mean_token_accuracy": 0.8472759440541268, "num_tokens": 106401837.0, "step": 88490 }, { "entropy": 1.9017505258321763, "epoch": 0.27434225668980494, "grad_norm": 4.231834888458252, "learning_rate": 4.830018339997658e-06, "loss": 0.4669, "mean_token_accuracy": 0.8477098569273949, "num_tokens": 106414196.0, "step": 88500 }, { "entropy": 1.763375386595726, "epoch": 0.2743732558148546, "grad_norm": 3.8429412841796875, "learning_rate": 4.829745477646087e-06, "loss": 0.3896, "mean_token_accuracy": 0.8600920364260674, "num_tokens": 106427418.0, "step": 88510 }, { "entropy": 1.8455632477998734, "epoch": 0.27440425493990434, "grad_norm": 9.410290718078613, "learning_rate": 4.829472661533753e-06, "loss": 0.4572, "mean_token_accuracy": 0.8498677432537078, "num_tokens": 106439679.0, "step": 88520 }, { "entropy": 1.9347635477781295, "epoch": 0.274435254064954, "grad_norm": 8.736660957336426, "learning_rate": 4.829199891647595e-06, "loss": 0.5427, "mean_token_accuracy": 0.834366361796856, "num_tokens": 106450622.0, "step": 88530 }, { "entropy": 1.7944782301783562, "epoch": 0.27446625319000373, "grad_norm": 3.962170362472534, "learning_rate": 4.828927167974562e-06, "loss": 0.4312, "mean_token_accuracy": 0.84948940128088, "num_tokens": 106464014.0, "step": 88540 }, { "entropy": 1.911988915503025, "epoch": 0.2744972523150534, "grad_norm": 8.544610977172852, "learning_rate": 4.828654490501605e-06, "loss": 0.4954, "mean_token_accuracy": 0.8315621390938759, "num_tokens": 106475443.0, "step": 88550 }, { "entropy": 1.8577186211943626, "epoch": 0.2745282514401031, "grad_norm": 9.083486557006836, "learning_rate": 4.828381859215683e-06, "loss": 0.5021, "mean_token_accuracy": 0.8416240692138672, "num_tokens": 106487802.0, "step": 88560 }, { "entropy": 1.8315035477280617, "epoch": 0.2745592505651528, "grad_norm": 8.581526756286621, "learning_rate": 4.828109274103759e-06, "loss": 0.4935, "mean_token_accuracy": 0.8431025877594948, "num_tokens": 106501150.0, "step": 88570 }, { "entropy": 1.8865200936794282, "epoch": 0.2745902496902025, "grad_norm": 11.377765655517578, "learning_rate": 4.8278367351527985e-06, "loss": 0.4927, "mean_token_accuracy": 0.8460037097334862, "num_tokens": 106513566.0, "step": 88580 }, { "entropy": 1.7892251804471015, "epoch": 0.2746212488152522, "grad_norm": 6.965874671936035, "learning_rate": 4.8275642423497745e-06, "loss": 0.3847, "mean_token_accuracy": 0.8588668003678321, "num_tokens": 106526831.0, "step": 88590 }, { "entropy": 1.8974862158298493, "epoch": 0.2746522479403019, "grad_norm": 7.9539265632629395, "learning_rate": 4.827291795681668e-06, "loss": 0.4423, "mean_token_accuracy": 0.8509656980633735, "num_tokens": 106538001.0, "step": 88600 }, { "entropy": 1.8913881599903106, "epoch": 0.2746832470653516, "grad_norm": 9.2141752243042, "learning_rate": 4.827019395135459e-06, "loss": 0.5634, "mean_token_accuracy": 0.8275657877326011, "num_tokens": 106549809.0, "step": 88610 }, { "entropy": 1.7867820590734482, "epoch": 0.2747142461904013, "grad_norm": 7.718073844909668, "learning_rate": 4.8267470406981375e-06, "loss": 0.4228, "mean_token_accuracy": 0.8527751803398133, "num_tokens": 106563083.0, "step": 88620 }, { "entropy": 1.838895745575428, "epoch": 0.27474524531545097, "grad_norm": 8.962431907653809, "learning_rate": 4.826474732356697e-06, "loss": 0.4731, "mean_token_accuracy": 0.8443746566772461, "num_tokens": 106575815.0, "step": 88630 }, { "entropy": 1.9002109482884406, "epoch": 0.2747762444405007, "grad_norm": 7.583240509033203, "learning_rate": 4.826202470098135e-06, "loss": 0.4828, "mean_token_accuracy": 0.8451993703842163, "num_tokens": 106587141.0, "step": 88640 }, { "entropy": 1.7970835909247398, "epoch": 0.27480724356555036, "grad_norm": 3.407457113265991, "learning_rate": 4.825930253909458e-06, "loss": 0.4077, "mean_token_accuracy": 0.8588675916194916, "num_tokens": 106600173.0, "step": 88650 }, { "entropy": 1.8548505648970603, "epoch": 0.27483824269060003, "grad_norm": 7.602267742156982, "learning_rate": 4.825658083777671e-06, "loss": 0.5061, "mean_token_accuracy": 0.8400174841284752, "num_tokens": 106611441.0, "step": 88660 }, { "entropy": 1.8951510965824128, "epoch": 0.27486924181564976, "grad_norm": 9.224246978759766, "learning_rate": 4.82538595968979e-06, "loss": 0.5088, "mean_token_accuracy": 0.8430482760071755, "num_tokens": 106622973.0, "step": 88670 }, { "entropy": 1.8475442111492157, "epoch": 0.2749002409406994, "grad_norm": 7.481077671051025, "learning_rate": 4.825113881632835e-06, "loss": 0.5015, "mean_token_accuracy": 0.8386109799146653, "num_tokens": 106635332.0, "step": 88680 }, { "entropy": 1.8375971369445323, "epoch": 0.27493124006574915, "grad_norm": 6.685962200164795, "learning_rate": 4.824841849593828e-06, "loss": 0.4501, "mean_token_accuracy": 0.85401571393013, "num_tokens": 106647486.0, "step": 88690 }, { "entropy": 1.8508135929703713, "epoch": 0.2749622391907988, "grad_norm": 9.494312286376953, "learning_rate": 4.824569863559801e-06, "loss": 0.5245, "mean_token_accuracy": 0.8352826073765754, "num_tokens": 106659227.0, "step": 88700 }, { "entropy": 1.8618362814188003, "epoch": 0.27499323831584854, "grad_norm": 8.583227157592773, "learning_rate": 4.824297923517787e-06, "loss": 0.4741, "mean_token_accuracy": 0.8454991012811661, "num_tokens": 106671337.0, "step": 88710 }, { "entropy": 1.8632221952080728, "epoch": 0.2750242374408982, "grad_norm": 4.273068428039551, "learning_rate": 4.824026029454825e-06, "loss": 0.477, "mean_token_accuracy": 0.8438972979784012, "num_tokens": 106683266.0, "step": 88720 }, { "entropy": 1.8759027153253556, "epoch": 0.27505523656594794, "grad_norm": 6.9859538078308105, "learning_rate": 4.823754181357961e-06, "loss": 0.5663, "mean_token_accuracy": 0.8387471958994865, "num_tokens": 106695075.0, "step": 88730 }, { "entropy": 1.9251735389232636, "epoch": 0.2750862356909976, "grad_norm": 8.672323226928711, "learning_rate": 4.823482379214244e-06, "loss": 0.5499, "mean_token_accuracy": 0.8304734826087952, "num_tokens": 106706762.0, "step": 88740 }, { "entropy": 1.8503682538866997, "epoch": 0.27511723481604733, "grad_norm": 9.310576438903809, "learning_rate": 4.8232106230107285e-06, "loss": 0.4709, "mean_token_accuracy": 0.8472747296094895, "num_tokens": 106718287.0, "step": 88750 }, { "entropy": 1.9084057167172432, "epoch": 0.275148233941097, "grad_norm": 8.345101356506348, "learning_rate": 4.822938912734476e-06, "loss": 0.5446, "mean_token_accuracy": 0.8372975274920463, "num_tokens": 106729907.0, "step": 88760 }, { "entropy": 1.8601343676447868, "epoch": 0.2751792330661467, "grad_norm": 3.5940239429473877, "learning_rate": 4.822667248372551e-06, "loss": 0.4276, "mean_token_accuracy": 0.8547569274902344, "num_tokens": 106742034.0, "step": 88770 }, { "entropy": 1.9169348627328873, "epoch": 0.2752102321911964, "grad_norm": 8.337597846984863, "learning_rate": 4.822395629912025e-06, "loss": 0.5174, "mean_token_accuracy": 0.8388430058956147, "num_tokens": 106753068.0, "step": 88780 }, { "entropy": 1.872619953751564, "epoch": 0.2752412313162461, "grad_norm": 10.032551765441895, "learning_rate": 4.8221240573399705e-06, "loss": 0.4984, "mean_token_accuracy": 0.8413626179099083, "num_tokens": 106764665.0, "step": 88790 }, { "entropy": 1.8718515574932098, "epoch": 0.2752722304412958, "grad_norm": 4.433212757110596, "learning_rate": 4.82185253064347e-06, "loss": 0.4435, "mean_token_accuracy": 0.8516853898763657, "num_tokens": 106776221.0, "step": 88800 }, { "entropy": 1.8270509555935859, "epoch": 0.2753032295663455, "grad_norm": 8.920626640319824, "learning_rate": 4.821581049809608e-06, "loss": 0.436, "mean_token_accuracy": 0.8488035008311272, "num_tokens": 106789268.0, "step": 88810 }, { "entropy": 1.8341209158301353, "epoch": 0.2753342286913952, "grad_norm": 9.364352226257324, "learning_rate": 4.821309614825477e-06, "loss": 0.4889, "mean_token_accuracy": 0.8486865177750588, "num_tokens": 106800990.0, "step": 88820 }, { "entropy": 1.7747258245944977, "epoch": 0.2753652278164449, "grad_norm": 3.747847557067871, "learning_rate": 4.82103822567817e-06, "loss": 0.4707, "mean_token_accuracy": 0.8474699661135674, "num_tokens": 106814661.0, "step": 88830 }, { "entropy": 1.8344976738095284, "epoch": 0.27539622694149457, "grad_norm": 3.948411226272583, "learning_rate": 4.8207668823547895e-06, "loss": 0.4755, "mean_token_accuracy": 0.8421635374426841, "num_tokens": 106827568.0, "step": 88840 }, { "entropy": 1.873189702630043, "epoch": 0.2754272260665443, "grad_norm": 7.882256507873535, "learning_rate": 4.82049558484244e-06, "loss": 0.4877, "mean_token_accuracy": 0.8454801499843597, "num_tokens": 106839436.0, "step": 88850 }, { "entropy": 1.8929407209157945, "epoch": 0.27545822519159396, "grad_norm": 6.915173530578613, "learning_rate": 4.820224333128236e-06, "loss": 0.4984, "mean_token_accuracy": 0.8434187933802605, "num_tokens": 106850474.0, "step": 88860 }, { "entropy": 1.8293378427624702, "epoch": 0.2754892243166437, "grad_norm": 4.267940521240234, "learning_rate": 4.819953127199289e-06, "loss": 0.4759, "mean_token_accuracy": 0.8467979103326797, "num_tokens": 106862337.0, "step": 88870 }, { "entropy": 1.8841822102665902, "epoch": 0.27552022344169336, "grad_norm": 7.245059013366699, "learning_rate": 4.819681967042724e-06, "loss": 0.513, "mean_token_accuracy": 0.8383611172437668, "num_tokens": 106874237.0, "step": 88880 }, { "entropy": 1.8861842527985573, "epoch": 0.275551222566743, "grad_norm": 7.553530693054199, "learning_rate": 4.819410852645663e-06, "loss": 0.4855, "mean_token_accuracy": 0.8369044825434685, "num_tokens": 106886243.0, "step": 88890 }, { "entropy": 1.8612857922911643, "epoch": 0.27558222169179275, "grad_norm": 5.01759147644043, "learning_rate": 4.81913978399524e-06, "loss": 0.523, "mean_token_accuracy": 0.8347546473145485, "num_tokens": 106898922.0, "step": 88900 }, { "entropy": 1.789707398414612, "epoch": 0.2756132208168424, "grad_norm": 8.906004905700684, "learning_rate": 4.818868761078591e-06, "loss": 0.4355, "mean_token_accuracy": 0.8516422912478447, "num_tokens": 106911823.0, "step": 88910 }, { "entropy": 1.8937145173549652, "epoch": 0.27564421994189214, "grad_norm": 7.709479331970215, "learning_rate": 4.818597783882858e-06, "loss": 0.476, "mean_token_accuracy": 0.8450556769967079, "num_tokens": 106923627.0, "step": 88920 }, { "entropy": 1.8715748369693757, "epoch": 0.2756752190669418, "grad_norm": 8.210858345031738, "learning_rate": 4.818326852395186e-06, "loss": 0.5139, "mean_token_accuracy": 0.843447645008564, "num_tokens": 106935011.0, "step": 88930 }, { "entropy": 1.8829000025987626, "epoch": 0.27570621819199154, "grad_norm": 9.785741806030273, "learning_rate": 4.818055966602728e-06, "loss": 0.4917, "mean_token_accuracy": 0.8352604553103447, "num_tokens": 106947124.0, "step": 88940 }, { "entropy": 1.9164473339915276, "epoch": 0.2757372173170412, "grad_norm": 8.271210670471191, "learning_rate": 4.817785126492638e-06, "loss": 0.5146, "mean_token_accuracy": 0.8394878327846527, "num_tokens": 106957935.0, "step": 88950 }, { "entropy": 1.890079266577959, "epoch": 0.27576821644209093, "grad_norm": 8.78956127166748, "learning_rate": 4.817514332052081e-06, "loss": 0.5374, "mean_token_accuracy": 0.8357309713959694, "num_tokens": 106970004.0, "step": 88960 }, { "entropy": 1.9482278615236281, "epoch": 0.2757992155671406, "grad_norm": 8.85912799835205, "learning_rate": 4.817243583268221e-06, "loss": 0.5326, "mean_token_accuracy": 0.8404009014368057, "num_tokens": 106980930.0, "step": 88970 }, { "entropy": 1.8766920641064644, "epoch": 0.2758302146921903, "grad_norm": 7.513118743896484, "learning_rate": 4.8169728801282294e-06, "loss": 0.4737, "mean_token_accuracy": 0.8502911329269409, "num_tokens": 106992422.0, "step": 88980 }, { "entropy": 1.8944596245884895, "epoch": 0.27586121381724, "grad_norm": 7.3327460289001465, "learning_rate": 4.816702222619286e-06, "loss": 0.4609, "mean_token_accuracy": 0.8509733945131301, "num_tokens": 107004273.0, "step": 88990 }, { "entropy": 1.847128589451313, "epoch": 0.2758922129422897, "grad_norm": 4.272797584533691, "learning_rate": 4.816431610728571e-06, "loss": 0.462, "mean_token_accuracy": 0.8441042378544807, "num_tokens": 107016497.0, "step": 89000 }, { "entropy": 1.855496746301651, "epoch": 0.2759232120673394, "grad_norm": 4.222333908081055, "learning_rate": 4.816161044443269e-06, "loss": 0.4865, "mean_token_accuracy": 0.8440208032727241, "num_tokens": 107029107.0, "step": 89010 }, { "entropy": 1.9156458109617234, "epoch": 0.2759542111923891, "grad_norm": 8.503098487854004, "learning_rate": 4.815890523750575e-06, "loss": 0.525, "mean_token_accuracy": 0.8486156970262527, "num_tokens": 107039998.0, "step": 89020 }, { "entropy": 1.8888219490647316, "epoch": 0.2759852103174388, "grad_norm": 2.975604772567749, "learning_rate": 4.8156200486376845e-06, "loss": 0.5026, "mean_token_accuracy": 0.8388027891516685, "num_tokens": 107052621.0, "step": 89030 }, { "entropy": 1.9181819319725038, "epoch": 0.2760162094424885, "grad_norm": 3.860818386077881, "learning_rate": 4.8153496190918e-06, "loss": 0.5216, "mean_token_accuracy": 0.8331426337361336, "num_tokens": 107063730.0, "step": 89040 }, { "entropy": 1.832422287762165, "epoch": 0.27604720856753817, "grad_norm": 7.345708847045898, "learning_rate": 4.815079235100127e-06, "loss": 0.4354, "mean_token_accuracy": 0.854414065182209, "num_tokens": 107076862.0, "step": 89050 }, { "entropy": 1.9153432667255401, "epoch": 0.2760782076925879, "grad_norm": 9.088801383972168, "learning_rate": 4.814808896649879e-06, "loss": 0.4634, "mean_token_accuracy": 0.853677150607109, "num_tokens": 107088395.0, "step": 89060 }, { "entropy": 1.8690629690885543, "epoch": 0.27610920681763756, "grad_norm": 7.586027145385742, "learning_rate": 4.814538603728274e-06, "loss": 0.4119, "mean_token_accuracy": 0.8534372985363007, "num_tokens": 107100834.0, "step": 89070 }, { "entropy": 1.8856920048594474, "epoch": 0.2761402059426873, "grad_norm": 4.5250043869018555, "learning_rate": 4.814268356322531e-06, "loss": 0.4513, "mean_token_accuracy": 0.8607599556446075, "num_tokens": 107112389.0, "step": 89080 }, { "entropy": 1.9317937284708022, "epoch": 0.27617120506773696, "grad_norm": 8.948545455932617, "learning_rate": 4.813998154419879e-06, "loss": 0.5167, "mean_token_accuracy": 0.8472296670079231, "num_tokens": 107122894.0, "step": 89090 }, { "entropy": 1.8248413413763047, "epoch": 0.2762022041927867, "grad_norm": 11.74347972869873, "learning_rate": 4.813727998007552e-06, "loss": 0.4519, "mean_token_accuracy": 0.8507599249482155, "num_tokens": 107134891.0, "step": 89100 }, { "entropy": 1.8651644140481949, "epoch": 0.27623320331783635, "grad_norm": 8.039405822753906, "learning_rate": 4.813457887072781e-06, "loss": 0.478, "mean_token_accuracy": 0.8547470927238464, "num_tokens": 107146700.0, "step": 89110 }, { "entropy": 1.8344761043787003, "epoch": 0.2762642024428861, "grad_norm": 8.795267105102539, "learning_rate": 4.813187821602815e-06, "loss": 0.5152, "mean_token_accuracy": 0.8472502484917641, "num_tokens": 107159722.0, "step": 89120 }, { "entropy": 1.909415753185749, "epoch": 0.27629520156793574, "grad_norm": 9.062956809997559, "learning_rate": 4.812917801584898e-06, "loss": 0.48, "mean_token_accuracy": 0.850268816947937, "num_tokens": 107171135.0, "step": 89130 }, { "entropy": 1.886960855126381, "epoch": 0.2763262006929854, "grad_norm": 8.55114459991455, "learning_rate": 4.812647827006282e-06, "loss": 0.4605, "mean_token_accuracy": 0.8514637067914009, "num_tokens": 107182863.0, "step": 89140 }, { "entropy": 1.862102809548378, "epoch": 0.27635719981803514, "grad_norm": 9.735430717468262, "learning_rate": 4.812377897854223e-06, "loss": 0.5274, "mean_token_accuracy": 0.8368513882160187, "num_tokens": 107195661.0, "step": 89150 }, { "entropy": 1.832503044605255, "epoch": 0.2763881989430848, "grad_norm": 7.541407108306885, "learning_rate": 4.812108014115985e-06, "loss": 0.4209, "mean_token_accuracy": 0.8562299177050591, "num_tokens": 107208363.0, "step": 89160 }, { "entropy": 1.884467676281929, "epoch": 0.27641919806813453, "grad_norm": 9.188913345336914, "learning_rate": 4.811838175778836e-06, "loss": 0.4936, "mean_token_accuracy": 0.8432751014828682, "num_tokens": 107219939.0, "step": 89170 }, { "entropy": 1.8959971502423287, "epoch": 0.2764501971931842, "grad_norm": 4.789566993713379, "learning_rate": 4.8115683828300445e-06, "loss": 0.4505, "mean_token_accuracy": 0.8498433887958526, "num_tokens": 107231619.0, "step": 89180 }, { "entropy": 1.8323383823037147, "epoch": 0.2764811963182339, "grad_norm": 2.805758237838745, "learning_rate": 4.811298635256891e-06, "loss": 0.4482, "mean_token_accuracy": 0.8528842240571975, "num_tokens": 107244259.0, "step": 89190 }, { "entropy": 1.7991958245635034, "epoch": 0.2765121954432836, "grad_norm": 6.88820219039917, "learning_rate": 4.811028933046656e-06, "loss": 0.3596, "mean_token_accuracy": 0.872551740705967, "num_tokens": 107257190.0, "step": 89200 }, { "entropy": 1.8357098802924157, "epoch": 0.2765431945683333, "grad_norm": 4.887908935546875, "learning_rate": 4.810759276186628e-06, "loss": 0.4842, "mean_token_accuracy": 0.8493363171815872, "num_tokens": 107269231.0, "step": 89210 }, { "entropy": 1.8650715343654156, "epoch": 0.276574193693383, "grad_norm": 8.015203475952148, "learning_rate": 4.810489664664098e-06, "loss": 0.4808, "mean_token_accuracy": 0.8378633111715317, "num_tokens": 107282063.0, "step": 89220 }, { "entropy": 1.8511658303439618, "epoch": 0.2766051928184327, "grad_norm": 8.239190101623535, "learning_rate": 4.810220098466364e-06, "loss": 0.4276, "mean_token_accuracy": 0.8476231142878532, "num_tokens": 107294849.0, "step": 89230 }, { "entropy": 1.8353752836585044, "epoch": 0.2766361919434824, "grad_norm": 11.316750526428223, "learning_rate": 4.809950577580724e-06, "loss": 0.4393, "mean_token_accuracy": 0.8560484200716019, "num_tokens": 107306875.0, "step": 89240 }, { "entropy": 1.8877992361783982, "epoch": 0.2766671910685321, "grad_norm": 10.842957496643066, "learning_rate": 4.809681101994492e-06, "loss": 0.4919, "mean_token_accuracy": 0.8417460203170777, "num_tokens": 107318449.0, "step": 89250 }, { "entropy": 1.798448894917965, "epoch": 0.2766981901935818, "grad_norm": 7.134692192077637, "learning_rate": 4.809411671694974e-06, "loss": 0.4513, "mean_token_accuracy": 0.8509971871972084, "num_tokens": 107331888.0, "step": 89260 }, { "entropy": 1.8195422321558, "epoch": 0.2767291893186315, "grad_norm": 10.8891019821167, "learning_rate": 4.809142286669492e-06, "loss": 0.4896, "mean_token_accuracy": 0.8366403177380561, "num_tokens": 107344418.0, "step": 89270 }, { "entropy": 1.857719998061657, "epoch": 0.27676018844368117, "grad_norm": 7.889041423797607, "learning_rate": 4.808872946905363e-06, "loss": 0.4521, "mean_token_accuracy": 0.8492889553308487, "num_tokens": 107356113.0, "step": 89280 }, { "entropy": 1.7993003293871879, "epoch": 0.2767911875687309, "grad_norm": 7.292881011962891, "learning_rate": 4.808603652389917e-06, "loss": 0.4368, "mean_token_accuracy": 0.8571058794856071, "num_tokens": 107369163.0, "step": 89290 }, { "entropy": 1.8662144735455513, "epoch": 0.27682218669378056, "grad_norm": 10.856431007385254, "learning_rate": 4.808334403110485e-06, "loss": 0.5662, "mean_token_accuracy": 0.8313690677285195, "num_tokens": 107381585.0, "step": 89300 }, { "entropy": 1.8663518592715262, "epoch": 0.2768531858188303, "grad_norm": 7.93809700012207, "learning_rate": 4.808065199054404e-06, "loss": 0.4681, "mean_token_accuracy": 0.8461432874202728, "num_tokens": 107393815.0, "step": 89310 }, { "entropy": 1.9202445238828658, "epoch": 0.27688418494387995, "grad_norm": 8.869173049926758, "learning_rate": 4.8077960402090155e-06, "loss": 0.5183, "mean_token_accuracy": 0.8392524033784866, "num_tokens": 107404393.0, "step": 89320 }, { "entropy": 1.9343400806188584, "epoch": 0.2769151840689297, "grad_norm": 8.072264671325684, "learning_rate": 4.807526926561667e-06, "loss": 0.509, "mean_token_accuracy": 0.8465714007616043, "num_tokens": 107414917.0, "step": 89330 }, { "entropy": 1.9184037640690803, "epoch": 0.27694618319397935, "grad_norm": 7.905839443206787, "learning_rate": 4.80725785809971e-06, "loss": 0.5024, "mean_token_accuracy": 0.8361783891916275, "num_tokens": 107425889.0, "step": 89340 }, { "entropy": 1.9122958168387414, "epoch": 0.27697718231902907, "grad_norm": 3.9599266052246094, "learning_rate": 4.806988834810501e-06, "loss": 0.4686, "mean_token_accuracy": 0.8507964372634887, "num_tokens": 107437118.0, "step": 89350 }, { "entropy": 1.7936722189188004, "epoch": 0.27700818144407874, "grad_norm": 9.577056884765625, "learning_rate": 4.806719856681402e-06, "loss": 0.4249, "mean_token_accuracy": 0.8579995214939118, "num_tokens": 107450413.0, "step": 89360 }, { "entropy": 1.8381581105291844, "epoch": 0.27703918056912846, "grad_norm": 7.809806823730469, "learning_rate": 4.806450923699778e-06, "loss": 0.4659, "mean_token_accuracy": 0.8484640643000603, "num_tokens": 107462904.0, "step": 89370 }, { "entropy": 1.878132027387619, "epoch": 0.27707017969417813, "grad_norm": 8.047992706298828, "learning_rate": 4.806182035853004e-06, "loss": 0.4695, "mean_token_accuracy": 0.8498531639575958, "num_tokens": 107474624.0, "step": 89380 }, { "entropy": 1.9003929272294044, "epoch": 0.2771011788192278, "grad_norm": 7.774439334869385, "learning_rate": 4.805913193128452e-06, "loss": 0.4197, "mean_token_accuracy": 0.864824341237545, "num_tokens": 107486478.0, "step": 89390 }, { "entropy": 1.885640236735344, "epoch": 0.2771321779442775, "grad_norm": 8.628144264221191, "learning_rate": 4.805644395513508e-06, "loss": 0.5054, "mean_token_accuracy": 0.8425728008151054, "num_tokens": 107498133.0, "step": 89400 }, { "entropy": 1.9598473072052003, "epoch": 0.2771631770693272, "grad_norm": 9.253849983215332, "learning_rate": 4.805375642995554e-06, "loss": 0.5666, "mean_token_accuracy": 0.8246390670537949, "num_tokens": 107509154.0, "step": 89410 }, { "entropy": 1.8804704681038857, "epoch": 0.2771941761943769, "grad_norm": 3.7883799076080322, "learning_rate": 4.8051069355619846e-06, "loss": 0.4571, "mean_token_accuracy": 0.8492748379707337, "num_tokens": 107521568.0, "step": 89420 }, { "entropy": 1.9423058658838273, "epoch": 0.2772251753194266, "grad_norm": 7.831295490264893, "learning_rate": 4.804838273200196e-06, "loss": 0.5115, "mean_token_accuracy": 0.8353771314024925, "num_tokens": 107532573.0, "step": 89430 }, { "entropy": 1.9429254934191704, "epoch": 0.2772561744444763, "grad_norm": 7.812403202056885, "learning_rate": 4.804569655897587e-06, "loss": 0.514, "mean_token_accuracy": 0.8479711428284645, "num_tokens": 107544075.0, "step": 89440 }, { "entropy": 1.8462449744343759, "epoch": 0.277287173569526, "grad_norm": 7.212341785430908, "learning_rate": 4.8043010836415645e-06, "loss": 0.4447, "mean_token_accuracy": 0.8527239561080933, "num_tokens": 107556555.0, "step": 89450 }, { "entropy": 1.8698409616947174, "epoch": 0.2773181726945757, "grad_norm": 7.4296698570251465, "learning_rate": 4.804032556419541e-06, "loss": 0.4619, "mean_token_accuracy": 0.8424155786633492, "num_tokens": 107569186.0, "step": 89460 }, { "entropy": 1.8821595564484597, "epoch": 0.2773491718196254, "grad_norm": 7.037583351135254, "learning_rate": 4.803764074218931e-06, "loss": 0.4709, "mean_token_accuracy": 0.8456917703151703, "num_tokens": 107580496.0, "step": 89470 }, { "entropy": 1.8518904522061348, "epoch": 0.2773801709446751, "grad_norm": 4.142475128173828, "learning_rate": 4.803495637027156e-06, "loss": 0.4833, "mean_token_accuracy": 0.8462300166487694, "num_tokens": 107592528.0, "step": 89480 }, { "entropy": 1.9444121688604354, "epoch": 0.27741117006972477, "grad_norm": 8.443082809448242, "learning_rate": 4.803227244831642e-06, "loss": 0.5032, "mean_token_accuracy": 0.8497670993208886, "num_tokens": 107603347.0, "step": 89490 }, { "entropy": 1.9082912877202034, "epoch": 0.2774421691947745, "grad_norm": 8.889103889465332, "learning_rate": 4.80295889761982e-06, "loss": 0.4889, "mean_token_accuracy": 0.8479152485728264, "num_tokens": 107614464.0, "step": 89500 }, { "entropy": 1.9622692078351975, "epoch": 0.27747316831982416, "grad_norm": 9.245258331298828, "learning_rate": 4.802690595379124e-06, "loss": 0.5951, "mean_token_accuracy": 0.8277513101696968, "num_tokens": 107625283.0, "step": 89510 }, { "entropy": 1.8577041417360305, "epoch": 0.2775041674448739, "grad_norm": 8.274160385131836, "learning_rate": 4.802422338096995e-06, "loss": 0.4768, "mean_token_accuracy": 0.8520587861537934, "num_tokens": 107637211.0, "step": 89520 }, { "entropy": 1.8055953189730645, "epoch": 0.27753516656992355, "grad_norm": 9.126919746398926, "learning_rate": 4.80215412576088e-06, "loss": 0.5178, "mean_token_accuracy": 0.8396184176206589, "num_tokens": 107650513.0, "step": 89530 }, { "entropy": 1.9063657209277154, "epoch": 0.2775661656949733, "grad_norm": 7.300408840179443, "learning_rate": 4.801885958358229e-06, "loss": 0.4567, "mean_token_accuracy": 0.853947177529335, "num_tokens": 107662885.0, "step": 89540 }, { "entropy": 2.0144054174423216, "epoch": 0.27759716482002295, "grad_norm": 8.333145141601562, "learning_rate": 4.801617835876496e-06, "loss": 0.5785, "mean_token_accuracy": 0.8276149451732635, "num_tokens": 107673365.0, "step": 89550 }, { "entropy": 1.8120394110679627, "epoch": 0.27762816394507267, "grad_norm": 5.053866863250732, "learning_rate": 4.801349758303142e-06, "loss": 0.4466, "mean_token_accuracy": 0.8445941239595414, "num_tokens": 107686729.0, "step": 89560 }, { "entropy": 1.8656949549913406, "epoch": 0.27765916307012234, "grad_norm": 10.746817588806152, "learning_rate": 4.801081725625631e-06, "loss": 0.5, "mean_token_accuracy": 0.8389295890927315, "num_tokens": 107699283.0, "step": 89570 }, { "entropy": 1.8939693599939347, "epoch": 0.27769016219517206, "grad_norm": 7.9915971755981445, "learning_rate": 4.800813737831435e-06, "loss": 0.4607, "mean_token_accuracy": 0.836301201581955, "num_tokens": 107710971.0, "step": 89580 }, { "entropy": 1.8993390202522278, "epoch": 0.27772116132022173, "grad_norm": 3.7865583896636963, "learning_rate": 4.800545794908028e-06, "loss": 0.5149, "mean_token_accuracy": 0.8455944269895553, "num_tokens": 107721957.0, "step": 89590 }, { "entropy": 1.9103139862418175, "epoch": 0.27775216044527146, "grad_norm": 7.072427749633789, "learning_rate": 4.800277896842888e-06, "loss": 0.5299, "mean_token_accuracy": 0.8366509795188903, "num_tokens": 107733436.0, "step": 89600 }, { "entropy": 1.916232281923294, "epoch": 0.2777831595703211, "grad_norm": 7.880809307098389, "learning_rate": 4.8000100436235025e-06, "loss": 0.4882, "mean_token_accuracy": 0.846810282766819, "num_tokens": 107744724.0, "step": 89610 }, { "entropy": 1.8565625533461572, "epoch": 0.27781415869537085, "grad_norm": 9.63576889038086, "learning_rate": 4.799742235237359e-06, "loss": 0.4176, "mean_token_accuracy": 0.8563435986638069, "num_tokens": 107756737.0, "step": 89620 }, { "entropy": 1.8952525824308395, "epoch": 0.2778451578204205, "grad_norm": 7.304356575012207, "learning_rate": 4.799474471671954e-06, "loss": 0.4555, "mean_token_accuracy": 0.855880931019783, "num_tokens": 107767777.0, "step": 89630 }, { "entropy": 1.8737752437591553, "epoch": 0.2778761569454702, "grad_norm": 8.607854843139648, "learning_rate": 4.799206752914784e-06, "loss": 0.4669, "mean_token_accuracy": 0.8448594495654106, "num_tokens": 107780521.0, "step": 89640 }, { "entropy": 1.8830969855189323, "epoch": 0.2779071560705199, "grad_norm": 9.414170265197754, "learning_rate": 4.798939078953355e-06, "loss": 0.4986, "mean_token_accuracy": 0.8453325614333153, "num_tokens": 107792404.0, "step": 89650 }, { "entropy": 1.8424068495631218, "epoch": 0.2779381551955696, "grad_norm": 3.329559087753296, "learning_rate": 4.798671449775176e-06, "loss": 0.4264, "mean_token_accuracy": 0.851707661151886, "num_tokens": 107804701.0, "step": 89660 }, { "entropy": 1.8513332203030586, "epoch": 0.2779691543206193, "grad_norm": 9.445657730102539, "learning_rate": 4.798403865367761e-06, "loss": 0.4731, "mean_token_accuracy": 0.8407017186284065, "num_tokens": 107817031.0, "step": 89670 }, { "entropy": 1.861585983633995, "epoch": 0.278000153445669, "grad_norm": 8.74657154083252, "learning_rate": 4.798136325718627e-06, "loss": 0.4754, "mean_token_accuracy": 0.8404981315135955, "num_tokens": 107829450.0, "step": 89680 }, { "entropy": 1.8812636777758598, "epoch": 0.2780311525707187, "grad_norm": 8.068127632141113, "learning_rate": 4.797868830815301e-06, "loss": 0.5182, "mean_token_accuracy": 0.8442062169313431, "num_tokens": 107842211.0, "step": 89690 }, { "entropy": 1.8865378215909003, "epoch": 0.27806215169576837, "grad_norm": 8.662693977355957, "learning_rate": 4.797601380645308e-06, "loss": 0.5025, "mean_token_accuracy": 0.8458723932504654, "num_tokens": 107854429.0, "step": 89700 }, { "entropy": 1.9377282798290252, "epoch": 0.2780931508208181, "grad_norm": 7.293258190155029, "learning_rate": 4.797333975196185e-06, "loss": 0.5247, "mean_token_accuracy": 0.845015498995781, "num_tokens": 107866326.0, "step": 89710 }, { "entropy": 1.8403261929750443, "epoch": 0.27812414994586776, "grad_norm": 4.629557132720947, "learning_rate": 4.797066614455466e-06, "loss": 0.4569, "mean_token_accuracy": 0.8479184404015541, "num_tokens": 107879453.0, "step": 89720 }, { "entropy": 1.8934699580073358, "epoch": 0.2781551490709175, "grad_norm": 3.9676856994628906, "learning_rate": 4.796799298410698e-06, "loss": 0.4826, "mean_token_accuracy": 0.8484136417508126, "num_tokens": 107891153.0, "step": 89730 }, { "entropy": 1.954687887430191, "epoch": 0.27818614819596715, "grad_norm": 7.991013526916504, "learning_rate": 4.796532027049428e-06, "loss": 0.4937, "mean_token_accuracy": 0.85145123898983, "num_tokens": 107902323.0, "step": 89740 }, { "entropy": 1.9269502833485603, "epoch": 0.2782171473210169, "grad_norm": 10.174466133117676, "learning_rate": 4.796264800359207e-06, "loss": 0.4912, "mean_token_accuracy": 0.8483559593558312, "num_tokens": 107913752.0, "step": 89750 }, { "entropy": 1.9451028525829315, "epoch": 0.27824814644606655, "grad_norm": 8.776559829711914, "learning_rate": 4.795997618327595e-06, "loss": 0.5428, "mean_token_accuracy": 0.8392796277999878, "num_tokens": 107925468.0, "step": 89760 }, { "entropy": 1.9637413799762726, "epoch": 0.27827914557111627, "grad_norm": 8.385177612304688, "learning_rate": 4.795730480942153e-06, "loss": 0.5066, "mean_token_accuracy": 0.8446138560771942, "num_tokens": 107936686.0, "step": 89770 }, { "entropy": 1.9453602582216263, "epoch": 0.27831014469616594, "grad_norm": 9.379023551940918, "learning_rate": 4.795463388190449e-06, "loss": 0.4917, "mean_token_accuracy": 0.8378261864185333, "num_tokens": 107947735.0, "step": 89780 }, { "entropy": 1.8662060409784318, "epoch": 0.27834114382121566, "grad_norm": 7.810781478881836, "learning_rate": 4.7951963400600565e-06, "loss": 0.4533, "mean_token_accuracy": 0.8564840778708458, "num_tokens": 107958863.0, "step": 89790 }, { "entropy": 1.8019395358860493, "epoch": 0.27837214294626533, "grad_norm": 8.415993690490723, "learning_rate": 4.7949293365385505e-06, "loss": 0.4145, "mean_token_accuracy": 0.8503817662596702, "num_tokens": 107972692.0, "step": 89800 }, { "entropy": 1.7698960989713668, "epoch": 0.27840314207131506, "grad_norm": 8.861882209777832, "learning_rate": 4.794662377613515e-06, "loss": 0.3761, "mean_token_accuracy": 0.8597157940268516, "num_tokens": 107985556.0, "step": 89810 }, { "entropy": 1.7953774243593217, "epoch": 0.2784341411963647, "grad_norm": 7.687840461730957, "learning_rate": 4.794395463272534e-06, "loss": 0.4189, "mean_token_accuracy": 0.854545010626316, "num_tokens": 107998951.0, "step": 89820 }, { "entropy": 1.9417035043239594, "epoch": 0.27846514032141445, "grad_norm": 9.26181697845459, "learning_rate": 4.794128593503201e-06, "loss": 0.5209, "mean_token_accuracy": 0.8420970395207406, "num_tokens": 108010519.0, "step": 89830 }, { "entropy": 1.8701779007911683, "epoch": 0.2784961394464641, "grad_norm": 8.857370376586914, "learning_rate": 4.793861768293114e-06, "loss": 0.4894, "mean_token_accuracy": 0.8434204265475274, "num_tokens": 108022433.0, "step": 89840 }, { "entropy": 1.9472114413976669, "epoch": 0.27852713857151384, "grad_norm": 8.817475318908691, "learning_rate": 4.793594987629871e-06, "loss": 0.5206, "mean_token_accuracy": 0.8399725124239922, "num_tokens": 108033484.0, "step": 89850 }, { "entropy": 1.9352742165327073, "epoch": 0.2785581376965635, "grad_norm": 7.381696701049805, "learning_rate": 4.7933282515010806e-06, "loss": 0.5144, "mean_token_accuracy": 0.8374552443623543, "num_tokens": 108045222.0, "step": 89860 }, { "entropy": 1.8853474691510201, "epoch": 0.27858913682161324, "grad_norm": 8.79345417022705, "learning_rate": 4.793061559894352e-06, "loss": 0.4711, "mean_token_accuracy": 0.8477419853210449, "num_tokens": 108057330.0, "step": 89870 }, { "entropy": 1.8697724029421807, "epoch": 0.2786201359466629, "grad_norm": 8.972064018249512, "learning_rate": 4.792794912797302e-06, "loss": 0.4377, "mean_token_accuracy": 0.8470319598913193, "num_tokens": 108069573.0, "step": 89880 }, { "entropy": 1.8288535490632056, "epoch": 0.2786511350717126, "grad_norm": 3.9921863079071045, "learning_rate": 4.792528310197551e-06, "loss": 0.465, "mean_token_accuracy": 0.8418003395199776, "num_tokens": 108083028.0, "step": 89890 }, { "entropy": 1.9048464432358743, "epoch": 0.2786821341967623, "grad_norm": 8.971819877624512, "learning_rate": 4.792261752082724e-06, "loss": 0.4933, "mean_token_accuracy": 0.8390678346157074, "num_tokens": 108094239.0, "step": 89900 }, { "entropy": 1.893374653160572, "epoch": 0.27871313332181197, "grad_norm": 8.717069625854492, "learning_rate": 4.791995238440452e-06, "loss": 0.4751, "mean_token_accuracy": 0.8402533918619156, "num_tokens": 108106085.0, "step": 89910 }, { "entropy": 1.9398944050073623, "epoch": 0.2787441324468617, "grad_norm": 9.565119743347168, "learning_rate": 4.79172876925837e-06, "loss": 0.5399, "mean_token_accuracy": 0.8393180221319199, "num_tokens": 108116682.0, "step": 89920 }, { "entropy": 1.8258964017033577, "epoch": 0.27877513157191136, "grad_norm": 2.663538694381714, "learning_rate": 4.791462344524116e-06, "loss": 0.4645, "mean_token_accuracy": 0.8499773174524308, "num_tokens": 108128639.0, "step": 89930 }, { "entropy": 1.833346499502659, "epoch": 0.2788061306969611, "grad_norm": 8.475390434265137, "learning_rate": 4.791195964225338e-06, "loss": 0.4454, "mean_token_accuracy": 0.8522562757134438, "num_tokens": 108141077.0, "step": 89940 }, { "entropy": 1.9464040279388428, "epoch": 0.27883712982201075, "grad_norm": 7.769806861877441, "learning_rate": 4.790929628349683e-06, "loss": 0.5211, "mean_token_accuracy": 0.838797104358673, "num_tokens": 108152453.0, "step": 89950 }, { "entropy": 1.8917251348495483, "epoch": 0.2788681289470605, "grad_norm": 7.752605438232422, "learning_rate": 4.790663336884804e-06, "loss": 0.459, "mean_token_accuracy": 0.8592374518513679, "num_tokens": 108164272.0, "step": 89960 }, { "entropy": 1.8999314427375793, "epoch": 0.27889912807211015, "grad_norm": 7.312995910644531, "learning_rate": 4.790397089818365e-06, "loss": 0.4661, "mean_token_accuracy": 0.8518079608678818, "num_tokens": 108175602.0, "step": 89970 }, { "entropy": 1.8947380736470223, "epoch": 0.27893012719715987, "grad_norm": 8.715499877929688, "learning_rate": 4.790130887138025e-06, "loss": 0.4957, "mean_token_accuracy": 0.8417610317468643, "num_tokens": 108187295.0, "step": 89980 }, { "entropy": 1.8655477941036225, "epoch": 0.27896112632220954, "grad_norm": 8.097354888916016, "learning_rate": 4.789864728831455e-06, "loss": 0.4866, "mean_token_accuracy": 0.8457545340061188, "num_tokens": 108199656.0, "step": 89990 }, { "entropy": 1.7254057943820953, "epoch": 0.27899212544725926, "grad_norm": 4.202314853668213, "learning_rate": 4.789598614886327e-06, "loss": 0.3796, "mean_token_accuracy": 0.8670785754919053, "num_tokens": 108213424.0, "step": 90000 }, { "entropy": 1.935379645228386, "epoch": 0.27902312457230893, "grad_norm": 4.619598865509033, "learning_rate": 4.789332545290321e-06, "loss": 0.5594, "mean_token_accuracy": 0.8315211609005928, "num_tokens": 108224748.0, "step": 90010 }, { "entropy": 1.9281780689954757, "epoch": 0.27905412369735866, "grad_norm": 8.387505531311035, "learning_rate": 4.789066520031119e-06, "loss": 0.5284, "mean_token_accuracy": 0.8352978438138962, "num_tokens": 108236160.0, "step": 90020 }, { "entropy": 1.9002047255635262, "epoch": 0.2790851228224083, "grad_norm": 8.676763534545898, "learning_rate": 4.7888005390964094e-06, "loss": 0.5236, "mean_token_accuracy": 0.834555535018444, "num_tokens": 108248149.0, "step": 90030 }, { "entropy": 1.8989999890327454, "epoch": 0.27911612194745805, "grad_norm": 9.32200813293457, "learning_rate": 4.788534602473885e-06, "loss": 0.4808, "mean_token_accuracy": 0.8412574380636215, "num_tokens": 108259859.0, "step": 90040 }, { "entropy": 1.8991701990365981, "epoch": 0.2791471210725077, "grad_norm": 7.9609694480896, "learning_rate": 4.788268710151243e-06, "loss": 0.4405, "mean_token_accuracy": 0.857192724943161, "num_tokens": 108271236.0, "step": 90050 }, { "entropy": 1.7937731251120568, "epoch": 0.27917812019755744, "grad_norm": 7.316855430603027, "learning_rate": 4.788002862116185e-06, "loss": 0.4172, "mean_token_accuracy": 0.8526657864451408, "num_tokens": 108284781.0, "step": 90060 }, { "entropy": 1.9361607402563095, "epoch": 0.2792091193226071, "grad_norm": 7.928420066833496, "learning_rate": 4.787737058356419e-06, "loss": 0.4876, "mean_token_accuracy": 0.8461529463529587, "num_tokens": 108295737.0, "step": 90070 }, { "entropy": 1.9357156157493591, "epoch": 0.27924011844765684, "grad_norm": 9.841268539428711, "learning_rate": 4.787471298859655e-06, "loss": 0.4987, "mean_token_accuracy": 0.8431805863976478, "num_tokens": 108307828.0, "step": 90080 }, { "entropy": 1.8373299419879914, "epoch": 0.2792711175727065, "grad_norm": 8.356464385986328, "learning_rate": 4.78720558361361e-06, "loss": 0.4569, "mean_token_accuracy": 0.8455582305788993, "num_tokens": 108320724.0, "step": 90090 }, { "entropy": 1.9154756784439086, "epoch": 0.27930211669775623, "grad_norm": 3.6140196323394775, "learning_rate": 4.786939912606008e-06, "loss": 0.4679, "mean_token_accuracy": 0.8417788296937943, "num_tokens": 108332507.0, "step": 90100 }, { "entropy": 1.8971551463007927, "epoch": 0.2793331158228059, "grad_norm": 8.835823059082031, "learning_rate": 4.786674285824571e-06, "loss": 0.484, "mean_token_accuracy": 0.8474727541208267, "num_tokens": 108343885.0, "step": 90110 }, { "entropy": 1.7721475332975387, "epoch": 0.2793641149478556, "grad_norm": 8.436137199401855, "learning_rate": 4.786408703257034e-06, "loss": 0.4507, "mean_token_accuracy": 0.8569958195090294, "num_tokens": 108356767.0, "step": 90120 }, { "entropy": 1.8786687865853309, "epoch": 0.2793951140729053, "grad_norm": 7.322561740875244, "learning_rate": 4.78614316489113e-06, "loss": 0.474, "mean_token_accuracy": 0.8418064162135124, "num_tokens": 108369550.0, "step": 90130 }, { "entropy": 1.8602668032050134, "epoch": 0.27942611319795496, "grad_norm": 4.303462505340576, "learning_rate": 4.785877670714598e-06, "loss": 0.4326, "mean_token_accuracy": 0.8566737055778504, "num_tokens": 108381931.0, "step": 90140 }, { "entropy": 1.869055911898613, "epoch": 0.2794571123230047, "grad_norm": 8.936867713928223, "learning_rate": 4.7856122207151874e-06, "loss": 0.4597, "mean_token_accuracy": 0.8521290734410286, "num_tokens": 108393434.0, "step": 90150 }, { "entropy": 1.9146001234650611, "epoch": 0.27948811144805435, "grad_norm": 10.076147079467773, "learning_rate": 4.7853468148806436e-06, "loss": 0.4827, "mean_token_accuracy": 0.8482694402337074, "num_tokens": 108404678.0, "step": 90160 }, { "entropy": 1.8209770336747169, "epoch": 0.2795191105731041, "grad_norm": 9.587752342224121, "learning_rate": 4.785081453198724e-06, "loss": 0.4182, "mean_token_accuracy": 0.8516710668802261, "num_tokens": 108416663.0, "step": 90170 }, { "entropy": 1.844451193511486, "epoch": 0.27955010969815375, "grad_norm": 10.955329895019531, "learning_rate": 4.784816135657187e-06, "loss": 0.4488, "mean_token_accuracy": 0.8469073712825775, "num_tokens": 108429075.0, "step": 90180 }, { "entropy": 1.8783362239599228, "epoch": 0.2795811088232035, "grad_norm": 8.352615356445312, "learning_rate": 4.784550862243798e-06, "loss": 0.4653, "mean_token_accuracy": 0.8481233865022659, "num_tokens": 108441399.0, "step": 90190 }, { "entropy": 1.9300066709518433, "epoch": 0.27961210794825314, "grad_norm": 7.3091206550598145, "learning_rate": 4.784285632946324e-06, "loss": 0.4898, "mean_token_accuracy": 0.8453634783625603, "num_tokens": 108452967.0, "step": 90200 }, { "entropy": 1.8706892415881158, "epoch": 0.27964310707330287, "grad_norm": 7.95927095413208, "learning_rate": 4.784020447752539e-06, "loss": 0.4524, "mean_token_accuracy": 0.8540063112974167, "num_tokens": 108464604.0, "step": 90210 }, { "entropy": 1.8064016848802567, "epoch": 0.27967410619835253, "grad_norm": 12.208187103271484, "learning_rate": 4.783755306650223e-06, "loss": 0.4148, "mean_token_accuracy": 0.8536765992641449, "num_tokens": 108477370.0, "step": 90220 }, { "entropy": 1.891729509830475, "epoch": 0.27970510532340226, "grad_norm": 7.544349193572998, "learning_rate": 4.783490209627159e-06, "loss": 0.503, "mean_token_accuracy": 0.8449078306555748, "num_tokens": 108488578.0, "step": 90230 }, { "entropy": 1.839923305809498, "epoch": 0.2797361044484519, "grad_norm": 7.284799098968506, "learning_rate": 4.783225156671132e-06, "loss": 0.4207, "mean_token_accuracy": 0.8685827806591988, "num_tokens": 108500874.0, "step": 90240 }, { "entropy": 1.9023682996630669, "epoch": 0.27976710357350165, "grad_norm": 7.970053195953369, "learning_rate": 4.782960147769936e-06, "loss": 0.4684, "mean_token_accuracy": 0.8523323282599449, "num_tokens": 108512936.0, "step": 90250 }, { "entropy": 1.8668055430054664, "epoch": 0.2797981026985513, "grad_norm": 7.000694751739502, "learning_rate": 4.78269518291137e-06, "loss": 0.4485, "mean_token_accuracy": 0.8535307124257088, "num_tokens": 108524895.0, "step": 90260 }, { "entropy": 1.8698514148592948, "epoch": 0.27982910182360105, "grad_norm": 7.908920764923096, "learning_rate": 4.782430262083234e-06, "loss": 0.4849, "mean_token_accuracy": 0.8330338954925537, "num_tokens": 108537045.0, "step": 90270 }, { "entropy": 1.9352901756763459, "epoch": 0.2798601009486507, "grad_norm": 9.200328826904297, "learning_rate": 4.7821653852733365e-06, "loss": 0.4976, "mean_token_accuracy": 0.8495378881692887, "num_tokens": 108547879.0, "step": 90280 }, { "entropy": 1.8811449840664864, "epoch": 0.27989110007370044, "grad_norm": 8.855972290039062, "learning_rate": 4.781900552469487e-06, "loss": 0.4601, "mean_token_accuracy": 0.8424475163221359, "num_tokens": 108560224.0, "step": 90290 }, { "entropy": 1.7924820497632026, "epoch": 0.2799220991987501, "grad_norm": 3.9103078842163086, "learning_rate": 4.7816357636595036e-06, "loss": 0.4031, "mean_token_accuracy": 0.856936690211296, "num_tokens": 108573946.0, "step": 90300 }, { "entropy": 1.9183641135692597, "epoch": 0.27995309832379983, "grad_norm": 4.683595180511475, "learning_rate": 4.781371018831206e-06, "loss": 0.5028, "mean_token_accuracy": 0.8507445871829986, "num_tokens": 108585889.0, "step": 90310 }, { "entropy": 1.882660059630871, "epoch": 0.2799840974488495, "grad_norm": 7.647256374359131, "learning_rate": 4.781106317972421e-06, "loss": 0.4848, "mean_token_accuracy": 0.8541607797145844, "num_tokens": 108597763.0, "step": 90320 }, { "entropy": 1.899367219209671, "epoch": 0.2800150965738992, "grad_norm": 7.681816577911377, "learning_rate": 4.780841661070978e-06, "loss": 0.4933, "mean_token_accuracy": 0.8502333298325538, "num_tokens": 108609237.0, "step": 90330 }, { "entropy": 1.9607916057109833, "epoch": 0.2800460956989489, "grad_norm": 8.942265510559082, "learning_rate": 4.780577048114713e-06, "loss": 0.4883, "mean_token_accuracy": 0.8450431019067765, "num_tokens": 108619807.0, "step": 90340 }, { "entropy": 1.925535424053669, "epoch": 0.2800770948239986, "grad_norm": 9.137941360473633, "learning_rate": 4.780312479091465e-06, "loss": 0.5373, "mean_token_accuracy": 0.8320911303162575, "num_tokens": 108630896.0, "step": 90350 }, { "entropy": 1.9415040105581283, "epoch": 0.2801080939490483, "grad_norm": 7.471439361572266, "learning_rate": 4.780047953989079e-06, "loss": 0.476, "mean_token_accuracy": 0.848981736600399, "num_tokens": 108642366.0, "step": 90360 }, { "entropy": 1.9614702731370925, "epoch": 0.280139093074098, "grad_norm": 9.385625839233398, "learning_rate": 4.779783472795404e-06, "loss": 0.5708, "mean_token_accuracy": 0.8339747205376625, "num_tokens": 108653416.0, "step": 90370 }, { "entropy": 1.7262531116604805, "epoch": 0.2801700921991477, "grad_norm": 8.864653587341309, "learning_rate": 4.779519035498294e-06, "loss": 0.3797, "mean_token_accuracy": 0.8626728132367134, "num_tokens": 108667678.0, "step": 90380 }, { "entropy": 1.9948803067207337, "epoch": 0.28020109132419735, "grad_norm": 8.669200897216797, "learning_rate": 4.779254642085608e-06, "loss": 0.5147, "mean_token_accuracy": 0.8431035861372947, "num_tokens": 108678385.0, "step": 90390 }, { "entropy": 1.9449063792824746, "epoch": 0.2802320904492471, "grad_norm": 9.006505966186523, "learning_rate": 4.778990292545207e-06, "loss": 0.5286, "mean_token_accuracy": 0.829691307246685, "num_tokens": 108689327.0, "step": 90400 }, { "entropy": 1.916630421578884, "epoch": 0.28026308957429674, "grad_norm": 8.61928653717041, "learning_rate": 4.7787259868649635e-06, "loss": 0.535, "mean_token_accuracy": 0.8399298146367074, "num_tokens": 108700905.0, "step": 90410 }, { "entropy": 1.9122631967067718, "epoch": 0.28029408869934647, "grad_norm": 7.261351108551025, "learning_rate": 4.778461725032747e-06, "loss": 0.5228, "mean_token_accuracy": 0.8412989899516106, "num_tokens": 108713415.0, "step": 90420 }, { "entropy": 1.9562030732631683, "epoch": 0.28032508782439614, "grad_norm": 7.638638496398926, "learning_rate": 4.7781975070364375e-06, "loss": 0.5237, "mean_token_accuracy": 0.8463377922773361, "num_tokens": 108723813.0, "step": 90430 }, { "entropy": 1.7865042075514794, "epoch": 0.28035608694944586, "grad_norm": 2.5158307552337646, "learning_rate": 4.7779333328639124e-06, "loss": 0.3875, "mean_token_accuracy": 0.8637947380542755, "num_tokens": 108737608.0, "step": 90440 }, { "entropy": 1.6872775062918663, "epoch": 0.28038708607449553, "grad_norm": 2.7983152866363525, "learning_rate": 4.777669202503063e-06, "loss": 0.3223, "mean_token_accuracy": 0.872826486825943, "num_tokens": 108752944.0, "step": 90450 }, { "entropy": 1.9655809059739113, "epoch": 0.28041808519954525, "grad_norm": 9.844002723693848, "learning_rate": 4.77740511594178e-06, "loss": 0.5496, "mean_token_accuracy": 0.8349737733602524, "num_tokens": 108764826.0, "step": 90460 }, { "entropy": 1.8685831755399704, "epoch": 0.2804490843245949, "grad_norm": 4.095294952392578, "learning_rate": 4.777141073167958e-06, "loss": 0.4645, "mean_token_accuracy": 0.8481494188308716, "num_tokens": 108777223.0, "step": 90470 }, { "entropy": 1.9378087937831878, "epoch": 0.28048008344964465, "grad_norm": 3.9913816452026367, "learning_rate": 4.7768770741694985e-06, "loss": 0.521, "mean_token_accuracy": 0.8388440117239953, "num_tokens": 108788749.0, "step": 90480 }, { "entropy": 1.9008531831204891, "epoch": 0.2805110825746943, "grad_norm": 7.451270580291748, "learning_rate": 4.7766131189343075e-06, "loss": 0.4134, "mean_token_accuracy": 0.8565253868699074, "num_tokens": 108801607.0, "step": 90490 }, { "entropy": 1.960809737443924, "epoch": 0.28054208169974404, "grad_norm": 7.793551921844482, "learning_rate": 4.776349207450297e-06, "loss": 0.4888, "mean_token_accuracy": 0.8530623838305473, "num_tokens": 108812575.0, "step": 90500 }, { "entropy": 1.9836690306663514, "epoch": 0.2805730808247937, "grad_norm": 9.390997886657715, "learning_rate": 4.776085339705378e-06, "loss": 0.527, "mean_token_accuracy": 0.8447514802217484, "num_tokens": 108823707.0, "step": 90510 }, { "entropy": 1.8805051818490028, "epoch": 0.28060407994984343, "grad_norm": 6.977400302886963, "learning_rate": 4.775821515687472e-06, "loss": 0.4779, "mean_token_accuracy": 0.8450083822011948, "num_tokens": 108835768.0, "step": 90520 }, { "entropy": 1.8927241086959838, "epoch": 0.2806350790748931, "grad_norm": 8.844595909118652, "learning_rate": 4.775557735384503e-06, "loss": 0.483, "mean_token_accuracy": 0.8455369025468826, "num_tokens": 108847447.0, "step": 90530 }, { "entropy": 1.9194593280553818, "epoch": 0.2806660781999428, "grad_norm": 9.10633659362793, "learning_rate": 4.775293998784402e-06, "loss": 0.4761, "mean_token_accuracy": 0.8458576589822769, "num_tokens": 108859245.0, "step": 90540 }, { "entropy": 1.9746450453996658, "epoch": 0.2806970773249925, "grad_norm": 8.2658052444458, "learning_rate": 4.775030305875099e-06, "loss": 0.5594, "mean_token_accuracy": 0.8276672974228859, "num_tokens": 108869967.0, "step": 90550 }, { "entropy": 1.8919481202960013, "epoch": 0.2807280764500422, "grad_norm": 6.916478157043457, "learning_rate": 4.774766656644536e-06, "loss": 0.4377, "mean_token_accuracy": 0.8547657161951066, "num_tokens": 108881653.0, "step": 90560 }, { "entropy": 1.8619554951786994, "epoch": 0.2807590755750919, "grad_norm": 7.139795780181885, "learning_rate": 4.774503051080653e-06, "loss": 0.4589, "mean_token_accuracy": 0.845686687529087, "num_tokens": 108893797.0, "step": 90570 }, { "entropy": 1.8943834751844406, "epoch": 0.2807900747001416, "grad_norm": 7.6240339279174805, "learning_rate": 4.7742394891713975e-06, "loss": 0.515, "mean_token_accuracy": 0.8450383573770524, "num_tokens": 108905569.0, "step": 90580 }, { "entropy": 1.8711055159568786, "epoch": 0.2808210738251913, "grad_norm": 7.671294689178467, "learning_rate": 4.773975970904725e-06, "loss": 0.4446, "mean_token_accuracy": 0.8534323275089264, "num_tokens": 108918072.0, "step": 90590 }, { "entropy": 1.9521177858114243, "epoch": 0.280852072950241, "grad_norm": 9.559345245361328, "learning_rate": 4.773712496268588e-06, "loss": 0.5296, "mean_token_accuracy": 0.8375743925571442, "num_tokens": 108929589.0, "step": 90600 }, { "entropy": 1.8415945529937745, "epoch": 0.2808830720752907, "grad_norm": 4.66655969619751, "learning_rate": 4.773449065250952e-06, "loss": 0.46, "mean_token_accuracy": 0.8464484736323357, "num_tokens": 108942319.0, "step": 90610 }, { "entropy": 1.8111029013991355, "epoch": 0.28091407120034034, "grad_norm": 7.22979211807251, "learning_rate": 4.77318567783978e-06, "loss": 0.4028, "mean_token_accuracy": 0.8576018720865249, "num_tokens": 108955141.0, "step": 90620 }, { "entropy": 1.910294608771801, "epoch": 0.28094507032539007, "grad_norm": 8.751194953918457, "learning_rate": 4.772922334023044e-06, "loss": 0.4734, "mean_token_accuracy": 0.8544377252459526, "num_tokens": 108966874.0, "step": 90630 }, { "entropy": 1.8671040296554566, "epoch": 0.28097606945043974, "grad_norm": 3.474283218383789, "learning_rate": 4.7726590337887215e-06, "loss": 0.4495, "mean_token_accuracy": 0.8581783235073089, "num_tokens": 108978553.0, "step": 90640 }, { "entropy": 1.8678077682852745, "epoch": 0.28100706857548946, "grad_norm": 10.035735130310059, "learning_rate": 4.772395777124789e-06, "loss": 0.5148, "mean_token_accuracy": 0.8406292900443078, "num_tokens": 108990475.0, "step": 90650 }, { "entropy": 1.9179120391607285, "epoch": 0.28103806770053913, "grad_norm": 8.567628860473633, "learning_rate": 4.772132564019233e-06, "loss": 0.4866, "mean_token_accuracy": 0.8443030267953873, "num_tokens": 109002069.0, "step": 90660 }, { "entropy": 1.9361174017190934, "epoch": 0.28106906682558885, "grad_norm": 8.406739234924316, "learning_rate": 4.7718693944600445e-06, "loss": 0.4704, "mean_token_accuracy": 0.8500792890787124, "num_tokens": 109013697.0, "step": 90670 }, { "entropy": 1.8635503351688385, "epoch": 0.2811000659506385, "grad_norm": 9.684234619140625, "learning_rate": 4.771606268435215e-06, "loss": 0.4781, "mean_token_accuracy": 0.8503406763076782, "num_tokens": 109025070.0, "step": 90680 }, { "entropy": 1.773262333869934, "epoch": 0.28113106507568825, "grad_norm": 4.093050003051758, "learning_rate": 4.771343185932744e-06, "loss": 0.4471, "mean_token_accuracy": 0.8514026969671249, "num_tokens": 109038769.0, "step": 90690 }, { "entropy": 1.8920626237988472, "epoch": 0.2811620642007379, "grad_norm": 8.63742446899414, "learning_rate": 4.771080146940636e-06, "loss": 0.4507, "mean_token_accuracy": 0.8509817168116569, "num_tokens": 109050600.0, "step": 90700 }, { "entropy": 1.9327398404479026, "epoch": 0.28119306332578764, "grad_norm": 10.020679473876953, "learning_rate": 4.7708171514468965e-06, "loss": 0.528, "mean_token_accuracy": 0.842743456363678, "num_tokens": 109061795.0, "step": 90710 }, { "entropy": 1.8716965742409228, "epoch": 0.2812240624508373, "grad_norm": 9.19456958770752, "learning_rate": 4.770554199439541e-06, "loss": 0.4618, "mean_token_accuracy": 0.8475604027509689, "num_tokens": 109073841.0, "step": 90720 }, { "entropy": 1.9363275811076164, "epoch": 0.28125506157588703, "grad_norm": 9.567742347717285, "learning_rate": 4.770291290906584e-06, "loss": 0.5366, "mean_token_accuracy": 0.8288316577672958, "num_tokens": 109084988.0, "step": 90730 }, { "entropy": 1.8694113805890082, "epoch": 0.2812860607009367, "grad_norm": 10.170321464538574, "learning_rate": 4.770028425836049e-06, "loss": 0.4493, "mean_token_accuracy": 0.854419095814228, "num_tokens": 109097065.0, "step": 90740 }, { "entropy": 1.893789705634117, "epoch": 0.2813170598259864, "grad_norm": 8.507575035095215, "learning_rate": 4.769765604215961e-06, "loss": 0.4775, "mean_token_accuracy": 0.846349011361599, "num_tokens": 109108931.0, "step": 90750 }, { "entropy": 1.9919794470071792, "epoch": 0.2813480589510361, "grad_norm": 8.848701477050781, "learning_rate": 4.769502826034352e-06, "loss": 0.5132, "mean_token_accuracy": 0.8403985217213631, "num_tokens": 109119893.0, "step": 90760 }, { "entropy": 1.826089233160019, "epoch": 0.2813790580760858, "grad_norm": 8.253718376159668, "learning_rate": 4.769240091279257e-06, "loss": 0.436, "mean_token_accuracy": 0.8538560375571251, "num_tokens": 109132512.0, "step": 90770 }, { "entropy": 1.8694834753870964, "epoch": 0.2814100572011355, "grad_norm": 3.41013240814209, "learning_rate": 4.768977399938718e-06, "loss": 0.4813, "mean_token_accuracy": 0.8431166216731072, "num_tokens": 109145787.0, "step": 90780 }, { "entropy": 1.7938583612442016, "epoch": 0.2814410563261852, "grad_norm": 3.963634729385376, "learning_rate": 4.768714752000778e-06, "loss": 0.4336, "mean_token_accuracy": 0.8582982420921326, "num_tokens": 109158903.0, "step": 90790 }, { "entropy": 1.9309082627296448, "epoch": 0.2814720554512349, "grad_norm": 9.012663841247559, "learning_rate": 4.768452147453487e-06, "loss": 0.5303, "mean_token_accuracy": 0.8420499324798584, "num_tokens": 109169777.0, "step": 90800 }, { "entropy": 1.8647458493709563, "epoch": 0.2815030545762846, "grad_norm": 8.505784034729004, "learning_rate": 4.7681895862849e-06, "loss": 0.4703, "mean_token_accuracy": 0.8408839210867882, "num_tokens": 109182314.0, "step": 90810 }, { "entropy": 1.8462204396724702, "epoch": 0.2815340537013343, "grad_norm": 8.286667823791504, "learning_rate": 4.767927068483076e-06, "loss": 0.4418, "mean_token_accuracy": 0.8553971752524376, "num_tokens": 109195725.0, "step": 90820 }, { "entropy": 1.8974308609962462, "epoch": 0.281565052826384, "grad_norm": 9.34967041015625, "learning_rate": 4.767664594036074e-06, "loss": 0.4852, "mean_token_accuracy": 0.8490856289863586, "num_tokens": 109207842.0, "step": 90830 }, { "entropy": 1.8527883812785149, "epoch": 0.28159605195143367, "grad_norm": 3.996321439743042, "learning_rate": 4.767402162931967e-06, "loss": 0.4555, "mean_token_accuracy": 0.8528962567448616, "num_tokens": 109220691.0, "step": 90840 }, { "entropy": 1.9433230310678482, "epoch": 0.2816270510764834, "grad_norm": 7.619417667388916, "learning_rate": 4.767139775158826e-06, "loss": 0.5239, "mean_token_accuracy": 0.843589824438095, "num_tokens": 109231579.0, "step": 90850 }, { "entropy": 1.8327447712421416, "epoch": 0.28165805020153306, "grad_norm": 8.732206344604492, "learning_rate": 4.766877430704727e-06, "loss": 0.4369, "mean_token_accuracy": 0.8554098963737488, "num_tokens": 109244327.0, "step": 90860 }, { "entropy": 1.9131013810634614, "epoch": 0.28168904932658273, "grad_norm": 8.691461563110352, "learning_rate": 4.766615129557752e-06, "loss": 0.48, "mean_token_accuracy": 0.8473145887255669, "num_tokens": 109255888.0, "step": 90870 }, { "entropy": 1.9245838135480882, "epoch": 0.28172004845163245, "grad_norm": 7.54715633392334, "learning_rate": 4.766352871705987e-06, "loss": 0.5234, "mean_token_accuracy": 0.8493126124143601, "num_tokens": 109266869.0, "step": 90880 }, { "entropy": 1.917286041378975, "epoch": 0.2817510475766821, "grad_norm": 4.127012252807617, "learning_rate": 4.7660906571375246e-06, "loss": 0.5033, "mean_token_accuracy": 0.8439315423369408, "num_tokens": 109278743.0, "step": 90890 }, { "entropy": 1.897385112941265, "epoch": 0.28178204670173185, "grad_norm": 8.57779312133789, "learning_rate": 4.76582848584046e-06, "loss": 0.4876, "mean_token_accuracy": 0.8471187829971314, "num_tokens": 109290444.0, "step": 90900 }, { "entropy": 1.9781205475330352, "epoch": 0.2818130458267815, "grad_norm": 9.53032398223877, "learning_rate": 4.765566357802891e-06, "loss": 0.5853, "mean_token_accuracy": 0.8293624818325043, "num_tokens": 109301356.0, "step": 90910 }, { "entropy": 1.9003041684627533, "epoch": 0.28184404495183124, "grad_norm": 8.813244819641113, "learning_rate": 4.765304273012924e-06, "loss": 0.4625, "mean_token_accuracy": 0.8380188256502151, "num_tokens": 109313531.0, "step": 90920 }, { "entropy": 1.8687111303210258, "epoch": 0.2818750440768809, "grad_norm": 8.043350219726562, "learning_rate": 4.765042231458668e-06, "loss": 0.4412, "mean_token_accuracy": 0.8497692689299583, "num_tokens": 109326158.0, "step": 90930 }, { "entropy": 1.9804033279418944, "epoch": 0.28190604320193063, "grad_norm": 8.524834632873535, "learning_rate": 4.764780233128236e-06, "loss": 0.5462, "mean_token_accuracy": 0.8371674284338951, "num_tokens": 109337187.0, "step": 90940 }, { "entropy": 1.813350136578083, "epoch": 0.2819370423269803, "grad_norm": 7.912545204162598, "learning_rate": 4.764518278009748e-06, "loss": 0.402, "mean_token_accuracy": 0.8572453141212464, "num_tokens": 109350343.0, "step": 90950 }, { "entropy": 1.9478640288114548, "epoch": 0.28196804145203, "grad_norm": 7.623589515686035, "learning_rate": 4.764256366091324e-06, "loss": 0.5357, "mean_token_accuracy": 0.8418220117688179, "num_tokens": 109361694.0, "step": 90960 }, { "entropy": 1.8554902136325837, "epoch": 0.2819990405770797, "grad_norm": 7.590473651885986, "learning_rate": 4.763994497361095e-06, "loss": 0.435, "mean_token_accuracy": 0.8496796250343323, "num_tokens": 109374068.0, "step": 90970 }, { "entropy": 1.9092994675040245, "epoch": 0.2820300397021294, "grad_norm": 7.852284908294678, "learning_rate": 4.7637326718071905e-06, "loss": 0.5231, "mean_token_accuracy": 0.8353525906801224, "num_tokens": 109385364.0, "step": 90980 }, { "entropy": 1.8934466361999511, "epoch": 0.2820610388271791, "grad_norm": 9.18950080871582, "learning_rate": 4.763470889417748e-06, "loss": 0.4483, "mean_token_accuracy": 0.8570297732949257, "num_tokens": 109397364.0, "step": 90990 }, { "entropy": 1.771867723762989, "epoch": 0.2820920379522288, "grad_norm": 9.84312629699707, "learning_rate": 4.763209150180908e-06, "loss": 0.3863, "mean_token_accuracy": 0.8599537044763566, "num_tokens": 109410971.0, "step": 91000 }, { "entropy": 1.9751540750265122, "epoch": 0.2821230370772785, "grad_norm": 10.806888580322266, "learning_rate": 4.762947454084818e-06, "loss": 0.5376, "mean_token_accuracy": 0.8323762461543083, "num_tokens": 109421894.0, "step": 91010 }, { "entropy": 1.863322387635708, "epoch": 0.2821540362023282, "grad_norm": 9.505192756652832, "learning_rate": 4.7626858011176256e-06, "loss": 0.4577, "mean_token_accuracy": 0.8461381018161773, "num_tokens": 109433667.0, "step": 91020 }, { "entropy": 1.9372095853090285, "epoch": 0.2821850353273779, "grad_norm": 9.51073169708252, "learning_rate": 4.7624241912674885e-06, "loss": 0.4814, "mean_token_accuracy": 0.8425448566675187, "num_tokens": 109445506.0, "step": 91030 }, { "entropy": 1.8736591801047324, "epoch": 0.2822160344524276, "grad_norm": 9.874513626098633, "learning_rate": 4.762162624522564e-06, "loss": 0.4579, "mean_token_accuracy": 0.8544930562376976, "num_tokens": 109456672.0, "step": 91040 }, { "entropy": 1.9017417326569557, "epoch": 0.28224703357747727, "grad_norm": 8.883088111877441, "learning_rate": 4.761901100871018e-06, "loss": 0.4956, "mean_token_accuracy": 0.8446088701486587, "num_tokens": 109468440.0, "step": 91050 }, { "entropy": 1.8893210887908936, "epoch": 0.282278032702527, "grad_norm": 10.458946228027344, "learning_rate": 4.7616396203010165e-06, "loss": 0.4411, "mean_token_accuracy": 0.8509502872824669, "num_tokens": 109480575.0, "step": 91060 }, { "entropy": 1.9682177215814591, "epoch": 0.28230903182757666, "grad_norm": 7.0361151695251465, "learning_rate": 4.761378182800733e-06, "loss": 0.5089, "mean_token_accuracy": 0.8456061512231827, "num_tokens": 109491254.0, "step": 91070 }, { "entropy": 1.9273748084902764, "epoch": 0.2823400309526264, "grad_norm": 7.263545989990234, "learning_rate": 4.761116788358349e-06, "loss": 0.501, "mean_token_accuracy": 0.8323819488286972, "num_tokens": 109503011.0, "step": 91080 }, { "entropy": 1.9739017739892006, "epoch": 0.28237103007767606, "grad_norm": 6.576552391052246, "learning_rate": 4.760855436962041e-06, "loss": 0.5487, "mean_token_accuracy": 0.8396146357059479, "num_tokens": 109514103.0, "step": 91090 }, { "entropy": 1.9086976170539856, "epoch": 0.2824020292027258, "grad_norm": 8.7454833984375, "learning_rate": 4.760594128599999e-06, "loss": 0.4661, "mean_token_accuracy": 0.848183062672615, "num_tokens": 109526563.0, "step": 91100 }, { "entropy": 1.9339544415473937, "epoch": 0.28243302832777545, "grad_norm": 9.544751167297363, "learning_rate": 4.760332863260414e-06, "loss": 0.5061, "mean_token_accuracy": 0.8470303192734718, "num_tokens": 109538093.0, "step": 91110 }, { "entropy": 1.916434782743454, "epoch": 0.2824640274528251, "grad_norm": 4.14872407913208, "learning_rate": 4.7600716409314804e-06, "loss": 0.4853, "mean_token_accuracy": 0.8473877355456352, "num_tokens": 109549378.0, "step": 91120 }, { "entropy": 1.766272282600403, "epoch": 0.28249502657787484, "grad_norm": 8.772634506225586, "learning_rate": 4.7598104616014005e-06, "loss": 0.3786, "mean_token_accuracy": 0.8664262443780899, "num_tokens": 109563346.0, "step": 91130 }, { "entropy": 1.8620783016085625, "epoch": 0.2825260257029245, "grad_norm": 5.1654887199401855, "learning_rate": 4.759549325258377e-06, "loss": 0.4479, "mean_token_accuracy": 0.8528469145298004, "num_tokens": 109575000.0, "step": 91140 }, { "entropy": 1.827536989748478, "epoch": 0.28255702482797423, "grad_norm": 4.431619644165039, "learning_rate": 4.759288231890621e-06, "loss": 0.4768, "mean_token_accuracy": 0.849473150074482, "num_tokens": 109587804.0, "step": 91150 }, { "entropy": 1.865426352620125, "epoch": 0.2825880239530239, "grad_norm": 7.909317493438721, "learning_rate": 4.759027181486346e-06, "loss": 0.4475, "mean_token_accuracy": 0.8499869540333748, "num_tokens": 109600294.0, "step": 91160 }, { "entropy": 1.9297534614801406, "epoch": 0.28261902307807363, "grad_norm": 9.111772537231445, "learning_rate": 4.758766174033769e-06, "loss": 0.5103, "mean_token_accuracy": 0.8378744632005691, "num_tokens": 109611503.0, "step": 91170 }, { "entropy": 1.8704712957143783, "epoch": 0.2826500222031233, "grad_norm": 4.173186779022217, "learning_rate": 4.758505209521114e-06, "loss": 0.4431, "mean_token_accuracy": 0.855632272362709, "num_tokens": 109624455.0, "step": 91180 }, { "entropy": 1.9280048042535782, "epoch": 0.282681021328173, "grad_norm": 8.439362525939941, "learning_rate": 4.758244287936609e-06, "loss": 0.5147, "mean_token_accuracy": 0.8422107562422753, "num_tokens": 109635564.0, "step": 91190 }, { "entropy": 1.8654653757810593, "epoch": 0.2827120204532227, "grad_norm": 4.882840156555176, "learning_rate": 4.757983409268485e-06, "loss": 0.47, "mean_token_accuracy": 0.8396533340215683, "num_tokens": 109648550.0, "step": 91200 }, { "entropy": 1.8784069836139679, "epoch": 0.2827430195782724, "grad_norm": 7.505212783813477, "learning_rate": 4.757722573504979e-06, "loss": 0.4466, "mean_token_accuracy": 0.8495945394039154, "num_tokens": 109660589.0, "step": 91210 }, { "entropy": 1.9778222680091857, "epoch": 0.2827740187033221, "grad_norm": 10.172992706298828, "learning_rate": 4.757461780634332e-06, "loss": 0.5444, "mean_token_accuracy": 0.8417175248265266, "num_tokens": 109671973.0, "step": 91220 }, { "entropy": 1.8196046486496926, "epoch": 0.2828050178283718, "grad_norm": 8.086063385009766, "learning_rate": 4.757201030644789e-06, "loss": 0.3953, "mean_token_accuracy": 0.8687186688184738, "num_tokens": 109684750.0, "step": 91230 }, { "entropy": 1.8343065902590752, "epoch": 0.2828360169534215, "grad_norm": 9.45622730255127, "learning_rate": 4.7569403235246005e-06, "loss": 0.4139, "mean_token_accuracy": 0.8519006326794625, "num_tokens": 109697003.0, "step": 91240 }, { "entropy": 1.838625229895115, "epoch": 0.2828670160784712, "grad_norm": 3.578289270401001, "learning_rate": 4.756679659262021e-06, "loss": 0.3938, "mean_token_accuracy": 0.8600620925426483, "num_tokens": 109709959.0, "step": 91250 }, { "entropy": 1.9180777609348296, "epoch": 0.28289801520352087, "grad_norm": 9.219250679016113, "learning_rate": 4.756419037845309e-06, "loss": 0.4797, "mean_token_accuracy": 0.8345475569367409, "num_tokens": 109721437.0, "step": 91260 }, { "entropy": 1.914302496612072, "epoch": 0.2829290143285706, "grad_norm": 3.8321692943573, "learning_rate": 4.756158459262729e-06, "loss": 0.4678, "mean_token_accuracy": 0.8463889390230179, "num_tokens": 109733406.0, "step": 91270 }, { "entropy": 1.841091763973236, "epoch": 0.28296001345362026, "grad_norm": 8.805869102478027, "learning_rate": 4.755897923502547e-06, "loss": 0.4353, "mean_token_accuracy": 0.8465237930417061, "num_tokens": 109745414.0, "step": 91280 }, { "entropy": 1.9549955561757089, "epoch": 0.28299101257867, "grad_norm": 4.771459102630615, "learning_rate": 4.755637430553038e-06, "loss": 0.5449, "mean_token_accuracy": 0.8281950682401658, "num_tokens": 109756473.0, "step": 91290 }, { "entropy": 1.9814363867044449, "epoch": 0.28302201170371966, "grad_norm": 9.486200332641602, "learning_rate": 4.755376980402479e-06, "loss": 0.5201, "mean_token_accuracy": 0.8435119092464447, "num_tokens": 109767033.0, "step": 91300 }, { "entropy": 1.8801933750510216, "epoch": 0.2830530108287694, "grad_norm": 8.066265106201172, "learning_rate": 4.755116573039149e-06, "loss": 0.4795, "mean_token_accuracy": 0.8474684238433838, "num_tokens": 109779009.0, "step": 91310 }, { "entropy": 1.8336575701832771, "epoch": 0.28308400995381905, "grad_norm": 9.574748039245605, "learning_rate": 4.754856208451337e-06, "loss": 0.4312, "mean_token_accuracy": 0.8556732803583145, "num_tokens": 109791334.0, "step": 91320 }, { "entropy": 1.9349465638399124, "epoch": 0.2831150090788688, "grad_norm": 8.522025108337402, "learning_rate": 4.75459588662733e-06, "loss": 0.4795, "mean_token_accuracy": 0.851339441537857, "num_tokens": 109802925.0, "step": 91330 }, { "entropy": 1.8921233609318733, "epoch": 0.28314600820391844, "grad_norm": 8.379258155822754, "learning_rate": 4.754335607555427e-06, "loss": 0.4995, "mean_token_accuracy": 0.8478038221597671, "num_tokens": 109814130.0, "step": 91340 }, { "entropy": 1.8570956602692603, "epoch": 0.28317700732896817, "grad_norm": 9.117257118225098, "learning_rate": 4.754075371223925e-06, "loss": 0.478, "mean_token_accuracy": 0.8518408805131912, "num_tokens": 109826200.0, "step": 91350 }, { "entropy": 1.91699261367321, "epoch": 0.28320800645401784, "grad_norm": 8.286026000976562, "learning_rate": 4.753815177621128e-06, "loss": 0.539, "mean_token_accuracy": 0.8423243030905724, "num_tokens": 109837290.0, "step": 91360 }, { "entropy": 1.940002153813839, "epoch": 0.2832390055790675, "grad_norm": 9.196231842041016, "learning_rate": 4.753555026735344e-06, "loss": 0.5067, "mean_token_accuracy": 0.8430562511086463, "num_tokens": 109848408.0, "step": 91370 }, { "entropy": 1.9086173102259636, "epoch": 0.28327000470411723, "grad_norm": 7.4503560066223145, "learning_rate": 4.753294918554887e-06, "loss": 0.4555, "mean_token_accuracy": 0.8556687757372856, "num_tokens": 109859863.0, "step": 91380 }, { "entropy": 1.8716721430420875, "epoch": 0.2833010038291669, "grad_norm": 7.7145795822143555, "learning_rate": 4.753034853068076e-06, "loss": 0.5203, "mean_token_accuracy": 0.8355464920401573, "num_tokens": 109872862.0, "step": 91390 }, { "entropy": 1.959967464208603, "epoch": 0.2833320029542166, "grad_norm": 8.618791580200195, "learning_rate": 4.752774830263229e-06, "loss": 0.4941, "mean_token_accuracy": 0.8505872398614883, "num_tokens": 109884502.0, "step": 91400 }, { "entropy": 1.8670057207345963, "epoch": 0.2833630020792663, "grad_norm": 2.768303155899048, "learning_rate": 4.7525148501286754e-06, "loss": 0.514, "mean_token_accuracy": 0.841192239522934, "num_tokens": 109896997.0, "step": 91410 }, { "entropy": 1.942854182422161, "epoch": 0.283394001204316, "grad_norm": 9.517045974731445, "learning_rate": 4.752254912652746e-06, "loss": 0.5407, "mean_token_accuracy": 0.8302263349294663, "num_tokens": 109908513.0, "step": 91420 }, { "entropy": 1.8470983251929283, "epoch": 0.2834250003293657, "grad_norm": 3.9292147159576416, "learning_rate": 4.751995017823772e-06, "loss": 0.4478, "mean_token_accuracy": 0.847145140171051, "num_tokens": 109921038.0, "step": 91430 }, { "entropy": 1.889025342464447, "epoch": 0.2834559994544154, "grad_norm": 7.8180952072143555, "learning_rate": 4.751735165630099e-06, "loss": 0.4536, "mean_token_accuracy": 0.8388582825660705, "num_tokens": 109933509.0, "step": 91440 }, { "entropy": 1.926556906104088, "epoch": 0.2834869985794651, "grad_norm": 9.13294506072998, "learning_rate": 4.751475356060067e-06, "loss": 0.502, "mean_token_accuracy": 0.8347544342279434, "num_tokens": 109945589.0, "step": 91450 }, { "entropy": 1.8450331330299377, "epoch": 0.2835179977045148, "grad_norm": 4.0597100257873535, "learning_rate": 4.751215589102026e-06, "loss": 0.424, "mean_token_accuracy": 0.8560441970825196, "num_tokens": 109958238.0, "step": 91460 }, { "entropy": 1.8735743075609208, "epoch": 0.28354899682956447, "grad_norm": 9.06523323059082, "learning_rate": 4.75095586474433e-06, "loss": 0.4176, "mean_token_accuracy": 0.8512053444981575, "num_tokens": 109970897.0, "step": 91470 }, { "entropy": 1.9218522995710372, "epoch": 0.2835799959546142, "grad_norm": 8.556892395019531, "learning_rate": 4.750696182975335e-06, "loss": 0.5451, "mean_token_accuracy": 0.8374419540166855, "num_tokens": 109982852.0, "step": 91480 }, { "entropy": 1.8760737299919128, "epoch": 0.28361099507966386, "grad_norm": 7.687764644622803, "learning_rate": 4.750436543783403e-06, "loss": 0.4736, "mean_token_accuracy": 0.8507323205471039, "num_tokens": 109993887.0, "step": 91490 }, { "entropy": 1.8525379657745362, "epoch": 0.2836419942047136, "grad_norm": 8.57304573059082, "learning_rate": 4.750176947156903e-06, "loss": 0.4597, "mean_token_accuracy": 0.8570681139826775, "num_tokens": 110005619.0, "step": 91500 }, { "entropy": 1.9060514703392983, "epoch": 0.28367299332976326, "grad_norm": 6.858503818511963, "learning_rate": 4.749917393084203e-06, "loss": 0.4772, "mean_token_accuracy": 0.8408376634120941, "num_tokens": 110018405.0, "step": 91510 }, { "entropy": 1.8777115240693092, "epoch": 0.283703992454813, "grad_norm": 7.576192855834961, "learning_rate": 4.74965788155368e-06, "loss": 0.4717, "mean_token_accuracy": 0.8432316944003105, "num_tokens": 110030683.0, "step": 91520 }, { "entropy": 1.6769248962402343, "epoch": 0.28373499157986265, "grad_norm": 8.123210906982422, "learning_rate": 4.749398412553713e-06, "loss": 0.3271, "mean_token_accuracy": 0.8673816755414009, "num_tokens": 110045730.0, "step": 91530 }, { "entropy": 1.8561891838908195, "epoch": 0.2837659907049124, "grad_norm": 4.347743511199951, "learning_rate": 4.749138986072685e-06, "loss": 0.4555, "mean_token_accuracy": 0.849002268910408, "num_tokens": 110058293.0, "step": 91540 }, { "entropy": 1.87572433501482, "epoch": 0.28379698982996204, "grad_norm": 8.649298667907715, "learning_rate": 4.748879602098988e-06, "loss": 0.4649, "mean_token_accuracy": 0.8493537470698357, "num_tokens": 110070569.0, "step": 91550 }, { "entropy": 1.8268972262740135, "epoch": 0.28382798895501177, "grad_norm": 3.4365265369415283, "learning_rate": 4.748620260621013e-06, "loss": 0.4031, "mean_token_accuracy": 0.8539086386561394, "num_tokens": 110083590.0, "step": 91560 }, { "entropy": 1.8347914576530457, "epoch": 0.28385898808006144, "grad_norm": 7.83189582824707, "learning_rate": 4.748360961627159e-06, "loss": 0.4167, "mean_token_accuracy": 0.8629049167037011, "num_tokens": 110096116.0, "step": 91570 }, { "entropy": 1.9295135840773583, "epoch": 0.28388998720511116, "grad_norm": 10.846009254455566, "learning_rate": 4.748101705105827e-06, "loss": 0.537, "mean_token_accuracy": 0.8269555777311325, "num_tokens": 110107595.0, "step": 91580 }, { "entropy": 1.9408522367477417, "epoch": 0.28392098633016083, "grad_norm": 7.8314056396484375, "learning_rate": 4.747842491045421e-06, "loss": 0.5202, "mean_token_accuracy": 0.8457581490278244, "num_tokens": 110118966.0, "step": 91590 }, { "entropy": 1.8445065826177598, "epoch": 0.28395198545521055, "grad_norm": 9.18549633026123, "learning_rate": 4.747583319434357e-06, "loss": 0.4766, "mean_token_accuracy": 0.8425332620739937, "num_tokens": 110131790.0, "step": 91600 }, { "entropy": 1.8247267931699753, "epoch": 0.2839829845802602, "grad_norm": 9.114663124084473, "learning_rate": 4.747324190261046e-06, "loss": 0.491, "mean_token_accuracy": 0.8539474830031395, "num_tokens": 110144807.0, "step": 91610 }, { "entropy": 1.859029544889927, "epoch": 0.2840139837053099, "grad_norm": 9.042014122009277, "learning_rate": 4.74706510351391e-06, "loss": 0.5174, "mean_token_accuracy": 0.8466860115528106, "num_tokens": 110157391.0, "step": 91620 }, { "entropy": 1.8583563596010209, "epoch": 0.2840449828303596, "grad_norm": 8.334424018859863, "learning_rate": 4.746806059181373e-06, "loss": 0.5017, "mean_token_accuracy": 0.8355572551488877, "num_tokens": 110170333.0, "step": 91630 }, { "entropy": 1.8378588289022446, "epoch": 0.2840759819554093, "grad_norm": 8.690698623657227, "learning_rate": 4.746547057251862e-06, "loss": 0.5242, "mean_token_accuracy": 0.8404997318983078, "num_tokens": 110183049.0, "step": 91640 }, { "entropy": 1.8678978830575943, "epoch": 0.284106981080459, "grad_norm": 4.2940473556518555, "learning_rate": 4.7462880977138126e-06, "loss": 0.4802, "mean_token_accuracy": 0.8339786469936371, "num_tokens": 110196329.0, "step": 91650 }, { "entropy": 1.7718224942684173, "epoch": 0.2841379802055087, "grad_norm": 8.046103477478027, "learning_rate": 4.74602918055566e-06, "loss": 0.4232, "mean_token_accuracy": 0.8618276312947273, "num_tokens": 110209525.0, "step": 91660 }, { "entropy": 1.9796247810125351, "epoch": 0.2841689793305584, "grad_norm": 6.6610870361328125, "learning_rate": 4.745770305765847e-06, "loss": 0.5604, "mean_token_accuracy": 0.8357976496219635, "num_tokens": 110220480.0, "step": 91670 }, { "entropy": 1.9403215855360032, "epoch": 0.28419997845560807, "grad_norm": 8.975007057189941, "learning_rate": 4.745511473332818e-06, "loss": 0.5478, "mean_token_accuracy": 0.8363938242197037, "num_tokens": 110231525.0, "step": 91680 }, { "entropy": 1.891119834780693, "epoch": 0.2842309775806578, "grad_norm": 9.538240432739258, "learning_rate": 4.745252683245027e-06, "loss": 0.4886, "mean_token_accuracy": 0.8425269886851311, "num_tokens": 110243198.0, "step": 91690 }, { "entropy": 1.8871520176529883, "epoch": 0.28426197670570746, "grad_norm": 9.343374252319336, "learning_rate": 4.744993935490928e-06, "loss": 0.4459, "mean_token_accuracy": 0.8517936706542969, "num_tokens": 110255593.0, "step": 91700 }, { "entropy": 1.8104426577687263, "epoch": 0.2842929758307572, "grad_norm": 8.839599609375, "learning_rate": 4.744735230058977e-06, "loss": 0.4135, "mean_token_accuracy": 0.8551136195659638, "num_tokens": 110268721.0, "step": 91710 }, { "entropy": 1.9426888212561608, "epoch": 0.28432397495580686, "grad_norm": 8.519753456115723, "learning_rate": 4.744476566937642e-06, "loss": 0.4967, "mean_token_accuracy": 0.8359502226114273, "num_tokens": 110280087.0, "step": 91720 }, { "entropy": 1.9278660222887993, "epoch": 0.2843549740808566, "grad_norm": 8.69306468963623, "learning_rate": 4.74421794611539e-06, "loss": 0.497, "mean_token_accuracy": 0.8463233426213265, "num_tokens": 110291541.0, "step": 91730 }, { "entropy": 1.9058746635913848, "epoch": 0.28438597320590625, "grad_norm": 4.05926513671875, "learning_rate": 4.743959367580693e-06, "loss": 0.4611, "mean_token_accuracy": 0.8437731295824051, "num_tokens": 110303659.0, "step": 91740 }, { "entropy": 1.9244377925992011, "epoch": 0.284416972330956, "grad_norm": 8.101446151733398, "learning_rate": 4.743700831322029e-06, "loss": 0.444, "mean_token_accuracy": 0.8459056586027145, "num_tokens": 110315031.0, "step": 91750 }, { "entropy": 1.9023540601134301, "epoch": 0.28444797145600564, "grad_norm": 6.504104137420654, "learning_rate": 4.74344233732788e-06, "loss": 0.4976, "mean_token_accuracy": 0.8424684196710587, "num_tokens": 110327386.0, "step": 91760 }, { "entropy": 1.9569046169519424, "epoch": 0.28447897058105537, "grad_norm": 9.210738182067871, "learning_rate": 4.743183885586729e-06, "loss": 0.537, "mean_token_accuracy": 0.8266186848282814, "num_tokens": 110338648.0, "step": 91770 }, { "entropy": 1.8510024085640908, "epoch": 0.28450996970610504, "grad_norm": 8.181108474731445, "learning_rate": 4.74292547608707e-06, "loss": 0.4109, "mean_token_accuracy": 0.8562486320734024, "num_tokens": 110350736.0, "step": 91780 }, { "entropy": 1.9186692774295806, "epoch": 0.28454096883115476, "grad_norm": 9.964049339294434, "learning_rate": 4.7426671088173945e-06, "loss": 0.4583, "mean_token_accuracy": 0.8494500458240509, "num_tokens": 110362275.0, "step": 91790 }, { "entropy": 1.7676223665475845, "epoch": 0.28457196795620443, "grad_norm": 2.426920175552368, "learning_rate": 4.742408783766203e-06, "loss": 0.3842, "mean_token_accuracy": 0.863847254216671, "num_tokens": 110375617.0, "step": 91800 }, { "entropy": 1.8544678494334221, "epoch": 0.28460296708125415, "grad_norm": 9.724370956420898, "learning_rate": 4.742150500922e-06, "loss": 0.484, "mean_token_accuracy": 0.8445950224995613, "num_tokens": 110387605.0, "step": 91810 }, { "entropy": 1.8677785605192185, "epoch": 0.2846339662063038, "grad_norm": 8.32010269165039, "learning_rate": 4.741892260273291e-06, "loss": 0.4568, "mean_token_accuracy": 0.845884545147419, "num_tokens": 110399589.0, "step": 91820 }, { "entropy": 1.8281821206212043, "epoch": 0.28466496533135355, "grad_norm": 8.002693176269531, "learning_rate": 4.741634061808588e-06, "loss": 0.4257, "mean_token_accuracy": 0.8603383213281631, "num_tokens": 110412247.0, "step": 91830 }, { "entropy": 1.8158651649951936, "epoch": 0.2846959644564032, "grad_norm": 8.582942008972168, "learning_rate": 4.741375905516411e-06, "loss": 0.4644, "mean_token_accuracy": 0.846869707107544, "num_tokens": 110424355.0, "step": 91840 }, { "entropy": 1.7574502289295197, "epoch": 0.28472696358145294, "grad_norm": 4.905912399291992, "learning_rate": 4.741117791385276e-06, "loss": 0.3859, "mean_token_accuracy": 0.8498795077204704, "num_tokens": 110437595.0, "step": 91850 }, { "entropy": 1.898454374074936, "epoch": 0.2847579627065026, "grad_norm": 7.610432147979736, "learning_rate": 4.740859719403713e-06, "loss": 0.492, "mean_token_accuracy": 0.8487632632255554, "num_tokens": 110448902.0, "step": 91860 }, { "entropy": 1.7625425659120082, "epoch": 0.2847889618315523, "grad_norm": 4.429518699645996, "learning_rate": 4.740601689560249e-06, "loss": 0.3973, "mean_token_accuracy": 0.8575651466846466, "num_tokens": 110461975.0, "step": 91870 }, { "entropy": 1.8556936159729958, "epoch": 0.284819960956602, "grad_norm": 9.066593170166016, "learning_rate": 4.74034370184342e-06, "loss": 0.4796, "mean_token_accuracy": 0.847280016541481, "num_tokens": 110473152.0, "step": 91880 }, { "entropy": 1.880302868783474, "epoch": 0.28485096008165167, "grad_norm": 9.606684684753418, "learning_rate": 4.740085756241761e-06, "loss": 0.5205, "mean_token_accuracy": 0.831180626153946, "num_tokens": 110484515.0, "step": 91890 }, { "entropy": 1.768574671447277, "epoch": 0.2848819592067014, "grad_norm": 3.897165298461914, "learning_rate": 4.7398278527438175e-06, "loss": 0.4161, "mean_token_accuracy": 0.8571337282657623, "num_tokens": 110496957.0, "step": 91900 }, { "entropy": 1.791210974752903, "epoch": 0.28491295833175106, "grad_norm": 9.069717407226562, "learning_rate": 4.739569991338137e-06, "loss": 0.4322, "mean_token_accuracy": 0.851265873014927, "num_tokens": 110509726.0, "step": 91910 }, { "entropy": 1.807637719810009, "epoch": 0.2849439574568008, "grad_norm": 7.900302886962891, "learning_rate": 4.739312172013269e-06, "loss": 0.4362, "mean_token_accuracy": 0.8464860022068024, "num_tokens": 110522419.0, "step": 91920 }, { "entropy": 1.8903899610042572, "epoch": 0.28497495658185046, "grad_norm": 6.663649559020996, "learning_rate": 4.7390543947577705e-06, "loss": 0.4893, "mean_token_accuracy": 0.8373034983873368, "num_tokens": 110533897.0, "step": 91930 }, { "entropy": 1.8422768160700798, "epoch": 0.2850059557069002, "grad_norm": 7.8179826736450195, "learning_rate": 4.7387966595602014e-06, "loss": 0.4492, "mean_token_accuracy": 0.8479554176330566, "num_tokens": 110546623.0, "step": 91940 }, { "entropy": 1.7804616317152977, "epoch": 0.28503695483194985, "grad_norm": 8.71053409576416, "learning_rate": 4.738538966409126e-06, "loss": 0.4473, "mean_token_accuracy": 0.8510273039340973, "num_tokens": 110559478.0, "step": 91950 }, { "entropy": 1.859451201558113, "epoch": 0.2850679539569996, "grad_norm": 4.298933029174805, "learning_rate": 4.738281315293114e-06, "loss": 0.4617, "mean_token_accuracy": 0.8425625443458558, "num_tokens": 110571704.0, "step": 91960 }, { "entropy": 1.850043423473835, "epoch": 0.28509895308204924, "grad_norm": 7.2485480308532715, "learning_rate": 4.738023706200738e-06, "loss": 0.4499, "mean_token_accuracy": 0.851256474852562, "num_tokens": 110583945.0, "step": 91970 }, { "entropy": 1.9050755083560944, "epoch": 0.28512995220709897, "grad_norm": 9.636146545410156, "learning_rate": 4.737766139120575e-06, "loss": 0.5271, "mean_token_accuracy": 0.832217988371849, "num_tokens": 110595560.0, "step": 91980 }, { "entropy": 1.9206005334854126, "epoch": 0.28516095133214864, "grad_norm": 8.515999794006348, "learning_rate": 4.73750861404121e-06, "loss": 0.5102, "mean_token_accuracy": 0.8435544535517693, "num_tokens": 110606577.0, "step": 91990 }, { "entropy": 1.869492068886757, "epoch": 0.28519195045719836, "grad_norm": 8.025008201599121, "learning_rate": 4.737251130951226e-06, "loss": 0.4956, "mean_token_accuracy": 0.8449269160628319, "num_tokens": 110618391.0, "step": 92000 }, { "entropy": 1.845357683300972, "epoch": 0.28522294958224803, "grad_norm": 8.134385108947754, "learning_rate": 4.736993689839216e-06, "loss": 0.4576, "mean_token_accuracy": 0.8439294084906578, "num_tokens": 110631021.0, "step": 92010 }, { "entropy": 1.8097378730773925, "epoch": 0.28525394870729776, "grad_norm": 10.888206481933594, "learning_rate": 4.736736290693772e-06, "loss": 0.4697, "mean_token_accuracy": 0.8568091303110122, "num_tokens": 110643427.0, "step": 92020 }, { "entropy": 1.9278713300824166, "epoch": 0.2852849478323474, "grad_norm": 8.461037635803223, "learning_rate": 4.736478933503496e-06, "loss": 0.524, "mean_token_accuracy": 0.8417813435196877, "num_tokens": 110654675.0, "step": 92030 }, { "entropy": 1.8513173662126063, "epoch": 0.28531594695739715, "grad_norm": 8.097729682922363, "learning_rate": 4.7362216182569906e-06, "loss": 0.4856, "mean_token_accuracy": 0.8500329554080963, "num_tokens": 110667292.0, "step": 92040 }, { "entropy": 1.9195297732949257, "epoch": 0.2853469460824468, "grad_norm": 8.111653327941895, "learning_rate": 4.735964344942864e-06, "loss": 0.4854, "mean_token_accuracy": 0.8443229928612709, "num_tokens": 110678569.0, "step": 92050 }, { "entropy": 1.8122631691396236, "epoch": 0.28537794520749654, "grad_norm": 8.886815071105957, "learning_rate": 4.735707113549729e-06, "loss": 0.4137, "mean_token_accuracy": 0.8556604027748108, "num_tokens": 110691490.0, "step": 92060 }, { "entropy": 1.884895347058773, "epoch": 0.2854089443325462, "grad_norm": 10.424093246459961, "learning_rate": 4.735449924066201e-06, "loss": 0.4717, "mean_token_accuracy": 0.8439397796988487, "num_tokens": 110703884.0, "step": 92070 }, { "entropy": 1.9233110576868058, "epoch": 0.28543994345759593, "grad_norm": 9.017784118652344, "learning_rate": 4.735192776480902e-06, "loss": 0.4782, "mean_token_accuracy": 0.8422451555728913, "num_tokens": 110716070.0, "step": 92080 }, { "entropy": 1.8380617439746856, "epoch": 0.2854709425826456, "grad_norm": 8.3659086227417, "learning_rate": 4.734935670782457e-06, "loss": 0.4673, "mean_token_accuracy": 0.8482602387666702, "num_tokens": 110728016.0, "step": 92090 }, { "entropy": 1.9306314051151277, "epoch": 0.2855019417076953, "grad_norm": 9.452094078063965, "learning_rate": 4.7346786069594955e-06, "loss": 0.5179, "mean_token_accuracy": 0.8452840596437454, "num_tokens": 110739218.0, "step": 92100 }, { "entropy": 1.7319670930504798, "epoch": 0.285532940832745, "grad_norm": 7.807188034057617, "learning_rate": 4.734421585000652e-06, "loss": 0.4277, "mean_token_accuracy": 0.8550496265292168, "num_tokens": 110752633.0, "step": 92110 }, { "entropy": 1.9415203884243966, "epoch": 0.28556393995779467, "grad_norm": 8.066431999206543, "learning_rate": 4.7341646048945645e-06, "loss": 0.5107, "mean_token_accuracy": 0.8382659062743187, "num_tokens": 110764376.0, "step": 92120 }, { "entropy": 1.801632682979107, "epoch": 0.2855949390828444, "grad_norm": 6.095812797546387, "learning_rate": 4.733907666629874e-06, "loss": 0.438, "mean_token_accuracy": 0.8537136375904083, "num_tokens": 110776672.0, "step": 92130 }, { "entropy": 1.8623081862926483, "epoch": 0.28562593820789406, "grad_norm": 8.639974594116211, "learning_rate": 4.733650770195231e-06, "loss": 0.4526, "mean_token_accuracy": 0.848575672507286, "num_tokens": 110788872.0, "step": 92140 }, { "entropy": 1.7804166600108147, "epoch": 0.2856569373329438, "grad_norm": 3.511837959289551, "learning_rate": 4.733393915579283e-06, "loss": 0.4113, "mean_token_accuracy": 0.8584547877311707, "num_tokens": 110801677.0, "step": 92150 }, { "entropy": 1.8777981102466583, "epoch": 0.28568793645799345, "grad_norm": 8.046910285949707, "learning_rate": 4.733137102770687e-06, "loss": 0.4629, "mean_token_accuracy": 0.8518513917922974, "num_tokens": 110813470.0, "step": 92160 }, { "entropy": 1.9628359898924828, "epoch": 0.2857189355830432, "grad_norm": 9.675116539001465, "learning_rate": 4.732880331758104e-06, "loss": 0.5357, "mean_token_accuracy": 0.8325561985373497, "num_tokens": 110824903.0, "step": 92170 }, { "entropy": 1.9065320461988449, "epoch": 0.28574993470809285, "grad_norm": 9.257568359375, "learning_rate": 4.732623602530196e-06, "loss": 0.4646, "mean_token_accuracy": 0.8543298453092575, "num_tokens": 110836341.0, "step": 92180 }, { "entropy": 1.8834924966096878, "epoch": 0.28578093383314257, "grad_norm": 3.791853189468384, "learning_rate": 4.732366915075634e-06, "loss": 0.5072, "mean_token_accuracy": 0.8327832892537117, "num_tokens": 110848011.0, "step": 92190 }, { "entropy": 1.9523876518011094, "epoch": 0.28581193295819224, "grad_norm": 8.7767972946167, "learning_rate": 4.732110269383088e-06, "loss": 0.5044, "mean_token_accuracy": 0.8447451606392861, "num_tokens": 110858754.0, "step": 92200 }, { "entropy": 1.9425658136606216, "epoch": 0.28584293208324196, "grad_norm": 6.863429069519043, "learning_rate": 4.731853665441238e-06, "loss": 0.5456, "mean_token_accuracy": 0.835406644642353, "num_tokens": 110869747.0, "step": 92210 }, { "entropy": 1.8428900748491288, "epoch": 0.28587393120829163, "grad_norm": 8.834907531738281, "learning_rate": 4.731597103238762e-06, "loss": 0.4663, "mean_token_accuracy": 0.844575323164463, "num_tokens": 110881672.0, "step": 92220 }, { "entropy": 1.9306576699018478, "epoch": 0.28590493033334136, "grad_norm": 9.883462905883789, "learning_rate": 4.731340582764347e-06, "loss": 0.498, "mean_token_accuracy": 0.8402498573064804, "num_tokens": 110893015.0, "step": 92230 }, { "entropy": 1.9201725766062736, "epoch": 0.285935929458391, "grad_norm": 6.858789443969727, "learning_rate": 4.731084104006684e-06, "loss": 0.4643, "mean_token_accuracy": 0.841082276403904, "num_tokens": 110905179.0, "step": 92240 }, { "entropy": 1.8505833253264428, "epoch": 0.28596692858344075, "grad_norm": 7.575535297393799, "learning_rate": 4.730827666954467e-06, "loss": 0.4116, "mean_token_accuracy": 0.8532150328159332, "num_tokens": 110918892.0, "step": 92250 }, { "entropy": 1.9996703028678895, "epoch": 0.2859979277084904, "grad_norm": 9.380273818969727, "learning_rate": 4.730571271596393e-06, "loss": 0.547, "mean_token_accuracy": 0.8315587803721428, "num_tokens": 110929571.0, "step": 92260 }, { "entropy": 1.724966013431549, "epoch": 0.28602892683354014, "grad_norm": 8.106704711914062, "learning_rate": 4.730314917921165e-06, "loss": 0.3552, "mean_token_accuracy": 0.8658282339572907, "num_tokens": 110942972.0, "step": 92270 }, { "entropy": 1.912028570473194, "epoch": 0.2860599259585898, "grad_norm": 3.8293659687042236, "learning_rate": 4.730058605917492e-06, "loss": 0.489, "mean_token_accuracy": 0.847729179263115, "num_tokens": 110954086.0, "step": 92280 }, { "entropy": 1.9026774257421493, "epoch": 0.28609092508363954, "grad_norm": 3.5246219635009766, "learning_rate": 4.729802335574084e-06, "loss": 0.4629, "mean_token_accuracy": 0.8511073097586632, "num_tokens": 110965849.0, "step": 92290 }, { "entropy": 1.839532507956028, "epoch": 0.2861219242086892, "grad_norm": 7.714004039764404, "learning_rate": 4.729546106879656e-06, "loss": 0.4422, "mean_token_accuracy": 0.8525758549571038, "num_tokens": 110978775.0, "step": 92300 }, { "entropy": 1.8683712184429169, "epoch": 0.28615292333373893, "grad_norm": 4.254873752593994, "learning_rate": 4.729289919822929e-06, "loss": 0.4697, "mean_token_accuracy": 0.8314826056361199, "num_tokens": 110990837.0, "step": 92310 }, { "entropy": 1.9552190572023391, "epoch": 0.2861839224587886, "grad_norm": 8.179940223693848, "learning_rate": 4.729033774392628e-06, "loss": 0.4934, "mean_token_accuracy": 0.8429278552532196, "num_tokens": 111002090.0, "step": 92320 }, { "entropy": 1.8049072623252869, "epoch": 0.2862149215838383, "grad_norm": 3.7070064544677734, "learning_rate": 4.728777670577479e-06, "loss": 0.3892, "mean_token_accuracy": 0.8596838563680649, "num_tokens": 111014690.0, "step": 92330 }, { "entropy": 1.865850919485092, "epoch": 0.286245920708888, "grad_norm": 9.017657279968262, "learning_rate": 4.7285216083662165e-06, "loss": 0.4644, "mean_token_accuracy": 0.84141965508461, "num_tokens": 111026626.0, "step": 92340 }, { "entropy": 1.802014322578907, "epoch": 0.28627691983393766, "grad_norm": 4.565001010894775, "learning_rate": 4.728265587747578e-06, "loss": 0.408, "mean_token_accuracy": 0.8524199083447457, "num_tokens": 111039897.0, "step": 92350 }, { "entropy": 1.8838436871767044, "epoch": 0.2863079189589874, "grad_norm": 10.405354499816895, "learning_rate": 4.728009608710304e-06, "loss": 0.5053, "mean_token_accuracy": 0.8382696464657784, "num_tokens": 111052245.0, "step": 92360 }, { "entropy": 1.8829987928271295, "epoch": 0.28633891808403705, "grad_norm": 8.90793514251709, "learning_rate": 4.727753671243139e-06, "loss": 0.4678, "mean_token_accuracy": 0.8499641686677932, "num_tokens": 111063937.0, "step": 92370 }, { "entropy": 1.8485404312610627, "epoch": 0.2863699172090868, "grad_norm": 7.828176975250244, "learning_rate": 4.727497775334834e-06, "loss": 0.4965, "mean_token_accuracy": 0.8384749412536621, "num_tokens": 111076807.0, "step": 92380 }, { "entropy": 1.9129365399479865, "epoch": 0.28640091633413645, "grad_norm": 10.41478443145752, "learning_rate": 4.727241920974142e-06, "loss": 0.5466, "mean_token_accuracy": 0.8334106773138046, "num_tokens": 111089253.0, "step": 92390 }, { "entropy": 1.9112588971853257, "epoch": 0.28643191545918617, "grad_norm": 9.177438735961914, "learning_rate": 4.726986108149824e-06, "loss": 0.5223, "mean_token_accuracy": 0.8404966652393341, "num_tokens": 111100456.0, "step": 92400 }, { "entropy": 1.9098269432783126, "epoch": 0.28646291458423584, "grad_norm": 7.330765724182129, "learning_rate": 4.7267303368506395e-06, "loss": 0.4828, "mean_token_accuracy": 0.8523991569876671, "num_tokens": 111111456.0, "step": 92410 }, { "entropy": 1.9250774174928664, "epoch": 0.28649391370928556, "grad_norm": 8.76259708404541, "learning_rate": 4.726474607065357e-06, "loss": 0.4822, "mean_token_accuracy": 0.8517454132437706, "num_tokens": 111122408.0, "step": 92420 }, { "entropy": 1.8456337764859199, "epoch": 0.28652491283433523, "grad_norm": 8.982091903686523, "learning_rate": 4.726218918782747e-06, "loss": 0.4748, "mean_token_accuracy": 0.8434979751706123, "num_tokens": 111135014.0, "step": 92430 }, { "entropy": 1.8489980950951577, "epoch": 0.28655591195938496, "grad_norm": 8.749741554260254, "learning_rate": 4.725963271991586e-06, "loss": 0.4099, "mean_token_accuracy": 0.8554965123534203, "num_tokens": 111147294.0, "step": 92440 }, { "entropy": 1.9318351566791534, "epoch": 0.2865869110844346, "grad_norm": 8.458632469177246, "learning_rate": 4.725707666680653e-06, "loss": 0.5415, "mean_token_accuracy": 0.8387140676379203, "num_tokens": 111158516.0, "step": 92450 }, { "entropy": 1.8340228885412215, "epoch": 0.28661791020948435, "grad_norm": 3.4830410480499268, "learning_rate": 4.72545210283873e-06, "loss": 0.3714, "mean_token_accuracy": 0.8590243220329284, "num_tokens": 111171043.0, "step": 92460 }, { "entropy": 1.8585921421647071, "epoch": 0.286648909334534, "grad_norm": 10.120370864868164, "learning_rate": 4.725196580454608e-06, "loss": 0.4472, "mean_token_accuracy": 0.8484177976846695, "num_tokens": 111183642.0, "step": 92470 }, { "entropy": 1.924575427174568, "epoch": 0.28667990845958374, "grad_norm": 9.668998718261719, "learning_rate": 4.724941099517078e-06, "loss": 0.4975, "mean_token_accuracy": 0.8406669244170188, "num_tokens": 111195086.0, "step": 92480 }, { "entropy": 1.8797391682863236, "epoch": 0.2867109075846334, "grad_norm": 7.982089996337891, "learning_rate": 4.724685660014936e-06, "loss": 0.4961, "mean_token_accuracy": 0.8439215019345283, "num_tokens": 111206895.0, "step": 92490 }, { "entropy": 1.906739890575409, "epoch": 0.28674190670968314, "grad_norm": 8.40061092376709, "learning_rate": 4.724430261936984e-06, "loss": 0.4778, "mean_token_accuracy": 0.8415932491421699, "num_tokens": 111218419.0, "step": 92500 }, { "entropy": 1.7755264952778815, "epoch": 0.2867729058347328, "grad_norm": 8.411188125610352, "learning_rate": 4.724174905272025e-06, "loss": 0.3824, "mean_token_accuracy": 0.8556288599967956, "num_tokens": 111232066.0, "step": 92510 }, { "entropy": 1.8620268389582635, "epoch": 0.28680390495978253, "grad_norm": 3.8714070320129395, "learning_rate": 4.72391959000887e-06, "loss": 0.4188, "mean_token_accuracy": 0.8555803820490837, "num_tokens": 111244082.0, "step": 92520 }, { "entropy": 1.925992988049984, "epoch": 0.2868349040848322, "grad_norm": 8.029500007629395, "learning_rate": 4.723664316136334e-06, "loss": 0.5205, "mean_token_accuracy": 0.8394767045974731, "num_tokens": 111255877.0, "step": 92530 }, { "entropy": 1.972803682088852, "epoch": 0.2868659032098819, "grad_norm": 9.798595428466797, "learning_rate": 4.723409083643231e-06, "loss": 0.5374, "mean_token_accuracy": 0.831108058989048, "num_tokens": 111266990.0, "step": 92540 }, { "entropy": 1.9300013601779937, "epoch": 0.2868969023349316, "grad_norm": 8.934462547302246, "learning_rate": 4.7231538925183875e-06, "loss": 0.5097, "mean_token_accuracy": 0.8401208460330963, "num_tokens": 111277932.0, "step": 92550 }, { "entropy": 1.8719871819019318, "epoch": 0.2869279014599813, "grad_norm": 8.938404083251953, "learning_rate": 4.722898742750625e-06, "loss": 0.4707, "mean_token_accuracy": 0.8495993599295616, "num_tokens": 111290308.0, "step": 92560 }, { "entropy": 1.9398197084665298, "epoch": 0.286958900585031, "grad_norm": 9.505714416503906, "learning_rate": 4.7226436343287775e-06, "loss": 0.5289, "mean_token_accuracy": 0.8359267324209213, "num_tokens": 111301797.0, "step": 92570 }, { "entropy": 1.8432279601693153, "epoch": 0.2869898997100807, "grad_norm": 3.82196307182312, "learning_rate": 4.7223885672416784e-06, "loss": 0.469, "mean_token_accuracy": 0.850963968038559, "num_tokens": 111314214.0, "step": 92580 }, { "entropy": 1.844231453537941, "epoch": 0.2870208988351304, "grad_norm": 8.75102710723877, "learning_rate": 4.722133541478166e-06, "loss": 0.4347, "mean_token_accuracy": 0.8547771289944649, "num_tokens": 111326679.0, "step": 92590 }, { "entropy": 1.9384073466062546, "epoch": 0.28705189796018005, "grad_norm": 7.945824146270752, "learning_rate": 4.721878557027084e-06, "loss": 0.524, "mean_token_accuracy": 0.835817402601242, "num_tokens": 111338404.0, "step": 92600 }, { "entropy": 1.9293118342757225, "epoch": 0.28708289708522977, "grad_norm": 8.206449508666992, "learning_rate": 4.7216236138772795e-06, "loss": 0.4973, "mean_token_accuracy": 0.8453877314925193, "num_tokens": 111350168.0, "step": 92610 }, { "entropy": 1.8235325343906879, "epoch": 0.28711389621027944, "grad_norm": 3.6279773712158203, "learning_rate": 4.721368712017605e-06, "loss": 0.4432, "mean_token_accuracy": 0.8457371458411217, "num_tokens": 111363248.0, "step": 92620 }, { "entropy": 1.9850926041603087, "epoch": 0.28714489533532916, "grad_norm": 9.338789939880371, "learning_rate": 4.721113851436916e-06, "loss": 0.5428, "mean_token_accuracy": 0.8441228061914444, "num_tokens": 111374112.0, "step": 92630 }, { "entropy": 1.8502737820148467, "epoch": 0.28717589446037883, "grad_norm": 8.026445388793945, "learning_rate": 4.72085903212407e-06, "loss": 0.423, "mean_token_accuracy": 0.8572929382324219, "num_tokens": 111386542.0, "step": 92640 }, { "entropy": 1.8364259883761407, "epoch": 0.28720689358542856, "grad_norm": 7.70156717300415, "learning_rate": 4.7206042540679335e-06, "loss": 0.4655, "mean_token_accuracy": 0.8466950342059135, "num_tokens": 111399115.0, "step": 92650 }, { "entropy": 1.8671451389789582, "epoch": 0.2872378927104782, "grad_norm": 9.978074073791504, "learning_rate": 4.720349517257375e-06, "loss": 0.4699, "mean_token_accuracy": 0.8522609934210778, "num_tokens": 111411335.0, "step": 92660 }, { "entropy": 1.8687150657176972, "epoch": 0.28726889183552795, "grad_norm": 4.219553470611572, "learning_rate": 4.720094821681266e-06, "loss": 0.4339, "mean_token_accuracy": 0.8571690008044243, "num_tokens": 111423197.0, "step": 92670 }, { "entropy": 1.839407466351986, "epoch": 0.2872998909605776, "grad_norm": 7.242602348327637, "learning_rate": 4.719840167328485e-06, "loss": 0.4644, "mean_token_accuracy": 0.8519873261451721, "num_tokens": 111436358.0, "step": 92680 }, { "entropy": 1.8508578151464463, "epoch": 0.28733089008562734, "grad_norm": 4.504565238952637, "learning_rate": 4.719585554187911e-06, "loss": 0.455, "mean_token_accuracy": 0.8483609542250633, "num_tokens": 111448749.0, "step": 92690 }, { "entropy": 1.8885752364993096, "epoch": 0.287361889210677, "grad_norm": 8.104568481445312, "learning_rate": 4.7193309822484295e-06, "loss": 0.4459, "mean_token_accuracy": 0.8618633911013603, "num_tokens": 111460055.0, "step": 92700 }, { "entropy": 1.8163581773638726, "epoch": 0.28739288833572674, "grad_norm": 7.430144786834717, "learning_rate": 4.719076451498931e-06, "loss": 0.4375, "mean_token_accuracy": 0.8476424887776375, "num_tokens": 111473204.0, "step": 92710 }, { "entropy": 1.9194232299923897, "epoch": 0.2874238874607764, "grad_norm": 8.560006141662598, "learning_rate": 4.718821961928308e-06, "loss": 0.551, "mean_token_accuracy": 0.825740373134613, "num_tokens": 111484969.0, "step": 92720 }, { "entropy": 1.8660444170236588, "epoch": 0.28745488658582613, "grad_norm": 8.867291450500488, "learning_rate": 4.71856751352546e-06, "loss": 0.4686, "mean_token_accuracy": 0.8471433222293854, "num_tokens": 111497653.0, "step": 92730 }, { "entropy": 1.8844849050045014, "epoch": 0.2874858857108758, "grad_norm": 8.284875869750977, "learning_rate": 4.7183131062792855e-06, "loss": 0.4806, "mean_token_accuracy": 0.8477038741111755, "num_tokens": 111510030.0, "step": 92740 }, { "entropy": 1.945139628648758, "epoch": 0.2875168848359255, "grad_norm": 10.998967170715332, "learning_rate": 4.718058740178694e-06, "loss": 0.5958, "mean_token_accuracy": 0.8271935939788818, "num_tokens": 111521480.0, "step": 92750 }, { "entropy": 1.9299291223287582, "epoch": 0.2875478839609752, "grad_norm": 8.494784355163574, "learning_rate": 4.717804415212594e-06, "loss": 0.533, "mean_token_accuracy": 0.8403868332505227, "num_tokens": 111532281.0, "step": 92760 }, { "entropy": 1.8992675706744193, "epoch": 0.2875788830860249, "grad_norm": 3.8736727237701416, "learning_rate": 4.717550131369901e-06, "loss": 0.4972, "mean_token_accuracy": 0.843301497399807, "num_tokens": 111544377.0, "step": 92770 }, { "entropy": 1.9680778548121451, "epoch": 0.2876098822110746, "grad_norm": 7.67625093460083, "learning_rate": 4.717295888639533e-06, "loss": 0.5604, "mean_token_accuracy": 0.8245472773909569, "num_tokens": 111555728.0, "step": 92780 }, { "entropy": 1.8295420020818711, "epoch": 0.2876408813361243, "grad_norm": 6.913825988769531, "learning_rate": 4.717041687010413e-06, "loss": 0.454, "mean_token_accuracy": 0.8417760416865349, "num_tokens": 111569369.0, "step": 92790 }, { "entropy": 1.8156803011894227, "epoch": 0.287671880461174, "grad_norm": 8.341958045959473, "learning_rate": 4.716787526471468e-06, "loss": 0.4361, "mean_token_accuracy": 0.846878944337368, "num_tokens": 111582611.0, "step": 92800 }, { "entropy": 1.9245857551693917, "epoch": 0.2877028795862237, "grad_norm": 8.205559730529785, "learning_rate": 4.716533407011631e-06, "loss": 0.5094, "mean_token_accuracy": 0.8327091008424758, "num_tokens": 111594361.0, "step": 92810 }, { "entropy": 1.818833366036415, "epoch": 0.28773387871127337, "grad_norm": 3.7494218349456787, "learning_rate": 4.716279328619835e-06, "loss": 0.414, "mean_token_accuracy": 0.8558287933468819, "num_tokens": 111607793.0, "step": 92820 }, { "entropy": 1.8438904002308845, "epoch": 0.2877648778363231, "grad_norm": 8.092823028564453, "learning_rate": 4.716025291285019e-06, "loss": 0.496, "mean_token_accuracy": 0.8389999002218247, "num_tokens": 111620382.0, "step": 92830 }, { "entropy": 1.8870263323187828, "epoch": 0.28779587696137277, "grad_norm": 2.7017006874084473, "learning_rate": 4.715771294996129e-06, "loss": 0.4943, "mean_token_accuracy": 0.8476079568266869, "num_tokens": 111632680.0, "step": 92840 }, { "entropy": 1.841055366396904, "epoch": 0.28782687608642243, "grad_norm": 7.183150768280029, "learning_rate": 4.715517339742112e-06, "loss": 0.4525, "mean_token_accuracy": 0.8533366426825524, "num_tokens": 111644869.0, "step": 92850 }, { "entropy": 1.763474926352501, "epoch": 0.28785787521147216, "grad_norm": 3.5942344665527344, "learning_rate": 4.7152634255119215e-06, "loss": 0.3912, "mean_token_accuracy": 0.8623644202947617, "num_tokens": 111658019.0, "step": 92860 }, { "entropy": 1.9290682673454285, "epoch": 0.2878888743365218, "grad_norm": 11.06643295288086, "learning_rate": 4.71500955229451e-06, "loss": 0.5342, "mean_token_accuracy": 0.8362339481711387, "num_tokens": 111669612.0, "step": 92870 }, { "entropy": 1.9505492269992828, "epoch": 0.28791987346157155, "grad_norm": 7.899135112762451, "learning_rate": 4.7147557200788414e-06, "loss": 0.5255, "mean_token_accuracy": 0.8352016389369965, "num_tokens": 111680635.0, "step": 92880 }, { "entropy": 1.8193707883358001, "epoch": 0.2879508725866212, "grad_norm": 5.0817975997924805, "learning_rate": 4.714501928853879e-06, "loss": 0.4464, "mean_token_accuracy": 0.849019393324852, "num_tokens": 111693588.0, "step": 92890 }, { "entropy": 1.8536582559347152, "epoch": 0.28798187171167094, "grad_norm": 3.8260529041290283, "learning_rate": 4.714248178608591e-06, "loss": 0.438, "mean_token_accuracy": 0.8521668612957001, "num_tokens": 111705649.0, "step": 92900 }, { "entropy": 1.9044562801718712, "epoch": 0.2880128708367206, "grad_norm": 4.923748016357422, "learning_rate": 4.713994469331952e-06, "loss": 0.471, "mean_token_accuracy": 0.8511299833655357, "num_tokens": 111717116.0, "step": 92910 }, { "entropy": 1.8554534062743187, "epoch": 0.28804386996177034, "grad_norm": 3.183706521987915, "learning_rate": 4.713740801012937e-06, "loss": 0.3813, "mean_token_accuracy": 0.8666115805506707, "num_tokens": 111729411.0, "step": 92920 }, { "entropy": 1.8407445654273034, "epoch": 0.28807486908682, "grad_norm": 9.04386043548584, "learning_rate": 4.713487173640529e-06, "loss": 0.4396, "mean_token_accuracy": 0.848953865468502, "num_tokens": 111742147.0, "step": 92930 }, { "entropy": 1.8394930571317674, "epoch": 0.28810586821186973, "grad_norm": 3.4830808639526367, "learning_rate": 4.7132335872037114e-06, "loss": 0.4433, "mean_token_accuracy": 0.8509487017989159, "num_tokens": 111754776.0, "step": 92940 }, { "entropy": 1.8994437158107758, "epoch": 0.2881368673369194, "grad_norm": 9.240650177001953, "learning_rate": 4.712980041691476e-06, "loss": 0.5243, "mean_token_accuracy": 0.843148159980774, "num_tokens": 111765930.0, "step": 92950 }, { "entropy": 1.8866792187094688, "epoch": 0.2881678664619691, "grad_norm": 7.509947299957275, "learning_rate": 4.7127265370928134e-06, "loss": 0.4576, "mean_token_accuracy": 0.8502578973770142, "num_tokens": 111777482.0, "step": 92960 }, { "entropy": 1.8861197009682655, "epoch": 0.2881988655870188, "grad_norm": 7.608978271484375, "learning_rate": 4.712473073396724e-06, "loss": 0.4883, "mean_token_accuracy": 0.8533033058047295, "num_tokens": 111789308.0, "step": 92970 }, { "entropy": 1.8014571815729141, "epoch": 0.2882298647120685, "grad_norm": 5.268444061279297, "learning_rate": 4.7122196505922085e-06, "loss": 0.444, "mean_token_accuracy": 0.8505001902580261, "num_tokens": 111803076.0, "step": 92980 }, { "entropy": 1.9110738933086395, "epoch": 0.2882608638371182, "grad_norm": 7.389989376068115, "learning_rate": 4.711966268668274e-06, "loss": 0.5182, "mean_token_accuracy": 0.8408216923475266, "num_tokens": 111814444.0, "step": 92990 }, { "entropy": 1.8892789036035538, "epoch": 0.2882918629621679, "grad_norm": 8.411653518676758, "learning_rate": 4.711712927613929e-06, "loss": 0.4976, "mean_token_accuracy": 0.8434664875268936, "num_tokens": 111826241.0, "step": 93000 }, { "entropy": 1.929236751794815, "epoch": 0.2883228620872176, "grad_norm": 9.464624404907227, "learning_rate": 4.711459627418189e-06, "loss": 0.5093, "mean_token_accuracy": 0.8482060879468918, "num_tokens": 111838666.0, "step": 93010 }, { "entropy": 1.8663489505648614, "epoch": 0.2883538612122673, "grad_norm": 7.735058784484863, "learning_rate": 4.711206368070072e-06, "loss": 0.4294, "mean_token_accuracy": 0.8607504948973655, "num_tokens": 111851236.0, "step": 93020 }, { "entropy": 1.8760851591825485, "epoch": 0.288384860337317, "grad_norm": 7.6005754470825195, "learning_rate": 4.710953149558602e-06, "loss": 0.477, "mean_token_accuracy": 0.8493797600269317, "num_tokens": 111863060.0, "step": 93030 }, { "entropy": 1.8153146281838417, "epoch": 0.2884158594623667, "grad_norm": 8.263833999633789, "learning_rate": 4.710699971872803e-06, "loss": 0.4379, "mean_token_accuracy": 0.851483790576458, "num_tokens": 111876458.0, "step": 93040 }, { "entropy": 1.8655553236603737, "epoch": 0.28844685858741637, "grad_norm": 5.136788368225098, "learning_rate": 4.710446835001707e-06, "loss": 0.4657, "mean_token_accuracy": 0.8393679440021515, "num_tokens": 111889068.0, "step": 93050 }, { "entropy": 1.8693054020404816, "epoch": 0.2884778577124661, "grad_norm": 6.886144638061523, "learning_rate": 4.71019373893435e-06, "loss": 0.4437, "mean_token_accuracy": 0.8552120968699455, "num_tokens": 111901836.0, "step": 93060 }, { "entropy": 1.8734416976571082, "epoch": 0.28850885683751576, "grad_norm": 10.7568941116333, "learning_rate": 4.709940683659771e-06, "loss": 0.4433, "mean_token_accuracy": 0.8552261263132095, "num_tokens": 111913647.0, "step": 93070 }, { "entropy": 1.868347629904747, "epoch": 0.2885398559625655, "grad_norm": 4.4855852127075195, "learning_rate": 4.709687669167011e-06, "loss": 0.4386, "mean_token_accuracy": 0.8452133953571319, "num_tokens": 111926603.0, "step": 93080 }, { "entropy": 1.9437709406018258, "epoch": 0.28857085508761515, "grad_norm": 9.099634170532227, "learning_rate": 4.7094346954451196e-06, "loss": 0.5133, "mean_token_accuracy": 0.8310608744621277, "num_tokens": 111937928.0, "step": 93090 }, { "entropy": 1.8436076626181603, "epoch": 0.2886018542126648, "grad_norm": 4.898632526397705, "learning_rate": 4.709181762483149e-06, "loss": 0.4675, "mean_token_accuracy": 0.8579155907034874, "num_tokens": 111950393.0, "step": 93100 }, { "entropy": 1.8545200631022454, "epoch": 0.28863285333771455, "grad_norm": 6.226985931396484, "learning_rate": 4.708928870270152e-06, "loss": 0.4087, "mean_token_accuracy": 0.8614425033330917, "num_tokens": 111962194.0, "step": 93110 }, { "entropy": 1.9305305495858192, "epoch": 0.2886638524627642, "grad_norm": 8.529313087463379, "learning_rate": 4.70867601879519e-06, "loss": 0.5062, "mean_token_accuracy": 0.8401212841272354, "num_tokens": 111973375.0, "step": 93120 }, { "entropy": 1.8925403907895089, "epoch": 0.28869485158781394, "grad_norm": 5.547452449798584, "learning_rate": 4.7084232080473254e-06, "loss": 0.5184, "mean_token_accuracy": 0.8335786327719689, "num_tokens": 111985430.0, "step": 93130 }, { "entropy": 1.9314084231853486, "epoch": 0.2887258507128636, "grad_norm": 8.339520454406738, "learning_rate": 4.7081704380156275e-06, "loss": 0.5239, "mean_token_accuracy": 0.8429270505905151, "num_tokens": 111996483.0, "step": 93140 }, { "entropy": 1.8602096036076545, "epoch": 0.28875684983791333, "grad_norm": 7.6653523445129395, "learning_rate": 4.7079177086891694e-06, "loss": 0.4984, "mean_token_accuracy": 0.8460811406373978, "num_tokens": 112008286.0, "step": 93150 }, { "entropy": 1.8911155819892884, "epoch": 0.288787848962963, "grad_norm": 8.817441940307617, "learning_rate": 4.7076650200570235e-06, "loss": 0.4631, "mean_token_accuracy": 0.8515992224216461, "num_tokens": 112019993.0, "step": 93160 }, { "entropy": 1.9293202444911004, "epoch": 0.2888188480880127, "grad_norm": 7.830452919006348, "learning_rate": 4.707412372108274e-06, "loss": 0.5404, "mean_token_accuracy": 0.8322738707065582, "num_tokens": 112030697.0, "step": 93170 }, { "entropy": 1.9798065185546876, "epoch": 0.2888498472130624, "grad_norm": 8.459895133972168, "learning_rate": 4.707159764832003e-06, "loss": 0.5198, "mean_token_accuracy": 0.8400536015629768, "num_tokens": 112041497.0, "step": 93180 }, { "entropy": 1.879485437273979, "epoch": 0.2888808463381121, "grad_norm": 6.744016647338867, "learning_rate": 4.7069071982172985e-06, "loss": 0.4375, "mean_token_accuracy": 0.8491860061883927, "num_tokens": 112054220.0, "step": 93190 }, { "entropy": 1.9080306202173234, "epoch": 0.2889118454631618, "grad_norm": 7.247501373291016, "learning_rate": 4.706654672253255e-06, "loss": 0.4684, "mean_token_accuracy": 0.8472570210695267, "num_tokens": 112066105.0, "step": 93200 }, { "entropy": 1.8720626473426818, "epoch": 0.2889428445882115, "grad_norm": 7.93571138381958, "learning_rate": 4.706402186928967e-06, "loss": 0.461, "mean_token_accuracy": 0.8489278420805931, "num_tokens": 112078575.0, "step": 93210 }, { "entropy": 1.8863236621022224, "epoch": 0.2889738437132612, "grad_norm": 8.153715133666992, "learning_rate": 4.706149742233537e-06, "loss": 0.5219, "mean_token_accuracy": 0.8377825424075127, "num_tokens": 112090038.0, "step": 93220 }, { "entropy": 1.8599166497588158, "epoch": 0.2890048428383109, "grad_norm": 8.849479675292969, "learning_rate": 4.705897338156069e-06, "loss": 0.479, "mean_token_accuracy": 0.8481295198202133, "num_tokens": 112101547.0, "step": 93230 }, { "entropy": 1.7532717436552048, "epoch": 0.2890358419633606, "grad_norm": 8.861629486083984, "learning_rate": 4.705644974685672e-06, "loss": 0.3496, "mean_token_accuracy": 0.8671473681926727, "num_tokens": 112114998.0, "step": 93240 }, { "entropy": 1.8831738129258155, "epoch": 0.2890668410884103, "grad_norm": 7.922811031341553, "learning_rate": 4.705392651811459e-06, "loss": 0.5045, "mean_token_accuracy": 0.8377567693591118, "num_tokens": 112126554.0, "step": 93250 }, { "entropy": 1.8818488538265228, "epoch": 0.28909784021345997, "grad_norm": 8.488920211791992, "learning_rate": 4.705140369522546e-06, "loss": 0.4977, "mean_token_accuracy": 0.8422826811671257, "num_tokens": 112138514.0, "step": 93260 }, { "entropy": 1.9045387908816338, "epoch": 0.2891288393385097, "grad_norm": 7.4267425537109375, "learning_rate": 4.704888127808055e-06, "loss": 0.4912, "mean_token_accuracy": 0.8422675371170044, "num_tokens": 112149688.0, "step": 93270 }, { "entropy": 1.9745795860886575, "epoch": 0.28915983846355936, "grad_norm": 7.358890056610107, "learning_rate": 4.704635926657112e-06, "loss": 0.5901, "mean_token_accuracy": 0.8308152720332146, "num_tokens": 112161678.0, "step": 93280 }, { "entropy": 1.8890698000788688, "epoch": 0.2891908375886091, "grad_norm": 7.141902446746826, "learning_rate": 4.704383766058845e-06, "loss": 0.4607, "mean_token_accuracy": 0.8586556881666183, "num_tokens": 112173914.0, "step": 93290 }, { "entropy": 1.9566715627908706, "epoch": 0.28922183671365875, "grad_norm": 8.638976097106934, "learning_rate": 4.70413164600239e-06, "loss": 0.5518, "mean_token_accuracy": 0.8295210391283036, "num_tokens": 112185080.0, "step": 93300 }, { "entropy": 1.8873969689011574, "epoch": 0.2892528358387085, "grad_norm": 3.9027388095855713, "learning_rate": 4.70387956647688e-06, "loss": 0.4504, "mean_token_accuracy": 0.8503093495965004, "num_tokens": 112197377.0, "step": 93310 }, { "entropy": 1.8474943891167641, "epoch": 0.28928383496375815, "grad_norm": 4.076306343078613, "learning_rate": 4.703627527471461e-06, "loss": 0.4205, "mean_token_accuracy": 0.8605047658085823, "num_tokens": 112209297.0, "step": 93320 }, { "entropy": 1.9313654646277427, "epoch": 0.28931483408880787, "grad_norm": 11.894732475280762, "learning_rate": 4.703375528975276e-06, "loss": 0.4935, "mean_token_accuracy": 0.8404849201440812, "num_tokens": 112220394.0, "step": 93330 }, { "entropy": 1.9135547280311584, "epoch": 0.28934583321385754, "grad_norm": 7.225008487701416, "learning_rate": 4.703123570977474e-06, "loss": 0.4937, "mean_token_accuracy": 0.8464812904596328, "num_tokens": 112231383.0, "step": 93340 }, { "entropy": 1.9131829939782619, "epoch": 0.2893768323389072, "grad_norm": 10.179394721984863, "learning_rate": 4.702871653467211e-06, "loss": 0.4887, "mean_token_accuracy": 0.8461228340864182, "num_tokens": 112243480.0, "step": 93350 }, { "entropy": 1.9151297047734261, "epoch": 0.28940783146395693, "grad_norm": 7.998013019561768, "learning_rate": 4.702619776433645e-06, "loss": 0.4861, "mean_token_accuracy": 0.8424739927053452, "num_tokens": 112254846.0, "step": 93360 }, { "entropy": 1.8562763080000877, "epoch": 0.2894388305890066, "grad_norm": 3.337523937225342, "learning_rate": 4.702367939865935e-06, "loss": 0.4565, "mean_token_accuracy": 0.8496193781495094, "num_tokens": 112266394.0, "step": 93370 }, { "entropy": 1.9171319857239724, "epoch": 0.2894698297140563, "grad_norm": 8.044168472290039, "learning_rate": 4.70211614375325e-06, "loss": 0.4958, "mean_token_accuracy": 0.8436752587556839, "num_tokens": 112277740.0, "step": 93380 }, { "entropy": 1.8200967207551002, "epoch": 0.289500828839106, "grad_norm": 7.529294013977051, "learning_rate": 4.701864388084757e-06, "loss": 0.4502, "mean_token_accuracy": 0.8523975953459739, "num_tokens": 112290792.0, "step": 93390 }, { "entropy": 1.8970361724495888, "epoch": 0.2895318279641557, "grad_norm": 10.518641471862793, "learning_rate": 4.701612672849634e-06, "loss": 0.5585, "mean_token_accuracy": 0.8354102879762649, "num_tokens": 112302522.0, "step": 93400 }, { "entropy": 1.827714842557907, "epoch": 0.2895628270892054, "grad_norm": 8.352113723754883, "learning_rate": 4.701360998037056e-06, "loss": 0.4356, "mean_token_accuracy": 0.849641677737236, "num_tokens": 112315336.0, "step": 93410 }, { "entropy": 1.9102312728762627, "epoch": 0.2895938262142551, "grad_norm": 7.807504653930664, "learning_rate": 4.701109363636205e-06, "loss": 0.5247, "mean_token_accuracy": 0.8295806169509887, "num_tokens": 112326917.0, "step": 93420 }, { "entropy": 1.850624245405197, "epoch": 0.2896248253393048, "grad_norm": 4.632590293884277, "learning_rate": 4.70085776963627e-06, "loss": 0.4633, "mean_token_accuracy": 0.8438147455453873, "num_tokens": 112339561.0, "step": 93430 }, { "entropy": 1.909179501235485, "epoch": 0.2896558244643545, "grad_norm": 8.325363159179688, "learning_rate": 4.700606216026438e-06, "loss": 0.4844, "mean_token_accuracy": 0.8498807772994041, "num_tokens": 112350456.0, "step": 93440 }, { "entropy": 1.8641536325216292, "epoch": 0.2896868235894042, "grad_norm": 10.412588119506836, "learning_rate": 4.700354702795905e-06, "loss": 0.4899, "mean_token_accuracy": 0.8456828027963639, "num_tokens": 112363019.0, "step": 93450 }, { "entropy": 1.92139028608799, "epoch": 0.2897178227144539, "grad_norm": 10.26781940460205, "learning_rate": 4.700103229933871e-06, "loss": 0.5355, "mean_token_accuracy": 0.8238134995102883, "num_tokens": 112374578.0, "step": 93460 }, { "entropy": 1.9097017407417298, "epoch": 0.28974882183950357, "grad_norm": 6.589956283569336, "learning_rate": 4.699851797429535e-06, "loss": 0.4802, "mean_token_accuracy": 0.8451741203665734, "num_tokens": 112385717.0, "step": 93470 }, { "entropy": 1.9064988225698472, "epoch": 0.2897798209645533, "grad_norm": 7.078739643096924, "learning_rate": 4.6996004052721055e-06, "loss": 0.5205, "mean_token_accuracy": 0.832171306014061, "num_tokens": 112397934.0, "step": 93480 }, { "entropy": 1.8057002767920494, "epoch": 0.28981082008960296, "grad_norm": 8.572505950927734, "learning_rate": 4.699349053450793e-06, "loss": 0.3998, "mean_token_accuracy": 0.858871404826641, "num_tokens": 112411952.0, "step": 93490 }, { "entropy": 1.9524729505181313, "epoch": 0.2898418192146527, "grad_norm": 8.818181037902832, "learning_rate": 4.699097741954811e-06, "loss": 0.5257, "mean_token_accuracy": 0.8438670292496682, "num_tokens": 112423702.0, "step": 93500 }, { "entropy": 1.9189194455742835, "epoch": 0.28987281833970235, "grad_norm": 3.635896682739258, "learning_rate": 4.698846470773379e-06, "loss": 0.4516, "mean_token_accuracy": 0.8519726365804672, "num_tokens": 112435673.0, "step": 93510 }, { "entropy": 1.9499738737940788, "epoch": 0.2899038174647521, "grad_norm": 8.249286651611328, "learning_rate": 4.698595239895718e-06, "loss": 0.4861, "mean_token_accuracy": 0.844916258752346, "num_tokens": 112446773.0, "step": 93520 }, { "entropy": 1.948981523513794, "epoch": 0.28993481658980175, "grad_norm": 8.473828315734863, "learning_rate": 4.698344049311058e-06, "loss": 0.5087, "mean_token_accuracy": 0.8404761984944343, "num_tokens": 112457606.0, "step": 93530 }, { "entropy": 1.8598048985004425, "epoch": 0.28996581571485147, "grad_norm": 7.43870735168457, "learning_rate": 4.698092899008628e-06, "loss": 0.475, "mean_token_accuracy": 0.8440364077687263, "num_tokens": 112470335.0, "step": 93540 }, { "entropy": 1.871220625936985, "epoch": 0.28999681483990114, "grad_norm": 7.426569938659668, "learning_rate": 4.69784178897766e-06, "loss": 0.4636, "mean_token_accuracy": 0.8520672276616097, "num_tokens": 112482248.0, "step": 93550 }, { "entropy": 1.86535192579031, "epoch": 0.29002781396495086, "grad_norm": 4.510183811187744, "learning_rate": 4.697590719207397e-06, "loss": 0.4548, "mean_token_accuracy": 0.8448464289307595, "num_tokens": 112494530.0, "step": 93560 }, { "entropy": 1.8161170035600662, "epoch": 0.29005881309000053, "grad_norm": 8.694151878356934, "learning_rate": 4.69733968968708e-06, "loss": 0.4765, "mean_token_accuracy": 0.8455665096640587, "num_tokens": 112508367.0, "step": 93570 }, { "entropy": 1.9283171832561492, "epoch": 0.29008981221505026, "grad_norm": 8.558956146240234, "learning_rate": 4.697088700405954e-06, "loss": 0.567, "mean_token_accuracy": 0.8320746123790741, "num_tokens": 112519148.0, "step": 93580 }, { "entropy": 1.9475929975509643, "epoch": 0.2901208113400999, "grad_norm": 7.119383811950684, "learning_rate": 4.696837751353273e-06, "loss": 0.5031, "mean_token_accuracy": 0.8410884723067283, "num_tokens": 112529839.0, "step": 93590 }, { "entropy": 1.8843889951705932, "epoch": 0.2901518104651496, "grad_norm": 6.53165340423584, "learning_rate": 4.69658684251829e-06, "loss": 0.4745, "mean_token_accuracy": 0.8552757307887078, "num_tokens": 112541392.0, "step": 93600 }, { "entropy": 1.9115852415561676, "epoch": 0.2901828095901993, "grad_norm": 8.958046913146973, "learning_rate": 4.696335973890263e-06, "loss": 0.4723, "mean_token_accuracy": 0.8533832281827927, "num_tokens": 112551686.0, "step": 93610 }, { "entropy": 1.9201652556657791, "epoch": 0.290213808715249, "grad_norm": 9.570830345153809, "learning_rate": 4.696085145458457e-06, "loss": 0.5081, "mean_token_accuracy": 0.8437472805380821, "num_tokens": 112562743.0, "step": 93620 }, { "entropy": 1.909856851398945, "epoch": 0.2902448078402987, "grad_norm": 3.7993521690368652, "learning_rate": 4.695834357212138e-06, "loss": 0.5146, "mean_token_accuracy": 0.8377780944108963, "num_tokens": 112574467.0, "step": 93630 }, { "entropy": 1.8987789198756218, "epoch": 0.2902758069653484, "grad_norm": 10.340612411499023, "learning_rate": 4.695583609140576e-06, "loss": 0.5368, "mean_token_accuracy": 0.8381225094199181, "num_tokens": 112587453.0, "step": 93640 }, { "entropy": 1.767358809709549, "epoch": 0.2903068060903981, "grad_norm": 4.561580181121826, "learning_rate": 4.695332901233046e-06, "loss": 0.4904, "mean_token_accuracy": 0.845879316329956, "num_tokens": 112601795.0, "step": 93650 }, { "entropy": 1.8162215173244476, "epoch": 0.2903378052154478, "grad_norm": 6.31105899810791, "learning_rate": 4.695082233478828e-06, "loss": 0.4, "mean_token_accuracy": 0.8538815006613731, "num_tokens": 112614333.0, "step": 93660 }, { "entropy": 1.865521389245987, "epoch": 0.2903688043404975, "grad_norm": 8.672292709350586, "learning_rate": 4.694831605867206e-06, "loss": 0.4737, "mean_token_accuracy": 0.8480850234627724, "num_tokens": 112627252.0, "step": 93670 }, { "entropy": 1.7771051108837128, "epoch": 0.29039980346554717, "grad_norm": 9.864136695861816, "learning_rate": 4.694581018387463e-06, "loss": 0.3924, "mean_token_accuracy": 0.8620004311203957, "num_tokens": 112640242.0, "step": 93680 }, { "entropy": 1.8782151356339454, "epoch": 0.2904308025905969, "grad_norm": 6.314033031463623, "learning_rate": 4.694330471028893e-06, "loss": 0.4644, "mean_token_accuracy": 0.8508936673402786, "num_tokens": 112652406.0, "step": 93690 }, { "entropy": 1.9099466919898986, "epoch": 0.29046180171564656, "grad_norm": 4.040074348449707, "learning_rate": 4.694079963780791e-06, "loss": 0.5005, "mean_token_accuracy": 0.8430052191019058, "num_tokens": 112664170.0, "step": 93700 }, { "entropy": 1.863590781390667, "epoch": 0.2904928008406963, "grad_norm": 8.13239860534668, "learning_rate": 4.693829496632454e-06, "loss": 0.503, "mean_token_accuracy": 0.8332739755511284, "num_tokens": 112676541.0, "step": 93710 }, { "entropy": 1.8954721093177795, "epoch": 0.29052379996574595, "grad_norm": 9.620070457458496, "learning_rate": 4.693579069573186e-06, "loss": 0.5071, "mean_token_accuracy": 0.8386843577027321, "num_tokens": 112689006.0, "step": 93720 }, { "entropy": 1.8437098309397697, "epoch": 0.2905547990907957, "grad_norm": 8.268927574157715, "learning_rate": 4.693328682592294e-06, "loss": 0.4478, "mean_token_accuracy": 0.860004435479641, "num_tokens": 112701042.0, "step": 93730 }, { "entropy": 1.8573900401592254, "epoch": 0.29058579821584535, "grad_norm": 3.780688524246216, "learning_rate": 4.693078335679089e-06, "loss": 0.4399, "mean_token_accuracy": 0.8458506971597671, "num_tokens": 112713236.0, "step": 93740 }, { "entropy": 1.81138436794281, "epoch": 0.29061679734089507, "grad_norm": 9.589875221252441, "learning_rate": 4.692828028822885e-06, "loss": 0.4301, "mean_token_accuracy": 0.8497409135103225, "num_tokens": 112725893.0, "step": 93750 }, { "entropy": 1.835260456800461, "epoch": 0.29064779646594474, "grad_norm": 9.189959526062012, "learning_rate": 4.692577762013002e-06, "loss": 0.5272, "mean_token_accuracy": 0.8347838670015335, "num_tokens": 112738315.0, "step": 93760 }, { "entropy": 1.8794139876961709, "epoch": 0.29067879559099447, "grad_norm": 3.8947536945343018, "learning_rate": 4.692327535238763e-06, "loss": 0.4627, "mean_token_accuracy": 0.851905120909214, "num_tokens": 112750430.0, "step": 93770 }, { "entropy": 1.7996224954724311, "epoch": 0.29070979471604413, "grad_norm": 9.7765474319458, "learning_rate": 4.6920773484894935e-06, "loss": 0.4734, "mean_token_accuracy": 0.8449171632528305, "num_tokens": 112763272.0, "step": 93780 }, { "entropy": 1.8897378101944924, "epoch": 0.29074079384109386, "grad_norm": 9.47298526763916, "learning_rate": 4.6918272017545255e-06, "loss": 0.4834, "mean_token_accuracy": 0.8373202964663505, "num_tokens": 112775724.0, "step": 93790 }, { "entropy": 1.9278543174266816, "epoch": 0.2907717929661435, "grad_norm": 7.359293460845947, "learning_rate": 4.691577095023192e-06, "loss": 0.5133, "mean_token_accuracy": 0.8393516659736633, "num_tokens": 112787066.0, "step": 93800 }, { "entropy": 1.9417521178722381, "epoch": 0.29080279209119325, "grad_norm": 8.523365020751953, "learning_rate": 4.691327028284835e-06, "loss": 0.5202, "mean_token_accuracy": 0.835085253417492, "num_tokens": 112798938.0, "step": 93810 }, { "entropy": 1.830255390703678, "epoch": 0.2908337912162429, "grad_norm": 8.126928329467773, "learning_rate": 4.691077001528794e-06, "loss": 0.4632, "mean_token_accuracy": 0.8412661746144294, "num_tokens": 112812321.0, "step": 93820 }, { "entropy": 1.8606264024972916, "epoch": 0.2908647903412926, "grad_norm": 10.298184394836426, "learning_rate": 4.690827014744417e-06, "loss": 0.5009, "mean_token_accuracy": 0.840786500275135, "num_tokens": 112824184.0, "step": 93830 }, { "entropy": 1.9569880202412606, "epoch": 0.2908957894663423, "grad_norm": 10.95293140411377, "learning_rate": 4.690577067921055e-06, "loss": 0.5302, "mean_token_accuracy": 0.8427381262183189, "num_tokens": 112834958.0, "step": 93840 }, { "entropy": 1.879898366332054, "epoch": 0.290926788591392, "grad_norm": 8.868535041809082, "learning_rate": 4.690327161048064e-06, "loss": 0.4585, "mean_token_accuracy": 0.8543467596173286, "num_tokens": 112847253.0, "step": 93850 }, { "entropy": 1.884432803094387, "epoch": 0.2909577877164417, "grad_norm": 3.7411601543426514, "learning_rate": 4.6900772941147994e-06, "loss": 0.4685, "mean_token_accuracy": 0.8459440425038338, "num_tokens": 112859034.0, "step": 93860 }, { "entropy": 1.8857886865735054, "epoch": 0.2909887868414914, "grad_norm": 4.058228492736816, "learning_rate": 4.689827467110626e-06, "loss": 0.4782, "mean_token_accuracy": 0.8447174549102783, "num_tokens": 112870746.0, "step": 93870 }, { "entropy": 1.8584589600563048, "epoch": 0.2910197859665411, "grad_norm": 3.87026309967041, "learning_rate": 4.689577680024911e-06, "loss": 0.4473, "mean_token_accuracy": 0.846294179558754, "num_tokens": 112883210.0, "step": 93880 }, { "entropy": 1.867951761186123, "epoch": 0.29105078509159077, "grad_norm": 8.24725341796875, "learning_rate": 4.689327932847024e-06, "loss": 0.4867, "mean_token_accuracy": 0.8442766860127449, "num_tokens": 112895841.0, "step": 93890 }, { "entropy": 1.8113203912973403, "epoch": 0.2910817842166405, "grad_norm": 3.6465940475463867, "learning_rate": 4.689078225566338e-06, "loss": 0.442, "mean_token_accuracy": 0.8473143368959427, "num_tokens": 112909141.0, "step": 93900 }, { "entropy": 1.929970356822014, "epoch": 0.29111278334169016, "grad_norm": 8.883091926574707, "learning_rate": 4.688828558172234e-06, "loss": 0.4968, "mean_token_accuracy": 0.8561759814620018, "num_tokens": 112919787.0, "step": 93910 }, { "entropy": 1.9370476379990578, "epoch": 0.2911437824667399, "grad_norm": 7.68314790725708, "learning_rate": 4.688578930654094e-06, "loss": 0.5672, "mean_token_accuracy": 0.8174238324165344, "num_tokens": 112931094.0, "step": 93920 }, { "entropy": 1.8187593132257462, "epoch": 0.29117478159178956, "grad_norm": 9.00848388671875, "learning_rate": 4.688329343001302e-06, "loss": 0.4596, "mean_token_accuracy": 0.8468037515878677, "num_tokens": 112944183.0, "step": 93930 }, { "entropy": 1.7754915788769723, "epoch": 0.2912057807168393, "grad_norm": 8.143677711486816, "learning_rate": 4.688079795203251e-06, "loss": 0.3993, "mean_token_accuracy": 0.8577578738331795, "num_tokens": 112957991.0, "step": 93940 }, { "entropy": 1.8825334414839745, "epoch": 0.29123677984188895, "grad_norm": 4.188181400299072, "learning_rate": 4.687830287249335e-06, "loss": 0.4675, "mean_token_accuracy": 0.8422155871987342, "num_tokens": 112970170.0, "step": 93950 }, { "entropy": 1.8940382510423661, "epoch": 0.2912677789669387, "grad_norm": 9.72235107421875, "learning_rate": 4.68758081912895e-06, "loss": 0.5222, "mean_token_accuracy": 0.8427134841680527, "num_tokens": 112981192.0, "step": 93960 }, { "entropy": 1.8888707652688026, "epoch": 0.29129877809198834, "grad_norm": 8.664020538330078, "learning_rate": 4.6873313908315015e-06, "loss": 0.5094, "mean_token_accuracy": 0.8318000584840775, "num_tokens": 112992870.0, "step": 93970 }, { "entropy": 1.850681571662426, "epoch": 0.29132977721703807, "grad_norm": 4.301815986633301, "learning_rate": 4.687082002346394e-06, "loss": 0.4519, "mean_token_accuracy": 0.8496690198779107, "num_tokens": 113005123.0, "step": 93980 }, { "entropy": 1.7844605684280395, "epoch": 0.29136077634208774, "grad_norm": 8.043193817138672, "learning_rate": 4.686832653663037e-06, "loss": 0.4085, "mean_token_accuracy": 0.8557795941829681, "num_tokens": 113017593.0, "step": 93990 }, { "entropy": 1.7445304661989212, "epoch": 0.29139177546713746, "grad_norm": 3.7523953914642334, "learning_rate": 4.686583344770846e-06, "loss": 0.4032, "mean_token_accuracy": 0.8624411836266518, "num_tokens": 113031950.0, "step": 94000 }, { "entropy": 1.7966215670108796, "epoch": 0.29142277459218713, "grad_norm": 7.984848499298096, "learning_rate": 4.686334075659238e-06, "loss": 0.3961, "mean_token_accuracy": 0.8521535471081734, "num_tokens": 113045309.0, "step": 94010 }, { "entropy": 1.8039371758699416, "epoch": 0.29145377371723685, "grad_norm": 4.514188766479492, "learning_rate": 4.686084846317634e-06, "loss": 0.4388, "mean_token_accuracy": 0.8542758151888847, "num_tokens": 113058068.0, "step": 94020 }, { "entropy": 1.8654557079076768, "epoch": 0.2914847728422865, "grad_norm": 8.273473739624023, "learning_rate": 4.685835656735462e-06, "loss": 0.508, "mean_token_accuracy": 0.8452908799052239, "num_tokens": 113069283.0, "step": 94030 }, { "entropy": 1.8588725805282593, "epoch": 0.29151577196733625, "grad_norm": 4.237758636474609, "learning_rate": 4.685586506902148e-06, "loss": 0.5063, "mean_token_accuracy": 0.8419048935174942, "num_tokens": 113081319.0, "step": 94040 }, { "entropy": 1.8301612615585328, "epoch": 0.2915467710923859, "grad_norm": 8.443975448608398, "learning_rate": 4.685337396807132e-06, "loss": 0.4933, "mean_token_accuracy": 0.8360993027687073, "num_tokens": 113093388.0, "step": 94050 }, { "entropy": 1.8223912194371223, "epoch": 0.29157777021743564, "grad_norm": 8.578900337219238, "learning_rate": 4.685088326439846e-06, "loss": 0.4068, "mean_token_accuracy": 0.856349365413189, "num_tokens": 113106077.0, "step": 94060 }, { "entropy": 1.8360855415463448, "epoch": 0.2916087693424853, "grad_norm": 8.660734176635742, "learning_rate": 4.684839295789734e-06, "loss": 0.4765, "mean_token_accuracy": 0.848674775660038, "num_tokens": 113118493.0, "step": 94070 }, { "entropy": 1.8793357729911804, "epoch": 0.291639768467535, "grad_norm": 6.9419403076171875, "learning_rate": 4.684590304846241e-06, "loss": 0.4598, "mean_token_accuracy": 0.8568961501121521, "num_tokens": 113129937.0, "step": 94080 }, { "entropy": 1.909253677725792, "epoch": 0.2916707675925847, "grad_norm": 8.138631820678711, "learning_rate": 4.684341353598818e-06, "loss": 0.5175, "mean_token_accuracy": 0.8358252078294754, "num_tokens": 113140848.0, "step": 94090 }, { "entropy": 1.8075340047478676, "epoch": 0.29170176671763437, "grad_norm": 8.938015937805176, "learning_rate": 4.684092442036915e-06, "loss": 0.4432, "mean_token_accuracy": 0.8573233857750893, "num_tokens": 113153007.0, "step": 94100 }, { "entropy": 1.8430316671729088, "epoch": 0.2917327658426841, "grad_norm": 7.955845355987549, "learning_rate": 4.683843570149992e-06, "loss": 0.4929, "mean_token_accuracy": 0.8404422610998153, "num_tokens": 113165004.0, "step": 94110 }, { "entropy": 1.8731530100107192, "epoch": 0.29176376496773376, "grad_norm": 9.946492195129395, "learning_rate": 4.683594737927509e-06, "loss": 0.4666, "mean_token_accuracy": 0.8477850437164307, "num_tokens": 113176838.0, "step": 94120 }, { "entropy": 1.8446904599666596, "epoch": 0.2917947640927835, "grad_norm": 4.911070823669434, "learning_rate": 4.683345945358933e-06, "loss": 0.4723, "mean_token_accuracy": 0.8407913163304329, "num_tokens": 113189520.0, "step": 94130 }, { "entropy": 1.8978979125618936, "epoch": 0.29182576321783316, "grad_norm": 8.811211585998535, "learning_rate": 4.683097192433731e-06, "loss": 0.4674, "mean_token_accuracy": 0.8498449578881264, "num_tokens": 113201050.0, "step": 94140 }, { "entropy": 1.868384449183941, "epoch": 0.2918567623428829, "grad_norm": 8.619186401367188, "learning_rate": 4.682848479141376e-06, "loss": 0.48, "mean_token_accuracy": 0.8416146129369736, "num_tokens": 113213345.0, "step": 94150 }, { "entropy": 1.8393750131130218, "epoch": 0.29188776146793255, "grad_norm": 2.398193836212158, "learning_rate": 4.682599805471346e-06, "loss": 0.4774, "mean_token_accuracy": 0.8392178237438201, "num_tokens": 113226489.0, "step": 94160 }, { "entropy": 1.9002157673239708, "epoch": 0.2919187605929823, "grad_norm": 9.483999252319336, "learning_rate": 4.68235117141312e-06, "loss": 0.4923, "mean_token_accuracy": 0.8522425264120101, "num_tokens": 113238608.0, "step": 94170 }, { "entropy": 1.8459337919950485, "epoch": 0.29194975971803194, "grad_norm": 7.897007942199707, "learning_rate": 4.6821025769561855e-06, "loss": 0.4421, "mean_token_accuracy": 0.8493504419922828, "num_tokens": 113250584.0, "step": 94180 }, { "entropy": 1.9223339349031447, "epoch": 0.29198075884308167, "grad_norm": 8.27923583984375, "learning_rate": 4.681854022090028e-06, "loss": 0.4856, "mean_token_accuracy": 0.8487838566303253, "num_tokens": 113261767.0, "step": 94190 }, { "entropy": 1.9111609309911728, "epoch": 0.29201175796813134, "grad_norm": 8.224406242370605, "learning_rate": 4.681605506804143e-06, "loss": 0.498, "mean_token_accuracy": 0.839901152253151, "num_tokens": 113273424.0, "step": 94200 }, { "entropy": 1.8766125679016112, "epoch": 0.29204275709318106, "grad_norm": 8.163731575012207, "learning_rate": 4.681357031088025e-06, "loss": 0.4346, "mean_token_accuracy": 0.8469099998474121, "num_tokens": 113285663.0, "step": 94210 }, { "entropy": 1.815425206720829, "epoch": 0.29207375621823073, "grad_norm": 8.935981750488281, "learning_rate": 4.681108594931173e-06, "loss": 0.4639, "mean_token_accuracy": 0.8499750107526779, "num_tokens": 113298461.0, "step": 94220 }, { "entropy": 1.6602164059877396, "epoch": 0.29210475534328045, "grad_norm": 8.308954238891602, "learning_rate": 4.680860198323094e-06, "loss": 0.383, "mean_token_accuracy": 0.8591342657804489, "num_tokens": 113314031.0, "step": 94230 }, { "entropy": 1.838196623325348, "epoch": 0.2921357544683301, "grad_norm": 3.9183874130249023, "learning_rate": 4.6806118412532965e-06, "loss": 0.4273, "mean_token_accuracy": 0.8539181277155876, "num_tokens": 113326110.0, "step": 94240 }, { "entropy": 1.8387087047100068, "epoch": 0.29216675359337985, "grad_norm": 7.87742280960083, "learning_rate": 4.680363523711289e-06, "loss": 0.451, "mean_token_accuracy": 0.8537355214357376, "num_tokens": 113338311.0, "step": 94250 }, { "entropy": 1.8915387392044067, "epoch": 0.2921977527184295, "grad_norm": 8.182762145996094, "learning_rate": 4.680115245686591e-06, "loss": 0.4672, "mean_token_accuracy": 0.8396169424057007, "num_tokens": 113350148.0, "step": 94260 }, { "entropy": 1.8612817063927651, "epoch": 0.29222875184347924, "grad_norm": 6.411753177642822, "learning_rate": 4.679867007168719e-06, "loss": 0.4455, "mean_token_accuracy": 0.8560265928506852, "num_tokens": 113362183.0, "step": 94270 }, { "entropy": 1.8689087167382241, "epoch": 0.2922597509685289, "grad_norm": 8.272517204284668, "learning_rate": 4.6796188081472e-06, "loss": 0.4405, "mean_token_accuracy": 0.8516158699989319, "num_tokens": 113374575.0, "step": 94280 }, { "entropy": 1.9062586069107055, "epoch": 0.29229075009357863, "grad_norm": 10.613982200622559, "learning_rate": 4.679370648611559e-06, "loss": 0.5072, "mean_token_accuracy": 0.8422037899494171, "num_tokens": 113386126.0, "step": 94290 }, { "entropy": 1.9051846981048584, "epoch": 0.2923217492186283, "grad_norm": 8.695990562438965, "learning_rate": 4.679122528551329e-06, "loss": 0.5411, "mean_token_accuracy": 0.841190955042839, "num_tokens": 113397175.0, "step": 94300 }, { "entropy": 1.7903152495622634, "epoch": 0.292352748343678, "grad_norm": 4.608640670776367, "learning_rate": 4.678874447956044e-06, "loss": 0.391, "mean_token_accuracy": 0.8582356154918671, "num_tokens": 113410086.0, "step": 94310 }, { "entropy": 1.8944632783532143, "epoch": 0.2923837474687277, "grad_norm": 8.895879745483398, "learning_rate": 4.6786264068152445e-06, "loss": 0.47, "mean_token_accuracy": 0.8449664607644081, "num_tokens": 113422204.0, "step": 94320 }, { "entropy": 1.9663603961467744, "epoch": 0.29241474659377736, "grad_norm": 10.93663215637207, "learning_rate": 4.678378405118473e-06, "loss": 0.5461, "mean_token_accuracy": 0.8348477125167847, "num_tokens": 113432962.0, "step": 94330 }, { "entropy": 1.8723286166787148, "epoch": 0.2924457457188271, "grad_norm": 8.900105476379395, "learning_rate": 4.6781304428552765e-06, "loss": 0.4489, "mean_token_accuracy": 0.8530125021934509, "num_tokens": 113444695.0, "step": 94340 }, { "entropy": 1.7696161583065986, "epoch": 0.29247674484387676, "grad_norm": 7.942965030670166, "learning_rate": 4.677882520015207e-06, "loss": 0.4524, "mean_token_accuracy": 0.857183714210987, "num_tokens": 113458974.0, "step": 94350 }, { "entropy": 1.8979527205228806, "epoch": 0.2925077439689265, "grad_norm": 10.76806354522705, "learning_rate": 4.677634636587817e-06, "loss": 0.4853, "mean_token_accuracy": 0.8429912239313125, "num_tokens": 113470746.0, "step": 94360 }, { "entropy": 1.8394412383437158, "epoch": 0.29253874309397615, "grad_norm": 7.812624931335449, "learning_rate": 4.677386792562667e-06, "loss": 0.4536, "mean_token_accuracy": 0.8541227161884308, "num_tokens": 113482575.0, "step": 94370 }, { "entropy": 1.8561811774969101, "epoch": 0.2925697422190259, "grad_norm": 7.494784832000732, "learning_rate": 4.6771389879293185e-06, "loss": 0.4912, "mean_token_accuracy": 0.8553539365530014, "num_tokens": 113493656.0, "step": 94380 }, { "entropy": 1.877823382616043, "epoch": 0.29260074134407554, "grad_norm": 7.919898986816406, "learning_rate": 4.676891222677338e-06, "loss": 0.4551, "mean_token_accuracy": 0.8565265223383903, "num_tokens": 113504994.0, "step": 94390 }, { "entropy": 1.8452934324741364, "epoch": 0.29263174046912527, "grad_norm": 9.235838890075684, "learning_rate": 4.6766434967962945e-06, "loss": 0.4938, "mean_token_accuracy": 0.8403481170535088, "num_tokens": 113517235.0, "step": 94400 }, { "entropy": 1.8997774094343185, "epoch": 0.29266273959417494, "grad_norm": 8.089444160461426, "learning_rate": 4.6763958102757665e-06, "loss": 0.4971, "mean_token_accuracy": 0.8421216562390328, "num_tokens": 113528646.0, "step": 94410 }, { "entropy": 1.6933333344757557, "epoch": 0.29269373871922466, "grad_norm": 9.213237762451172, "learning_rate": 4.676148163105327e-06, "loss": 0.499, "mean_token_accuracy": 0.8442223116755485, "num_tokens": 113543849.0, "step": 94420 }, { "entropy": 1.74602971971035, "epoch": 0.29272473784427433, "grad_norm": 8.171969413757324, "learning_rate": 4.67590055527456e-06, "loss": 0.388, "mean_token_accuracy": 0.8524969726800918, "num_tokens": 113556535.0, "step": 94430 }, { "entropy": 1.899616888165474, "epoch": 0.29275573696932405, "grad_norm": 9.324505805969238, "learning_rate": 4.675652986773051e-06, "loss": 0.5368, "mean_token_accuracy": 0.8389844998717308, "num_tokens": 113567827.0, "step": 94440 }, { "entropy": 1.85470499843359, "epoch": 0.2927867360943737, "grad_norm": 7.886104106903076, "learning_rate": 4.675405457590389e-06, "loss": 0.4989, "mean_token_accuracy": 0.8475810378789902, "num_tokens": 113578947.0, "step": 94450 }, { "entropy": 1.9117970570921898, "epoch": 0.29281773521942345, "grad_norm": 8.605646133422852, "learning_rate": 4.675157967716168e-06, "loss": 0.4978, "mean_token_accuracy": 0.8469821259379386, "num_tokens": 113590112.0, "step": 94460 }, { "entropy": 1.850995273888111, "epoch": 0.2928487343444731, "grad_norm": 9.236472129821777, "learning_rate": 4.6749105171399864e-06, "loss": 0.4171, "mean_token_accuracy": 0.8592243269085884, "num_tokens": 113602127.0, "step": 94470 }, { "entropy": 1.894811224937439, "epoch": 0.29287973346952284, "grad_norm": 7.549960136413574, "learning_rate": 4.674663105851442e-06, "loss": 0.5144, "mean_token_accuracy": 0.8472552940249443, "num_tokens": 113613991.0, "step": 94480 }, { "entropy": 1.8211253330111503, "epoch": 0.2929107325945725, "grad_norm": 10.19444751739502, "learning_rate": 4.674415733840143e-06, "loss": 0.5209, "mean_token_accuracy": 0.8452879965305329, "num_tokens": 113626732.0, "step": 94490 }, { "entropy": 1.870133863389492, "epoch": 0.29294173171962223, "grad_norm": 5.497763633728027, "learning_rate": 4.674168401095697e-06, "loss": 0.5169, "mean_token_accuracy": 0.8333605661988258, "num_tokens": 113638260.0, "step": 94500 }, { "entropy": 1.881983858346939, "epoch": 0.2929727308446719, "grad_norm": 7.503384590148926, "learning_rate": 4.673921107607716e-06, "loss": 0.4997, "mean_token_accuracy": 0.8480863809585572, "num_tokens": 113649729.0, "step": 94510 }, { "entropy": 1.7449950769543647, "epoch": 0.2930037299697216, "grad_norm": 6.960958003997803, "learning_rate": 4.673673853365818e-06, "loss": 0.4333, "mean_token_accuracy": 0.8576125666499138, "num_tokens": 113662822.0, "step": 94520 }, { "entropy": 1.6805919706821442, "epoch": 0.2930347290947713, "grad_norm": 7.576868534088135, "learning_rate": 4.673426638359622e-06, "loss": 0.3747, "mean_token_accuracy": 0.8681085333228111, "num_tokens": 113677064.0, "step": 94530 }, { "entropy": 1.8898225530982018, "epoch": 0.293065728219821, "grad_norm": 8.339954376220703, "learning_rate": 4.673179462578754e-06, "loss": 0.5254, "mean_token_accuracy": 0.834310744702816, "num_tokens": 113688785.0, "step": 94540 }, { "entropy": 1.822121039032936, "epoch": 0.2930967273448707, "grad_norm": 2.9981343746185303, "learning_rate": 4.672932326012839e-06, "loss": 0.4196, "mean_token_accuracy": 0.8623607367277145, "num_tokens": 113700761.0, "step": 94550 }, { "entropy": 1.8925957545638084, "epoch": 0.2931277264699204, "grad_norm": 8.26031494140625, "learning_rate": 4.672685228651511e-06, "loss": 0.4731, "mean_token_accuracy": 0.8499511271715164, "num_tokens": 113712498.0, "step": 94560 }, { "entropy": 1.92208162099123, "epoch": 0.2931587255949701, "grad_norm": 9.060463905334473, "learning_rate": 4.672438170484405e-06, "loss": 0.5613, "mean_token_accuracy": 0.8314491465687752, "num_tokens": 113724082.0, "step": 94570 }, { "entropy": 1.8770214468240738, "epoch": 0.29318972472001975, "grad_norm": 8.743376731872559, "learning_rate": 4.672191151501161e-06, "loss": 0.4541, "mean_token_accuracy": 0.8547257348895073, "num_tokens": 113736315.0, "step": 94580 }, { "entropy": 1.837056641280651, "epoch": 0.2932207238450695, "grad_norm": 4.969115257263184, "learning_rate": 4.671944171691422e-06, "loss": 0.464, "mean_token_accuracy": 0.8457030534744263, "num_tokens": 113748972.0, "step": 94590 }, { "entropy": 1.9361342743039132, "epoch": 0.29325172297011914, "grad_norm": 8.590668678283691, "learning_rate": 4.671697231044837e-06, "loss": 0.4903, "mean_token_accuracy": 0.8455305263400078, "num_tokens": 113760795.0, "step": 94600 }, { "entropy": 1.775714547932148, "epoch": 0.29328272209516887, "grad_norm": 3.653806209564209, "learning_rate": 4.671450329551054e-06, "loss": 0.4358, "mean_token_accuracy": 0.8520709335803985, "num_tokens": 113774218.0, "step": 94610 }, { "entropy": 1.9447847425937652, "epoch": 0.29331372122021854, "grad_norm": 8.897464752197266, "learning_rate": 4.671203467199731e-06, "loss": 0.5127, "mean_token_accuracy": 0.8491786658763886, "num_tokens": 113785173.0, "step": 94620 }, { "entropy": 1.8407832726836204, "epoch": 0.29334472034526826, "grad_norm": 6.721490383148193, "learning_rate": 4.670956643980524e-06, "loss": 0.4677, "mean_token_accuracy": 0.8475279733538628, "num_tokens": 113797753.0, "step": 94630 }, { "entropy": 1.7925242587924004, "epoch": 0.29337571947031793, "grad_norm": 8.641847610473633, "learning_rate": 4.670709859883096e-06, "loss": 0.4577, "mean_token_accuracy": 0.8569319665431976, "num_tokens": 113809733.0, "step": 94640 }, { "entropy": 1.9358911573886872, "epoch": 0.29340671859536765, "grad_norm": 8.337167739868164, "learning_rate": 4.670463114897114e-06, "loss": 0.5519, "mean_token_accuracy": 0.8383975982666015, "num_tokens": 113820542.0, "step": 94650 }, { "entropy": 1.8865822792053222, "epoch": 0.2934377177204173, "grad_norm": 8.994181632995605, "learning_rate": 4.670216409012248e-06, "loss": 0.49, "mean_token_accuracy": 0.8425557687878609, "num_tokens": 113832371.0, "step": 94660 }, { "entropy": 1.7770501509308816, "epoch": 0.29346871684546705, "grad_norm": 6.771418571472168, "learning_rate": 4.669969742218173e-06, "loss": 0.431, "mean_token_accuracy": 0.8530333042144775, "num_tokens": 113845284.0, "step": 94670 }, { "entropy": 1.7958509385585786, "epoch": 0.2934997159705167, "grad_norm": 7.999570369720459, "learning_rate": 4.669723114504565e-06, "loss": 0.4007, "mean_token_accuracy": 0.8648405924439431, "num_tokens": 113858418.0, "step": 94680 }, { "entropy": 1.9169925883412362, "epoch": 0.29353071509556644, "grad_norm": 8.945980072021484, "learning_rate": 4.669476525861107e-06, "loss": 0.5081, "mean_token_accuracy": 0.8366316080093383, "num_tokens": 113869941.0, "step": 94690 }, { "entropy": 1.804078498482704, "epoch": 0.2935617142206161, "grad_norm": 8.323823928833008, "learning_rate": 4.669229976277483e-06, "loss": 0.3992, "mean_token_accuracy": 0.8675397202372551, "num_tokens": 113882565.0, "step": 94700 }, { "entropy": 1.9035957649350166, "epoch": 0.29359271334566583, "grad_norm": 9.34669017791748, "learning_rate": 4.668983465743385e-06, "loss": 0.5124, "mean_token_accuracy": 0.8365237683057785, "num_tokens": 113893898.0, "step": 94710 }, { "entropy": 1.9729483425617218, "epoch": 0.2936237124707155, "grad_norm": 6.514955043792725, "learning_rate": 4.668736994248504e-06, "loss": 0.5382, "mean_token_accuracy": 0.8372229024767875, "num_tokens": 113904833.0, "step": 94720 }, { "entropy": 1.9153319388628005, "epoch": 0.2936547115957652, "grad_norm": 11.098676681518555, "learning_rate": 4.668490561782535e-06, "loss": 0.5469, "mean_token_accuracy": 0.8335372865200043, "num_tokens": 113915799.0, "step": 94730 }, { "entropy": 1.9642977565526962, "epoch": 0.2936857107208149, "grad_norm": 8.632896423339844, "learning_rate": 4.668244168335182e-06, "loss": 0.5027, "mean_token_accuracy": 0.8496646553277969, "num_tokens": 113926831.0, "step": 94740 }, { "entropy": 1.8451596662402152, "epoch": 0.2937167098458646, "grad_norm": 8.402206420898438, "learning_rate": 4.667997813896149e-06, "loss": 0.4127, "mean_token_accuracy": 0.8657459333539009, "num_tokens": 113938570.0, "step": 94750 }, { "entropy": 1.8689325533807277, "epoch": 0.2937477089709143, "grad_norm": 9.82726001739502, "learning_rate": 4.667751498455142e-06, "loss": 0.5013, "mean_token_accuracy": 0.8491661697626114, "num_tokens": 113950581.0, "step": 94760 }, { "entropy": 1.81126269698143, "epoch": 0.293778708095964, "grad_norm": 2.5253586769104004, "learning_rate": 4.667505222001875e-06, "loss": 0.4055, "mean_token_accuracy": 0.8603689581155777, "num_tokens": 113963539.0, "step": 94770 }, { "entropy": 1.8925784215331078, "epoch": 0.2938097072210137, "grad_norm": 7.797617435455322, "learning_rate": 4.667258984526063e-06, "loss": 0.4787, "mean_token_accuracy": 0.8523267209529877, "num_tokens": 113975014.0, "step": 94780 }, { "entropy": 1.8433985903859138, "epoch": 0.2938407063460634, "grad_norm": 10.589468002319336, "learning_rate": 4.667012786017426e-06, "loss": 0.4875, "mean_token_accuracy": 0.8405422657728195, "num_tokens": 113987158.0, "step": 94790 }, { "entropy": 1.8784984156489373, "epoch": 0.2938717054711131, "grad_norm": 3.9485971927642822, "learning_rate": 4.666766626465689e-06, "loss": 0.4409, "mean_token_accuracy": 0.8542310744524002, "num_tokens": 113998286.0, "step": 94800 }, { "entropy": 1.8718909628689289, "epoch": 0.2939027045961628, "grad_norm": 8.77500057220459, "learning_rate": 4.666520505860577e-06, "loss": 0.461, "mean_token_accuracy": 0.8534493952989578, "num_tokens": 114010433.0, "step": 94810 }, { "entropy": 1.9200672134757042, "epoch": 0.29393370372121247, "grad_norm": 8.568684577941895, "learning_rate": 4.666274424191821e-06, "loss": 0.4846, "mean_token_accuracy": 0.8516236767172813, "num_tokens": 114021482.0, "step": 94820 }, { "entropy": 1.8536953687667848, "epoch": 0.29396470284626214, "grad_norm": 7.200621128082275, "learning_rate": 4.666028381449159e-06, "loss": 0.4337, "mean_token_accuracy": 0.8599602654576302, "num_tokens": 114033985.0, "step": 94830 }, { "entropy": 1.894244983792305, "epoch": 0.29399570197131186, "grad_norm": 10.332306861877441, "learning_rate": 4.6657823776223255e-06, "loss": 0.4844, "mean_token_accuracy": 0.8414657115936279, "num_tokens": 114045285.0, "step": 94840 }, { "entropy": 1.863986374437809, "epoch": 0.29402670109636153, "grad_norm": 4.247035980224609, "learning_rate": 4.6655364127010655e-06, "loss": 0.4657, "mean_token_accuracy": 0.8416002795100213, "num_tokens": 114057674.0, "step": 94850 }, { "entropy": 1.9077302902936935, "epoch": 0.29405770022141126, "grad_norm": 7.60083532333374, "learning_rate": 4.665290486675124e-06, "loss": 0.4954, "mean_token_accuracy": 0.8370777696371079, "num_tokens": 114069600.0, "step": 94860 }, { "entropy": 1.8785944551229476, "epoch": 0.2940886993464609, "grad_norm": 7.80269718170166, "learning_rate": 4.665044599534251e-06, "loss": 0.4277, "mean_token_accuracy": 0.852102880179882, "num_tokens": 114081686.0, "step": 94870 }, { "entropy": 1.8737225562334061, "epoch": 0.29411969847151065, "grad_norm": 8.264700889587402, "learning_rate": 4.664798751268201e-06, "loss": 0.4726, "mean_token_accuracy": 0.8497600093483925, "num_tokens": 114093863.0, "step": 94880 }, { "entropy": 1.9434197053313256, "epoch": 0.2941506975965603, "grad_norm": 8.022650718688965, "learning_rate": 4.664552941866732e-06, "loss": 0.482, "mean_token_accuracy": 0.8454866543412208, "num_tokens": 114105246.0, "step": 94890 }, { "entropy": 1.7929591804742813, "epoch": 0.29418169672161004, "grad_norm": 4.1385908126831055, "learning_rate": 4.664307171319604e-06, "loss": 0.4195, "mean_token_accuracy": 0.84492729306221, "num_tokens": 114118876.0, "step": 94900 }, { "entropy": 1.9504995405673982, "epoch": 0.2942126958466597, "grad_norm": 7.590723037719727, "learning_rate": 4.664061439616583e-06, "loss": 0.4788, "mean_token_accuracy": 0.8555523350834846, "num_tokens": 114129331.0, "step": 94910 }, { "entropy": 1.8454117804765702, "epoch": 0.29424369497170944, "grad_norm": 10.14136791229248, "learning_rate": 4.663815746747437e-06, "loss": 0.4786, "mean_token_accuracy": 0.8382917091250419, "num_tokens": 114141760.0, "step": 94920 }, { "entropy": 1.8560292690992355, "epoch": 0.2942746940967591, "grad_norm": 7.97593879699707, "learning_rate": 4.66357009270194e-06, "loss": 0.5148, "mean_token_accuracy": 0.8398404493927956, "num_tokens": 114154144.0, "step": 94930 }, { "entropy": 1.9188194185495377, "epoch": 0.29430569322180883, "grad_norm": 8.06023120880127, "learning_rate": 4.663324477469867e-06, "loss": 0.4748, "mean_token_accuracy": 0.8514153569936752, "num_tokens": 114165232.0, "step": 94940 }, { "entropy": 1.9558087676763534, "epoch": 0.2943366923468585, "grad_norm": 9.450501441955566, "learning_rate": 4.663078901040999e-06, "loss": 0.5441, "mean_token_accuracy": 0.8372850298881531, "num_tokens": 114176045.0, "step": 94950 }, { "entropy": 1.8351658523082732, "epoch": 0.2943676914719082, "grad_norm": 4.117894649505615, "learning_rate": 4.662833363405119e-06, "loss": 0.4534, "mean_token_accuracy": 0.8479064837098121, "num_tokens": 114189019.0, "step": 94960 }, { "entropy": 1.8557307586073875, "epoch": 0.2943986905969579, "grad_norm": 8.198458671569824, "learning_rate": 4.662587864552017e-06, "loss": 0.464, "mean_token_accuracy": 0.8527988627552986, "num_tokens": 114201808.0, "step": 94970 }, { "entropy": 1.927764955163002, "epoch": 0.2944296897220076, "grad_norm": 9.65707015991211, "learning_rate": 4.662342404471482e-06, "loss": 0.5003, "mean_token_accuracy": 0.8513452783226967, "num_tokens": 114213290.0, "step": 94980 }, { "entropy": 1.76514892578125, "epoch": 0.2944606888470573, "grad_norm": 8.054937362670898, "learning_rate": 4.662096983153311e-06, "loss": 0.3548, "mean_token_accuracy": 0.8645235538482666, "num_tokens": 114226877.0, "step": 94990 }, { "entropy": 1.8766354367136955, "epoch": 0.294491687972107, "grad_norm": 4.424029350280762, "learning_rate": 4.661851600587301e-06, "loss": 0.4644, "mean_token_accuracy": 0.8427970930933952, "num_tokens": 114239241.0, "step": 95000 }, { "entropy": 1.9168139308691026, "epoch": 0.2945226870971567, "grad_norm": 7.28840970993042, "learning_rate": 4.661606256763257e-06, "loss": 0.528, "mean_token_accuracy": 0.8504084393382072, "num_tokens": 114250255.0, "step": 95010 }, { "entropy": 1.7996359452605248, "epoch": 0.2945536862222064, "grad_norm": 8.724395751953125, "learning_rate": 4.661360951670983e-06, "loss": 0.4217, "mean_token_accuracy": 0.8572775185108185, "num_tokens": 114262731.0, "step": 95020 }, { "entropy": 1.847113935649395, "epoch": 0.29458468534725607, "grad_norm": 8.312970161437988, "learning_rate": 4.661115685300293e-06, "loss": 0.4897, "mean_token_accuracy": 0.8483634769916535, "num_tokens": 114274768.0, "step": 95030 }, { "entropy": 1.8922452926635742, "epoch": 0.2946156844723058, "grad_norm": 6.005066871643066, "learning_rate": 4.660870457640998e-06, "loss": 0.4613, "mean_token_accuracy": 0.846984113752842, "num_tokens": 114286772.0, "step": 95040 }, { "entropy": 1.8506748765707015, "epoch": 0.29464668359735546, "grad_norm": 3.484794855117798, "learning_rate": 4.660625268682915e-06, "loss": 0.4566, "mean_token_accuracy": 0.8471810176968575, "num_tokens": 114298457.0, "step": 95050 }, { "entropy": 1.952257016301155, "epoch": 0.2946776827224052, "grad_norm": 9.706525802612305, "learning_rate": 4.660380118415869e-06, "loss": 0.5516, "mean_token_accuracy": 0.8365617975592613, "num_tokens": 114309656.0, "step": 95060 }, { "entropy": 1.854885457456112, "epoch": 0.29470868184745486, "grad_norm": 7.192900657653809, "learning_rate": 4.660135006829682e-06, "loss": 0.4771, "mean_token_accuracy": 0.8500230267643929, "num_tokens": 114321375.0, "step": 95070 }, { "entropy": 1.8961684226989746, "epoch": 0.2947396809725045, "grad_norm": 8.305758476257324, "learning_rate": 4.659889933914185e-06, "loss": 0.4983, "mean_token_accuracy": 0.8406580999493599, "num_tokens": 114333189.0, "step": 95080 }, { "entropy": 1.914514508843422, "epoch": 0.29477068009755425, "grad_norm": 7.5317702293396, "learning_rate": 4.65964489965921e-06, "loss": 0.5174, "mean_token_accuracy": 0.8434663847088814, "num_tokens": 114344937.0, "step": 95090 }, { "entropy": 1.8632069528102875, "epoch": 0.2948016792226039, "grad_norm": 8.437536239624023, "learning_rate": 4.659399904054594e-06, "loss": 0.512, "mean_token_accuracy": 0.8462415754795074, "num_tokens": 114357269.0, "step": 95100 }, { "entropy": 1.9126679688692092, "epoch": 0.29483267834765364, "grad_norm": 4.100780010223389, "learning_rate": 4.6591549470901755e-06, "loss": 0.5264, "mean_token_accuracy": 0.8231175735592842, "num_tokens": 114369778.0, "step": 95110 }, { "entropy": 1.84993801638484, "epoch": 0.2948636774727033, "grad_norm": 8.060639381408691, "learning_rate": 4.6589100287558015e-06, "loss": 0.4221, "mean_token_accuracy": 0.8632707759737969, "num_tokens": 114383231.0, "step": 95120 }, { "entropy": 1.9371715486049652, "epoch": 0.29489467659775304, "grad_norm": 7.608775615692139, "learning_rate": 4.658665149041318e-06, "loss": 0.5282, "mean_token_accuracy": 0.8381532579660416, "num_tokens": 114393685.0, "step": 95130 }, { "entropy": 1.8963730677962303, "epoch": 0.2949256757228027, "grad_norm": 7.8309550285339355, "learning_rate": 4.6584203079365756e-06, "loss": 0.4819, "mean_token_accuracy": 0.8506500810384751, "num_tokens": 114404986.0, "step": 95140 }, { "entropy": 1.8873082131147385, "epoch": 0.29495667484785243, "grad_norm": 8.878643989562988, "learning_rate": 4.658175505431431e-06, "loss": 0.4859, "mean_token_accuracy": 0.8476304590702057, "num_tokens": 114416424.0, "step": 95150 }, { "entropy": 1.7684010401368142, "epoch": 0.2949876739729021, "grad_norm": 4.694974422454834, "learning_rate": 4.657930741515742e-06, "loss": 0.4157, "mean_token_accuracy": 0.853473761677742, "num_tokens": 114430299.0, "step": 95160 }, { "entropy": 1.8848821252584458, "epoch": 0.2950186730979518, "grad_norm": 9.460819244384766, "learning_rate": 4.657686016179372e-06, "loss": 0.5366, "mean_token_accuracy": 0.8388624176383018, "num_tokens": 114441497.0, "step": 95170 }, { "entropy": 1.8895638823509215, "epoch": 0.2950496722230015, "grad_norm": 7.845742702484131, "learning_rate": 4.6574413294121865e-06, "loss": 0.5091, "mean_token_accuracy": 0.840872798860073, "num_tokens": 114453312.0, "step": 95180 }, { "entropy": 1.8671682730317116, "epoch": 0.2950806713480512, "grad_norm": 8.617532730102539, "learning_rate": 4.657196681204057e-06, "loss": 0.4543, "mean_token_accuracy": 0.858017711341381, "num_tokens": 114464871.0, "step": 95190 }, { "entropy": 1.8859140858054162, "epoch": 0.2951116704731009, "grad_norm": 8.359928131103516, "learning_rate": 4.656952071544857e-06, "loss": 0.503, "mean_token_accuracy": 0.8446494430303574, "num_tokens": 114476109.0, "step": 95200 }, { "entropy": 1.9119530409574508, "epoch": 0.2951426695981506, "grad_norm": 9.479840278625488, "learning_rate": 4.656707500424463e-06, "loss": 0.5339, "mean_token_accuracy": 0.8339703768491745, "num_tokens": 114487221.0, "step": 95210 }, { "entropy": 1.8878633320331573, "epoch": 0.2951736687232003, "grad_norm": 3.638047218322754, "learning_rate": 4.656462967832758e-06, "loss": 0.4996, "mean_token_accuracy": 0.8457856431603432, "num_tokens": 114498809.0, "step": 95220 }, { "entropy": 1.893609069287777, "epoch": 0.29520466784825, "grad_norm": 7.487365245819092, "learning_rate": 4.656218473759623e-06, "loss": 0.4817, "mean_token_accuracy": 0.849824258685112, "num_tokens": 114510866.0, "step": 95230 }, { "entropy": 1.9306818440556526, "epoch": 0.29523566697329967, "grad_norm": 7.329023838043213, "learning_rate": 4.655974018194953e-06, "loss": 0.4857, "mean_token_accuracy": 0.8432904377579689, "num_tokens": 114521966.0, "step": 95240 }, { "entropy": 1.9246254205703734, "epoch": 0.2952666660983494, "grad_norm": 8.638476371765137, "learning_rate": 4.655729601128635e-06, "loss": 0.493, "mean_token_accuracy": 0.8454683497548103, "num_tokens": 114533785.0, "step": 95250 }, { "entropy": 1.8969884738326073, "epoch": 0.29529766522339906, "grad_norm": 7.14600944519043, "learning_rate": 4.655485222550568e-06, "loss": 0.4495, "mean_token_accuracy": 0.8438623622059822, "num_tokens": 114545791.0, "step": 95260 }, { "entropy": 1.8377904728055001, "epoch": 0.2953286643484488, "grad_norm": 6.43510103225708, "learning_rate": 4.65524088245065e-06, "loss": 0.433, "mean_token_accuracy": 0.8514227852225303, "num_tokens": 114558462.0, "step": 95270 }, { "entropy": 1.9160344362258912, "epoch": 0.29535966347349846, "grad_norm": 10.534104347229004, "learning_rate": 4.654996580818786e-06, "loss": 0.4926, "mean_token_accuracy": 0.845592126250267, "num_tokens": 114569485.0, "step": 95280 }, { "entropy": 1.8697632998228073, "epoch": 0.2953906625985482, "grad_norm": 8.796942710876465, "learning_rate": 4.654752317644883e-06, "loss": 0.5373, "mean_token_accuracy": 0.8387501269578934, "num_tokens": 114581277.0, "step": 95290 }, { "entropy": 1.831010890007019, "epoch": 0.29542166172359785, "grad_norm": 9.258670806884766, "learning_rate": 4.654508092918852e-06, "loss": 0.4995, "mean_token_accuracy": 0.8350206702947617, "num_tokens": 114593427.0, "step": 95300 }, { "entropy": 1.8626672565937041, "epoch": 0.2954526608486475, "grad_norm": 4.265320777893066, "learning_rate": 4.6542639066306065e-06, "loss": 0.4421, "mean_token_accuracy": 0.8505603194236755, "num_tokens": 114605252.0, "step": 95310 }, { "entropy": 1.7814742475748062, "epoch": 0.29548365997369724, "grad_norm": 9.163718223571777, "learning_rate": 4.654019758770067e-06, "loss": 0.4164, "mean_token_accuracy": 0.8585087850689888, "num_tokens": 114618618.0, "step": 95320 }, { "entropy": 1.8076193630695343, "epoch": 0.2955146590987469, "grad_norm": 7.5134477615356445, "learning_rate": 4.653775649327154e-06, "loss": 0.4022, "mean_token_accuracy": 0.8535345688462257, "num_tokens": 114632466.0, "step": 95330 }, { "entropy": 1.878467933833599, "epoch": 0.29554565822379664, "grad_norm": 3.287616729736328, "learning_rate": 4.653531578291793e-06, "loss": 0.4508, "mean_token_accuracy": 0.8520422443747521, "num_tokens": 114644129.0, "step": 95340 }, { "entropy": 1.7941744670271873, "epoch": 0.2955766573488463, "grad_norm": 8.888310432434082, "learning_rate": 4.653287545653915e-06, "loss": 0.4527, "mean_token_accuracy": 0.8592835500836372, "num_tokens": 114657506.0, "step": 95350 }, { "entropy": 1.842213323712349, "epoch": 0.29560765647389603, "grad_norm": 8.71075439453125, "learning_rate": 4.653043551403452e-06, "loss": 0.4674, "mean_token_accuracy": 0.8478049322962761, "num_tokens": 114669587.0, "step": 95360 }, { "entropy": 1.8689180329442023, "epoch": 0.2956386555989457, "grad_norm": 7.912755489349365, "learning_rate": 4.652799595530342e-06, "loss": 0.4909, "mean_token_accuracy": 0.8550763368606568, "num_tokens": 114682059.0, "step": 95370 }, { "entropy": 1.9113721013069154, "epoch": 0.2956696547239954, "grad_norm": 9.693363189697266, "learning_rate": 4.652555678024524e-06, "loss": 0.4759, "mean_token_accuracy": 0.853804387152195, "num_tokens": 114693470.0, "step": 95380 }, { "entropy": 1.8694558992981911, "epoch": 0.2957006538490451, "grad_norm": 8.779112815856934, "learning_rate": 4.652311798875943e-06, "loss": 0.4829, "mean_token_accuracy": 0.8518740639090538, "num_tokens": 114705358.0, "step": 95390 }, { "entropy": 1.8623104050755501, "epoch": 0.2957316529740948, "grad_norm": 9.614526748657227, "learning_rate": 4.652067958074547e-06, "loss": 0.4875, "mean_token_accuracy": 0.8510076805949212, "num_tokens": 114718125.0, "step": 95400 }, { "entropy": 1.8787400797009468, "epoch": 0.2957626520991445, "grad_norm": 8.458456039428711, "learning_rate": 4.651824155610288e-06, "loss": 0.4505, "mean_token_accuracy": 0.8569131478667259, "num_tokens": 114729728.0, "step": 95410 }, { "entropy": 1.9131328269839287, "epoch": 0.2957936512241942, "grad_norm": 9.49421215057373, "learning_rate": 4.6515803914731215e-06, "loss": 0.472, "mean_token_accuracy": 0.845634426176548, "num_tokens": 114742431.0, "step": 95420 }, { "entropy": 1.7844593867659568, "epoch": 0.2958246503492439, "grad_norm": 8.483735084533691, "learning_rate": 4.651336665653005e-06, "loss": 0.3808, "mean_token_accuracy": 0.8685527250170708, "num_tokens": 114755668.0, "step": 95430 }, { "entropy": 1.922539620101452, "epoch": 0.2958556494742936, "grad_norm": 7.755213260650635, "learning_rate": 4.651092978139902e-06, "loss": 0.5134, "mean_token_accuracy": 0.8362011343240738, "num_tokens": 114766944.0, "step": 95440 }, { "entropy": 1.8400570914149283, "epoch": 0.29588664859934327, "grad_norm": 7.850890636444092, "learning_rate": 4.650849328923779e-06, "loss": 0.4173, "mean_token_accuracy": 0.8529717952013016, "num_tokens": 114779470.0, "step": 95450 }, { "entropy": 1.8972987502813339, "epoch": 0.295917647724393, "grad_norm": 10.900968551635742, "learning_rate": 4.650605717994607e-06, "loss": 0.4505, "mean_token_accuracy": 0.8552507728338241, "num_tokens": 114791012.0, "step": 95460 }, { "entropy": 1.904093398153782, "epoch": 0.29594864684944266, "grad_norm": 7.401911735534668, "learning_rate": 4.650362145342358e-06, "loss": 0.4572, "mean_token_accuracy": 0.8534610956907273, "num_tokens": 114802550.0, "step": 95470 }, { "entropy": 1.8667383641004562, "epoch": 0.2959796459744924, "grad_norm": 7.704689025878906, "learning_rate": 4.650118610957009e-06, "loss": 0.468, "mean_token_accuracy": 0.8388277173042298, "num_tokens": 114814877.0, "step": 95480 }, { "entropy": 1.8931495070457458, "epoch": 0.29601064509954206, "grad_norm": 9.703409194946289, "learning_rate": 4.649875114828544e-06, "loss": 0.5122, "mean_token_accuracy": 0.8397621661424637, "num_tokens": 114826671.0, "step": 95490 }, { "entropy": 1.949988493323326, "epoch": 0.2960416442245918, "grad_norm": 7.655276298522949, "learning_rate": 4.6496316569469455e-06, "loss": 0.515, "mean_token_accuracy": 0.8395925149321556, "num_tokens": 114837908.0, "step": 95500 }, { "entropy": 1.9334761276841164, "epoch": 0.29607264334964145, "grad_norm": 8.131454467773438, "learning_rate": 4.649388237302203e-06, "loss": 0.5127, "mean_token_accuracy": 0.8403454497456551, "num_tokens": 114849471.0, "step": 95510 }, { "entropy": 1.902390295267105, "epoch": 0.2961036424746912, "grad_norm": 8.340156555175781, "learning_rate": 4.6491448558843065e-06, "loss": 0.5007, "mean_token_accuracy": 0.8446046605706214, "num_tokens": 114861012.0, "step": 95520 }, { "entropy": 1.9234414175152779, "epoch": 0.29613464159974084, "grad_norm": 9.107486724853516, "learning_rate": 4.648901512683255e-06, "loss": 0.4532, "mean_token_accuracy": 0.84765195697546, "num_tokens": 114872674.0, "step": 95530 }, { "entropy": 1.8331530943512917, "epoch": 0.29616564072479057, "grad_norm": 8.991997718811035, "learning_rate": 4.648658207689045e-06, "loss": 0.4409, "mean_token_accuracy": 0.8575505539774895, "num_tokens": 114885367.0, "step": 95540 }, { "entropy": 1.8234670519828797, "epoch": 0.29619663984984024, "grad_norm": 5.19529914855957, "learning_rate": 4.648414940891681e-06, "loss": 0.4943, "mean_token_accuracy": 0.8523637726902962, "num_tokens": 114899055.0, "step": 95550 }, { "entropy": 1.8568585798144341, "epoch": 0.2962276389748899, "grad_norm": 4.264057636260986, "learning_rate": 4.648171712281169e-06, "loss": 0.4132, "mean_token_accuracy": 0.8576111733913422, "num_tokens": 114911683.0, "step": 95560 }, { "entropy": 1.9596162497997285, "epoch": 0.29625863809993963, "grad_norm": 9.092710494995117, "learning_rate": 4.64792852184752e-06, "loss": 0.5458, "mean_token_accuracy": 0.8393025726079941, "num_tokens": 114922365.0, "step": 95570 }, { "entropy": 1.8301913827657699, "epoch": 0.2962896372249893, "grad_norm": 7.440187454223633, "learning_rate": 4.647685369580747e-06, "loss": 0.4163, "mean_token_accuracy": 0.857902692258358, "num_tokens": 114935376.0, "step": 95580 }, { "entropy": 1.9472972556948662, "epoch": 0.296320636350039, "grad_norm": 8.745131492614746, "learning_rate": 4.64744225547087e-06, "loss": 0.5434, "mean_token_accuracy": 0.828769737482071, "num_tokens": 114946969.0, "step": 95590 }, { "entropy": 1.935869987308979, "epoch": 0.2963516354750887, "grad_norm": 7.479891777038574, "learning_rate": 4.647199179507909e-06, "loss": 0.4902, "mean_token_accuracy": 0.8483911573886871, "num_tokens": 114958473.0, "step": 95600 }, { "entropy": 1.899821263551712, "epoch": 0.2963826346001384, "grad_norm": 7.5076680183410645, "learning_rate": 4.646956141681888e-06, "loss": 0.5001, "mean_token_accuracy": 0.845979979634285, "num_tokens": 114969919.0, "step": 95610 }, { "entropy": 1.9035681039094925, "epoch": 0.2964136337251881, "grad_norm": 8.332342147827148, "learning_rate": 4.646713141982837e-06, "loss": 0.4396, "mean_token_accuracy": 0.8571434035897255, "num_tokens": 114982144.0, "step": 95620 }, { "entropy": 1.924658827483654, "epoch": 0.2964446328502378, "grad_norm": 9.30444622039795, "learning_rate": 4.646470180400788e-06, "loss": 0.4844, "mean_token_accuracy": 0.8478532791137695, "num_tokens": 114993947.0, "step": 95630 }, { "entropy": 1.8619504556059838, "epoch": 0.2964756319752875, "grad_norm": 5.051435470581055, "learning_rate": 4.646227256925777e-06, "loss": 0.4141, "mean_token_accuracy": 0.8470785349607468, "num_tokens": 115006275.0, "step": 95640 }, { "entropy": 1.8626134410500526, "epoch": 0.2965066311003372, "grad_norm": 4.338906288146973, "learning_rate": 4.645984371547844e-06, "loss": 0.4394, "mean_token_accuracy": 0.8533647701144218, "num_tokens": 115018973.0, "step": 95650 }, { "entropy": 1.8923157513141633, "epoch": 0.2965376302253869, "grad_norm": 8.258339881896973, "learning_rate": 4.645741524257032e-06, "loss": 0.5561, "mean_token_accuracy": 0.8394008025527, "num_tokens": 115030711.0, "step": 95660 }, { "entropy": 1.9015984013676643, "epoch": 0.2965686293504366, "grad_norm": 10.353814125061035, "learning_rate": 4.645498715043387e-06, "loss": 0.5335, "mean_token_accuracy": 0.8356149435043335, "num_tokens": 115042545.0, "step": 95670 }, { "entropy": 1.9278481543064117, "epoch": 0.29659962847548627, "grad_norm": 8.37845516204834, "learning_rate": 4.645255943896961e-06, "loss": 0.4757, "mean_token_accuracy": 0.8446251779794693, "num_tokens": 115054273.0, "step": 95680 }, { "entropy": 1.9885007172822953, "epoch": 0.296630627600536, "grad_norm": 8.631330490112305, "learning_rate": 4.645013210807806e-06, "loss": 0.4925, "mean_token_accuracy": 0.8454102978110314, "num_tokens": 115064894.0, "step": 95690 }, { "entropy": 1.8312623113393784, "epoch": 0.29666162672558566, "grad_norm": 8.99056339263916, "learning_rate": 4.644770515765983e-06, "loss": 0.3975, "mean_token_accuracy": 0.8604427292943001, "num_tokens": 115077844.0, "step": 95700 }, { "entropy": 1.8540172457695008, "epoch": 0.2966926258506354, "grad_norm": 2.9684078693389893, "learning_rate": 4.64452785876155e-06, "loss": 0.4706, "mean_token_accuracy": 0.844263382256031, "num_tokens": 115090181.0, "step": 95710 }, { "entropy": 1.9741147756576538, "epoch": 0.29672362497568505, "grad_norm": 8.870850563049316, "learning_rate": 4.644285239784575e-06, "loss": 0.5372, "mean_token_accuracy": 0.8312768504023552, "num_tokens": 115100406.0, "step": 95720 }, { "entropy": 1.9829291343688964, "epoch": 0.2967546241007348, "grad_norm": 3.856050729751587, "learning_rate": 4.644042658825126e-06, "loss": 0.538, "mean_token_accuracy": 0.8350884988903999, "num_tokens": 115111439.0, "step": 95730 }, { "entropy": 1.9604087606072427, "epoch": 0.29678562322578445, "grad_norm": 8.375349998474121, "learning_rate": 4.643800115873274e-06, "loss": 0.5206, "mean_token_accuracy": 0.8406010925769806, "num_tokens": 115122511.0, "step": 95740 }, { "entropy": 1.8680056795477866, "epoch": 0.29681662235083417, "grad_norm": 3.6315958499908447, "learning_rate": 4.643557610919095e-06, "loss": 0.4559, "mean_token_accuracy": 0.851527801156044, "num_tokens": 115134569.0, "step": 95750 }, { "entropy": 1.860308752954006, "epoch": 0.29684762147588384, "grad_norm": 7.783955097198486, "learning_rate": 4.643315143952671e-06, "loss": 0.4196, "mean_token_accuracy": 0.8580260217189789, "num_tokens": 115146701.0, "step": 95760 }, { "entropy": 1.9309222459793092, "epoch": 0.29687862060093356, "grad_norm": 8.806280136108398, "learning_rate": 4.643072714964084e-06, "loss": 0.5613, "mean_token_accuracy": 0.836700190603733, "num_tokens": 115157808.0, "step": 95770 }, { "entropy": 1.9781962364912034, "epoch": 0.29690961972598323, "grad_norm": 7.529308319091797, "learning_rate": 4.64283032394342e-06, "loss": 0.524, "mean_token_accuracy": 0.8430225938558579, "num_tokens": 115168935.0, "step": 95780 }, { "entropy": 1.8590948715806008, "epoch": 0.29694061885103296, "grad_norm": 8.396674156188965, "learning_rate": 4.642587970880769e-06, "loss": 0.4035, "mean_token_accuracy": 0.8598534435033798, "num_tokens": 115180678.0, "step": 95790 }, { "entropy": 1.8222449347376823, "epoch": 0.2969716179760826, "grad_norm": 5.250387191772461, "learning_rate": 4.642345655766227e-06, "loss": 0.4177, "mean_token_accuracy": 0.8539089500904083, "num_tokens": 115193480.0, "step": 95800 }, { "entropy": 1.8699547871947289, "epoch": 0.2970026171011323, "grad_norm": 10.275331497192383, "learning_rate": 4.642103378589891e-06, "loss": 0.4747, "mean_token_accuracy": 0.8331056758761406, "num_tokens": 115205319.0, "step": 95810 }, { "entropy": 1.8728644341230392, "epoch": 0.297033616226182, "grad_norm": 3.5322608947753906, "learning_rate": 4.641861139341863e-06, "loss": 0.4816, "mean_token_accuracy": 0.8516603350639343, "num_tokens": 115217604.0, "step": 95820 }, { "entropy": 1.9369212195277214, "epoch": 0.2970646153512317, "grad_norm": 7.612384796142578, "learning_rate": 4.641618938012246e-06, "loss": 0.4872, "mean_token_accuracy": 0.8410941004753113, "num_tokens": 115229163.0, "step": 95830 }, { "entropy": 1.930888931453228, "epoch": 0.2970956144762814, "grad_norm": 8.152316093444824, "learning_rate": 4.64137677459115e-06, "loss": 0.5037, "mean_token_accuracy": 0.8360361099243164, "num_tokens": 115241832.0, "step": 95840 }, { "entropy": 1.860244083404541, "epoch": 0.2971266136013311, "grad_norm": 2.479391574859619, "learning_rate": 4.641134649068688e-06, "loss": 0.4802, "mean_token_accuracy": 0.8496681988239289, "num_tokens": 115254340.0, "step": 95850 }, { "entropy": 1.8632259294390678, "epoch": 0.2971576127263808, "grad_norm": 4.353872299194336, "learning_rate": 4.640892561434973e-06, "loss": 0.4722, "mean_token_accuracy": 0.8551715791225434, "num_tokens": 115266838.0, "step": 95860 }, { "entropy": 1.999795663356781, "epoch": 0.2971886118514305, "grad_norm": 9.249789237976074, "learning_rate": 4.640650511680128e-06, "loss": 0.5064, "mean_token_accuracy": 0.8440039098262787, "num_tokens": 115277215.0, "step": 95870 }, { "entropy": 1.954987433552742, "epoch": 0.2972196109764802, "grad_norm": 13.373074531555176, "learning_rate": 4.640408499794271e-06, "loss": 0.5134, "mean_token_accuracy": 0.8423504129052162, "num_tokens": 115287841.0, "step": 95880 }, { "entropy": 1.821416835486889, "epoch": 0.29725061010152987, "grad_norm": 3.3793182373046875, "learning_rate": 4.640166525767535e-06, "loss": 0.3991, "mean_token_accuracy": 0.8553699105978012, "num_tokens": 115300828.0, "step": 95890 }, { "entropy": 1.9414212331175804, "epoch": 0.2972816092265796, "grad_norm": 3.7656142711639404, "learning_rate": 4.639924589590045e-06, "loss": 0.4728, "mean_token_accuracy": 0.8337204396724701, "num_tokens": 115312598.0, "step": 95900 }, { "entropy": 1.9279131293296814, "epoch": 0.29731260835162926, "grad_norm": 10.359955787658691, "learning_rate": 4.639682691251938e-06, "loss": 0.4972, "mean_token_accuracy": 0.8390220627188683, "num_tokens": 115324584.0, "step": 95910 }, { "entropy": 1.859154610335827, "epoch": 0.297343607476679, "grad_norm": 7.873774528503418, "learning_rate": 4.6394408307433494e-06, "loss": 0.3915, "mean_token_accuracy": 0.8545582070946693, "num_tokens": 115337114.0, "step": 95920 }, { "entropy": 1.9387451350688933, "epoch": 0.29737460660172865, "grad_norm": 10.652080535888672, "learning_rate": 4.639199008054421e-06, "loss": 0.4937, "mean_token_accuracy": 0.8364695340394974, "num_tokens": 115348545.0, "step": 95930 }, { "entropy": 2.010096028447151, "epoch": 0.2974056057267784, "grad_norm": 9.396890640258789, "learning_rate": 4.638957223175298e-06, "loss": 0.5485, "mean_token_accuracy": 0.8352597668766976, "num_tokens": 115359072.0, "step": 95940 }, { "entropy": 1.845473076403141, "epoch": 0.29743660485182805, "grad_norm": 7.396862030029297, "learning_rate": 4.638715476096127e-06, "loss": 0.4258, "mean_token_accuracy": 0.8523558273911476, "num_tokens": 115372520.0, "step": 95950 }, { "entropy": 1.8642360031604768, "epoch": 0.29746760397687777, "grad_norm": 3.6565442085266113, "learning_rate": 4.638473766807061e-06, "loss": 0.4411, "mean_token_accuracy": 0.8511311173439026, "num_tokens": 115385427.0, "step": 95960 }, { "entropy": 1.9817214131355285, "epoch": 0.29749860310192744, "grad_norm": 9.105077743530273, "learning_rate": 4.638232095298256e-06, "loss": 0.5501, "mean_token_accuracy": 0.8311897858977317, "num_tokens": 115396089.0, "step": 95970 }, { "entropy": 1.8148702546954154, "epoch": 0.29752960222697716, "grad_norm": 4.5015106201171875, "learning_rate": 4.6379904615598705e-06, "loss": 0.4107, "mean_token_accuracy": 0.8546622708439827, "num_tokens": 115408984.0, "step": 95980 }, { "entropy": 1.8370864361524581, "epoch": 0.29756060135202683, "grad_norm": 8.784217834472656, "learning_rate": 4.637748865582065e-06, "loss": 0.4133, "mean_token_accuracy": 0.8529459208250045, "num_tokens": 115421841.0, "step": 95990 }, { "entropy": 1.9331490993499756, "epoch": 0.29759160047707656, "grad_norm": 8.97538948059082, "learning_rate": 4.637507307355009e-06, "loss": 0.4849, "mean_token_accuracy": 0.8470474123954773, "num_tokens": 115432927.0, "step": 96000 }, { "entropy": 1.8707862794399261, "epoch": 0.2976225996021262, "grad_norm": 8.228986740112305, "learning_rate": 4.63726578686887e-06, "loss": 0.4487, "mean_token_accuracy": 0.857261911034584, "num_tokens": 115444664.0, "step": 96010 }, { "entropy": 1.9656662926077844, "epoch": 0.29765359872717595, "grad_norm": 8.407549858093262, "learning_rate": 4.637024304113822e-06, "loss": 0.4903, "mean_token_accuracy": 0.847040732204914, "num_tokens": 115456093.0, "step": 96020 }, { "entropy": 2.015861451625824, "epoch": 0.2976845978522256, "grad_norm": 8.171123504638672, "learning_rate": 4.636782859080041e-06, "loss": 0.5187, "mean_token_accuracy": 0.8443436101078987, "num_tokens": 115467201.0, "step": 96030 }, { "entropy": 2.0067085534334184, "epoch": 0.29771559697727534, "grad_norm": 9.067146301269531, "learning_rate": 4.636541451757711e-06, "loss": 0.5731, "mean_token_accuracy": 0.8257392793893814, "num_tokens": 115478314.0, "step": 96040 }, { "entropy": 1.8171662881970405, "epoch": 0.297746596102325, "grad_norm": 9.887383460998535, "learning_rate": 4.636300082137011e-06, "loss": 0.4195, "mean_token_accuracy": 0.860226346552372, "num_tokens": 115490891.0, "step": 96050 }, { "entropy": 1.8969256058335304, "epoch": 0.2977775952273747, "grad_norm": 3.9007980823516846, "learning_rate": 4.636058750208131e-06, "loss": 0.5259, "mean_token_accuracy": 0.8481943354010582, "num_tokens": 115502955.0, "step": 96060 }, { "entropy": 1.855407053232193, "epoch": 0.2978085943524244, "grad_norm": 7.683308124542236, "learning_rate": 4.635817455961264e-06, "loss": 0.4505, "mean_token_accuracy": 0.8527497768402099, "num_tokens": 115516432.0, "step": 96070 }, { "entropy": 1.9460287556052207, "epoch": 0.2978395934774741, "grad_norm": 10.421119689941406, "learning_rate": 4.635576199386602e-06, "loss": 0.556, "mean_token_accuracy": 0.8432202488183975, "num_tokens": 115528309.0, "step": 96080 }, { "entropy": 1.9160909160971642, "epoch": 0.2978705926025238, "grad_norm": 7.117789268493652, "learning_rate": 4.635334980474345e-06, "loss": 0.5228, "mean_token_accuracy": 0.8440079033374787, "num_tokens": 115539915.0, "step": 96090 }, { "entropy": 1.9123343035578728, "epoch": 0.29790159172757347, "grad_norm": 7.615013122558594, "learning_rate": 4.635093799214693e-06, "loss": 0.483, "mean_token_accuracy": 0.8447704032063484, "num_tokens": 115551725.0, "step": 96100 }, { "entropy": 1.8921069249510765, "epoch": 0.2979325908526232, "grad_norm": 8.272263526916504, "learning_rate": 4.634852655597854e-06, "loss": 0.4898, "mean_token_accuracy": 0.8503027930855751, "num_tokens": 115563104.0, "step": 96110 }, { "entropy": 1.9330130770802498, "epoch": 0.29796358997767286, "grad_norm": 7.9516143798828125, "learning_rate": 4.634611549614036e-06, "loss": 0.4694, "mean_token_accuracy": 0.846668167412281, "num_tokens": 115574420.0, "step": 96120 }, { "entropy": 1.930530808866024, "epoch": 0.2979945891027226, "grad_norm": 10.271747589111328, "learning_rate": 4.634370481253451e-06, "loss": 0.5066, "mean_token_accuracy": 0.8329700931906701, "num_tokens": 115586106.0, "step": 96130 }, { "entropy": 1.9371146902441978, "epoch": 0.29802558822777225, "grad_norm": 8.164349555969238, "learning_rate": 4.634129450506316e-06, "loss": 0.5208, "mean_token_accuracy": 0.8380087822675705, "num_tokens": 115597047.0, "step": 96140 }, { "entropy": 1.93800760358572, "epoch": 0.298056587352822, "grad_norm": 8.42188549041748, "learning_rate": 4.633888457362851e-06, "loss": 0.4878, "mean_token_accuracy": 0.8453196853399276, "num_tokens": 115608165.0, "step": 96150 }, { "entropy": 1.8701598271727562, "epoch": 0.29808758647787165, "grad_norm": 8.132607460021973, "learning_rate": 4.633647501813278e-06, "loss": 0.4596, "mean_token_accuracy": 0.8482308521866798, "num_tokens": 115620759.0, "step": 96160 }, { "entropy": 1.8687379583716393, "epoch": 0.29811858560292137, "grad_norm": 3.990684747695923, "learning_rate": 4.633406583847825e-06, "loss": 0.4503, "mean_token_accuracy": 0.8474869653582573, "num_tokens": 115633358.0, "step": 96170 }, { "entropy": 1.8821706905961038, "epoch": 0.29814958472797104, "grad_norm": 7.216353416442871, "learning_rate": 4.633165703456723e-06, "loss": 0.433, "mean_token_accuracy": 0.8504739284515381, "num_tokens": 115646131.0, "step": 96180 }, { "entropy": 1.98152334690094, "epoch": 0.29818058385302076, "grad_norm": 9.739982604980469, "learning_rate": 4.6329248606302045e-06, "loss": 0.5019, "mean_token_accuracy": 0.8421538904309273, "num_tokens": 115656444.0, "step": 96190 }, { "entropy": 1.7937684386968613, "epoch": 0.29821158297807043, "grad_norm": 4.097762107849121, "learning_rate": 4.6326840553585075e-06, "loss": 0.3803, "mean_token_accuracy": 0.8663532704114913, "num_tokens": 115669849.0, "step": 96200 }, { "entropy": 1.960816130042076, "epoch": 0.29824258210312016, "grad_norm": 7.547084331512451, "learning_rate": 4.632443287631873e-06, "loss": 0.538, "mean_token_accuracy": 0.8430081829428673, "num_tokens": 115680669.0, "step": 96210 }, { "entropy": 1.8645331501960754, "epoch": 0.2982735812281698, "grad_norm": 4.153885364532471, "learning_rate": 4.632202557440546e-06, "loss": 0.4595, "mean_token_accuracy": 0.8473611772060394, "num_tokens": 115693855.0, "step": 96220 }, { "entropy": 1.932821586728096, "epoch": 0.29830458035321955, "grad_norm": 6.955663681030273, "learning_rate": 4.631961864774775e-06, "loss": 0.4661, "mean_token_accuracy": 0.853759004175663, "num_tokens": 115705441.0, "step": 96230 }, { "entropy": 1.8800847560167313, "epoch": 0.2983355794782692, "grad_norm": 10.34147834777832, "learning_rate": 4.631721209624811e-06, "loss": 0.5122, "mean_token_accuracy": 0.8399926438927651, "num_tokens": 115718673.0, "step": 96240 }, { "entropy": 1.868877911567688, "epoch": 0.29836657860331894, "grad_norm": 9.02034854888916, "learning_rate": 4.63148059198091e-06, "loss": 0.4677, "mean_token_accuracy": 0.8474683433771133, "num_tokens": 115731147.0, "step": 96250 }, { "entropy": 1.7052193224430083, "epoch": 0.2983975777283686, "grad_norm": 4.165556907653809, "learning_rate": 4.63124001183333e-06, "loss": 0.392, "mean_token_accuracy": 0.8648575574159623, "num_tokens": 115745846.0, "step": 96260 }, { "entropy": 1.8481420859694482, "epoch": 0.29842857685341834, "grad_norm": 8.495997428894043, "learning_rate": 4.630999469172333e-06, "loss": 0.4131, "mean_token_accuracy": 0.8595449611544609, "num_tokens": 115758179.0, "step": 96270 }, { "entropy": 1.8788498505949973, "epoch": 0.298459575978468, "grad_norm": 7.667300701141357, "learning_rate": 4.630758963988187e-06, "loss": 0.456, "mean_token_accuracy": 0.8503319293260574, "num_tokens": 115769809.0, "step": 96280 }, { "entropy": 1.9287839099764823, "epoch": 0.29849057510351773, "grad_norm": 8.444547653198242, "learning_rate": 4.63051849627116e-06, "loss": 0.4939, "mean_token_accuracy": 0.8440607368946076, "num_tokens": 115781425.0, "step": 96290 }, { "entropy": 1.9038120612502099, "epoch": 0.2985215742285674, "grad_norm": 7.381701946258545, "learning_rate": 4.630278066011525e-06, "loss": 0.4942, "mean_token_accuracy": 0.8477517932653427, "num_tokens": 115793181.0, "step": 96300 }, { "entropy": 1.8960516452789307, "epoch": 0.29855257335361707, "grad_norm": 7.266051292419434, "learning_rate": 4.630037673199559e-06, "loss": 0.4783, "mean_token_accuracy": 0.8412499457597733, "num_tokens": 115805564.0, "step": 96310 }, { "entropy": 1.9981954544782639, "epoch": 0.2985835724786668, "grad_norm": 8.080859184265137, "learning_rate": 4.62979731782554e-06, "loss": 0.4944, "mean_token_accuracy": 0.8427246376872063, "num_tokens": 115816757.0, "step": 96320 }, { "entropy": 1.824733631312847, "epoch": 0.29861457160371646, "grad_norm": 3.8523154258728027, "learning_rate": 4.629556999879755e-06, "loss": 0.4328, "mean_token_accuracy": 0.8527489557862282, "num_tokens": 115829066.0, "step": 96330 }, { "entropy": 1.9180657356977462, "epoch": 0.2986455707287662, "grad_norm": 9.480389595031738, "learning_rate": 4.629316719352488e-06, "loss": 0.4889, "mean_token_accuracy": 0.8383275628089905, "num_tokens": 115840829.0, "step": 96340 }, { "entropy": 1.8489946901798249, "epoch": 0.29867656985381585, "grad_norm": 4.126688003540039, "learning_rate": 4.629076476234032e-06, "loss": 0.4585, "mean_token_accuracy": 0.8490898162126541, "num_tokens": 115853917.0, "step": 96350 }, { "entropy": 1.8371913895010947, "epoch": 0.2987075689788656, "grad_norm": 3.543614387512207, "learning_rate": 4.628836270514679e-06, "loss": 0.4319, "mean_token_accuracy": 0.8514250427484512, "num_tokens": 115867183.0, "step": 96360 }, { "entropy": 1.853197917342186, "epoch": 0.29873856810391525, "grad_norm": 3.921837329864502, "learning_rate": 4.628596102184729e-06, "loss": 0.4699, "mean_token_accuracy": 0.8422363549470901, "num_tokens": 115879237.0, "step": 96370 }, { "entropy": 1.818500466644764, "epoch": 0.29876956722896497, "grad_norm": 8.878031730651855, "learning_rate": 4.6283559712344825e-06, "loss": 0.4574, "mean_token_accuracy": 0.8616446629166603, "num_tokens": 115892081.0, "step": 96380 }, { "entropy": 1.9318446889519691, "epoch": 0.29880056635401464, "grad_norm": 8.830316543579102, "learning_rate": 4.628115877654243e-06, "loss": 0.5193, "mean_token_accuracy": 0.8325408533215523, "num_tokens": 115903514.0, "step": 96390 }, { "entropy": 1.8660483628511428, "epoch": 0.29883156547906436, "grad_norm": 8.622795104980469, "learning_rate": 4.627875821434319e-06, "loss": 0.4325, "mean_token_accuracy": 0.8535114452242851, "num_tokens": 115915306.0, "step": 96400 }, { "entropy": 1.8813782200217246, "epoch": 0.29886256460411403, "grad_norm": 9.0267333984375, "learning_rate": 4.627635802565024e-06, "loss": 0.499, "mean_token_accuracy": 0.846407724916935, "num_tokens": 115927498.0, "step": 96410 }, { "entropy": 1.9499717622995376, "epoch": 0.29889356372916376, "grad_norm": 9.516846656799316, "learning_rate": 4.627395821036672e-06, "loss": 0.4815, "mean_token_accuracy": 0.853526496887207, "num_tokens": 115938787.0, "step": 96420 }, { "entropy": 1.9189068511128426, "epoch": 0.2989245628542134, "grad_norm": 7.334571361541748, "learning_rate": 4.6271558768395816e-06, "loss": 0.4921, "mean_token_accuracy": 0.8488484725356102, "num_tokens": 115950064.0, "step": 96430 }, { "entropy": 1.8531944811344148, "epoch": 0.29895556197926315, "grad_norm": 8.624135971069336, "learning_rate": 4.6269159699640755e-06, "loss": 0.4706, "mean_token_accuracy": 0.8516410380601883, "num_tokens": 115962309.0, "step": 96440 }, { "entropy": 1.903770998120308, "epoch": 0.2989865611043128, "grad_norm": 8.187854766845703, "learning_rate": 4.62667610040048e-06, "loss": 0.4689, "mean_token_accuracy": 0.8519768640398979, "num_tokens": 115973845.0, "step": 96450 }, { "entropy": 1.8115949898958206, "epoch": 0.29901756022936254, "grad_norm": 8.5139799118042, "learning_rate": 4.626436268139122e-06, "loss": 0.4351, "mean_token_accuracy": 0.8568135395646095, "num_tokens": 115986832.0, "step": 96460 }, { "entropy": 1.979418794810772, "epoch": 0.2990485593544122, "grad_norm": 7.29787015914917, "learning_rate": 4.626196473170338e-06, "loss": 0.5356, "mean_token_accuracy": 0.8345787361264229, "num_tokens": 115997910.0, "step": 96470 }, { "entropy": 1.9397563070058823, "epoch": 0.29907955847946194, "grad_norm": 8.139474868774414, "learning_rate": 4.625956715484463e-06, "loss": 0.4978, "mean_token_accuracy": 0.8472580030560494, "num_tokens": 116009904.0, "step": 96480 }, { "entropy": 1.8948927074670792, "epoch": 0.2991105576045116, "grad_norm": 9.053715705871582, "learning_rate": 4.625716995071836e-06, "loss": 0.4591, "mean_token_accuracy": 0.8533863142132759, "num_tokens": 116021106.0, "step": 96490 }, { "entropy": 1.8922299653291703, "epoch": 0.29914155672956133, "grad_norm": 9.0900239944458, "learning_rate": 4.6254773119228004e-06, "loss": 0.4598, "mean_token_accuracy": 0.8474109992384911, "num_tokens": 116033322.0, "step": 96500 }, { "entropy": 1.847077339887619, "epoch": 0.299172555854611, "grad_norm": 3.035865068435669, "learning_rate": 4.625237666027704e-06, "loss": 0.4026, "mean_token_accuracy": 0.8617709457874299, "num_tokens": 116045141.0, "step": 96510 }, { "entropy": 1.8434469774365425, "epoch": 0.2992035549796607, "grad_norm": 10.367220878601074, "learning_rate": 4.624998057376896e-06, "loss": 0.43, "mean_token_accuracy": 0.859175056219101, "num_tokens": 116057311.0, "step": 96520 }, { "entropy": 1.922757549583912, "epoch": 0.2992345541047104, "grad_norm": 7.6684417724609375, "learning_rate": 4.624758485960731e-06, "loss": 0.5211, "mean_token_accuracy": 0.8412060752511025, "num_tokens": 116068781.0, "step": 96530 }, { "entropy": 1.874141050875187, "epoch": 0.2992655532297601, "grad_norm": 5.786795139312744, "learning_rate": 4.624518951769568e-06, "loss": 0.5015, "mean_token_accuracy": 0.8453152433037758, "num_tokens": 116080384.0, "step": 96540 }, { "entropy": 1.9315826326608658, "epoch": 0.2992965523548098, "grad_norm": 7.588706016540527, "learning_rate": 4.624279454793765e-06, "loss": 0.5078, "mean_token_accuracy": 0.84673622995615, "num_tokens": 116090980.0, "step": 96550 }, { "entropy": 1.910808216035366, "epoch": 0.29932755147985945, "grad_norm": 4.097476482391357, "learning_rate": 4.624039995023688e-06, "loss": 0.4614, "mean_token_accuracy": 0.8506806045770645, "num_tokens": 116102714.0, "step": 96560 }, { "entropy": 1.8671618595719337, "epoch": 0.2993585506049092, "grad_norm": 4.14247465133667, "learning_rate": 4.623800572449704e-06, "loss": 0.4883, "mean_token_accuracy": 0.8386751100420952, "num_tokens": 116114873.0, "step": 96570 }, { "entropy": 1.8730577558279038, "epoch": 0.29938954972995885, "grad_norm": 11.251909255981445, "learning_rate": 4.623561187062184e-06, "loss": 0.4503, "mean_token_accuracy": 0.8561120167374611, "num_tokens": 116126620.0, "step": 96580 }, { "entropy": 1.8773257568478585, "epoch": 0.2994205488550086, "grad_norm": 9.164704322814941, "learning_rate": 4.623321838851505e-06, "loss": 0.4981, "mean_token_accuracy": 0.8373346477746964, "num_tokens": 116137916.0, "step": 96590 }, { "entropy": 1.9126343131065369, "epoch": 0.29945154798005824, "grad_norm": 8.017783164978027, "learning_rate": 4.623082527808043e-06, "loss": 0.4813, "mean_token_accuracy": 0.841637770831585, "num_tokens": 116149896.0, "step": 96600 }, { "entropy": 1.8838177442550659, "epoch": 0.29948254710510797, "grad_norm": 4.913340091705322, "learning_rate": 4.622843253922182e-06, "loss": 0.4407, "mean_token_accuracy": 0.8604057088494301, "num_tokens": 116162580.0, "step": 96610 }, { "entropy": 1.8277122244238853, "epoch": 0.29951354623015763, "grad_norm": 8.11374282836914, "learning_rate": 4.622604017184304e-06, "loss": 0.4217, "mean_token_accuracy": 0.8429702982306481, "num_tokens": 116175332.0, "step": 96620 }, { "entropy": 1.9389643400907517, "epoch": 0.29954454535520736, "grad_norm": 8.148730278015137, "learning_rate": 4.622364817584801e-06, "loss": 0.541, "mean_token_accuracy": 0.8371641963720322, "num_tokens": 116186424.0, "step": 96630 }, { "entropy": 1.9510870546102523, "epoch": 0.29957554448025703, "grad_norm": 6.960958957672119, "learning_rate": 4.622125655114065e-06, "loss": 0.5032, "mean_token_accuracy": 0.839006906747818, "num_tokens": 116197799.0, "step": 96640 }, { "entropy": 1.9510822862386703, "epoch": 0.29960654360530675, "grad_norm": 8.637950897216797, "learning_rate": 4.621886529762488e-06, "loss": 0.5395, "mean_token_accuracy": 0.8410506933927536, "num_tokens": 116208985.0, "step": 96650 }, { "entropy": 1.868352036178112, "epoch": 0.2996375427303564, "grad_norm": 10.132604598999023, "learning_rate": 4.621647441520475e-06, "loss": 0.4289, "mean_token_accuracy": 0.8492979109287262, "num_tokens": 116221911.0, "step": 96660 }, { "entropy": 1.9646795377135278, "epoch": 0.29966854185540615, "grad_norm": 9.418262481689453, "learning_rate": 4.621408390378424e-06, "loss": 0.4937, "mean_token_accuracy": 0.8449583351612091, "num_tokens": 116232932.0, "step": 96670 }, { "entropy": 1.9498365700244904, "epoch": 0.2996995409804558, "grad_norm": 7.607828617095947, "learning_rate": 4.621169376326742e-06, "loss": 0.5144, "mean_token_accuracy": 0.8415804550051689, "num_tokens": 116243576.0, "step": 96680 }, { "entropy": 1.862950399518013, "epoch": 0.29973054010550554, "grad_norm": 8.767817497253418, "learning_rate": 4.620930399355841e-06, "loss": 0.4621, "mean_token_accuracy": 0.8488745912909508, "num_tokens": 116256004.0, "step": 96690 }, { "entropy": 1.795606505870819, "epoch": 0.2997615392305552, "grad_norm": 4.170324325561523, "learning_rate": 4.620691459456132e-06, "loss": 0.4269, "mean_token_accuracy": 0.8523398399353027, "num_tokens": 116269297.0, "step": 96700 }, { "entropy": 1.8912088066339492, "epoch": 0.29979253835560493, "grad_norm": 7.616124153137207, "learning_rate": 4.620452556618031e-06, "loss": 0.4653, "mean_token_accuracy": 0.8536741435527802, "num_tokens": 116281401.0, "step": 96710 }, { "entropy": 1.9044504195451737, "epoch": 0.2998235374806546, "grad_norm": 3.9353320598602295, "learning_rate": 4.6202136908319606e-06, "loss": 0.4888, "mean_token_accuracy": 0.8428740501403809, "num_tokens": 116293236.0, "step": 96720 }, { "entropy": 1.831810677051544, "epoch": 0.2998545366057043, "grad_norm": 9.96127986907959, "learning_rate": 4.6199748620883425e-06, "loss": 0.4187, "mean_token_accuracy": 0.856151320040226, "num_tokens": 116305330.0, "step": 96730 }, { "entropy": 1.915688943862915, "epoch": 0.299885535730754, "grad_norm": 8.897388458251953, "learning_rate": 4.619736070377604e-06, "loss": 0.4791, "mean_token_accuracy": 0.8483431145548821, "num_tokens": 116316018.0, "step": 96740 }, { "entropy": 1.8551687330007554, "epoch": 0.2999165348558037, "grad_norm": 3.1668596267700195, "learning_rate": 4.619497315690176e-06, "loss": 0.5315, "mean_token_accuracy": 0.8362834542989731, "num_tokens": 116328555.0, "step": 96750 }, { "entropy": 1.8770240753889085, "epoch": 0.2999475339808534, "grad_norm": 2.5352485179901123, "learning_rate": 4.61925859801649e-06, "loss": 0.4974, "mean_token_accuracy": 0.8435857251286507, "num_tokens": 116340178.0, "step": 96760 }, { "entropy": 1.9596908926963805, "epoch": 0.2999785331059031, "grad_norm": 8.359101295471191, "learning_rate": 4.619019917346987e-06, "loss": 0.5595, "mean_token_accuracy": 0.8392258763313294, "num_tokens": 116351512.0, "step": 96770 }, { "entropy": 1.7745133236050605, "epoch": 0.3000095322309528, "grad_norm": 6.980304718017578, "learning_rate": 4.618781273672105e-06, "loss": 0.3915, "mean_token_accuracy": 0.8550699412822723, "num_tokens": 116365493.0, "step": 96780 }, { "entropy": 1.8009425699710846, "epoch": 0.3000405313560025, "grad_norm": 9.106100082397461, "learning_rate": 4.618542666982291e-06, "loss": 0.4403, "mean_token_accuracy": 0.8583745554089546, "num_tokens": 116378953.0, "step": 96790 }, { "entropy": 1.8405057787895203, "epoch": 0.3000715304810522, "grad_norm": 6.269810199737549, "learning_rate": 4.6183040972679905e-06, "loss": 0.455, "mean_token_accuracy": 0.853019006550312, "num_tokens": 116391720.0, "step": 96800 }, { "entropy": 1.8243362039327622, "epoch": 0.30010252960610184, "grad_norm": 8.785292625427246, "learning_rate": 4.618065564519655e-06, "loss": 0.4258, "mean_token_accuracy": 0.8519068777561187, "num_tokens": 116404782.0, "step": 96810 }, { "entropy": 1.9162081688642503, "epoch": 0.30013352873115157, "grad_norm": 9.158159255981445, "learning_rate": 4.617827068727739e-06, "loss": 0.5059, "mean_token_accuracy": 0.8442169070243836, "num_tokens": 116415863.0, "step": 96820 }, { "entropy": 1.8956199631094932, "epoch": 0.30016452785620124, "grad_norm": 5.014525890350342, "learning_rate": 4.617588609882702e-06, "loss": 0.5128, "mean_token_accuracy": 0.8389386609196663, "num_tokens": 116427960.0, "step": 96830 }, { "entropy": 1.890786738693714, "epoch": 0.30019552698125096, "grad_norm": 10.428237915039062, "learning_rate": 4.617350187975004e-06, "loss": 0.4924, "mean_token_accuracy": 0.8487927287817001, "num_tokens": 116439841.0, "step": 96840 }, { "entropy": 1.8615357890725135, "epoch": 0.30022652610630063, "grad_norm": 9.423101425170898, "learning_rate": 4.617111802995109e-06, "loss": 0.4574, "mean_token_accuracy": 0.8472002789378166, "num_tokens": 116451475.0, "step": 96850 }, { "entropy": 1.9217775925993918, "epoch": 0.30025752523135035, "grad_norm": 7.137108325958252, "learning_rate": 4.616873454933489e-06, "loss": 0.5065, "mean_token_accuracy": 0.8377560988068581, "num_tokens": 116462859.0, "step": 96860 }, { "entropy": 1.8776028633117676, "epoch": 0.3002885243564, "grad_norm": 4.520118236541748, "learning_rate": 4.616635143780614e-06, "loss": 0.5238, "mean_token_accuracy": 0.8304355323314667, "num_tokens": 116475060.0, "step": 96870 }, { "entropy": 1.8731580957770348, "epoch": 0.30031952348144975, "grad_norm": 7.3886942863464355, "learning_rate": 4.616396869526958e-06, "loss": 0.4684, "mean_token_accuracy": 0.8490657389163971, "num_tokens": 116487261.0, "step": 96880 }, { "entropy": 1.8840817973017692, "epoch": 0.3003505226064994, "grad_norm": 8.366750717163086, "learning_rate": 4.616158632163e-06, "loss": 0.4508, "mean_token_accuracy": 0.857377803325653, "num_tokens": 116498999.0, "step": 96890 }, { "entropy": 1.8003995344042778, "epoch": 0.30038152173154914, "grad_norm": 2.8149616718292236, "learning_rate": 4.615920431679226e-06, "loss": 0.4015, "mean_token_accuracy": 0.8655429676175117, "num_tokens": 116512337.0, "step": 96900 }, { "entropy": 1.8423293381929398, "epoch": 0.3004125208565988, "grad_norm": 3.726378917694092, "learning_rate": 4.615682268066116e-06, "loss": 0.4743, "mean_token_accuracy": 0.8466761097311973, "num_tokens": 116525215.0, "step": 96910 }, { "entropy": 1.8393646717071532, "epoch": 0.30044351998164853, "grad_norm": 9.186988830566406, "learning_rate": 4.615444141314163e-06, "loss": 0.4738, "mean_token_accuracy": 0.8458735197782516, "num_tokens": 116537163.0, "step": 96920 }, { "entropy": 1.9296279013156892, "epoch": 0.3004745191066982, "grad_norm": 9.516865730285645, "learning_rate": 4.615206051413857e-06, "loss": 0.4996, "mean_token_accuracy": 0.8410545736551285, "num_tokens": 116548242.0, "step": 96930 }, { "entropy": 1.867045633494854, "epoch": 0.3005055182317479, "grad_norm": 7.079854965209961, "learning_rate": 4.614967998355696e-06, "loss": 0.4553, "mean_token_accuracy": 0.8517505764961243, "num_tokens": 116560243.0, "step": 96940 }, { "entropy": 1.9103495597839355, "epoch": 0.3005365173567976, "grad_norm": 5.358558177947998, "learning_rate": 4.614729982130179e-06, "loss": 0.5171, "mean_token_accuracy": 0.8406784102320671, "num_tokens": 116571891.0, "step": 96950 }, { "entropy": 1.9171627387404442, "epoch": 0.3005675164818473, "grad_norm": 8.190808296203613, "learning_rate": 4.614492002727808e-06, "loss": 0.4863, "mean_token_accuracy": 0.8321561366319656, "num_tokens": 116583993.0, "step": 96960 }, { "entropy": 1.84826198220253, "epoch": 0.300598515606897, "grad_norm": 9.064087867736816, "learning_rate": 4.61425406013909e-06, "loss": 0.4424, "mean_token_accuracy": 0.8504088371992111, "num_tokens": 116596878.0, "step": 96970 }, { "entropy": 1.941984808444977, "epoch": 0.3006295147319467, "grad_norm": 4.682338237762451, "learning_rate": 4.614016154354533e-06, "loss": 0.5337, "mean_token_accuracy": 0.8427264273166657, "num_tokens": 116608036.0, "step": 96980 }, { "entropy": 1.9229829460382462, "epoch": 0.3006605138569964, "grad_norm": 7.0781097412109375, "learning_rate": 4.6137782853646524e-06, "loss": 0.5264, "mean_token_accuracy": 0.839740289747715, "num_tokens": 116619904.0, "step": 96990 }, { "entropy": 1.8459256619215012, "epoch": 0.3006915129820461, "grad_norm": 7.523281097412109, "learning_rate": 4.613540453159963e-06, "loss": 0.4691, "mean_token_accuracy": 0.8505584686994553, "num_tokens": 116632261.0, "step": 97000 }, { "entropy": 1.8035550713539124, "epoch": 0.3007225121070958, "grad_norm": 5.279726982116699, "learning_rate": 4.613302657730985e-06, "loss": 0.4401, "mean_token_accuracy": 0.854595598578453, "num_tokens": 116645315.0, "step": 97010 }, { "entropy": 1.907500149309635, "epoch": 0.3007535112321455, "grad_norm": 3.997288227081299, "learning_rate": 4.613064899068243e-06, "loss": 0.5113, "mean_token_accuracy": 0.8380198463797569, "num_tokens": 116657205.0, "step": 97020 }, { "entropy": 1.815874347090721, "epoch": 0.30078451035719517, "grad_norm": 3.8312273025512695, "learning_rate": 4.612827177162262e-06, "loss": 0.4213, "mean_token_accuracy": 0.8552998825907707, "num_tokens": 116669443.0, "step": 97030 }, { "entropy": 1.93814327865839, "epoch": 0.30081550948224484, "grad_norm": 9.422027587890625, "learning_rate": 4.612589492003573e-06, "loss": 0.5309, "mean_token_accuracy": 0.840138903260231, "num_tokens": 116681521.0, "step": 97040 }, { "entropy": 1.9181523829698564, "epoch": 0.30084650860729456, "grad_norm": 7.659515380859375, "learning_rate": 4.6123518435827095e-06, "loss": 0.4698, "mean_token_accuracy": 0.8537562400102615, "num_tokens": 116692024.0, "step": 97050 }, { "entropy": 1.8777270019054413, "epoch": 0.30087750773234423, "grad_norm": 4.077333450317383, "learning_rate": 4.612114231890209e-06, "loss": 0.4429, "mean_token_accuracy": 0.8504292860627174, "num_tokens": 116703672.0, "step": 97060 }, { "entropy": 1.8734031990170479, "epoch": 0.30090850685739395, "grad_norm": 8.679883003234863, "learning_rate": 4.61187665691661e-06, "loss": 0.5151, "mean_token_accuracy": 0.8424584448337555, "num_tokens": 116715061.0, "step": 97070 }, { "entropy": 1.919617336988449, "epoch": 0.3009395059824436, "grad_norm": 9.382501602172852, "learning_rate": 4.611639118652459e-06, "loss": 0.5369, "mean_token_accuracy": 0.8328076407313347, "num_tokens": 116726486.0, "step": 97080 }, { "entropy": 1.890953540802002, "epoch": 0.30097050510749335, "grad_norm": 10.713085174560547, "learning_rate": 4.611401617088301e-06, "loss": 0.563, "mean_token_accuracy": 0.8449400596320629, "num_tokens": 116738906.0, "step": 97090 }, { "entropy": 1.8357614412903787, "epoch": 0.301001504232543, "grad_norm": 8.783710479736328, "learning_rate": 4.611164152214689e-06, "loss": 0.4748, "mean_token_accuracy": 0.8468549698591232, "num_tokens": 116751196.0, "step": 97100 }, { "entropy": 1.9422936275601388, "epoch": 0.30103250335759274, "grad_norm": 8.501259803771973, "learning_rate": 4.6109267240221755e-06, "loss": 0.527, "mean_token_accuracy": 0.8422497108578682, "num_tokens": 116763810.0, "step": 97110 }, { "entropy": 1.9155998945236206, "epoch": 0.3010635024826424, "grad_norm": 7.944989204406738, "learning_rate": 4.610689332501317e-06, "loss": 0.5321, "mean_token_accuracy": 0.823591648042202, "num_tokens": 116776024.0, "step": 97120 }, { "entropy": 1.8754037857055663, "epoch": 0.30109450160769213, "grad_norm": 7.605261325836182, "learning_rate": 4.610451977642677e-06, "loss": 0.4565, "mean_token_accuracy": 0.8444375693798065, "num_tokens": 116787890.0, "step": 97130 }, { "entropy": 1.8820706829428673, "epoch": 0.3011255007327418, "grad_norm": 4.352447509765625, "learning_rate": 4.610214659436818e-06, "loss": 0.4916, "mean_token_accuracy": 0.8387032672762871, "num_tokens": 116799997.0, "step": 97140 }, { "entropy": 1.8734632670879363, "epoch": 0.3011564998577915, "grad_norm": 7.570825576782227, "learning_rate": 4.609977377874307e-06, "loss": 0.4729, "mean_token_accuracy": 0.8414079815149307, "num_tokens": 116812400.0, "step": 97150 }, { "entropy": 1.9303068399429322, "epoch": 0.3011874989828412, "grad_norm": 3.602426528930664, "learning_rate": 4.609740132945716e-06, "loss": 0.4627, "mean_token_accuracy": 0.8503539338707924, "num_tokens": 116825076.0, "step": 97160 }, { "entropy": 1.8318970784544946, "epoch": 0.3012184981078909, "grad_norm": 9.406216621398926, "learning_rate": 4.609502924641619e-06, "loss": 0.4565, "mean_token_accuracy": 0.8480072125792504, "num_tokens": 116837724.0, "step": 97170 }, { "entropy": 1.9633089289069177, "epoch": 0.3012494972329406, "grad_norm": 8.81785774230957, "learning_rate": 4.609265752952596e-06, "loss": 0.543, "mean_token_accuracy": 0.8268476709723472, "num_tokens": 116849107.0, "step": 97180 }, { "entropy": 1.8834416687488555, "epoch": 0.3012804963579903, "grad_norm": 4.704411506652832, "learning_rate": 4.609028617869224e-06, "loss": 0.4607, "mean_token_accuracy": 0.8506078109145164, "num_tokens": 116861833.0, "step": 97190 }, { "entropy": 1.7407511696219444, "epoch": 0.30131149548304, "grad_norm": 2.5168025493621826, "learning_rate": 4.6087915193820916e-06, "loss": 0.3506, "mean_token_accuracy": 0.8639174044132233, "num_tokens": 116876452.0, "step": 97200 }, { "entropy": 1.8649092674255372, "epoch": 0.3013424946080897, "grad_norm": 3.976242780685425, "learning_rate": 4.608554457481785e-06, "loss": 0.4238, "mean_token_accuracy": 0.8543577790260315, "num_tokens": 116888349.0, "step": 97210 }, { "entropy": 1.850515154004097, "epoch": 0.3013734937331394, "grad_norm": 4.014977931976318, "learning_rate": 4.608317432158896e-06, "loss": 0.4406, "mean_token_accuracy": 0.8551569744944573, "num_tokens": 116900337.0, "step": 97220 }, { "entropy": 1.904152835905552, "epoch": 0.3014044928581891, "grad_norm": 7.234251022338867, "learning_rate": 4.60808044340402e-06, "loss": 0.4614, "mean_token_accuracy": 0.8568873882293702, "num_tokens": 116912573.0, "step": 97230 }, { "entropy": 1.8178890123963356, "epoch": 0.30143549198323877, "grad_norm": 2.6580164432525635, "learning_rate": 4.607843491207752e-06, "loss": 0.4276, "mean_token_accuracy": 0.8546398520469666, "num_tokens": 116924977.0, "step": 97240 }, { "entropy": 1.9117593422532082, "epoch": 0.3014664911082885, "grad_norm": 13.091536521911621, "learning_rate": 4.607606575560697e-06, "loss": 0.4832, "mean_token_accuracy": 0.8411953374743462, "num_tokens": 116936606.0, "step": 97250 }, { "entropy": 1.9153368800878525, "epoch": 0.30149749023333816, "grad_norm": 10.279766082763672, "learning_rate": 4.607369696453461e-06, "loss": 0.5155, "mean_token_accuracy": 0.8421198353171349, "num_tokens": 116948020.0, "step": 97260 }, { "entropy": 1.892761492729187, "epoch": 0.3015284893583879, "grad_norm": 8.145535469055176, "learning_rate": 4.6071328538766486e-06, "loss": 0.4989, "mean_token_accuracy": 0.8441890180110931, "num_tokens": 116959806.0, "step": 97270 }, { "entropy": 1.9218065708875656, "epoch": 0.30155948848343755, "grad_norm": 7.536378860473633, "learning_rate": 4.606896047820874e-06, "loss": 0.5161, "mean_token_accuracy": 0.8465237602591514, "num_tokens": 116970254.0, "step": 97280 }, { "entropy": 1.8478126615285873, "epoch": 0.3015904876084872, "grad_norm": 8.085580825805664, "learning_rate": 4.60665927827675e-06, "loss": 0.4712, "mean_token_accuracy": 0.8454589918255806, "num_tokens": 116981953.0, "step": 97290 }, { "entropy": 1.7839539676904679, "epoch": 0.30162148673353695, "grad_norm": 4.7135491371154785, "learning_rate": 4.606422545234899e-06, "loss": 0.4022, "mean_token_accuracy": 0.8522520795464515, "num_tokens": 116994789.0, "step": 97300 }, { "entropy": 1.7900249511003494, "epoch": 0.3016524858585866, "grad_norm": 12.013949394226074, "learning_rate": 4.606185848685939e-06, "loss": 0.4124, "mean_token_accuracy": 0.8632920622825623, "num_tokens": 117007458.0, "step": 97310 }, { "entropy": 1.8314153790473937, "epoch": 0.30168348498363634, "grad_norm": 8.588581085205078, "learning_rate": 4.605949188620496e-06, "loss": 0.454, "mean_token_accuracy": 0.8470736399292946, "num_tokens": 117019927.0, "step": 97320 }, { "entropy": 1.8574935659766196, "epoch": 0.301714484108686, "grad_norm": 8.47958755493164, "learning_rate": 4.6057125650292e-06, "loss": 0.4717, "mean_token_accuracy": 0.8512878254055977, "num_tokens": 117031729.0, "step": 97330 }, { "entropy": 1.9198235914111137, "epoch": 0.30174548323373573, "grad_norm": 10.255192756652832, "learning_rate": 4.605475977902682e-06, "loss": 0.4887, "mean_token_accuracy": 0.8471052810549736, "num_tokens": 117042862.0, "step": 97340 }, { "entropy": 1.8835196584463119, "epoch": 0.3017764823587854, "grad_norm": 9.062085151672363, "learning_rate": 4.605239427231577e-06, "loss": 0.5183, "mean_token_accuracy": 0.8494998052716255, "num_tokens": 117054126.0, "step": 97350 }, { "entropy": 1.9018598824739457, "epoch": 0.3018074814838351, "grad_norm": 8.304078102111816, "learning_rate": 4.6050029130065245e-06, "loss": 0.5274, "mean_token_accuracy": 0.8413213551044464, "num_tokens": 117065364.0, "step": 97360 }, { "entropy": 1.8747293829917908, "epoch": 0.3018384806088848, "grad_norm": 8.438359260559082, "learning_rate": 4.604766435218166e-06, "loss": 0.498, "mean_token_accuracy": 0.8443131268024444, "num_tokens": 117076938.0, "step": 97370 }, { "entropy": 1.8413864582777024, "epoch": 0.3018694797339345, "grad_norm": 8.747175216674805, "learning_rate": 4.604529993857147e-06, "loss": 0.4393, "mean_token_accuracy": 0.8414786517620086, "num_tokens": 117089574.0, "step": 97380 }, { "entropy": 1.9370706051588058, "epoch": 0.3019004788589842, "grad_norm": 7.200348854064941, "learning_rate": 4.604293588914116e-06, "loss": 0.5174, "mean_token_accuracy": 0.8398382663726807, "num_tokens": 117100625.0, "step": 97390 }, { "entropy": 1.850702230632305, "epoch": 0.3019314779840339, "grad_norm": 8.624340057373047, "learning_rate": 4.604057220379726e-06, "loss": 0.4435, "mean_token_accuracy": 0.8501965671777725, "num_tokens": 117113376.0, "step": 97400 }, { "entropy": 1.8446269080042839, "epoch": 0.3019624771090836, "grad_norm": 3.699392318725586, "learning_rate": 4.603820888244632e-06, "loss": 0.4437, "mean_token_accuracy": 0.8584934890270233, "num_tokens": 117126682.0, "step": 97410 }, { "entropy": 1.943909691274166, "epoch": 0.3019934762341333, "grad_norm": 3.7331109046936035, "learning_rate": 4.603584592499492e-06, "loss": 0.569, "mean_token_accuracy": 0.8290564298629761, "num_tokens": 117138961.0, "step": 97420 }, { "entropy": 1.8698807314038277, "epoch": 0.302024475359183, "grad_norm": 8.145543098449707, "learning_rate": 4.603348333134969e-06, "loss": 0.466, "mean_token_accuracy": 0.8512676984071732, "num_tokens": 117150754.0, "step": 97430 }, { "entropy": 1.9144811987876893, "epoch": 0.3020554744842327, "grad_norm": 9.019363403320312, "learning_rate": 4.60311211014173e-06, "loss": 0.4746, "mean_token_accuracy": 0.8448151677846909, "num_tokens": 117162759.0, "step": 97440 }, { "entropy": 1.8737313896417618, "epoch": 0.30208647360928237, "grad_norm": 7.63023567199707, "learning_rate": 4.602875923510441e-06, "loss": 0.4619, "mean_token_accuracy": 0.8467231750488281, "num_tokens": 117174665.0, "step": 97450 }, { "entropy": 1.8207977265119553, "epoch": 0.3021174727343321, "grad_norm": 8.674139976501465, "learning_rate": 4.602639773231776e-06, "loss": 0.471, "mean_token_accuracy": 0.8548212826251984, "num_tokens": 117187809.0, "step": 97460 }, { "entropy": 1.8152425453066825, "epoch": 0.30214847185938176, "grad_norm": 5.675132751464844, "learning_rate": 4.602403659296411e-06, "loss": 0.4768, "mean_token_accuracy": 0.8367105752229691, "num_tokens": 117201085.0, "step": 97470 }, { "entropy": 1.857864636182785, "epoch": 0.3021794709844315, "grad_norm": 8.102805137634277, "learning_rate": 4.602167581695023e-06, "loss": 0.4151, "mean_token_accuracy": 0.8479077383875847, "num_tokens": 117213732.0, "step": 97480 }, { "entropy": 1.925594538450241, "epoch": 0.30221047010948116, "grad_norm": 7.970809459686279, "learning_rate": 4.601931540418297e-06, "loss": 0.523, "mean_token_accuracy": 0.8393017098307609, "num_tokens": 117224935.0, "step": 97490 }, { "entropy": 1.86695496737957, "epoch": 0.3022414692345309, "grad_norm": 9.16865348815918, "learning_rate": 4.601695535456917e-06, "loss": 0.4778, "mean_token_accuracy": 0.8479771926999092, "num_tokens": 117236716.0, "step": 97500 }, { "entropy": 1.9163902431726456, "epoch": 0.30227246835958055, "grad_norm": 8.983051300048828, "learning_rate": 4.601459566801571e-06, "loss": 0.4887, "mean_token_accuracy": 0.8492978289723396, "num_tokens": 117247883.0, "step": 97510 }, { "entropy": 1.8202176943421364, "epoch": 0.3023034674846303, "grad_norm": 7.500359535217285, "learning_rate": 4.601223634442954e-06, "loss": 0.3772, "mean_token_accuracy": 0.8624223947525025, "num_tokens": 117261178.0, "step": 97520 }, { "entropy": 1.9173918321728707, "epoch": 0.30233446660967994, "grad_norm": 8.454766273498535, "learning_rate": 4.600987738371759e-06, "loss": 0.4866, "mean_token_accuracy": 0.8437489971518517, "num_tokens": 117273426.0, "step": 97530 }, { "entropy": 1.77590219527483, "epoch": 0.3023654657347296, "grad_norm": 8.413315773010254, "learning_rate": 4.600751878578687e-06, "loss": 0.3971, "mean_token_accuracy": 0.8557587161660194, "num_tokens": 117286603.0, "step": 97540 }, { "entropy": 1.9176592335104943, "epoch": 0.30239646485977933, "grad_norm": 10.7260103225708, "learning_rate": 4.600516055054439e-06, "loss": 0.5096, "mean_token_accuracy": 0.8402794227004051, "num_tokens": 117297972.0, "step": 97550 }, { "entropy": 1.8973240569233893, "epoch": 0.302427463984829, "grad_norm": 4.1420087814331055, "learning_rate": 4.600280267789722e-06, "loss": 0.4362, "mean_token_accuracy": 0.8584436431527138, "num_tokens": 117309456.0, "step": 97560 }, { "entropy": 1.9296040430665016, "epoch": 0.30245846310987873, "grad_norm": 8.410199165344238, "learning_rate": 4.600044516775245e-06, "loss": 0.5078, "mean_token_accuracy": 0.8443345412611961, "num_tokens": 117321272.0, "step": 97570 }, { "entropy": 1.9159424543380736, "epoch": 0.3024894622349284, "grad_norm": 8.414966583251953, "learning_rate": 4.5998088020017186e-06, "loss": 0.4903, "mean_token_accuracy": 0.8412731990218163, "num_tokens": 117333250.0, "step": 97580 }, { "entropy": 1.7769809067249298, "epoch": 0.3025204613599781, "grad_norm": 3.658212661743164, "learning_rate": 4.599573123459859e-06, "loss": 0.38, "mean_token_accuracy": 0.8634654805064201, "num_tokens": 117346443.0, "step": 97590 }, { "entropy": 1.8901727512478828, "epoch": 0.3025514604850278, "grad_norm": 9.50108814239502, "learning_rate": 4.599337481140387e-06, "loss": 0.4743, "mean_token_accuracy": 0.8505553930997849, "num_tokens": 117358066.0, "step": 97600 }, { "entropy": 1.8625748187303544, "epoch": 0.3025824596100775, "grad_norm": 8.508552551269531, "learning_rate": 4.5991018750340235e-06, "loss": 0.4556, "mean_token_accuracy": 0.849023912847042, "num_tokens": 117369805.0, "step": 97610 }, { "entropy": 1.9069266065955162, "epoch": 0.3026134587351272, "grad_norm": 8.270326614379883, "learning_rate": 4.5988663051314944e-06, "loss": 0.4747, "mean_token_accuracy": 0.8539838179945946, "num_tokens": 117381262.0, "step": 97620 }, { "entropy": 1.9128611549735068, "epoch": 0.3026444578601769, "grad_norm": 8.971906661987305, "learning_rate": 4.5986307714235286e-06, "loss": 0.5167, "mean_token_accuracy": 0.832034258544445, "num_tokens": 117393179.0, "step": 97630 }, { "entropy": 1.8143539875745773, "epoch": 0.3026754569852266, "grad_norm": 7.190959453582764, "learning_rate": 4.5983952739008585e-06, "loss": 0.4082, "mean_token_accuracy": 0.8518998950719834, "num_tokens": 117406161.0, "step": 97640 }, { "entropy": 1.8843399420380593, "epoch": 0.3027064561102763, "grad_norm": 9.908082008361816, "learning_rate": 4.598159812554219e-06, "loss": 0.4768, "mean_token_accuracy": 0.8463353976607323, "num_tokens": 117418576.0, "step": 97650 }, { "entropy": 1.8831077113747596, "epoch": 0.30273745523532597, "grad_norm": 9.612486839294434, "learning_rate": 4.597924387374351e-06, "loss": 0.4527, "mean_token_accuracy": 0.8505209341645241, "num_tokens": 117430122.0, "step": 97660 }, { "entropy": 1.8462072387337685, "epoch": 0.3027684543603757, "grad_norm": 4.665594100952148, "learning_rate": 4.597688998351995e-06, "loss": 0.3917, "mean_token_accuracy": 0.8621034786105156, "num_tokens": 117442896.0, "step": 97670 }, { "entropy": 1.9408204436302186, "epoch": 0.30279945348542536, "grad_norm": 7.827560901641846, "learning_rate": 4.597453645477898e-06, "loss": 0.5212, "mean_token_accuracy": 0.8394302636384964, "num_tokens": 117453977.0, "step": 97680 }, { "entropy": 1.8475888326764107, "epoch": 0.3028304526104751, "grad_norm": 8.166518211364746, "learning_rate": 4.597218328742807e-06, "loss": 0.4278, "mean_token_accuracy": 0.8580135375261306, "num_tokens": 117466325.0, "step": 97690 }, { "entropy": 1.8173963025212287, "epoch": 0.30286145173552476, "grad_norm": 4.001589775085449, "learning_rate": 4.596983048137475e-06, "loss": 0.4122, "mean_token_accuracy": 0.8486342668533325, "num_tokens": 117478782.0, "step": 97700 }, { "entropy": 1.8551449865102767, "epoch": 0.3028924508605745, "grad_norm": 8.81025218963623, "learning_rate": 4.596747803652658e-06, "loss": 0.4866, "mean_token_accuracy": 0.8483422353863717, "num_tokens": 117491302.0, "step": 97710 }, { "entropy": 1.8700275629758836, "epoch": 0.30292344998562415, "grad_norm": 3.273538112640381, "learning_rate": 4.596512595279115e-06, "loss": 0.4879, "mean_token_accuracy": 0.8363815456628799, "num_tokens": 117504377.0, "step": 97720 }, { "entropy": 1.9091096714138984, "epoch": 0.3029544491106739, "grad_norm": 4.058509349822998, "learning_rate": 4.5962774230076075e-06, "loss": 0.4511, "mean_token_accuracy": 0.8540859907865525, "num_tokens": 117515754.0, "step": 97730 }, { "entropy": 1.979654061794281, "epoch": 0.30298544823572354, "grad_norm": 8.739022254943848, "learning_rate": 4.596042286828902e-06, "loss": 0.5324, "mean_token_accuracy": 0.8428146079182625, "num_tokens": 117526416.0, "step": 97740 }, { "entropy": 1.9291452512145042, "epoch": 0.30301644736077327, "grad_norm": 8.007489204406738, "learning_rate": 4.595807186733767e-06, "loss": 0.5136, "mean_token_accuracy": 0.8436457946896553, "num_tokens": 117538034.0, "step": 97750 }, { "entropy": 1.8014676854014398, "epoch": 0.30304744648582294, "grad_norm": 9.48009967803955, "learning_rate": 4.595572122712974e-06, "loss": 0.4153, "mean_token_accuracy": 0.8604965686798096, "num_tokens": 117551372.0, "step": 97760 }, { "entropy": 1.8575839295983314, "epoch": 0.30307844561087266, "grad_norm": 2.4057457447052, "learning_rate": 4.595337094757297e-06, "loss": 0.4555, "mean_token_accuracy": 0.8554554954171181, "num_tokens": 117563424.0, "step": 97770 }, { "entropy": 1.8977721393108369, "epoch": 0.30310944473592233, "grad_norm": 9.092135429382324, "learning_rate": 4.595102102857518e-06, "loss": 0.4895, "mean_token_accuracy": 0.8410274058580398, "num_tokens": 117575055.0, "step": 97780 }, { "entropy": 1.89531751871109, "epoch": 0.303140443860972, "grad_norm": 8.363005638122559, "learning_rate": 4.594867147004416e-06, "loss": 0.484, "mean_token_accuracy": 0.8437793105840683, "num_tokens": 117586328.0, "step": 97790 }, { "entropy": 1.8966928854584695, "epoch": 0.3031714429860217, "grad_norm": 8.183049201965332, "learning_rate": 4.594632227188778e-06, "loss": 0.4881, "mean_token_accuracy": 0.8496061801910401, "num_tokens": 117598053.0, "step": 97800 }, { "entropy": 1.7953755557537079, "epoch": 0.3032024421110714, "grad_norm": 7.209178924560547, "learning_rate": 4.594397343401393e-06, "loss": 0.3953, "mean_token_accuracy": 0.8673543766140938, "num_tokens": 117610778.0, "step": 97810 }, { "entropy": 1.8411307245492936, "epoch": 0.3032334412361211, "grad_norm": 4.28125, "learning_rate": 4.59416249563305e-06, "loss": 0.4508, "mean_token_accuracy": 0.8453304886817932, "num_tokens": 117623608.0, "step": 97820 }, { "entropy": 1.849568995833397, "epoch": 0.3032644403611708, "grad_norm": 7.9680023193359375, "learning_rate": 4.593927683874549e-06, "loss": 0.4654, "mean_token_accuracy": 0.8446632474660873, "num_tokens": 117635895.0, "step": 97830 }, { "entropy": 1.9411183446645737, "epoch": 0.3032954394862205, "grad_norm": 8.592232704162598, "learning_rate": 4.593692908116683e-06, "loss": 0.49, "mean_token_accuracy": 0.8454527780413628, "num_tokens": 117647042.0, "step": 97840 }, { "entropy": 1.9133759438991547, "epoch": 0.3033264386112702, "grad_norm": 8.33125114440918, "learning_rate": 4.593458168350257e-06, "loss": 0.4768, "mean_token_accuracy": 0.8406713515520096, "num_tokens": 117658493.0, "step": 97850 }, { "entropy": 1.9860854953527451, "epoch": 0.3033574377363199, "grad_norm": 8.772767066955566, "learning_rate": 4.593223464566075e-06, "loss": 0.5346, "mean_token_accuracy": 0.836230854690075, "num_tokens": 117669208.0, "step": 97860 }, { "entropy": 1.8566246941685676, "epoch": 0.30338843686136957, "grad_norm": 2.640052556991577, "learning_rate": 4.592988796754947e-06, "loss": 0.4453, "mean_token_accuracy": 0.8560033723711967, "num_tokens": 117681693.0, "step": 97870 }, { "entropy": 1.90928722769022, "epoch": 0.3034194359864193, "grad_norm": 7.907132148742676, "learning_rate": 4.592754164907683e-06, "loss": 0.5258, "mean_token_accuracy": 0.8417934641242028, "num_tokens": 117693406.0, "step": 97880 }, { "entropy": 1.9087349817156791, "epoch": 0.30345043511146896, "grad_norm": 11.36373519897461, "learning_rate": 4.592519569015098e-06, "loss": 0.4901, "mean_token_accuracy": 0.8501476049423218, "num_tokens": 117704501.0, "step": 97890 }, { "entropy": 1.8858125910162926, "epoch": 0.3034814342365187, "grad_norm": 7.878525733947754, "learning_rate": 4.592285009068011e-06, "loss": 0.4703, "mean_token_accuracy": 0.8410215243697167, "num_tokens": 117716498.0, "step": 97900 }, { "entropy": 1.9222358748316766, "epoch": 0.30351243336156836, "grad_norm": 9.170816421508789, "learning_rate": 4.592050485057241e-06, "loss": 0.4626, "mean_token_accuracy": 0.8559870198369026, "num_tokens": 117727660.0, "step": 97910 }, { "entropy": 1.9125974863767623, "epoch": 0.3035434324866181, "grad_norm": 8.71693229675293, "learning_rate": 4.591815996973617e-06, "loss": 0.4696, "mean_token_accuracy": 0.8463147193193435, "num_tokens": 117739528.0, "step": 97920 }, { "entropy": 1.8418530434370042, "epoch": 0.30357443161166775, "grad_norm": 4.724992752075195, "learning_rate": 4.591581544807964e-06, "loss": 0.4521, "mean_token_accuracy": 0.849432897567749, "num_tokens": 117752050.0, "step": 97930 }, { "entropy": 1.8719761818647385, "epoch": 0.3036054307367175, "grad_norm": 8.673355102539062, "learning_rate": 4.591347128551114e-06, "loss": 0.4561, "mean_token_accuracy": 0.8506433501839638, "num_tokens": 117763777.0, "step": 97940 }, { "entropy": 1.834936611354351, "epoch": 0.30363642986176714, "grad_norm": 4.6238603591918945, "learning_rate": 4.5911127481939e-06, "loss": 0.4469, "mean_token_accuracy": 0.8557968467473984, "num_tokens": 117776099.0, "step": 97950 }, { "entropy": 1.7863100260496139, "epoch": 0.30366742898681687, "grad_norm": 4.946314811706543, "learning_rate": 4.590878403727164e-06, "loss": 0.44, "mean_token_accuracy": 0.8499247878789902, "num_tokens": 117789509.0, "step": 97960 }, { "entropy": 1.915014560520649, "epoch": 0.30369842811186654, "grad_norm": 8.316082000732422, "learning_rate": 4.5906440951417435e-06, "loss": 0.4681, "mean_token_accuracy": 0.8483388528227807, "num_tokens": 117800926.0, "step": 97970 }, { "entropy": 1.9649482190608978, "epoch": 0.30372942723691626, "grad_norm": 10.604179382324219, "learning_rate": 4.590409822428485e-06, "loss": 0.524, "mean_token_accuracy": 0.8398748651146889, "num_tokens": 117811622.0, "step": 97980 }, { "entropy": 1.7894397795200347, "epoch": 0.30376042636196593, "grad_norm": 6.535918712615967, "learning_rate": 4.590175585578233e-06, "loss": 0.3897, "mean_token_accuracy": 0.8634741649031639, "num_tokens": 117824809.0, "step": 97990 }, { "entropy": 1.8452475354075433, "epoch": 0.30379142548701565, "grad_norm": 9.069293975830078, "learning_rate": 4.589941384581842e-06, "loss": 0.4343, "mean_token_accuracy": 0.8545988813042641, "num_tokens": 117837581.0, "step": 98000 }, { "entropy": 1.9193524435162543, "epoch": 0.3038224246120653, "grad_norm": 9.2352933883667, "learning_rate": 4.589707219430166e-06, "loss": 0.483, "mean_token_accuracy": 0.8450753495097161, "num_tokens": 117849268.0, "step": 98010 }, { "entropy": 1.8406575858592986, "epoch": 0.30385342373711505, "grad_norm": 8.772045135498047, "learning_rate": 4.589473090114059e-06, "loss": 0.4357, "mean_token_accuracy": 0.8576031014323234, "num_tokens": 117861975.0, "step": 98020 }, { "entropy": 1.8483123630285263, "epoch": 0.3038844228621647, "grad_norm": 9.5717191696167, "learning_rate": 4.5892389966243866e-06, "loss": 0.4226, "mean_token_accuracy": 0.8474526911973953, "num_tokens": 117874143.0, "step": 98030 }, { "entropy": 1.9467709794640542, "epoch": 0.3039154219872144, "grad_norm": 8.297847747802734, "learning_rate": 4.589004938952009e-06, "loss": 0.488, "mean_token_accuracy": 0.8526682198047638, "num_tokens": 117885322.0, "step": 98040 }, { "entropy": 1.859838457405567, "epoch": 0.3039464211122641, "grad_norm": 8.134725570678711, "learning_rate": 4.588770917087794e-06, "loss": 0.4836, "mean_token_accuracy": 0.8431189700961113, "num_tokens": 117897060.0, "step": 98050 }, { "entropy": 1.8738309249281884, "epoch": 0.3039774202373138, "grad_norm": 4.704370021820068, "learning_rate": 4.5885369310226145e-06, "loss": 0.4939, "mean_token_accuracy": 0.8435174241662026, "num_tokens": 117909430.0, "step": 98060 }, { "entropy": 1.8503778785467149, "epoch": 0.3040084193623635, "grad_norm": 3.1746442317962646, "learning_rate": 4.588302980747341e-06, "loss": 0.4732, "mean_token_accuracy": 0.8513809859752655, "num_tokens": 117920805.0, "step": 98070 }, { "entropy": 1.8909773424267768, "epoch": 0.30403941848741317, "grad_norm": 8.956877708435059, "learning_rate": 4.588069066252854e-06, "loss": 0.5191, "mean_token_accuracy": 0.8422663122415542, "num_tokens": 117932448.0, "step": 98080 }, { "entropy": 1.8859625205397605, "epoch": 0.3040704176124629, "grad_norm": 6.800996780395508, "learning_rate": 4.587835187530031e-06, "loss": 0.4477, "mean_token_accuracy": 0.8531765311956405, "num_tokens": 117944171.0, "step": 98090 }, { "entropy": 1.9186548352241517, "epoch": 0.30410141673751256, "grad_norm": 8.095877647399902, "learning_rate": 4.587601344569756e-06, "loss": 0.5043, "mean_token_accuracy": 0.8412599250674248, "num_tokens": 117956478.0, "step": 98100 }, { "entropy": 1.8034547924995423, "epoch": 0.3041324158625623, "grad_norm": 4.433010578155518, "learning_rate": 4.587367537362918e-06, "loss": 0.4144, "mean_token_accuracy": 0.8495987445116043, "num_tokens": 117969524.0, "step": 98110 }, { "entropy": 1.8625740513205529, "epoch": 0.30416341498761196, "grad_norm": 8.75419807434082, "learning_rate": 4.587133765900404e-06, "loss": 0.4489, "mean_token_accuracy": 0.8563158795237541, "num_tokens": 117982012.0, "step": 98120 }, { "entropy": 1.8077227592468261, "epoch": 0.3041944141126617, "grad_norm": 8.769314765930176, "learning_rate": 4.586900030173109e-06, "loss": 0.4041, "mean_token_accuracy": 0.8591718584299087, "num_tokens": 117994852.0, "step": 98130 }, { "entropy": 1.8287582576274872, "epoch": 0.30422541323771135, "grad_norm": 5.158862590789795, "learning_rate": 4.58666633017193e-06, "loss": 0.4848, "mean_token_accuracy": 0.8468107968568802, "num_tokens": 118007827.0, "step": 98140 }, { "entropy": 1.917576177418232, "epoch": 0.3042564123627611, "grad_norm": 10.31728458404541, "learning_rate": 4.586432665887766e-06, "loss": 0.5309, "mean_token_accuracy": 0.8305649489164353, "num_tokens": 118018948.0, "step": 98150 }, { "entropy": 1.9343233779072762, "epoch": 0.30428741148781074, "grad_norm": 7.640726089477539, "learning_rate": 4.586199037311519e-06, "loss": 0.5013, "mean_token_accuracy": 0.849469755589962, "num_tokens": 118030240.0, "step": 98160 }, { "entropy": 1.874649804830551, "epoch": 0.30431841061286047, "grad_norm": 7.788882255554199, "learning_rate": 4.585965444434098e-06, "loss": 0.4418, "mean_token_accuracy": 0.8480836063623428, "num_tokens": 118042287.0, "step": 98170 }, { "entropy": 1.894361424446106, "epoch": 0.30434940973791014, "grad_norm": 5.997152328491211, "learning_rate": 4.58573188724641e-06, "loss": 0.533, "mean_token_accuracy": 0.8430122479796409, "num_tokens": 118054246.0, "step": 98180 }, { "entropy": 1.8837274074554444, "epoch": 0.30438040886295986, "grad_norm": 8.705531120300293, "learning_rate": 4.585498365739368e-06, "loss": 0.4851, "mean_token_accuracy": 0.8436176598072052, "num_tokens": 118066667.0, "step": 98190 }, { "entropy": 1.905727145075798, "epoch": 0.30441140798800953, "grad_norm": 8.105955123901367, "learning_rate": 4.585264879903889e-06, "loss": 0.4952, "mean_token_accuracy": 0.8468491420149803, "num_tokens": 118078925.0, "step": 98200 }, { "entropy": 1.9071518421173095, "epoch": 0.30444240711305925, "grad_norm": 8.408157348632812, "learning_rate": 4.585031429730893e-06, "loss": 0.4984, "mean_token_accuracy": 0.8430832877755166, "num_tokens": 118090390.0, "step": 98210 }, { "entropy": 1.8832137271761895, "epoch": 0.3044734062381089, "grad_norm": 4.172749042510986, "learning_rate": 4.5847980152113015e-06, "loss": 0.4766, "mean_token_accuracy": 0.8408018171787262, "num_tokens": 118103271.0, "step": 98220 }, { "entropy": 1.8779758632183075, "epoch": 0.30450440536315865, "grad_norm": 6.640566825866699, "learning_rate": 4.584564636336039e-06, "loss": 0.4322, "mean_token_accuracy": 0.8598590075969696, "num_tokens": 118115121.0, "step": 98230 }, { "entropy": 1.8136912897229194, "epoch": 0.3045354044882083, "grad_norm": 9.391630172729492, "learning_rate": 4.584331293096037e-06, "loss": 0.4272, "mean_token_accuracy": 0.8552101209759713, "num_tokens": 118127803.0, "step": 98240 }, { "entropy": 1.7988118350505828, "epoch": 0.30456640361325804, "grad_norm": 3.3279430866241455, "learning_rate": 4.584097985482225e-06, "loss": 0.3666, "mean_token_accuracy": 0.8751557558774948, "num_tokens": 118140663.0, "step": 98250 }, { "entropy": 1.8820227935910225, "epoch": 0.3045974027383077, "grad_norm": 12.045969009399414, "learning_rate": 4.583864713485541e-06, "loss": 0.4841, "mean_token_accuracy": 0.8497835651040078, "num_tokens": 118152695.0, "step": 98260 }, { "entropy": 1.8159508243203164, "epoch": 0.30462840186335743, "grad_norm": 5.319043159484863, "learning_rate": 4.583631477096921e-06, "loss": 0.5005, "mean_token_accuracy": 0.8409684166312218, "num_tokens": 118165387.0, "step": 98270 }, { "entropy": 1.8920594125986099, "epoch": 0.3046594009884071, "grad_norm": 8.387589454650879, "learning_rate": 4.583398276307309e-06, "loss": 0.463, "mean_token_accuracy": 0.8461384385824203, "num_tokens": 118177881.0, "step": 98280 }, { "entropy": 1.8834513157606125, "epoch": 0.30469040011345677, "grad_norm": 7.704468250274658, "learning_rate": 4.58316511110765e-06, "loss": 0.5188, "mean_token_accuracy": 0.84387187063694, "num_tokens": 118189579.0, "step": 98290 }, { "entropy": 1.9479789227247237, "epoch": 0.3047213992385065, "grad_norm": 9.058480262756348, "learning_rate": 4.582931981488891e-06, "loss": 0.5274, "mean_token_accuracy": 0.822111751139164, "num_tokens": 118201583.0, "step": 98300 }, { "entropy": 1.914680229127407, "epoch": 0.30475239836355617, "grad_norm": 9.425095558166504, "learning_rate": 4.582698887441983e-06, "loss": 0.4838, "mean_token_accuracy": 0.8463716924190521, "num_tokens": 118212830.0, "step": 98310 }, { "entropy": 1.9096634030342101, "epoch": 0.3047833974886059, "grad_norm": 4.8716912269592285, "learning_rate": 4.582465828957883e-06, "loss": 0.5376, "mean_token_accuracy": 0.8446076348423958, "num_tokens": 118223765.0, "step": 98320 }, { "entropy": 1.8158196270465852, "epoch": 0.30481439661365556, "grad_norm": 7.788163185119629, "learning_rate": 4.582232806027548e-06, "loss": 0.4027, "mean_token_accuracy": 0.8658209949731827, "num_tokens": 118236493.0, "step": 98330 }, { "entropy": 1.8198370561003685, "epoch": 0.3048453957387053, "grad_norm": 3.936845541000366, "learning_rate": 4.581999818641939e-06, "loss": 0.4219, "mean_token_accuracy": 0.8498961672186851, "num_tokens": 118248905.0, "step": 98340 }, { "entropy": 1.8224836379289626, "epoch": 0.30487639486375495, "grad_norm": 2.4840080738067627, "learning_rate": 4.5817668667920205e-06, "loss": 0.4328, "mean_token_accuracy": 0.8523753926157951, "num_tokens": 118262429.0, "step": 98350 }, { "entropy": 1.776771107316017, "epoch": 0.3049073939888047, "grad_norm": 7.824039936065674, "learning_rate": 4.58153395046876e-06, "loss": 0.3806, "mean_token_accuracy": 0.8675816237926484, "num_tokens": 118275691.0, "step": 98360 }, { "entropy": 1.8841424211859703, "epoch": 0.30493839311385434, "grad_norm": 7.849323749542236, "learning_rate": 4.581301069663129e-06, "loss": 0.5043, "mean_token_accuracy": 0.8445948898792267, "num_tokens": 118288240.0, "step": 98370 }, { "entropy": 1.8760451026260854, "epoch": 0.30496939223890407, "grad_norm": 8.8077974319458, "learning_rate": 4.5810682243661e-06, "loss": 0.4661, "mean_token_accuracy": 0.8469317957758904, "num_tokens": 118301110.0, "step": 98380 }, { "entropy": 1.8852962955832482, "epoch": 0.30500039136395374, "grad_norm": 8.56628704071045, "learning_rate": 4.580835414568652e-06, "loss": 0.476, "mean_token_accuracy": 0.849009807407856, "num_tokens": 118312570.0, "step": 98390 }, { "entropy": 1.8616637766361237, "epoch": 0.30503139048900346, "grad_norm": 4.301907539367676, "learning_rate": 4.580602640261765e-06, "loss": 0.44, "mean_token_accuracy": 0.852383928000927, "num_tokens": 118324622.0, "step": 98400 }, { "entropy": 1.9427771091461181, "epoch": 0.30506238961405313, "grad_norm": 6.355135440826416, "learning_rate": 4.580369901436422e-06, "loss": 0.5128, "mean_token_accuracy": 0.8413548216223716, "num_tokens": 118336164.0, "step": 98410 }, { "entropy": 1.9013810217380525, "epoch": 0.30509338873910286, "grad_norm": 8.685293197631836, "learning_rate": 4.580137198083611e-06, "loss": 0.5058, "mean_token_accuracy": 0.8395018517971039, "num_tokens": 118348429.0, "step": 98420 }, { "entropy": 1.8083545833826065, "epoch": 0.3051243878641525, "grad_norm": 9.026556968688965, "learning_rate": 4.5799045301943205e-06, "loss": 0.4327, "mean_token_accuracy": 0.8529153957962989, "num_tokens": 118361098.0, "step": 98430 }, { "entropy": 1.8322697907686234, "epoch": 0.30515538698920225, "grad_norm": 8.164215087890625, "learning_rate": 4.579671897759546e-06, "loss": 0.49, "mean_token_accuracy": 0.8390917211771012, "num_tokens": 118374073.0, "step": 98440 }, { "entropy": 1.8684598043560983, "epoch": 0.3051863861142519, "grad_norm": 7.570199489593506, "learning_rate": 4.579439300770282e-06, "loss": 0.4692, "mean_token_accuracy": 0.8506176099181175, "num_tokens": 118385894.0, "step": 98450 }, { "entropy": 1.959225982427597, "epoch": 0.30521738523930164, "grad_norm": 8.114801406860352, "learning_rate": 4.579206739217529e-06, "loss": 0.5804, "mean_token_accuracy": 0.8300017550587654, "num_tokens": 118397286.0, "step": 98460 }, { "entropy": 1.865318314731121, "epoch": 0.3052483843643513, "grad_norm": 9.157855033874512, "learning_rate": 4.57897421309229e-06, "loss": 0.4553, "mean_token_accuracy": 0.8516108497977257, "num_tokens": 118408785.0, "step": 98470 }, { "entropy": 1.8846929490566253, "epoch": 0.30527938348940103, "grad_norm": 7.941446304321289, "learning_rate": 4.5787417223855705e-06, "loss": 0.4774, "mean_token_accuracy": 0.8428527131676674, "num_tokens": 118420758.0, "step": 98480 }, { "entropy": 1.8963923439383508, "epoch": 0.3053103826144507, "grad_norm": 9.210304260253906, "learning_rate": 4.57850926708838e-06, "loss": 0.4957, "mean_token_accuracy": 0.8412079870700836, "num_tokens": 118432505.0, "step": 98490 }, { "entropy": 1.9127984009683132, "epoch": 0.30534138173950043, "grad_norm": 8.439544677734375, "learning_rate": 4.578276847191734e-06, "loss": 0.4771, "mean_token_accuracy": 0.8416270837187767, "num_tokens": 118444368.0, "step": 98500 }, { "entropy": 1.8232401758432388, "epoch": 0.3053723808645501, "grad_norm": 8.125101089477539, "learning_rate": 4.578044462686643e-06, "loss": 0.4172, "mean_token_accuracy": 0.8568010330200195, "num_tokens": 118457274.0, "step": 98510 }, { "entropy": 1.988058003783226, "epoch": 0.30540337998959977, "grad_norm": 8.513818740844727, "learning_rate": 4.577812113564129e-06, "loss": 0.5254, "mean_token_accuracy": 0.8399447157979012, "num_tokens": 118468141.0, "step": 98520 }, { "entropy": 1.8777014121413231, "epoch": 0.3054343791146495, "grad_norm": 7.6989850997924805, "learning_rate": 4.577579799815213e-06, "loss": 0.4963, "mean_token_accuracy": 0.8421044409275055, "num_tokens": 118479910.0, "step": 98530 }, { "entropy": 1.8567018955945969, "epoch": 0.30546537823969916, "grad_norm": 8.173233032226562, "learning_rate": 4.57734752143092e-06, "loss": 0.4427, "mean_token_accuracy": 0.8535317838191986, "num_tokens": 118491450.0, "step": 98540 }, { "entropy": 1.8805242463946343, "epoch": 0.3054963773647489, "grad_norm": 8.804410934448242, "learning_rate": 4.577115278402281e-06, "loss": 0.4536, "mean_token_accuracy": 0.8533170059323311, "num_tokens": 118502975.0, "step": 98550 }, { "entropy": 1.7814088612794876, "epoch": 0.30552737648979855, "grad_norm": 8.247437477111816, "learning_rate": 4.5768830707203236e-06, "loss": 0.4217, "mean_token_accuracy": 0.8531454712152481, "num_tokens": 118515769.0, "step": 98560 }, { "entropy": 1.8369492530822753, "epoch": 0.3055583756148483, "grad_norm": 7.819140911102295, "learning_rate": 4.576650898376085e-06, "loss": 0.4961, "mean_token_accuracy": 0.8478553339838981, "num_tokens": 118528258.0, "step": 98570 }, { "entropy": 1.868079724907875, "epoch": 0.30558937473989795, "grad_norm": 4.0178446769714355, "learning_rate": 4.5764187613606045e-06, "loss": 0.4468, "mean_token_accuracy": 0.8433499991893768, "num_tokens": 118540637.0, "step": 98580 }, { "entropy": 1.825770527124405, "epoch": 0.30562037386494767, "grad_norm": 6.859602451324463, "learning_rate": 4.57618665966492e-06, "loss": 0.4441, "mean_token_accuracy": 0.8496432945132255, "num_tokens": 118553277.0, "step": 98590 }, { "entropy": 1.8329296857118607, "epoch": 0.30565137298999734, "grad_norm": 7.438360214233398, "learning_rate": 4.575954593280079e-06, "loss": 0.4231, "mean_token_accuracy": 0.858691118657589, "num_tokens": 118564885.0, "step": 98600 }, { "entropy": 1.8381449341773988, "epoch": 0.30568237211504706, "grad_norm": 4.074737071990967, "learning_rate": 4.575722562197127e-06, "loss": 0.4524, "mean_token_accuracy": 0.8469588398933411, "num_tokens": 118577848.0, "step": 98610 }, { "entropy": 1.85487762093544, "epoch": 0.30571337124009673, "grad_norm": 7.444735527038574, "learning_rate": 4.575490566407115e-06, "loss": 0.4466, "mean_token_accuracy": 0.849855688214302, "num_tokens": 118590154.0, "step": 98620 }, { "entropy": 1.8266935624182223, "epoch": 0.30574437036514646, "grad_norm": 4.6787896156311035, "learning_rate": 4.575258605901098e-06, "loss": 0.419, "mean_token_accuracy": 0.8533053979277611, "num_tokens": 118603425.0, "step": 98630 }, { "entropy": 1.886932836472988, "epoch": 0.3057753694901961, "grad_norm": 7.9331889152526855, "learning_rate": 4.575026680670132e-06, "loss": 0.5059, "mean_token_accuracy": 0.8492220997810364, "num_tokens": 118614798.0, "step": 98640 }, { "entropy": 1.803956750035286, "epoch": 0.30580636861524585, "grad_norm": 7.170061111450195, "learning_rate": 4.5747947907052775e-06, "loss": 0.4131, "mean_token_accuracy": 0.8577531769871711, "num_tokens": 118628191.0, "step": 98650 }, { "entropy": 1.8016648098826409, "epoch": 0.3058373677402955, "grad_norm": 3.502978801727295, "learning_rate": 4.574562935997597e-06, "loss": 0.4022, "mean_token_accuracy": 0.8521159499883652, "num_tokens": 118641385.0, "step": 98660 }, { "entropy": 1.8296085387468337, "epoch": 0.30586836686534524, "grad_norm": 8.538934707641602, "learning_rate": 4.574331116538158e-06, "loss": 0.4299, "mean_token_accuracy": 0.8565147161483765, "num_tokens": 118653485.0, "step": 98670 }, { "entropy": 1.8369739711284638, "epoch": 0.3058993659903949, "grad_norm": 7.255573749542236, "learning_rate": 4.574099332318032e-06, "loss": 0.4302, "mean_token_accuracy": 0.8531767651438713, "num_tokens": 118665318.0, "step": 98680 }, { "entropy": 1.8349349424242973, "epoch": 0.30593036511544464, "grad_norm": 8.855729103088379, "learning_rate": 4.573867583328289e-06, "loss": 0.4502, "mean_token_accuracy": 0.8501504331827163, "num_tokens": 118678218.0, "step": 98690 }, { "entropy": 1.8983908250927926, "epoch": 0.3059613642404943, "grad_norm": 8.671244621276855, "learning_rate": 4.573635869560006e-06, "loss": 0.472, "mean_token_accuracy": 0.8487844780087471, "num_tokens": 118690095.0, "step": 98700 }, { "entropy": 1.8810410752892495, "epoch": 0.30599236336554403, "grad_norm": 8.668618202209473, "learning_rate": 4.573404191004263e-06, "loss": 0.482, "mean_token_accuracy": 0.8451575711369514, "num_tokens": 118701551.0, "step": 98710 }, { "entropy": 1.9009374052286148, "epoch": 0.3060233624905937, "grad_norm": 8.427909851074219, "learning_rate": 4.573172547652142e-06, "loss": 0.4786, "mean_token_accuracy": 0.839051516354084, "num_tokens": 118713008.0, "step": 98720 }, { "entropy": 1.9209471434354781, "epoch": 0.3060543616156434, "grad_norm": 7.763575077056885, "learning_rate": 4.572940939494728e-06, "loss": 0.5099, "mean_token_accuracy": 0.8371775403618813, "num_tokens": 118724161.0, "step": 98730 }, { "entropy": 1.9421784669160842, "epoch": 0.3060853607406931, "grad_norm": 8.41716194152832, "learning_rate": 4.5727093665231095e-06, "loss": 0.498, "mean_token_accuracy": 0.8471307173371315, "num_tokens": 118734733.0, "step": 98740 }, { "entropy": 1.8512969225645066, "epoch": 0.3061163598657428, "grad_norm": 4.9068193435668945, "learning_rate": 4.57247782872838e-06, "loss": 0.4509, "mean_token_accuracy": 0.8417642742395401, "num_tokens": 118747239.0, "step": 98750 }, { "entropy": 1.8801366731524467, "epoch": 0.3061473589907925, "grad_norm": 8.114398956298828, "learning_rate": 4.5722463261016335e-06, "loss": 0.4223, "mean_token_accuracy": 0.8596544772386551, "num_tokens": 118759918.0, "step": 98760 }, { "entropy": 1.8776177152991296, "epoch": 0.30617835811584215, "grad_norm": 7.3587141036987305, "learning_rate": 4.572014858633968e-06, "loss": 0.4446, "mean_token_accuracy": 0.8482073560357094, "num_tokens": 118771855.0, "step": 98770 }, { "entropy": 1.8818192645907401, "epoch": 0.3062093572408919, "grad_norm": 10.548823356628418, "learning_rate": 4.571783426316486e-06, "loss": 0.5322, "mean_token_accuracy": 0.835597887635231, "num_tokens": 118783654.0, "step": 98780 }, { "entropy": 1.8536282077431678, "epoch": 0.30624035636594155, "grad_norm": 3.757561206817627, "learning_rate": 4.571552029140291e-06, "loss": 0.4171, "mean_token_accuracy": 0.860873955488205, "num_tokens": 118795662.0, "step": 98790 }, { "entropy": 1.8578494921326638, "epoch": 0.30627135549099127, "grad_norm": 6.849057197570801, "learning_rate": 4.57132066709649e-06, "loss": 0.5194, "mean_token_accuracy": 0.8355998247861862, "num_tokens": 118807279.0, "step": 98800 }, { "entropy": 1.9314350634813309, "epoch": 0.30630235461604094, "grad_norm": 9.63023853302002, "learning_rate": 4.571089340176196e-06, "loss": 0.5303, "mean_token_accuracy": 0.8435678333044052, "num_tokens": 118818128.0, "step": 98810 }, { "entropy": 1.816218902170658, "epoch": 0.30633335374109066, "grad_norm": 8.004508972167969, "learning_rate": 4.570858048370521e-06, "loss": 0.4384, "mean_token_accuracy": 0.8539455533027649, "num_tokens": 118830413.0, "step": 98820 }, { "entropy": 1.9110864594578743, "epoch": 0.30636435286614033, "grad_norm": 8.797080039978027, "learning_rate": 4.570626791670582e-06, "loss": 0.5289, "mean_token_accuracy": 0.8348402082920074, "num_tokens": 118841694.0, "step": 98830 }, { "entropy": 1.8853062570095063, "epoch": 0.30639535199119006, "grad_norm": 8.307490348815918, "learning_rate": 4.570395570067499e-06, "loss": 0.4921, "mean_token_accuracy": 0.8420171350240707, "num_tokens": 118853649.0, "step": 98840 }, { "entropy": 1.902986891567707, "epoch": 0.3064263511162397, "grad_norm": 6.329873561859131, "learning_rate": 4.5701643835523984e-06, "loss": 0.4958, "mean_token_accuracy": 0.8435382351279259, "num_tokens": 118864431.0, "step": 98850 }, { "entropy": 1.9434664219617843, "epoch": 0.30645735024128945, "grad_norm": 7.9458184242248535, "learning_rate": 4.569933232116404e-06, "loss": 0.5506, "mean_token_accuracy": 0.8374830722808838, "num_tokens": 118875327.0, "step": 98860 }, { "entropy": 1.824971318244934, "epoch": 0.3064883493663391, "grad_norm": 8.772302627563477, "learning_rate": 4.569702115750646e-06, "loss": 0.417, "mean_token_accuracy": 0.8616791382431984, "num_tokens": 118888102.0, "step": 98870 }, { "entropy": 1.924868457019329, "epoch": 0.30651934849138884, "grad_norm": 7.891080379486084, "learning_rate": 4.569471034446258e-06, "loss": 0.4636, "mean_token_accuracy": 0.8509886354207993, "num_tokens": 118898982.0, "step": 98880 }, { "entropy": 1.8691664278507232, "epoch": 0.3065503476164385, "grad_norm": 8.10325813293457, "learning_rate": 4.5692399881943754e-06, "loss": 0.4811, "mean_token_accuracy": 0.8400322616100311, "num_tokens": 118910630.0, "step": 98890 }, { "entropy": 1.8352098122239113, "epoch": 0.30658134674148824, "grad_norm": 3.6725313663482666, "learning_rate": 4.569008976986136e-06, "loss": 0.3725, "mean_token_accuracy": 0.8578842118382454, "num_tokens": 118923666.0, "step": 98900 }, { "entropy": 1.852015070617199, "epoch": 0.3066123458665379, "grad_norm": 7.963640213012695, "learning_rate": 4.568778000812685e-06, "loss": 0.458, "mean_token_accuracy": 0.8513859167695046, "num_tokens": 118935379.0, "step": 98910 }, { "entropy": 1.8301201090216637, "epoch": 0.30664334499158763, "grad_norm": 7.35858678817749, "learning_rate": 4.568547059665164e-06, "loss": 0.4167, "mean_token_accuracy": 0.8498409286141395, "num_tokens": 118948414.0, "step": 98920 }, { "entropy": 1.8332775115966797, "epoch": 0.3066743441166373, "grad_norm": 4.402047157287598, "learning_rate": 4.568316153534725e-06, "loss": 0.4252, "mean_token_accuracy": 0.8492154240608215, "num_tokens": 118961750.0, "step": 98930 }, { "entropy": 1.912602500617504, "epoch": 0.306705343241687, "grad_norm": 8.157501220703125, "learning_rate": 4.568085282412518e-06, "loss": 0.5092, "mean_token_accuracy": 0.8328171208500862, "num_tokens": 118973215.0, "step": 98940 }, { "entropy": 1.8730465933680533, "epoch": 0.3067363423667367, "grad_norm": 10.181466102600098, "learning_rate": 4.567854446289697e-06, "loss": 0.4826, "mean_token_accuracy": 0.8437885701656341, "num_tokens": 118985524.0, "step": 98950 }, { "entropy": 1.902740554511547, "epoch": 0.3067673414917864, "grad_norm": 9.36436939239502, "learning_rate": 4.567623645157422e-06, "loss": 0.4682, "mean_token_accuracy": 0.8489101201295852, "num_tokens": 118997046.0, "step": 98960 }, { "entropy": 1.8747300267219544, "epoch": 0.3067983406168361, "grad_norm": 9.957647323608398, "learning_rate": 4.567392879006852e-06, "loss": 0.4768, "mean_token_accuracy": 0.8495268151164055, "num_tokens": 119009030.0, "step": 98970 }, { "entropy": 1.8686933800578118, "epoch": 0.3068293397418858, "grad_norm": 7.555231094360352, "learning_rate": 4.567162147829152e-06, "loss": 0.491, "mean_token_accuracy": 0.8467110440135002, "num_tokens": 119020843.0, "step": 98980 }, { "entropy": 1.884840413928032, "epoch": 0.3068603388669355, "grad_norm": 7.894239902496338, "learning_rate": 4.56693145161549e-06, "loss": 0.4907, "mean_token_accuracy": 0.8425130993127823, "num_tokens": 119032739.0, "step": 98990 }, { "entropy": 1.846720139682293, "epoch": 0.3068913379919852, "grad_norm": 7.1045403480529785, "learning_rate": 4.566700790357034e-06, "loss": 0.4442, "mean_token_accuracy": 0.8530875831842423, "num_tokens": 119044858.0, "step": 99000 }, { "entropy": 1.9004247322678567, "epoch": 0.30692233711703487, "grad_norm": 3.9099464416503906, "learning_rate": 4.56647016404496e-06, "loss": 0.4907, "mean_token_accuracy": 0.8432266771793365, "num_tokens": 119056720.0, "step": 99010 }, { "entropy": 1.845305010676384, "epoch": 0.30695333624208454, "grad_norm": 4.365594387054443, "learning_rate": 4.566239572670445e-06, "loss": 0.4271, "mean_token_accuracy": 0.8529621243476868, "num_tokens": 119069481.0, "step": 99020 }, { "entropy": 1.8452321007847785, "epoch": 0.30698433536713426, "grad_norm": 7.914214611053467, "learning_rate": 4.566009016224666e-06, "loss": 0.451, "mean_token_accuracy": 0.8480882182717323, "num_tokens": 119081620.0, "step": 99030 }, { "entropy": 1.8590731248259544, "epoch": 0.30701533449218393, "grad_norm": 8.533273696899414, "learning_rate": 4.565778494698808e-06, "loss": 0.4815, "mean_token_accuracy": 0.8423286929726601, "num_tokens": 119093680.0, "step": 99040 }, { "entropy": 1.8688816666603087, "epoch": 0.30704633361723366, "grad_norm": 8.64100456237793, "learning_rate": 4.5655480080840556e-06, "loss": 0.4382, "mean_token_accuracy": 0.8599061399698258, "num_tokens": 119105568.0, "step": 99050 }, { "entropy": 1.853632289171219, "epoch": 0.3070773327422833, "grad_norm": 2.459723949432373, "learning_rate": 4.565317556371598e-06, "loss": 0.4447, "mean_token_accuracy": 0.8528544023633003, "num_tokens": 119118403.0, "step": 99060 }, { "entropy": 1.867573080956936, "epoch": 0.30710833186733305, "grad_norm": 7.473991394042969, "learning_rate": 4.56508713955263e-06, "loss": 0.477, "mean_token_accuracy": 0.844258151948452, "num_tokens": 119130445.0, "step": 99070 }, { "entropy": 1.8869913056492806, "epoch": 0.3071393309923827, "grad_norm": 6.70831298828125, "learning_rate": 4.564856757618344e-06, "loss": 0.4929, "mean_token_accuracy": 0.8497973084449768, "num_tokens": 119142338.0, "step": 99080 }, { "entropy": 1.8372093975543975, "epoch": 0.30717033011743244, "grad_norm": 7.9739603996276855, "learning_rate": 4.564626410559939e-06, "loss": 0.4441, "mean_token_accuracy": 0.8506271958351135, "num_tokens": 119154394.0, "step": 99090 }, { "entropy": 1.8323589369654656, "epoch": 0.3072013292424821, "grad_norm": 8.829228401184082, "learning_rate": 4.564396098368618e-06, "loss": 0.4315, "mean_token_accuracy": 0.8452668026089668, "num_tokens": 119167066.0, "step": 99100 }, { "entropy": 1.9608383178710938, "epoch": 0.30723232836753184, "grad_norm": 8.926813125610352, "learning_rate": 4.564165821035583e-06, "loss": 0.5453, "mean_token_accuracy": 0.8367260545492172, "num_tokens": 119178017.0, "step": 99110 }, { "entropy": 1.861896750330925, "epoch": 0.3072633274925815, "grad_norm": 3.713798761367798, "learning_rate": 4.563935578552043e-06, "loss": 0.4896, "mean_token_accuracy": 0.8421308517456054, "num_tokens": 119190396.0, "step": 99120 }, { "entropy": 1.9013278767466546, "epoch": 0.30729432661763123, "grad_norm": 9.141477584838867, "learning_rate": 4.563705370909211e-06, "loss": 0.5019, "mean_token_accuracy": 0.8393938109278679, "num_tokens": 119201792.0, "step": 99130 }, { "entropy": 1.850807547569275, "epoch": 0.3073253257426809, "grad_norm": 4.37838888168335, "learning_rate": 4.563475198098299e-06, "loss": 0.4531, "mean_token_accuracy": 0.8489650651812554, "num_tokens": 119214868.0, "step": 99140 }, { "entropy": 1.7602989837527274, "epoch": 0.3073563248677306, "grad_norm": 3.604959726333618, "learning_rate": 4.563245060110523e-06, "loss": 0.412, "mean_token_accuracy": 0.8620150581002235, "num_tokens": 119228299.0, "step": 99150 }, { "entropy": 1.949777290225029, "epoch": 0.3073873239927803, "grad_norm": 7.350498199462891, "learning_rate": 4.563014956937104e-06, "loss": 0.5215, "mean_token_accuracy": 0.8472014635801315, "num_tokens": 119239986.0, "step": 99160 }, { "entropy": 1.9088080897927284, "epoch": 0.30741832311783, "grad_norm": 8.599087715148926, "learning_rate": 4.562784888569266e-06, "loss": 0.4898, "mean_token_accuracy": 0.8541075602173805, "num_tokens": 119251421.0, "step": 99170 }, { "entropy": 1.9487151458859444, "epoch": 0.3074493222428797, "grad_norm": 9.30274772644043, "learning_rate": 4.562554854998235e-06, "loss": 0.4915, "mean_token_accuracy": 0.842944149672985, "num_tokens": 119263018.0, "step": 99180 }, { "entropy": 1.8647134892642498, "epoch": 0.3074803213679294, "grad_norm": 7.472745418548584, "learning_rate": 4.56232485621524e-06, "loss": 0.4288, "mean_token_accuracy": 0.8561579942703247, "num_tokens": 119275984.0, "step": 99190 }, { "entropy": 1.8933315068483352, "epoch": 0.3075113204929791, "grad_norm": 8.288209915161133, "learning_rate": 4.5620948922115156e-06, "loss": 0.5475, "mean_token_accuracy": 0.831151905655861, "num_tokens": 119287771.0, "step": 99200 }, { "entropy": 1.8554987102746963, "epoch": 0.3075423196180288, "grad_norm": 7.526989459991455, "learning_rate": 4.561864962978294e-06, "loss": 0.4617, "mean_token_accuracy": 0.8532639876008034, "num_tokens": 119299903.0, "step": 99210 }, { "entropy": 1.9998096972703934, "epoch": 0.30757331874307847, "grad_norm": 9.368232727050781, "learning_rate": 4.5616350685068165e-06, "loss": 0.567, "mean_token_accuracy": 0.8291105717420578, "num_tokens": 119310444.0, "step": 99220 }, { "entropy": 1.898825192451477, "epoch": 0.3076043178681282, "grad_norm": 8.540651321411133, "learning_rate": 4.561405208788324e-06, "loss": 0.5238, "mean_token_accuracy": 0.8367143541574478, "num_tokens": 119321635.0, "step": 99230 }, { "entropy": 1.8685836613178253, "epoch": 0.30763531699317787, "grad_norm": 3.111175775527954, "learning_rate": 4.561175383814061e-06, "loss": 0.4296, "mean_token_accuracy": 0.8581192702054977, "num_tokens": 119333768.0, "step": 99240 }, { "entropy": 1.8917466133832932, "epoch": 0.3076663161182276, "grad_norm": 8.266393661499023, "learning_rate": 4.560945593575276e-06, "loss": 0.4361, "mean_token_accuracy": 0.8560093253850937, "num_tokens": 119345845.0, "step": 99250 }, { "entropy": 1.9194523006677628, "epoch": 0.30769731524327726, "grad_norm": 7.8154683113098145, "learning_rate": 4.560715838063221e-06, "loss": 0.4998, "mean_token_accuracy": 0.8392749205231667, "num_tokens": 119356959.0, "step": 99260 }, { "entropy": 1.9294108331203461, "epoch": 0.3077283143683269, "grad_norm": 7.9482831954956055, "learning_rate": 4.560486117269149e-06, "loss": 0.5774, "mean_token_accuracy": 0.8344770103693009, "num_tokens": 119368443.0, "step": 99270 }, { "entropy": 1.8910026296973228, "epoch": 0.30775931349337665, "grad_norm": 6.961696624755859, "learning_rate": 4.560256431184316e-06, "loss": 0.4661, "mean_token_accuracy": 0.8482179164886474, "num_tokens": 119380584.0, "step": 99280 }, { "entropy": 1.9641872704029084, "epoch": 0.3077903126184263, "grad_norm": 9.529417991638184, "learning_rate": 4.5600267797999856e-06, "loss": 0.5362, "mean_token_accuracy": 0.8355140164494514, "num_tokens": 119390739.0, "step": 99290 }, { "entropy": 1.8502513483166694, "epoch": 0.30782131174347604, "grad_norm": 7.629122734069824, "learning_rate": 4.55979716310742e-06, "loss": 0.4319, "mean_token_accuracy": 0.8559545025229454, "num_tokens": 119403174.0, "step": 99300 }, { "entropy": 1.9880924820899963, "epoch": 0.3078523108685257, "grad_norm": 11.568595886230469, "learning_rate": 4.5595675810978835e-06, "loss": 0.5379, "mean_token_accuracy": 0.8477470189332962, "num_tokens": 119414065.0, "step": 99310 }, { "entropy": 1.9389469519257545, "epoch": 0.30788330999357544, "grad_norm": 8.663850784301758, "learning_rate": 4.559338033762647e-06, "loss": 0.4673, "mean_token_accuracy": 0.8486195042729378, "num_tokens": 119425228.0, "step": 99320 }, { "entropy": 1.8266063407063484, "epoch": 0.3079143091186251, "grad_norm": 5.284822940826416, "learning_rate": 4.559108521092985e-06, "loss": 0.3787, "mean_token_accuracy": 0.861154480278492, "num_tokens": 119439104.0, "step": 99330 }, { "entropy": 1.8920496240258218, "epoch": 0.30794530824367483, "grad_norm": 3.9276669025421143, "learning_rate": 4.558879043080171e-06, "loss": 0.4505, "mean_token_accuracy": 0.8588141456246376, "num_tokens": 119451373.0, "step": 99340 }, { "entropy": 1.951807254552841, "epoch": 0.3079763073687245, "grad_norm": 7.063690662384033, "learning_rate": 4.5586495997154835e-06, "loss": 0.5193, "mean_token_accuracy": 0.8351276248693467, "num_tokens": 119462888.0, "step": 99350 }, { "entropy": 2.002240237593651, "epoch": 0.3080073064937742, "grad_norm": 9.491320610046387, "learning_rate": 4.558420190990207e-06, "loss": 0.5519, "mean_token_accuracy": 0.8310097977519035, "num_tokens": 119473869.0, "step": 99360 }, { "entropy": 1.847242882847786, "epoch": 0.3080383056188239, "grad_norm": 8.938675880432129, "learning_rate": 4.558190816895623e-06, "loss": 0.464, "mean_token_accuracy": 0.8449984312057495, "num_tokens": 119486256.0, "step": 99370 }, { "entropy": 1.865819439291954, "epoch": 0.3080693047438736, "grad_norm": 7.645913600921631, "learning_rate": 4.557961477423024e-06, "loss": 0.4424, "mean_token_accuracy": 0.848369000852108, "num_tokens": 119497901.0, "step": 99380 }, { "entropy": 1.8621415674686432, "epoch": 0.3081003038689233, "grad_norm": 9.088942527770996, "learning_rate": 4.557732172563696e-06, "loss": 0.4749, "mean_token_accuracy": 0.8500281676650048, "num_tokens": 119510003.0, "step": 99390 }, { "entropy": 1.8306550726294517, "epoch": 0.308131302993973, "grad_norm": 7.942266941070557, "learning_rate": 4.557502902308936e-06, "loss": 0.3983, "mean_token_accuracy": 0.8491405546665192, "num_tokens": 119522659.0, "step": 99400 }, { "entropy": 1.9008427813649178, "epoch": 0.3081623021190227, "grad_norm": 10.027981758117676, "learning_rate": 4.557273666650041e-06, "loss": 0.4978, "mean_token_accuracy": 0.8394996523857117, "num_tokens": 119534862.0, "step": 99410 }, { "entropy": 1.9339268684387207, "epoch": 0.3081933012440724, "grad_norm": 8.332462310791016, "learning_rate": 4.5570444655783105e-06, "loss": 0.5281, "mean_token_accuracy": 0.848136767745018, "num_tokens": 119546767.0, "step": 99420 }, { "entropy": 1.8939686760306358, "epoch": 0.3082243003691221, "grad_norm": 7.6620259284973145, "learning_rate": 4.556815299085049e-06, "loss": 0.4571, "mean_token_accuracy": 0.8582808762788773, "num_tokens": 119558146.0, "step": 99430 }, { "entropy": 1.8960684776306151, "epoch": 0.3082552994941718, "grad_norm": 8.92011547088623, "learning_rate": 4.556586167161562e-06, "loss": 0.5102, "mean_token_accuracy": 0.8485264018177986, "num_tokens": 119569433.0, "step": 99440 }, { "entropy": 1.8812731936573983, "epoch": 0.30828629861922147, "grad_norm": 7.918720722198486, "learning_rate": 4.556357069799159e-06, "loss": 0.5013, "mean_token_accuracy": 0.8485799968242645, "num_tokens": 119581623.0, "step": 99450 }, { "entropy": 1.9013389393687248, "epoch": 0.3083172977442712, "grad_norm": 8.778814315795898, "learning_rate": 4.556128006989152e-06, "loss": 0.4826, "mean_token_accuracy": 0.8451300621032715, "num_tokens": 119593213.0, "step": 99460 }, { "entropy": 1.8785056233406068, "epoch": 0.30834829686932086, "grad_norm": 7.801638126373291, "learning_rate": 4.555898978722858e-06, "loss": 0.4349, "mean_token_accuracy": 0.8590483710169792, "num_tokens": 119605344.0, "step": 99470 }, { "entropy": 1.9365007862448693, "epoch": 0.3083792959943706, "grad_norm": 10.32783317565918, "learning_rate": 4.555669984991595e-06, "loss": 0.5556, "mean_token_accuracy": 0.830289502441883, "num_tokens": 119617211.0, "step": 99480 }, { "entropy": 1.9110652074217795, "epoch": 0.30841029511942025, "grad_norm": 7.38109016418457, "learning_rate": 4.555441025786685e-06, "loss": 0.5, "mean_token_accuracy": 0.8413866236805916, "num_tokens": 119628112.0, "step": 99490 }, { "entropy": 1.9205258354544639, "epoch": 0.30844129424447, "grad_norm": 7.779130458831787, "learning_rate": 4.555212101099452e-06, "loss": 0.4783, "mean_token_accuracy": 0.8441153079271316, "num_tokens": 119639907.0, "step": 99500 }, { "entropy": 1.9123625665903092, "epoch": 0.30847229336951965, "grad_norm": 3.761373281478882, "learning_rate": 4.554983210921223e-06, "loss": 0.4505, "mean_token_accuracy": 0.8475768864154816, "num_tokens": 119651966.0, "step": 99510 }, { "entropy": 1.8980589851737022, "epoch": 0.3085032924945693, "grad_norm": 9.282678604125977, "learning_rate": 4.55475435524333e-06, "loss": 0.4699, "mean_token_accuracy": 0.8474918335676194, "num_tokens": 119663224.0, "step": 99520 }, { "entropy": 1.807744611799717, "epoch": 0.30853429161961904, "grad_norm": 8.637643814086914, "learning_rate": 4.554525534057108e-06, "loss": 0.4036, "mean_token_accuracy": 0.8565024748444557, "num_tokens": 119676021.0, "step": 99530 }, { "entropy": 1.936571803689003, "epoch": 0.3085652907446687, "grad_norm": 8.615496635437012, "learning_rate": 4.554296747353892e-06, "loss": 0.4591, "mean_token_accuracy": 0.8555189058184623, "num_tokens": 119686726.0, "step": 99540 }, { "entropy": 1.869454000890255, "epoch": 0.30859628986971843, "grad_norm": 8.078155517578125, "learning_rate": 4.554067995125023e-06, "loss": 0.4736, "mean_token_accuracy": 0.8461382001638412, "num_tokens": 119698220.0, "step": 99550 }, { "entropy": 1.8459780648350717, "epoch": 0.3086272889947681, "grad_norm": 4.338291168212891, "learning_rate": 4.5538392773618436e-06, "loss": 0.4567, "mean_token_accuracy": 0.8427159383893013, "num_tokens": 119710621.0, "step": 99560 }, { "entropy": 1.8610286951065063, "epoch": 0.3086582881198178, "grad_norm": 8.782474517822266, "learning_rate": 4.553610594055699e-06, "loss": 0.4504, "mean_token_accuracy": 0.8488268256187439, "num_tokens": 119722541.0, "step": 99570 }, { "entropy": 1.9070811703801156, "epoch": 0.3086892872448675, "grad_norm": 7.238033771514893, "learning_rate": 4.553381945197941e-06, "loss": 0.4898, "mean_token_accuracy": 0.836867593228817, "num_tokens": 119733908.0, "step": 99580 }, { "entropy": 1.9172224178910255, "epoch": 0.3087202863699172, "grad_norm": 8.506465911865234, "learning_rate": 4.553153330779919e-06, "loss": 0.5402, "mean_token_accuracy": 0.8402721121907234, "num_tokens": 119745237.0, "step": 99590 }, { "entropy": 1.8876484125852584, "epoch": 0.3087512854949669, "grad_norm": 4.413023471832275, "learning_rate": 4.552924750792989e-06, "loss": 0.4712, "mean_token_accuracy": 0.846372652053833, "num_tokens": 119757146.0, "step": 99600 }, { "entropy": 1.8622125744819642, "epoch": 0.3087822846200166, "grad_norm": 8.20417308807373, "learning_rate": 4.552696205228509e-06, "loss": 0.4326, "mean_token_accuracy": 0.8524843230843544, "num_tokens": 119769534.0, "step": 99610 }, { "entropy": 1.847460262477398, "epoch": 0.3088132837450663, "grad_norm": 7.062432289123535, "learning_rate": 4.552467694077842e-06, "loss": 0.4164, "mean_token_accuracy": 0.8552860572934151, "num_tokens": 119781883.0, "step": 99620 }, { "entropy": 1.8279512047767639, "epoch": 0.308844282870116, "grad_norm": 3.6404471397399902, "learning_rate": 4.552239217332351e-06, "loss": 0.4187, "mean_token_accuracy": 0.8477238088846206, "num_tokens": 119795146.0, "step": 99630 }, { "entropy": 1.8767915710806846, "epoch": 0.3088752819951657, "grad_norm": 10.762784004211426, "learning_rate": 4.552010774983402e-06, "loss": 0.466, "mean_token_accuracy": 0.8552503928542137, "num_tokens": 119806803.0, "step": 99640 }, { "entropy": 1.8293926134705543, "epoch": 0.3089062811202154, "grad_norm": 7.491893291473389, "learning_rate": 4.551782367022367e-06, "loss": 0.4573, "mean_token_accuracy": 0.8511452227830887, "num_tokens": 119819906.0, "step": 99650 }, { "entropy": 1.8923638671636582, "epoch": 0.30893728024526507, "grad_norm": 9.077742576599121, "learning_rate": 4.55155399344062e-06, "loss": 0.474, "mean_token_accuracy": 0.8456351578235626, "num_tokens": 119831777.0, "step": 99660 }, { "entropy": 1.9003022998571395, "epoch": 0.3089682793703148, "grad_norm": 8.246248245239258, "learning_rate": 4.551325654229535e-06, "loss": 0.4391, "mean_token_accuracy": 0.8550438165664673, "num_tokens": 119843799.0, "step": 99670 }, { "entropy": 1.8228402808308601, "epoch": 0.30899927849536446, "grad_norm": 5.1642746925354, "learning_rate": 4.551097349380495e-06, "loss": 0.4592, "mean_token_accuracy": 0.8491297498345375, "num_tokens": 119856606.0, "step": 99680 }, { "entropy": 1.96132450401783, "epoch": 0.3090302776204142, "grad_norm": 9.276740074157715, "learning_rate": 4.550869078884878e-06, "loss": 0.5335, "mean_token_accuracy": 0.8399526447057724, "num_tokens": 119867481.0, "step": 99690 }, { "entropy": 1.8656087532639503, "epoch": 0.30906127674546385, "grad_norm": 8.782885551452637, "learning_rate": 4.550640842734073e-06, "loss": 0.4616, "mean_token_accuracy": 0.8477922558784485, "num_tokens": 119879627.0, "step": 99700 }, { "entropy": 1.8200937718153, "epoch": 0.3090922758705136, "grad_norm": 9.573867797851562, "learning_rate": 4.550412640919468e-06, "loss": 0.5322, "mean_token_accuracy": 0.8348228052258492, "num_tokens": 119892028.0, "step": 99710 }, { "entropy": 1.8446323826909066, "epoch": 0.30912327499556325, "grad_norm": 8.252911567687988, "learning_rate": 4.550184473432453e-06, "loss": 0.4397, "mean_token_accuracy": 0.8526568725705147, "num_tokens": 119904371.0, "step": 99720 }, { "entropy": 1.9776091009378434, "epoch": 0.30915427412061297, "grad_norm": 9.516276359558105, "learning_rate": 4.5499563402644234e-06, "loss": 0.5689, "mean_token_accuracy": 0.8284485876560211, "num_tokens": 119915090.0, "step": 99730 }, { "entropy": 1.9320985347032547, "epoch": 0.30918527324566264, "grad_norm": 9.039697647094727, "learning_rate": 4.5497282414067775e-06, "loss": 0.4897, "mean_token_accuracy": 0.850306898355484, "num_tokens": 119926461.0, "step": 99740 }, { "entropy": 1.9415932521224022, "epoch": 0.30921627237071236, "grad_norm": 9.981196403503418, "learning_rate": 4.549500176850916e-06, "loss": 0.5159, "mean_token_accuracy": 0.848623238503933, "num_tokens": 119937402.0, "step": 99750 }, { "entropy": 1.8851657703518867, "epoch": 0.30924727149576203, "grad_norm": 6.050877571105957, "learning_rate": 4.549272146588241e-06, "loss": 0.525, "mean_token_accuracy": 0.8291679427027703, "num_tokens": 119948670.0, "step": 99760 }, { "entropy": 1.8636073276400567, "epoch": 0.3092782706208117, "grad_norm": 9.428133964538574, "learning_rate": 4.54904415061016e-06, "loss": 0.4713, "mean_token_accuracy": 0.8505959510803223, "num_tokens": 119961140.0, "step": 99770 }, { "entropy": 1.8380408152937888, "epoch": 0.3093092697458614, "grad_norm": 8.414907455444336, "learning_rate": 4.548816188908081e-06, "loss": 0.501, "mean_token_accuracy": 0.839817276597023, "num_tokens": 119973426.0, "step": 99780 }, { "entropy": 1.8116330251097679, "epoch": 0.3093402688709111, "grad_norm": 8.415764808654785, "learning_rate": 4.548588261473421e-06, "loss": 0.4328, "mean_token_accuracy": 0.8535687699913979, "num_tokens": 119986183.0, "step": 99790 }, { "entropy": 1.8176366582512855, "epoch": 0.3093712679959608, "grad_norm": 9.07093620300293, "learning_rate": 4.548360368297591e-06, "loss": 0.4462, "mean_token_accuracy": 0.8550222635269165, "num_tokens": 119998588.0, "step": 99800 }, { "entropy": 1.9179295137524606, "epoch": 0.3094022671210105, "grad_norm": 8.444205284118652, "learning_rate": 4.548132509372013e-06, "loss": 0.5187, "mean_token_accuracy": 0.8391743138432503, "num_tokens": 120010012.0, "step": 99810 }, { "entropy": 1.8938566118478775, "epoch": 0.3094332662460602, "grad_norm": 8.31008529663086, "learning_rate": 4.5479046846881064e-06, "loss": 0.4787, "mean_token_accuracy": 0.8451041653752327, "num_tokens": 120022944.0, "step": 99820 }, { "entropy": 1.906507858633995, "epoch": 0.3094642653711099, "grad_norm": 8.747406959533691, "learning_rate": 4.547676894237297e-06, "loss": 0.4485, "mean_token_accuracy": 0.8478483647108078, "num_tokens": 120034662.0, "step": 99830 }, { "entropy": 1.8798761084675788, "epoch": 0.3094952644961596, "grad_norm": 9.963387489318848, "learning_rate": 4.547449138011013e-06, "loss": 0.466, "mean_token_accuracy": 0.8498962029814721, "num_tokens": 120047172.0, "step": 99840 }, { "entropy": 1.8998943746089936, "epoch": 0.3095262636212093, "grad_norm": 8.54676342010498, "learning_rate": 4.5472214160006844e-06, "loss": 0.5129, "mean_token_accuracy": 0.8516019433736801, "num_tokens": 120058507.0, "step": 99850 }, { "entropy": 1.8910057291388511, "epoch": 0.309557262746259, "grad_norm": 9.665603637695312, "learning_rate": 4.546993728197744e-06, "loss": 0.5021, "mean_token_accuracy": 0.8422492250800133, "num_tokens": 120070406.0, "step": 99860 }, { "entropy": 1.9007139950990677, "epoch": 0.30958826187130867, "grad_norm": 8.319001197814941, "learning_rate": 4.546766074593631e-06, "loss": 0.4697, "mean_token_accuracy": 0.8561020061373711, "num_tokens": 120082456.0, "step": 99870 }, { "entropy": 1.9839264631271363, "epoch": 0.3096192609963584, "grad_norm": 7.754944324493408, "learning_rate": 4.546538455179782e-06, "loss": 0.5336, "mean_token_accuracy": 0.836320398747921, "num_tokens": 120093393.0, "step": 99880 }, { "entropy": 1.827572251856327, "epoch": 0.30965026012140806, "grad_norm": 6.521658897399902, "learning_rate": 4.546310869947643e-06, "loss": 0.4143, "mean_token_accuracy": 0.8617355406284333, "num_tokens": 120106497.0, "step": 99890 }, { "entropy": 1.952468115091324, "epoch": 0.3096812592464578, "grad_norm": 7.609374523162842, "learning_rate": 4.546083318888656e-06, "loss": 0.4451, "mean_token_accuracy": 0.8570076480507851, "num_tokens": 120117150.0, "step": 99900 }, { "entropy": 1.9119912654161453, "epoch": 0.30971225837150745, "grad_norm": 8.479022979736328, "learning_rate": 4.5458558019942736e-06, "loss": 0.4679, "mean_token_accuracy": 0.8490529209375381, "num_tokens": 120128766.0, "step": 99910 }, { "entropy": 1.8472444072365761, "epoch": 0.3097432574965572, "grad_norm": 8.135096549987793, "learning_rate": 4.5456283192559455e-06, "loss": 0.4387, "mean_token_accuracy": 0.8514307633042335, "num_tokens": 120140846.0, "step": 99920 }, { "entropy": 1.8320681855082512, "epoch": 0.30977425662160685, "grad_norm": 7.373263835906982, "learning_rate": 4.5454008706651255e-06, "loss": 0.4321, "mean_token_accuracy": 0.8532561391592026, "num_tokens": 120152671.0, "step": 99930 }, { "entropy": 1.8469135209918022, "epoch": 0.30980525574665657, "grad_norm": 6.601968288421631, "learning_rate": 4.545173456213272e-06, "loss": 0.4331, "mean_token_accuracy": 0.8549439549446106, "num_tokens": 120165189.0, "step": 99940 }, { "entropy": 1.8578027948737144, "epoch": 0.30983625487170624, "grad_norm": 9.094026565551758, "learning_rate": 4.5449460758918485e-06, "loss": 0.4063, "mean_token_accuracy": 0.8560818761587143, "num_tokens": 120177445.0, "step": 99950 }, { "entropy": 1.8183515168726445, "epoch": 0.30986725399675596, "grad_norm": 8.197356224060059, "learning_rate": 4.544718729692315e-06, "loss": 0.4337, "mean_token_accuracy": 0.8482865273952485, "num_tokens": 120190572.0, "step": 99960 }, { "entropy": 1.8944060549139976, "epoch": 0.30989825312180563, "grad_norm": 8.696087837219238, "learning_rate": 4.544491417606139e-06, "loss": 0.5191, "mean_token_accuracy": 0.8414214491844177, "num_tokens": 120202087.0, "step": 99970 }, { "entropy": 1.7967160180211068, "epoch": 0.30992925224685536, "grad_norm": 8.540863990783691, "learning_rate": 4.544264139624791e-06, "loss": 0.4583, "mean_token_accuracy": 0.8545686945319175, "num_tokens": 120214607.0, "step": 99980 }, { "entropy": 1.814838644862175, "epoch": 0.309960251371905, "grad_norm": 6.886814594268799, "learning_rate": 4.544036895739743e-06, "loss": 0.42, "mean_token_accuracy": 0.8569683223962784, "num_tokens": 120227295.0, "step": 99990 }, { "entropy": 1.8500340178608894, "epoch": 0.30999125049695475, "grad_norm": 4.226168632507324, "learning_rate": 4.5438096859424714e-06, "loss": 0.4386, "mean_token_accuracy": 0.8484739258885383, "num_tokens": 120240549.0, "step": 100000 }, { "entropy": 1.7947644501924516, "epoch": 0.3100222496220044, "grad_norm": 4.175483703613281, "learning_rate": 4.543582510224454e-06, "loss": 0.4234, "mean_token_accuracy": 0.8510774120688438, "num_tokens": 120253357.0, "step": 100010 }, { "entropy": 1.8611189171671867, "epoch": 0.3100532487470541, "grad_norm": 11.313339233398438, "learning_rate": 4.543355368577173e-06, "loss": 0.4428, "mean_token_accuracy": 0.8596706509590148, "num_tokens": 120265081.0, "step": 100020 }, { "entropy": 1.8015659287571908, "epoch": 0.3100842478721038, "grad_norm": 8.804397583007812, "learning_rate": 4.543128260992112e-06, "loss": 0.4095, "mean_token_accuracy": 0.8543227300047874, "num_tokens": 120278300.0, "step": 100030 }, { "entropy": 1.8594049900770186, "epoch": 0.3101152469971535, "grad_norm": 9.17402458190918, "learning_rate": 4.54290118746076e-06, "loss": 0.4722, "mean_token_accuracy": 0.8479228600859642, "num_tokens": 120289822.0, "step": 100040 }, { "entropy": 1.808043359220028, "epoch": 0.3101462461222032, "grad_norm": 7.376701831817627, "learning_rate": 4.542674147974606e-06, "loss": 0.4142, "mean_token_accuracy": 0.8577449038624764, "num_tokens": 120302594.0, "step": 100050 }, { "entropy": 1.9016132436692714, "epoch": 0.3101772452472529, "grad_norm": 8.157571792602539, "learning_rate": 4.5424471425251435e-06, "loss": 0.4858, "mean_token_accuracy": 0.8419206649065017, "num_tokens": 120314205.0, "step": 100060 }, { "entropy": 1.8275883719325066, "epoch": 0.3102082443723026, "grad_norm": 8.187117576599121, "learning_rate": 4.542220171103871e-06, "loss": 0.4082, "mean_token_accuracy": 0.8554164975881576, "num_tokens": 120327120.0, "step": 100070 }, { "entropy": 1.8765954077243805, "epoch": 0.31023924349735227, "grad_norm": 2.51336932182312, "learning_rate": 4.541993233702286e-06, "loss": 0.5119, "mean_token_accuracy": 0.8441577076911926, "num_tokens": 120338971.0, "step": 100080 }, { "entropy": 1.8348333820700646, "epoch": 0.310270242622402, "grad_norm": 9.080048561096191, "learning_rate": 4.541766330311893e-06, "loss": 0.4552, "mean_token_accuracy": 0.8454143956303597, "num_tokens": 120351609.0, "step": 100090 }, { "entropy": 1.8745562806725502, "epoch": 0.31030124174745166, "grad_norm": 7.322804927825928, "learning_rate": 4.541539460924194e-06, "loss": 0.4858, "mean_token_accuracy": 0.8521630093455315, "num_tokens": 120363483.0, "step": 100100 }, { "entropy": 1.899325506389141, "epoch": 0.3103322408725014, "grad_norm": 9.89995002746582, "learning_rate": 4.541312625530701e-06, "loss": 0.4891, "mean_token_accuracy": 0.8446615591645241, "num_tokens": 120374672.0, "step": 100110 }, { "entropy": 1.8418832674622536, "epoch": 0.31036323999755105, "grad_norm": 7.776636123657227, "learning_rate": 4.541085824122922e-06, "loss": 0.4485, "mean_token_accuracy": 0.8611134603619576, "num_tokens": 120386805.0, "step": 100120 }, { "entropy": 1.908632791042328, "epoch": 0.3103942391226008, "grad_norm": 9.52039909362793, "learning_rate": 4.540859056692375e-06, "loss": 0.5108, "mean_token_accuracy": 0.8384132295846939, "num_tokens": 120398539.0, "step": 100130 }, { "entropy": 1.9329130694270134, "epoch": 0.31042523824765045, "grad_norm": 8.478260040283203, "learning_rate": 4.540632323230573e-06, "loss": 0.5144, "mean_token_accuracy": 0.8484991729259491, "num_tokens": 120409458.0, "step": 100140 }, { "entropy": 1.9148776784539223, "epoch": 0.3104562373727002, "grad_norm": 8.679959297180176, "learning_rate": 4.54040562372904e-06, "loss": 0.505, "mean_token_accuracy": 0.8426013261079788, "num_tokens": 120420579.0, "step": 100150 }, { "entropy": 1.8648797929286958, "epoch": 0.31048723649774984, "grad_norm": 4.328078269958496, "learning_rate": 4.5401789581792985e-06, "loss": 0.4867, "mean_token_accuracy": 0.846242044866085, "num_tokens": 120432678.0, "step": 100160 }, { "entropy": 1.8050495654344558, "epoch": 0.31051823562279957, "grad_norm": 9.263623237609863, "learning_rate": 4.539952326572873e-06, "loss": 0.4235, "mean_token_accuracy": 0.8572616815567017, "num_tokens": 120446167.0, "step": 100170 }, { "entropy": 2.000722700357437, "epoch": 0.31054923474784923, "grad_norm": 7.7509355545043945, "learning_rate": 4.539725728901292e-06, "loss": 0.5552, "mean_token_accuracy": 0.8357238873839379, "num_tokens": 120457079.0, "step": 100180 }, { "entropy": 1.8446622014045715, "epoch": 0.31058023387289896, "grad_norm": 7.534979820251465, "learning_rate": 4.539499165156091e-06, "loss": 0.4584, "mean_token_accuracy": 0.8397464141249656, "num_tokens": 120469823.0, "step": 100190 }, { "entropy": 1.8747181564569473, "epoch": 0.3106112329979486, "grad_norm": 3.7008886337280273, "learning_rate": 4.539272635328803e-06, "loss": 0.472, "mean_token_accuracy": 0.846717146039009, "num_tokens": 120482088.0, "step": 100200 }, { "entropy": 1.8778936117887497, "epoch": 0.31064223212299835, "grad_norm": 8.01706314086914, "learning_rate": 4.539046139410965e-06, "loss": 0.5166, "mean_token_accuracy": 0.8487760618329048, "num_tokens": 120494848.0, "step": 100210 }, { "entropy": 1.8788290441036224, "epoch": 0.310673231248048, "grad_norm": 7.9294915199279785, "learning_rate": 4.53881967739412e-06, "loss": 0.4745, "mean_token_accuracy": 0.8471907436847687, "num_tokens": 120506578.0, "step": 100220 }, { "entropy": 1.8919280216097831, "epoch": 0.31070423037309775, "grad_norm": 4.634057521820068, "learning_rate": 4.53859324926981e-06, "loss": 0.4983, "mean_token_accuracy": 0.8390332102775574, "num_tokens": 120518962.0, "step": 100230 }, { "entropy": 1.73563092648983, "epoch": 0.3107352294981474, "grad_norm": 6.966121196746826, "learning_rate": 4.538366855029584e-06, "loss": 0.408, "mean_token_accuracy": 0.8594012647867203, "num_tokens": 120532862.0, "step": 100240 }, { "entropy": 1.8883020922541618, "epoch": 0.3107662286231971, "grad_norm": 2.8376030921936035, "learning_rate": 4.53814049466499e-06, "loss": 0.4781, "mean_token_accuracy": 0.8497428327798844, "num_tokens": 120544852.0, "step": 100250 }, { "entropy": 1.980030670762062, "epoch": 0.3107972277482468, "grad_norm": 9.871118545532227, "learning_rate": 4.537914168167582e-06, "loss": 0.517, "mean_token_accuracy": 0.8421619832515717, "num_tokens": 120555617.0, "step": 100260 }, { "entropy": 1.9399761855602264, "epoch": 0.3108282268732965, "grad_norm": 8.313036918640137, "learning_rate": 4.5376878755289136e-06, "loss": 0.4704, "mean_token_accuracy": 0.8462784796953201, "num_tokens": 120567039.0, "step": 100270 }, { "entropy": 1.8688796371221543, "epoch": 0.3108592259983462, "grad_norm": 8.893397331237793, "learning_rate": 4.537461616740546e-06, "loss": 0.4449, "mean_token_accuracy": 0.8576091736555099, "num_tokens": 120579341.0, "step": 100280 }, { "entropy": 1.902301235496998, "epoch": 0.31089022512339587, "grad_norm": 8.338861465454102, "learning_rate": 4.53723539179404e-06, "loss": 0.4307, "mean_token_accuracy": 0.8621632814407348, "num_tokens": 120590733.0, "step": 100290 }, { "entropy": 1.9393490612506867, "epoch": 0.3109212242484456, "grad_norm": 7.925196170806885, "learning_rate": 4.53700920068096e-06, "loss": 0.5162, "mean_token_accuracy": 0.8450248718261719, "num_tokens": 120601534.0, "step": 100300 }, { "entropy": 1.8962682634592056, "epoch": 0.31095222337349526, "grad_norm": 7.99591064453125, "learning_rate": 4.536783043392873e-06, "loss": 0.4653, "mean_token_accuracy": 0.8482768073678016, "num_tokens": 120612997.0, "step": 100310 }, { "entropy": 1.9093605667352676, "epoch": 0.310983222498545, "grad_norm": 8.260348320007324, "learning_rate": 4.536556919921349e-06, "loss": 0.4934, "mean_token_accuracy": 0.8399380281567573, "num_tokens": 120624613.0, "step": 100320 }, { "entropy": 1.8416539490222932, "epoch": 0.31101422162359466, "grad_norm": 4.14152193069458, "learning_rate": 4.536330830257964e-06, "loss": 0.4217, "mean_token_accuracy": 0.8592947632074356, "num_tokens": 120637123.0, "step": 100330 }, { "entropy": 1.9081314831972123, "epoch": 0.3110452207486444, "grad_norm": 10.202445030212402, "learning_rate": 4.536104774394291e-06, "loss": 0.5284, "mean_token_accuracy": 0.8314228966832161, "num_tokens": 120648950.0, "step": 100340 }, { "entropy": 1.8932959333062171, "epoch": 0.31107621987369405, "grad_norm": 9.009939193725586, "learning_rate": 4.5358787523219115e-06, "loss": 0.4466, "mean_token_accuracy": 0.8498683959245682, "num_tokens": 120660896.0, "step": 100350 }, { "entropy": 1.8493494719266892, "epoch": 0.3111072189987438, "grad_norm": 3.8888235092163086, "learning_rate": 4.535652764032407e-06, "loss": 0.4515, "mean_token_accuracy": 0.8555730670690537, "num_tokens": 120673291.0, "step": 100360 }, { "entropy": 1.8772372379899025, "epoch": 0.31113821812379344, "grad_norm": 8.41147232055664, "learning_rate": 4.535426809517363e-06, "loss": 0.5155, "mean_token_accuracy": 0.844378887116909, "num_tokens": 120685691.0, "step": 100370 }, { "entropy": 1.86507987678051, "epoch": 0.31116921724884317, "grad_norm": 8.958087921142578, "learning_rate": 4.535200888768366e-06, "loss": 0.4501, "mean_token_accuracy": 0.8503421053290368, "num_tokens": 120697935.0, "step": 100380 }, { "entropy": 1.849946916103363, "epoch": 0.31120021637389284, "grad_norm": 7.368825435638428, "learning_rate": 4.534975001777008e-06, "loss": 0.4453, "mean_token_accuracy": 0.8594206288456917, "num_tokens": 120709752.0, "step": 100390 }, { "entropy": 1.859765648841858, "epoch": 0.31123121549894256, "grad_norm": 7.649475574493408, "learning_rate": 4.5347491485348835e-06, "loss": 0.4323, "mean_token_accuracy": 0.8555939674377442, "num_tokens": 120721721.0, "step": 100400 }, { "entropy": 1.877078004181385, "epoch": 0.31126221462399223, "grad_norm": 8.297224998474121, "learning_rate": 4.534523329033589e-06, "loss": 0.4486, "mean_token_accuracy": 0.8504548445343971, "num_tokens": 120732890.0, "step": 100410 }, { "entropy": 1.8714374005794525, "epoch": 0.31129321374904195, "grad_norm": 4.207152366638184, "learning_rate": 4.534297543264725e-06, "loss": 0.5621, "mean_token_accuracy": 0.8266729831695556, "num_tokens": 120745682.0, "step": 100420 }, { "entropy": 1.8621924549341202, "epoch": 0.3113242128740916, "grad_norm": 8.956070899963379, "learning_rate": 4.534071791219892e-06, "loss": 0.4647, "mean_token_accuracy": 0.8492157995700836, "num_tokens": 120758011.0, "step": 100430 }, { "entropy": 1.9551179885864258, "epoch": 0.31135521199914135, "grad_norm": 8.54804515838623, "learning_rate": 4.533846072890697e-06, "loss": 0.5251, "mean_token_accuracy": 0.83407664000988, "num_tokens": 120768890.0, "step": 100440 }, { "entropy": 1.8986855819821358, "epoch": 0.311386211124191, "grad_norm": 8.727336883544922, "learning_rate": 4.533620388268749e-06, "loss": 0.4687, "mean_token_accuracy": 0.8469956681132317, "num_tokens": 120780472.0, "step": 100450 }, { "entropy": 1.8609646454453468, "epoch": 0.31141721024924074, "grad_norm": 7.309536457061768, "learning_rate": 4.533394737345659e-06, "loss": 0.4281, "mean_token_accuracy": 0.8551263570785522, "num_tokens": 120792295.0, "step": 100460 }, { "entropy": 1.8796255081892013, "epoch": 0.3114482093742904, "grad_norm": 10.224238395690918, "learning_rate": 4.5331691201130415e-06, "loss": 0.4551, "mean_token_accuracy": 0.8438882201910018, "num_tokens": 120804242.0, "step": 100470 }, { "entropy": 1.907512204349041, "epoch": 0.31147920849934013, "grad_norm": 10.103514671325684, "learning_rate": 4.532943536562514e-06, "loss": 0.4917, "mean_token_accuracy": 0.8437295973300933, "num_tokens": 120815319.0, "step": 100480 }, { "entropy": 1.8957229569554328, "epoch": 0.3115102076243898, "grad_norm": 5.280575752258301, "learning_rate": 4.5327179866856965e-06, "loss": 0.4682, "mean_token_accuracy": 0.8484357133507728, "num_tokens": 120827094.0, "step": 100490 }, { "entropy": 1.856425480544567, "epoch": 0.31154120674943947, "grad_norm": 8.965953826904297, "learning_rate": 4.532492470474212e-06, "loss": 0.4583, "mean_token_accuracy": 0.8441311463713645, "num_tokens": 120839247.0, "step": 100500 }, { "entropy": 1.8147829815745353, "epoch": 0.3115722058744892, "grad_norm": 9.263935089111328, "learning_rate": 4.532266987919687e-06, "loss": 0.4365, "mean_token_accuracy": 0.8509012356400489, "num_tokens": 120852066.0, "step": 100510 }, { "entropy": 1.9309598222374915, "epoch": 0.31160320499953886, "grad_norm": 7.9816975593566895, "learning_rate": 4.53204153901375e-06, "loss": 0.4874, "mean_token_accuracy": 0.8446980267763138, "num_tokens": 120863149.0, "step": 100520 }, { "entropy": 1.8563687920570373, "epoch": 0.3116342041245886, "grad_norm": 8.456686973571777, "learning_rate": 4.531816123748033e-06, "loss": 0.4465, "mean_token_accuracy": 0.8485777124762535, "num_tokens": 120875622.0, "step": 100530 }, { "entropy": 1.9105036780238152, "epoch": 0.31166520324963826, "grad_norm": 8.747064590454102, "learning_rate": 4.531590742114171e-06, "loss": 0.4662, "mean_token_accuracy": 0.8524237051606178, "num_tokens": 120886828.0, "step": 100540 }, { "entropy": 1.9014281533658504, "epoch": 0.311696202374688, "grad_norm": 10.515109062194824, "learning_rate": 4.531365394103802e-06, "loss": 0.491, "mean_token_accuracy": 0.8461366832256317, "num_tokens": 120898889.0, "step": 100550 }, { "entropy": 1.7405464336276055, "epoch": 0.31172720149973765, "grad_norm": 4.015621662139893, "learning_rate": 4.531140079708566e-06, "loss": 0.3677, "mean_token_accuracy": 0.8626249134540558, "num_tokens": 120913012.0, "step": 100560 }, { "entropy": 1.875110612809658, "epoch": 0.3117582006247874, "grad_norm": 9.846309661865234, "learning_rate": 4.530914798920107e-06, "loss": 0.4614, "mean_token_accuracy": 0.8495632246136665, "num_tokens": 120924428.0, "step": 100570 }, { "entropy": 1.7907123163342475, "epoch": 0.31178919974983704, "grad_norm": 8.938268661499023, "learning_rate": 4.530689551730072e-06, "loss": 0.4333, "mean_token_accuracy": 0.8574684172868728, "num_tokens": 120937451.0, "step": 100580 }, { "entropy": 1.9089242190122604, "epoch": 0.31182019887488677, "grad_norm": 10.029963493347168, "learning_rate": 4.5304643381301094e-06, "loss": 0.4889, "mean_token_accuracy": 0.8454339489340782, "num_tokens": 120948860.0, "step": 100590 }, { "entropy": 1.9048386812210083, "epoch": 0.31185119799993644, "grad_norm": 8.10409927368164, "learning_rate": 4.530239158111872e-06, "loss": 0.4551, "mean_token_accuracy": 0.8487098172307015, "num_tokens": 120960368.0, "step": 100600 }, { "entropy": 1.7758634328842162, "epoch": 0.31188219712498616, "grad_norm": 3.4512832164764404, "learning_rate": 4.530014011667015e-06, "loss": 0.379, "mean_token_accuracy": 0.8484735369682312, "num_tokens": 120973621.0, "step": 100610 }, { "entropy": 1.9328245520591736, "epoch": 0.31191319625003583, "grad_norm": 9.077108383178711, "learning_rate": 4.5297888987871956e-06, "loss": 0.5028, "mean_token_accuracy": 0.8441737592220306, "num_tokens": 120984223.0, "step": 100620 }, { "entropy": 1.8256781995296478, "epoch": 0.31194419537508555, "grad_norm": 7.908056735992432, "learning_rate": 4.529563819464075e-06, "loss": 0.4364, "mean_token_accuracy": 0.8539151340723038, "num_tokens": 120997133.0, "step": 100630 }, { "entropy": 1.8427986733615398, "epoch": 0.3119751945001352, "grad_norm": 9.019706726074219, "learning_rate": 4.529338773689319e-06, "loss": 0.4545, "mean_token_accuracy": 0.8452917739748955, "num_tokens": 121009965.0, "step": 100640 }, { "entropy": 1.912809681892395, "epoch": 0.31200619362518495, "grad_norm": 8.718803405761719, "learning_rate": 4.529113761454591e-06, "loss": 0.5344, "mean_token_accuracy": 0.8374342992901802, "num_tokens": 121021538.0, "step": 100650 }, { "entropy": 1.8587751030921935, "epoch": 0.3120371927502346, "grad_norm": 8.5054931640625, "learning_rate": 4.528888782751565e-06, "loss": 0.4487, "mean_token_accuracy": 0.8497788742184639, "num_tokens": 121033842.0, "step": 100660 }, { "entropy": 1.8983284369111062, "epoch": 0.31206819187528434, "grad_norm": 8.823607444763184, "learning_rate": 4.52866383757191e-06, "loss": 0.4662, "mean_token_accuracy": 0.8428234368562698, "num_tokens": 121045774.0, "step": 100670 }, { "entropy": 1.9349606305360794, "epoch": 0.312099191000334, "grad_norm": 8.035542488098145, "learning_rate": 4.528438925907303e-06, "loss": 0.5277, "mean_token_accuracy": 0.8406255826354027, "num_tokens": 121057292.0, "step": 100680 }, { "entropy": 1.8452520370483398, "epoch": 0.31213019012538373, "grad_norm": 9.03587818145752, "learning_rate": 4.528214047749422e-06, "loss": 0.4873, "mean_token_accuracy": 0.8514409214258194, "num_tokens": 121070931.0, "step": 100690 }, { "entropy": 1.7869605004787446, "epoch": 0.3121611892504334, "grad_norm": 10.495682716369629, "learning_rate": 4.5279892030899485e-06, "loss": 0.4015, "mean_token_accuracy": 0.8594531282782555, "num_tokens": 121083578.0, "step": 100700 }, { "entropy": 1.8690698593854904, "epoch": 0.3121921883754831, "grad_norm": 8.482170104980469, "learning_rate": 4.527764391920566e-06, "loss": 0.4727, "mean_token_accuracy": 0.8500708416104317, "num_tokens": 121095247.0, "step": 100710 }, { "entropy": 1.9164055377244948, "epoch": 0.3122231875005328, "grad_norm": 8.222436904907227, "learning_rate": 4.527539614232962e-06, "loss": 0.4953, "mean_token_accuracy": 0.8398829951882363, "num_tokens": 121107205.0, "step": 100720 }, { "entropy": 1.8966273233294486, "epoch": 0.3122541866255825, "grad_norm": 10.007433891296387, "learning_rate": 4.527314870018826e-06, "loss": 0.4603, "mean_token_accuracy": 0.8499047040939331, "num_tokens": 121118189.0, "step": 100730 }, { "entropy": 1.9177552998065948, "epoch": 0.3122851857506322, "grad_norm": 9.334546089172363, "learning_rate": 4.527090159269853e-06, "loss": 0.4732, "mean_token_accuracy": 0.8446200251579284, "num_tokens": 121129883.0, "step": 100740 }, { "entropy": 1.951483702659607, "epoch": 0.31231618487568186, "grad_norm": 9.152246475219727, "learning_rate": 4.5268654819777355e-06, "loss": 0.5432, "mean_token_accuracy": 0.8399815455079078, "num_tokens": 121140945.0, "step": 100750 }, { "entropy": 1.843865168094635, "epoch": 0.3123471840007316, "grad_norm": 6.65300178527832, "learning_rate": 4.5266408381341735e-06, "loss": 0.4655, "mean_token_accuracy": 0.846330837905407, "num_tokens": 121153921.0, "step": 100760 }, { "entropy": 1.9063566982746125, "epoch": 0.31237818312578125, "grad_norm": 7.834247589111328, "learning_rate": 4.526416227730868e-06, "loss": 0.549, "mean_token_accuracy": 0.8455073088407516, "num_tokens": 121165017.0, "step": 100770 }, { "entropy": 1.8952314227819442, "epoch": 0.312409182250831, "grad_norm": 8.009584426879883, "learning_rate": 4.526191650759525e-06, "loss": 0.4638, "mean_token_accuracy": 0.846976974606514, "num_tokens": 121176827.0, "step": 100780 }, { "entropy": 1.9277762562036513, "epoch": 0.31244018137588064, "grad_norm": 8.662957191467285, "learning_rate": 4.52596710721185e-06, "loss": 0.5201, "mean_token_accuracy": 0.8444670990109444, "num_tokens": 121187950.0, "step": 100790 }, { "entropy": 1.943731963634491, "epoch": 0.31247118050093037, "grad_norm": 7.965757846832275, "learning_rate": 4.525742597079554e-06, "loss": 0.529, "mean_token_accuracy": 0.8364956364035606, "num_tokens": 121199219.0, "step": 100800 }, { "entropy": 1.9287627547979356, "epoch": 0.31250217962598004, "grad_norm": 6.632516384124756, "learning_rate": 4.52551812035435e-06, "loss": 0.4824, "mean_token_accuracy": 0.8509150311350823, "num_tokens": 121210268.0, "step": 100810 }, { "entropy": 1.9134871885180473, "epoch": 0.31253317875102976, "grad_norm": 4.123570919036865, "learning_rate": 4.525293677027954e-06, "loss": 0.461, "mean_token_accuracy": 0.8477164819836617, "num_tokens": 121222196.0, "step": 100820 }, { "entropy": 1.916480553150177, "epoch": 0.31256417787607943, "grad_norm": 10.33231258392334, "learning_rate": 4.525069267092083e-06, "loss": 0.5124, "mean_token_accuracy": 0.8356222257018089, "num_tokens": 121233821.0, "step": 100830 }, { "entropy": 1.8253281712532043, "epoch": 0.31259517700112915, "grad_norm": 8.607940673828125, "learning_rate": 4.52484489053846e-06, "loss": 0.4572, "mean_token_accuracy": 0.8448203921318054, "num_tokens": 121246155.0, "step": 100840 }, { "entropy": 1.9012310445308684, "epoch": 0.3126261761261788, "grad_norm": 7.7749247550964355, "learning_rate": 4.524620547358811e-06, "loss": 0.4896, "mean_token_accuracy": 0.8430942222476006, "num_tokens": 121258155.0, "step": 100850 }, { "entropy": 1.9564151272177697, "epoch": 0.31265717525122855, "grad_norm": 8.548589706420898, "learning_rate": 4.52439623754486e-06, "loss": 0.4858, "mean_token_accuracy": 0.8461882799863816, "num_tokens": 121269354.0, "step": 100860 }, { "entropy": 1.8889728412032127, "epoch": 0.3126881743762782, "grad_norm": 5.758060455322266, "learning_rate": 4.524171961088339e-06, "loss": 0.4714, "mean_token_accuracy": 0.857310351729393, "num_tokens": 121281304.0, "step": 100870 }, { "entropy": 1.930538135766983, "epoch": 0.31271917350132794, "grad_norm": 8.554981231689453, "learning_rate": 4.523947717980982e-06, "loss": 0.4988, "mean_token_accuracy": 0.8413084149360657, "num_tokens": 121293664.0, "step": 100880 }, { "entropy": 1.950375673174858, "epoch": 0.3127501726263776, "grad_norm": 8.186065673828125, "learning_rate": 4.5237235082145235e-06, "loss": 0.4897, "mean_token_accuracy": 0.849441209435463, "num_tokens": 121304745.0, "step": 100890 }, { "entropy": 1.9061787739396094, "epoch": 0.31278117175142733, "grad_norm": 3.3713457584381104, "learning_rate": 4.523499331780703e-06, "loss": 0.5062, "mean_token_accuracy": 0.8370168238878251, "num_tokens": 121316310.0, "step": 100900 }, { "entropy": 1.830712193250656, "epoch": 0.312812170876477, "grad_norm": 4.071151256561279, "learning_rate": 4.5232751886712615e-06, "loss": 0.4785, "mean_token_accuracy": 0.8477034792304039, "num_tokens": 121328915.0, "step": 100910 }, { "entropy": 1.9675110548734664, "epoch": 0.3128431700015267, "grad_norm": 8.07413101196289, "learning_rate": 4.523051078877946e-06, "loss": 0.5416, "mean_token_accuracy": 0.8389467343688011, "num_tokens": 121339986.0, "step": 100920 }, { "entropy": 1.8929841727018357, "epoch": 0.3128741691265764, "grad_norm": 8.73824691772461, "learning_rate": 4.5228270023925e-06, "loss": 0.4723, "mean_token_accuracy": 0.8490571796894073, "num_tokens": 121352037.0, "step": 100930 }, { "entropy": 1.8892315909266473, "epoch": 0.3129051682516261, "grad_norm": 4.492147922515869, "learning_rate": 4.522602959206678e-06, "loss": 0.4653, "mean_token_accuracy": 0.8552650704979896, "num_tokens": 121364144.0, "step": 100940 }, { "entropy": 1.923583671450615, "epoch": 0.3129361673766758, "grad_norm": 10.050043106079102, "learning_rate": 4.52237894931223e-06, "loss": 0.4653, "mean_token_accuracy": 0.844260835647583, "num_tokens": 121375808.0, "step": 100950 }, { "entropy": 1.9433955758810044, "epoch": 0.3129671665017255, "grad_norm": 9.592854499816895, "learning_rate": 4.522154972700912e-06, "loss": 0.5002, "mean_token_accuracy": 0.8481174424290657, "num_tokens": 121387011.0, "step": 100960 }, { "entropy": 1.847121460735798, "epoch": 0.3129981656267752, "grad_norm": 8.493393898010254, "learning_rate": 4.5219310293644856e-06, "loss": 0.4003, "mean_token_accuracy": 0.8667575031518936, "num_tokens": 121399217.0, "step": 100970 }, { "entropy": 1.895681183040142, "epoch": 0.3130291647518249, "grad_norm": 8.458040237426758, "learning_rate": 4.52170711929471e-06, "loss": 0.4853, "mean_token_accuracy": 0.8476063832640648, "num_tokens": 121411380.0, "step": 100980 }, { "entropy": 1.8396486029028893, "epoch": 0.3130601638768746, "grad_norm": 8.87826156616211, "learning_rate": 4.521483242483351e-06, "loss": 0.4117, "mean_token_accuracy": 0.8522458449006081, "num_tokens": 121424542.0, "step": 100990 }, { "entropy": 1.8425541028380394, "epoch": 0.31309116300192424, "grad_norm": 8.997958183288574, "learning_rate": 4.521259398922175e-06, "loss": 0.4341, "mean_token_accuracy": 0.8560503959655762, "num_tokens": 121437235.0, "step": 101000 }, { "entropy": 1.857857908308506, "epoch": 0.31312216212697397, "grad_norm": 9.751863479614258, "learning_rate": 4.521035588602953e-06, "loss": 0.4642, "mean_token_accuracy": 0.8419872790575027, "num_tokens": 121449522.0, "step": 101010 }, { "entropy": 1.858136995136738, "epoch": 0.31315316125202364, "grad_norm": 3.528123617172241, "learning_rate": 4.520811811517458e-06, "loss": 0.4313, "mean_token_accuracy": 0.8612496837973594, "num_tokens": 121462199.0, "step": 101020 }, { "entropy": 1.907412128150463, "epoch": 0.31318416037707336, "grad_norm": 6.3340744972229, "learning_rate": 4.520588067657467e-06, "loss": 0.4864, "mean_token_accuracy": 0.8482323855161666, "num_tokens": 121474247.0, "step": 101030 }, { "entropy": 1.8920990169048308, "epoch": 0.31321515950212303, "grad_norm": 7.975953578948975, "learning_rate": 4.520364357014758e-06, "loss": 0.4311, "mean_token_accuracy": 0.8457095667719841, "num_tokens": 121486893.0, "step": 101040 }, { "entropy": 1.8772882983088492, "epoch": 0.31324615862717275, "grad_norm": 8.792813301086426, "learning_rate": 4.520140679581111e-06, "loss": 0.4382, "mean_token_accuracy": 0.8604818403720855, "num_tokens": 121499181.0, "step": 101050 }, { "entropy": 1.8207586660981179, "epoch": 0.3132771577522224, "grad_norm": 7.4785590171813965, "learning_rate": 4.519917035348314e-06, "loss": 0.3837, "mean_token_accuracy": 0.8638281837105751, "num_tokens": 121511510.0, "step": 101060 }, { "entropy": 1.847452338039875, "epoch": 0.31330815687727215, "grad_norm": 4.0800886154174805, "learning_rate": 4.519693424308152e-06, "loss": 0.4483, "mean_token_accuracy": 0.8405447885394096, "num_tokens": 121524110.0, "step": 101070 }, { "entropy": 1.9054955273866654, "epoch": 0.3133391560023218, "grad_norm": 5.289824485778809, "learning_rate": 4.519469846452415e-06, "loss": 0.48, "mean_token_accuracy": 0.8457769706845284, "num_tokens": 121535699.0, "step": 101080 }, { "entropy": 1.915067094564438, "epoch": 0.31337015512737154, "grad_norm": 8.302640914916992, "learning_rate": 4.519246301772896e-06, "loss": 0.5167, "mean_token_accuracy": 0.8345979511737823, "num_tokens": 121547506.0, "step": 101090 }, { "entropy": 1.8765003278851509, "epoch": 0.3134011542524212, "grad_norm": 9.370650291442871, "learning_rate": 4.519022790261393e-06, "loss": 0.4425, "mean_token_accuracy": 0.8564626336097717, "num_tokens": 121560190.0, "step": 101100 }, { "entropy": 1.6999532222747802, "epoch": 0.31343215337747093, "grad_norm": 3.211577892303467, "learning_rate": 4.5187993119097045e-06, "loss": 0.3213, "mean_token_accuracy": 0.8762291237711907, "num_tokens": 121574646.0, "step": 101110 }, { "entropy": 1.8197277091443538, "epoch": 0.3134631525025206, "grad_norm": 7.400459289550781, "learning_rate": 4.5185758667096295e-06, "loss": 0.3932, "mean_token_accuracy": 0.8629156649112701, "num_tokens": 121588419.0, "step": 101120 }, { "entropy": 1.9415106505155564, "epoch": 0.3134941516275703, "grad_norm": 9.660387992858887, "learning_rate": 4.518352454652974e-06, "loss": 0.5114, "mean_token_accuracy": 0.8417216569185257, "num_tokens": 121599422.0, "step": 101130 }, { "entropy": 1.9480364575982094, "epoch": 0.31352515075262, "grad_norm": 9.710609436035156, "learning_rate": 4.518129075731546e-06, "loss": 0.5141, "mean_token_accuracy": 0.8369411841034889, "num_tokens": 121610421.0, "step": 101140 }, { "entropy": 1.9201453104615211, "epoch": 0.3135561498776697, "grad_norm": 3.905876874923706, "learning_rate": 4.517905729937153e-06, "loss": 0.5488, "mean_token_accuracy": 0.8450462609529495, "num_tokens": 121622409.0, "step": 101150 }, { "entropy": 1.9243328019976615, "epoch": 0.3135871490027194, "grad_norm": 8.177422523498535, "learning_rate": 4.517682417261611e-06, "loss": 0.4732, "mean_token_accuracy": 0.8475325971841812, "num_tokens": 121634012.0, "step": 101160 }, { "entropy": 1.8541806608438491, "epoch": 0.3136181481277691, "grad_norm": 9.121359825134277, "learning_rate": 4.517459137696734e-06, "loss": 0.4359, "mean_token_accuracy": 0.8584785044193268, "num_tokens": 121646345.0, "step": 101170 }, { "entropy": 1.8980771958827973, "epoch": 0.3136491472528188, "grad_norm": 8.328401565551758, "learning_rate": 4.517235891234341e-06, "loss": 0.4921, "mean_token_accuracy": 0.8456721261143685, "num_tokens": 121657741.0, "step": 101180 }, { "entropy": 1.8372810363769532, "epoch": 0.3136801463778685, "grad_norm": 9.05524730682373, "learning_rate": 4.517012677866254e-06, "loss": 0.4301, "mean_token_accuracy": 0.857391194999218, "num_tokens": 121670981.0, "step": 101190 }, { "entropy": 1.8419764667749405, "epoch": 0.3137111455029182, "grad_norm": 3.9297235012054443, "learning_rate": 4.516789497584297e-06, "loss": 0.4726, "mean_token_accuracy": 0.8536780118942261, "num_tokens": 121682826.0, "step": 101200 }, { "entropy": 1.8321501910686493, "epoch": 0.3137421446279679, "grad_norm": 4.388725757598877, "learning_rate": 4.516566350380297e-06, "loss": 0.4487, "mean_token_accuracy": 0.852819861471653, "num_tokens": 121695953.0, "step": 101210 }, { "entropy": 1.8966490373015403, "epoch": 0.31377314375301757, "grad_norm": 4.4960737228393555, "learning_rate": 4.5163432362460825e-06, "loss": 0.4519, "mean_token_accuracy": 0.8485043302178383, "num_tokens": 121707846.0, "step": 101220 }, { "entropy": 1.8889821529388429, "epoch": 0.3138041428780673, "grad_norm": 9.122937202453613, "learning_rate": 4.516120155173487e-06, "loss": 0.4919, "mean_token_accuracy": 0.8423930436372757, "num_tokens": 121719869.0, "step": 101230 }, { "entropy": 1.832286487519741, "epoch": 0.31383514200311696, "grad_norm": 3.832523822784424, "learning_rate": 4.515897107154348e-06, "loss": 0.4331, "mean_token_accuracy": 0.8503172323107719, "num_tokens": 121732133.0, "step": 101240 }, { "entropy": 1.8415155798196792, "epoch": 0.31386614112816663, "grad_norm": 3.9735727310180664, "learning_rate": 4.515674092180501e-06, "loss": 0.4646, "mean_token_accuracy": 0.848311547935009, "num_tokens": 121744373.0, "step": 101250 }, { "entropy": 1.9100284710526467, "epoch": 0.31389714025321636, "grad_norm": 8.413751602172852, "learning_rate": 4.51545111024379e-06, "loss": 0.4904, "mean_token_accuracy": 0.8440921515226364, "num_tokens": 121755112.0, "step": 101260 }, { "entropy": 1.8759055025875568, "epoch": 0.313928139378266, "grad_norm": 9.331445693969727, "learning_rate": 4.515228161336056e-06, "loss": 0.4728, "mean_token_accuracy": 0.8457464009523392, "num_tokens": 121767583.0, "step": 101270 }, { "entropy": 1.8825054869055748, "epoch": 0.31395913850331575, "grad_norm": 3.7135350704193115, "learning_rate": 4.515005245449148e-06, "loss": 0.4498, "mean_token_accuracy": 0.8491608336567879, "num_tokens": 121779121.0, "step": 101280 }, { "entropy": 1.8358449339866638, "epoch": 0.3139901376283654, "grad_norm": 8.02281379699707, "learning_rate": 4.514782362574916e-06, "loss": 0.4137, "mean_token_accuracy": 0.8574068158864975, "num_tokens": 121791200.0, "step": 101290 }, { "entropy": 1.7923202894628047, "epoch": 0.31402113675341514, "grad_norm": 8.330782890319824, "learning_rate": 4.514559512705209e-06, "loss": 0.4104, "mean_token_accuracy": 0.8662643000483513, "num_tokens": 121804297.0, "step": 101300 }, { "entropy": 1.9169503509998322, "epoch": 0.3140521358784648, "grad_norm": 8.749537467956543, "learning_rate": 4.514336695831886e-06, "loss": 0.5041, "mean_token_accuracy": 0.8455056488513947, "num_tokens": 121815612.0, "step": 101310 }, { "entropy": 1.7872198984026908, "epoch": 0.31408313500351454, "grad_norm": 7.533987522125244, "learning_rate": 4.514113911946806e-06, "loss": 0.4268, "mean_token_accuracy": 0.858782111108303, "num_tokens": 121828475.0, "step": 101320 }, { "entropy": 1.9148533344268799, "epoch": 0.3141141341285642, "grad_norm": 8.236104011535645, "learning_rate": 4.5138911610418245e-06, "loss": 0.5253, "mean_token_accuracy": 0.8438421174883842, "num_tokens": 121838831.0, "step": 101330 }, { "entropy": 1.9065877795219421, "epoch": 0.31414513325361393, "grad_norm": 8.377083778381348, "learning_rate": 4.51366844310881e-06, "loss": 0.4838, "mean_token_accuracy": 0.8459710478782654, "num_tokens": 121850367.0, "step": 101340 }, { "entropy": 1.9168651878833771, "epoch": 0.3141761323786636, "grad_norm": 8.766077995300293, "learning_rate": 4.513445758139627e-06, "loss": 0.4972, "mean_token_accuracy": 0.8526653304696084, "num_tokens": 121860852.0, "step": 101350 }, { "entropy": 1.8872756123542787, "epoch": 0.3142071315037133, "grad_norm": 8.51346492767334, "learning_rate": 4.513223106126145e-06, "loss": 0.4947, "mean_token_accuracy": 0.8485476955771446, "num_tokens": 121871508.0, "step": 101360 }, { "entropy": 1.8321407943964005, "epoch": 0.314238130628763, "grad_norm": 7.802825450897217, "learning_rate": 4.513000487060237e-06, "loss": 0.4132, "mean_token_accuracy": 0.8585144847631454, "num_tokens": 121883191.0, "step": 101370 }, { "entropy": 1.8902756407856942, "epoch": 0.3142691297538127, "grad_norm": 8.383158683776855, "learning_rate": 4.5127779009337785e-06, "loss": 0.5165, "mean_token_accuracy": 0.840720933675766, "num_tokens": 121894503.0, "step": 101380 }, { "entropy": 1.9538305789232253, "epoch": 0.3143001288788624, "grad_norm": 10.174178123474121, "learning_rate": 4.5125553477386455e-06, "loss": 0.5445, "mean_token_accuracy": 0.8378095537424087, "num_tokens": 121905568.0, "step": 101390 }, { "entropy": 1.8209902182221414, "epoch": 0.3143311280039121, "grad_norm": 9.949512481689453, "learning_rate": 4.512332827466718e-06, "loss": 0.466, "mean_token_accuracy": 0.8366820439696312, "num_tokens": 121918020.0, "step": 101400 }, { "entropy": 1.8949487626552581, "epoch": 0.3143621271289618, "grad_norm": 7.5901360511779785, "learning_rate": 4.5121103401098816e-06, "loss": 0.472, "mean_token_accuracy": 0.8482216715812683, "num_tokens": 121929522.0, "step": 101410 }, { "entropy": 1.805781890451908, "epoch": 0.3143931262540115, "grad_norm": 7.198166370391846, "learning_rate": 4.5118878856600216e-06, "loss": 0.4295, "mean_token_accuracy": 0.8508959487080574, "num_tokens": 121941861.0, "step": 101420 }, { "entropy": 1.9467833280563354, "epoch": 0.31442412537906117, "grad_norm": 7.574606895446777, "learning_rate": 4.511665464109026e-06, "loss": 0.531, "mean_token_accuracy": 0.8413808315992355, "num_tokens": 121952629.0, "step": 101430 }, { "entropy": 1.8594104155898095, "epoch": 0.3144551245041109, "grad_norm": 8.646973609924316, "learning_rate": 4.511443075448789e-06, "loss": 0.4942, "mean_token_accuracy": 0.8487472698092461, "num_tokens": 121964076.0, "step": 101440 }, { "entropy": 1.8867372497916222, "epoch": 0.31448612362916056, "grad_norm": 9.636038780212402, "learning_rate": 4.511220719671201e-06, "loss": 0.4743, "mean_token_accuracy": 0.8549345463514328, "num_tokens": 121975596.0, "step": 101450 }, { "entropy": 1.8463930860161781, "epoch": 0.3145171227542103, "grad_norm": 4.379640579223633, "learning_rate": 4.510998396768163e-06, "loss": 0.4399, "mean_token_accuracy": 0.8542065605521202, "num_tokens": 121988727.0, "step": 101460 }, { "entropy": 1.7745927199721336, "epoch": 0.31454812187925996, "grad_norm": 8.495243072509766, "learning_rate": 4.510776106731574e-06, "loss": 0.4697, "mean_token_accuracy": 0.8536753922700882, "num_tokens": 122001880.0, "step": 101470 }, { "entropy": 1.857958073914051, "epoch": 0.3145791210043097, "grad_norm": 8.16500473022461, "learning_rate": 4.510553849553338e-06, "loss": 0.4352, "mean_token_accuracy": 0.8449077561497689, "num_tokens": 122013484.0, "step": 101480 }, { "entropy": 1.894733229279518, "epoch": 0.31461012012935935, "grad_norm": 8.241775512695312, "learning_rate": 4.5103316252253596e-06, "loss": 0.4974, "mean_token_accuracy": 0.8418209850788116, "num_tokens": 122024930.0, "step": 101490 }, { "entropy": 1.8933887377381324, "epoch": 0.314641119254409, "grad_norm": 6.502079963684082, "learning_rate": 4.510109433739546e-06, "loss": 0.5135, "mean_token_accuracy": 0.8463633015751839, "num_tokens": 122036290.0, "step": 101500 }, { "entropy": 1.7902062192559243, "epoch": 0.31467211837945874, "grad_norm": 7.463756084442139, "learning_rate": 4.5098872750878105e-06, "loss": 0.4618, "mean_token_accuracy": 0.8511275082826615, "num_tokens": 122048789.0, "step": 101510 }, { "entropy": 1.8736846506595612, "epoch": 0.3147031175045084, "grad_norm": 8.760028839111328, "learning_rate": 4.509665149262067e-06, "loss": 0.5182, "mean_token_accuracy": 0.8344025030732155, "num_tokens": 122060583.0, "step": 101520 }, { "entropy": 1.863266195356846, "epoch": 0.31473411662955814, "grad_norm": 8.545906066894531, "learning_rate": 4.509443056254233e-06, "loss": 0.427, "mean_token_accuracy": 0.8548782065510749, "num_tokens": 122072828.0, "step": 101530 }, { "entropy": 1.8797732189297676, "epoch": 0.3147651157546078, "grad_norm": 7.313852787017822, "learning_rate": 4.509220996056225e-06, "loss": 0.4835, "mean_token_accuracy": 0.8434521064162255, "num_tokens": 122084579.0, "step": 101540 }, { "entropy": 1.823886838555336, "epoch": 0.31479611487965753, "grad_norm": 5.079513072967529, "learning_rate": 4.508998968659968e-06, "loss": 0.4358, "mean_token_accuracy": 0.8580957293510437, "num_tokens": 122097466.0, "step": 101550 }, { "entropy": 1.8132152885198594, "epoch": 0.3148271140047072, "grad_norm": 10.782577514648438, "learning_rate": 4.508776974057388e-06, "loss": 0.4318, "mean_token_accuracy": 0.838719479739666, "num_tokens": 122109870.0, "step": 101560 }, { "entropy": 1.8953714028000832, "epoch": 0.3148581131297569, "grad_norm": 8.248565673828125, "learning_rate": 4.508555012240411e-06, "loss": 0.5109, "mean_token_accuracy": 0.8457905784249306, "num_tokens": 122121740.0, "step": 101570 }, { "entropy": 1.8286928310990334, "epoch": 0.3148891122548066, "grad_norm": 9.213640213012695, "learning_rate": 4.50833308320097e-06, "loss": 0.421, "mean_token_accuracy": 0.8573044881224632, "num_tokens": 122133522.0, "step": 101580 }, { "entropy": 1.8590153113007546, "epoch": 0.3149201113798563, "grad_norm": 5.629488945007324, "learning_rate": 4.508111186930996e-06, "loss": 0.5116, "mean_token_accuracy": 0.8378587946295738, "num_tokens": 122145368.0, "step": 101590 }, { "entropy": 1.8506905019283295, "epoch": 0.314951110504906, "grad_norm": 7.348735332489014, "learning_rate": 4.507889323422427e-06, "loss": 0.4933, "mean_token_accuracy": 0.8469028800725937, "num_tokens": 122157779.0, "step": 101600 }, { "entropy": 1.8291833832859994, "epoch": 0.3149821096299557, "grad_norm": 2.6711909770965576, "learning_rate": 4.507667492667202e-06, "loss": 0.418, "mean_token_accuracy": 0.8578299075365067, "num_tokens": 122170157.0, "step": 101610 }, { "entropy": 1.9278810530900956, "epoch": 0.3150131087550054, "grad_norm": 6.753495693206787, "learning_rate": 4.507445694657263e-06, "loss": 0.5123, "mean_token_accuracy": 0.8417115181684494, "num_tokens": 122181568.0, "step": 101620 }, { "entropy": 1.8500063866376877, "epoch": 0.3150441078800551, "grad_norm": 9.120920181274414, "learning_rate": 4.507223929384555e-06, "loss": 0.459, "mean_token_accuracy": 0.8450711652636528, "num_tokens": 122193790.0, "step": 101630 }, { "entropy": 1.8878964528441429, "epoch": 0.31507510700510477, "grad_norm": 7.948580265045166, "learning_rate": 4.507002196841023e-06, "loss": 0.4774, "mean_token_accuracy": 0.8558190748095512, "num_tokens": 122205835.0, "step": 101640 }, { "entropy": 1.870565366744995, "epoch": 0.3151061061301545, "grad_norm": 9.229273796081543, "learning_rate": 4.506780497018622e-06, "loss": 0.4816, "mean_token_accuracy": 0.8505363553762436, "num_tokens": 122217561.0, "step": 101650 }, { "entropy": 1.8509984895586968, "epoch": 0.31513710525520416, "grad_norm": 9.244194984436035, "learning_rate": 4.506558829909301e-06, "loss": 0.4356, "mean_token_accuracy": 0.8476543664932251, "num_tokens": 122230248.0, "step": 101660 }, { "entropy": 1.7851860001683235, "epoch": 0.3151681043802539, "grad_norm": 8.21959114074707, "learning_rate": 4.506337195505015e-06, "loss": 0.3845, "mean_token_accuracy": 0.8607925057411194, "num_tokens": 122243301.0, "step": 101670 }, { "entropy": 1.843540082871914, "epoch": 0.31519910350530356, "grad_norm": 8.222967147827148, "learning_rate": 4.506115593797727e-06, "loss": 0.4182, "mean_token_accuracy": 0.8591127768158913, "num_tokens": 122255771.0, "step": 101680 }, { "entropy": 1.8111511290073394, "epoch": 0.3152301026303533, "grad_norm": 4.0410075187683105, "learning_rate": 4.5058940247793955e-06, "loss": 0.4031, "mean_token_accuracy": 0.8595302075147628, "num_tokens": 122268473.0, "step": 101690 }, { "entropy": 1.849461354315281, "epoch": 0.31526110175540295, "grad_norm": 7.890182971954346, "learning_rate": 4.505672488441985e-06, "loss": 0.4679, "mean_token_accuracy": 0.8466949865221978, "num_tokens": 122280554.0, "step": 101700 }, { "entropy": 1.8682439640164374, "epoch": 0.3152921008804527, "grad_norm": 3.920172691345215, "learning_rate": 4.505450984777461e-06, "loss": 0.4249, "mean_token_accuracy": 0.8509997203946114, "num_tokens": 122293129.0, "step": 101710 }, { "entropy": 1.8806020855903625, "epoch": 0.31532310000550234, "grad_norm": 9.121551513671875, "learning_rate": 4.505229513777795e-06, "loss": 0.4952, "mean_token_accuracy": 0.8400007635354996, "num_tokens": 122304732.0, "step": 101720 }, { "entropy": 1.902934204041958, "epoch": 0.315354099130552, "grad_norm": 8.14372730255127, "learning_rate": 4.50500807543496e-06, "loss": 0.4798, "mean_token_accuracy": 0.8440614491701126, "num_tokens": 122315872.0, "step": 101730 }, { "entropy": 1.8729471534490585, "epoch": 0.31538509825560174, "grad_norm": 7.441555023193359, "learning_rate": 4.50478666974093e-06, "loss": 0.4782, "mean_token_accuracy": 0.8485934257507324, "num_tokens": 122327035.0, "step": 101740 }, { "entropy": 1.9011955708265305, "epoch": 0.3154160973806514, "grad_norm": 8.446456909179688, "learning_rate": 4.50456529668768e-06, "loss": 0.5141, "mean_token_accuracy": 0.8409095421433449, "num_tokens": 122338445.0, "step": 101750 }, { "entropy": 1.7948980048298835, "epoch": 0.31544709650570113, "grad_norm": 10.517394065856934, "learning_rate": 4.5043439562671966e-06, "loss": 0.4413, "mean_token_accuracy": 0.8519067704677582, "num_tokens": 122351396.0, "step": 101760 }, { "entropy": 1.7647177547216415, "epoch": 0.3154780956307508, "grad_norm": 7.822119235992432, "learning_rate": 4.504122648471458e-06, "loss": 0.3804, "mean_token_accuracy": 0.8645659565925599, "num_tokens": 122364593.0, "step": 101770 }, { "entropy": 1.912806712090969, "epoch": 0.3155090947558005, "grad_norm": 8.647019386291504, "learning_rate": 4.503901373292454e-06, "loss": 0.5724, "mean_token_accuracy": 0.8309219375252723, "num_tokens": 122376209.0, "step": 101780 }, { "entropy": 1.8549558535218238, "epoch": 0.3155400938808502, "grad_norm": 7.488748073577881, "learning_rate": 4.503680130722171e-06, "loss": 0.4575, "mean_token_accuracy": 0.8498001918196678, "num_tokens": 122388129.0, "step": 101790 }, { "entropy": 1.9467090874910356, "epoch": 0.3155710930058999, "grad_norm": 7.977104663848877, "learning_rate": 4.5034589207526026e-06, "loss": 0.5692, "mean_token_accuracy": 0.8243438869714736, "num_tokens": 122399489.0, "step": 101800 }, { "entropy": 1.8753120481967926, "epoch": 0.3156020921309496, "grad_norm": 7.6365275382995605, "learning_rate": 4.503237743375743e-06, "loss": 0.5049, "mean_token_accuracy": 0.8401636779308319, "num_tokens": 122411610.0, "step": 101810 }, { "entropy": 1.8790741473436356, "epoch": 0.3156330912559993, "grad_norm": 9.488393783569336, "learning_rate": 4.503016598583588e-06, "loss": 0.4516, "mean_token_accuracy": 0.8573433116078377, "num_tokens": 122423284.0, "step": 101820 }, { "entropy": 1.9299686640501021, "epoch": 0.315664090381049, "grad_norm": 7.945889472961426, "learning_rate": 4.502795486368138e-06, "loss": 0.5097, "mean_token_accuracy": 0.8483487293124199, "num_tokens": 122434547.0, "step": 101830 }, { "entropy": 1.778617675602436, "epoch": 0.3156950895060987, "grad_norm": 9.670980453491211, "learning_rate": 4.502574406721396e-06, "loss": 0.4062, "mean_token_accuracy": 0.8523766458034515, "num_tokens": 122447628.0, "step": 101840 }, { "entropy": 1.7951662212610244, "epoch": 0.31572608863114837, "grad_norm": 8.881525039672852, "learning_rate": 4.502353359635368e-06, "loss": 0.385, "mean_token_accuracy": 0.861037427186966, "num_tokens": 122460464.0, "step": 101850 }, { "entropy": 1.8687596887350082, "epoch": 0.3157570877561981, "grad_norm": 7.735236644744873, "learning_rate": 4.502132345102062e-06, "loss": 0.4575, "mean_token_accuracy": 0.853579594194889, "num_tokens": 122471873.0, "step": 101860 }, { "entropy": 1.8538660779595375, "epoch": 0.31578808688124776, "grad_norm": 9.091965675354004, "learning_rate": 4.501911363113488e-06, "loss": 0.4883, "mean_token_accuracy": 0.8424102887511253, "num_tokens": 122483758.0, "step": 101870 }, { "entropy": 1.867423267662525, "epoch": 0.3158190860062975, "grad_norm": 7.476759433746338, "learning_rate": 4.50169041366166e-06, "loss": 0.4226, "mean_token_accuracy": 0.855827559530735, "num_tokens": 122495774.0, "step": 101880 }, { "entropy": 1.9684102550148963, "epoch": 0.31585008513134716, "grad_norm": 9.030182838439941, "learning_rate": 4.501469496738595e-06, "loss": 0.5092, "mean_token_accuracy": 0.8357870906591416, "num_tokens": 122507503.0, "step": 101890 }, { "entropy": 1.8453612983226777, "epoch": 0.3158810842563969, "grad_norm": 3.91621732711792, "learning_rate": 4.501248612336311e-06, "loss": 0.4265, "mean_token_accuracy": 0.8493709444999695, "num_tokens": 122519916.0, "step": 101900 }, { "entropy": 1.8780709460377694, "epoch": 0.31591208338144655, "grad_norm": 4.055104732513428, "learning_rate": 4.501027760446832e-06, "loss": 0.4607, "mean_token_accuracy": 0.8418286308646202, "num_tokens": 122532278.0, "step": 101910 }, { "entropy": 1.8270500496029853, "epoch": 0.3159430825064963, "grad_norm": 8.200374603271484, "learning_rate": 4.500806941062181e-06, "loss": 0.4187, "mean_token_accuracy": 0.8556545317173004, "num_tokens": 122544680.0, "step": 101920 }, { "entropy": 1.8527694910764694, "epoch": 0.31597408163154594, "grad_norm": 7.4447526931762695, "learning_rate": 4.500586154174386e-06, "loss": 0.4066, "mean_token_accuracy": 0.8587725609540939, "num_tokens": 122557057.0, "step": 101930 }, { "entropy": 1.8287393301725388, "epoch": 0.31600508075659567, "grad_norm": 8.269747734069824, "learning_rate": 4.500365399775477e-06, "loss": 0.4286, "mean_token_accuracy": 0.8537361025810242, "num_tokens": 122570020.0, "step": 101940 }, { "entropy": 1.9338515534996987, "epoch": 0.31603607988164534, "grad_norm": 4.0301194190979, "learning_rate": 4.500144677857487e-06, "loss": 0.508, "mean_token_accuracy": 0.8461434602737427, "num_tokens": 122580842.0, "step": 101950 }, { "entropy": 1.8989423006772994, "epoch": 0.31606707900669506, "grad_norm": 9.705557823181152, "learning_rate": 4.499923988412451e-06, "loss": 0.4887, "mean_token_accuracy": 0.844089737534523, "num_tokens": 122592807.0, "step": 101960 }, { "entropy": 1.7944003835320472, "epoch": 0.31609807813174473, "grad_norm": 8.264009475708008, "learning_rate": 4.4997033314324076e-06, "loss": 0.4369, "mean_token_accuracy": 0.853903503715992, "num_tokens": 122606158.0, "step": 101970 }, { "entropy": 1.9085728481411934, "epoch": 0.3161290772567944, "grad_norm": 4.879970073699951, "learning_rate": 4.499482706909398e-06, "loss": 0.5186, "mean_token_accuracy": 0.8358641535043716, "num_tokens": 122618189.0, "step": 101980 }, { "entropy": 1.9110256865620614, "epoch": 0.3161600763818441, "grad_norm": 11.392416954040527, "learning_rate": 4.4992621148354666e-06, "loss": 0.5015, "mean_token_accuracy": 0.8411575809121132, "num_tokens": 122630458.0, "step": 101990 }, { "entropy": 1.8680866047739983, "epoch": 0.3161910755068938, "grad_norm": 8.535940170288086, "learning_rate": 4.499041555202658e-06, "loss": 0.4669, "mean_token_accuracy": 0.8537841528654099, "num_tokens": 122642497.0, "step": 102000 }, { "entropy": 1.9318107053637505, "epoch": 0.3162220746319435, "grad_norm": 7.174762725830078, "learning_rate": 4.498821028003023e-06, "loss": 0.4721, "mean_token_accuracy": 0.8535374835133552, "num_tokens": 122654391.0, "step": 102010 }, { "entropy": 1.8076833948493003, "epoch": 0.3162530737569932, "grad_norm": 8.442280769348145, "learning_rate": 4.498600533228614e-06, "loss": 0.3978, "mean_token_accuracy": 0.8697671309113503, "num_tokens": 122667692.0, "step": 102020 }, { "entropy": 1.9444027364253997, "epoch": 0.3162840728820429, "grad_norm": 8.13665771484375, "learning_rate": 4.498380070871485e-06, "loss": 0.5283, "mean_token_accuracy": 0.8343800812959671, "num_tokens": 122679172.0, "step": 102030 }, { "entropy": 1.8926381140947341, "epoch": 0.3163150720070926, "grad_norm": 4.520792484283447, "learning_rate": 4.498159640923693e-06, "loss": 0.4614, "mean_token_accuracy": 0.8546594932675362, "num_tokens": 122691105.0, "step": 102040 }, { "entropy": 1.9276057749986648, "epoch": 0.3163460711321423, "grad_norm": 9.296843528747559, "learning_rate": 4.497939243377298e-06, "loss": 0.4684, "mean_token_accuracy": 0.8524391040205955, "num_tokens": 122702895.0, "step": 102050 }, { "entropy": 1.9363893195986748, "epoch": 0.316377070257192, "grad_norm": 6.961995601654053, "learning_rate": 4.497718878224365e-06, "loss": 0.5188, "mean_token_accuracy": 0.8385184094309807, "num_tokens": 122714385.0, "step": 102060 }, { "entropy": 1.9360941782593728, "epoch": 0.3164080693822417, "grad_norm": 8.984434127807617, "learning_rate": 4.497498545456957e-06, "loss": 0.4717, "mean_token_accuracy": 0.8492272660136223, "num_tokens": 122725974.0, "step": 102070 }, { "entropy": 1.88591488301754, "epoch": 0.31643906850729137, "grad_norm": 9.188456535339355, "learning_rate": 4.497278245067143e-06, "loss": 0.4751, "mean_token_accuracy": 0.8486263528466225, "num_tokens": 122737041.0, "step": 102080 }, { "entropy": 1.9170858517289162, "epoch": 0.3164700676323411, "grad_norm": 8.99303913116455, "learning_rate": 4.497057977046996e-06, "loss": 0.5033, "mean_token_accuracy": 0.8467420935630798, "num_tokens": 122748309.0, "step": 102090 }, { "entropy": 1.950069211423397, "epoch": 0.31650106675739076, "grad_norm": 8.753754615783691, "learning_rate": 4.4968377413885885e-06, "loss": 0.4896, "mean_token_accuracy": 0.849167463183403, "num_tokens": 122759466.0, "step": 102100 }, { "entropy": 1.9460075750947, "epoch": 0.3165320658824405, "grad_norm": 8.198152542114258, "learning_rate": 4.496617538083995e-06, "loss": 0.4837, "mean_token_accuracy": 0.847400327026844, "num_tokens": 122770569.0, "step": 102110 }, { "entropy": 1.9162710309028625, "epoch": 0.31656306500749015, "grad_norm": 8.986804008483887, "learning_rate": 4.496397367125297e-06, "loss": 0.4862, "mean_token_accuracy": 0.8529955089092255, "num_tokens": 122782469.0, "step": 102120 }, { "entropy": 1.9209685817360878, "epoch": 0.3165940641325399, "grad_norm": 9.081794738769531, "learning_rate": 4.496177228504574e-06, "loss": 0.5058, "mean_token_accuracy": 0.8467201054096222, "num_tokens": 122794654.0, "step": 102130 }, { "entropy": 1.976951664686203, "epoch": 0.31662506325758955, "grad_norm": 7.217496395111084, "learning_rate": 4.495957122213915e-06, "loss": 0.5074, "mean_token_accuracy": 0.852759413421154, "num_tokens": 122805399.0, "step": 102140 }, { "entropy": 1.7019195973873138, "epoch": 0.31665606238263927, "grad_norm": 8.643301963806152, "learning_rate": 4.495737048245404e-06, "loss": 0.4252, "mean_token_accuracy": 0.8538576439023018, "num_tokens": 122819420.0, "step": 102150 }, { "entropy": 1.8965346455574035, "epoch": 0.31668706150768894, "grad_norm": 8.723462104797363, "learning_rate": 4.495517006591132e-06, "loss": 0.476, "mean_token_accuracy": 0.8469094544649124, "num_tokens": 122831159.0, "step": 102160 }, { "entropy": 1.8087441340088843, "epoch": 0.31671806063273866, "grad_norm": 7.827014923095703, "learning_rate": 4.495296997243191e-06, "loss": 0.4458, "mean_token_accuracy": 0.8492914751172066, "num_tokens": 122844637.0, "step": 102170 }, { "entropy": 1.8186364084482194, "epoch": 0.31674905975778833, "grad_norm": 8.309804916381836, "learning_rate": 4.495077020193676e-06, "loss": 0.4508, "mean_token_accuracy": 0.8481403559446334, "num_tokens": 122857719.0, "step": 102180 }, { "entropy": 1.924719001352787, "epoch": 0.31678005888283806, "grad_norm": 8.645857810974121, "learning_rate": 4.494857075434688e-06, "loss": 0.46, "mean_token_accuracy": 0.8571926310658455, "num_tokens": 122869193.0, "step": 102190 }, { "entropy": 1.9453157484531403, "epoch": 0.3168110580078877, "grad_norm": 8.812455177307129, "learning_rate": 4.494637162958325e-06, "loss": 0.5329, "mean_token_accuracy": 0.8361015647649765, "num_tokens": 122879724.0, "step": 102200 }, { "entropy": 1.8693825080990791, "epoch": 0.31684205713293745, "grad_norm": 10.073441505432129, "learning_rate": 4.494417282756691e-06, "loss": 0.4661, "mean_token_accuracy": 0.8441578835248947, "num_tokens": 122892200.0, "step": 102210 }, { "entropy": 1.8365435376763344, "epoch": 0.3168730562579871, "grad_norm": 3.941197633743286, "learning_rate": 4.494197434821895e-06, "loss": 0.4078, "mean_token_accuracy": 0.8510231733322143, "num_tokens": 122904637.0, "step": 102220 }, { "entropy": 1.8284949362277985, "epoch": 0.3169040553830368, "grad_norm": 7.839810371398926, "learning_rate": 4.493977619146042e-06, "loss": 0.4285, "mean_token_accuracy": 0.8462954074144363, "num_tokens": 122917337.0, "step": 102230 }, { "entropy": 1.8114649429917336, "epoch": 0.3169350545080865, "grad_norm": 7.352813243865967, "learning_rate": 4.493757835721245e-06, "loss": 0.4804, "mean_token_accuracy": 0.8442055761814118, "num_tokens": 122930610.0, "step": 102240 }, { "entropy": 1.9509634003043175, "epoch": 0.3169660536331362, "grad_norm": 8.08341121673584, "learning_rate": 4.49353808453962e-06, "loss": 0.5304, "mean_token_accuracy": 0.8404506236314774, "num_tokens": 122941825.0, "step": 102250 }, { "entropy": 1.9052265360951424, "epoch": 0.3169970527581859, "grad_norm": 8.447335243225098, "learning_rate": 4.4933183655932825e-06, "loss": 0.483, "mean_token_accuracy": 0.8546632289886474, "num_tokens": 122953086.0, "step": 102260 }, { "entropy": 1.8697161242365836, "epoch": 0.3170280518832356, "grad_norm": 7.899079322814941, "learning_rate": 4.493098678874353e-06, "loss": 0.4338, "mean_token_accuracy": 0.8598993286490441, "num_tokens": 122965302.0, "step": 102270 }, { "entropy": 1.9763395830988884, "epoch": 0.3170590510082853, "grad_norm": 7.561324119567871, "learning_rate": 4.4928790243749535e-06, "loss": 0.4768, "mean_token_accuracy": 0.8493476256728172, "num_tokens": 122976430.0, "step": 102280 }, { "entropy": 1.9137044921517372, "epoch": 0.31709005013333497, "grad_norm": 8.271001815795898, "learning_rate": 4.49265940208721e-06, "loss": 0.5022, "mean_token_accuracy": 0.8426155328750611, "num_tokens": 122987244.0, "step": 102290 }, { "entropy": 1.8825251743197442, "epoch": 0.3171210492583847, "grad_norm": 9.43757152557373, "learning_rate": 4.4924398120032505e-06, "loss": 0.4783, "mean_token_accuracy": 0.851975978910923, "num_tokens": 122998662.0, "step": 102300 }, { "entropy": 1.8874466240406036, "epoch": 0.31715204838343436, "grad_norm": 8.041158676147461, "learning_rate": 4.492220254115204e-06, "loss": 0.496, "mean_token_accuracy": 0.8433241844177246, "num_tokens": 123010762.0, "step": 102310 }, { "entropy": 1.9816171437501908, "epoch": 0.3171830475084841, "grad_norm": 9.10623836517334, "learning_rate": 4.492000728415204e-06, "loss": 0.5231, "mean_token_accuracy": 0.8387098371982574, "num_tokens": 123021337.0, "step": 102320 }, { "entropy": 1.9616368293762207, "epoch": 0.31721404663353375, "grad_norm": 9.265618324279785, "learning_rate": 4.491781234895389e-06, "loss": 0.5216, "mean_token_accuracy": 0.8416030526161193, "num_tokens": 123032883.0, "step": 102330 }, { "entropy": 1.8963989228010179, "epoch": 0.3172450457585835, "grad_norm": 8.671114921569824, "learning_rate": 4.4915617735478936e-06, "loss": 0.4523, "mean_token_accuracy": 0.8574667409062385, "num_tokens": 123044915.0, "step": 102340 }, { "entropy": 1.8731760695576667, "epoch": 0.31727604488363315, "grad_norm": 7.5588297843933105, "learning_rate": 4.491342344364861e-06, "loss": 0.4867, "mean_token_accuracy": 0.8465674906969071, "num_tokens": 123056529.0, "step": 102350 }, { "entropy": 1.9712413370609283, "epoch": 0.31730704400868287, "grad_norm": 8.025903701782227, "learning_rate": 4.491122947338437e-06, "loss": 0.5524, "mean_token_accuracy": 0.8367073446512222, "num_tokens": 123067329.0, "step": 102360 }, { "entropy": 1.849116511642933, "epoch": 0.31733804313373254, "grad_norm": 7.625123023986816, "learning_rate": 4.490903582460766e-06, "loss": 0.419, "mean_token_accuracy": 0.847317686676979, "num_tokens": 123080562.0, "step": 102370 }, { "entropy": 1.980668443441391, "epoch": 0.31736904225878226, "grad_norm": 7.4852705001831055, "learning_rate": 4.490684249724e-06, "loss": 0.5089, "mean_token_accuracy": 0.8438632428646088, "num_tokens": 123092615.0, "step": 102380 }, { "entropy": 1.9390667513012887, "epoch": 0.31740004138383193, "grad_norm": 8.869697570800781, "learning_rate": 4.4904649491202866e-06, "loss": 0.4443, "mean_token_accuracy": 0.8526061177253723, "num_tokens": 123104152.0, "step": 102390 }, { "entropy": 1.866631529480219, "epoch": 0.31743104050888166, "grad_norm": 7.319535255432129, "learning_rate": 4.490245680641784e-06, "loss": 0.4087, "mean_token_accuracy": 0.8547472521662712, "num_tokens": 123116432.0, "step": 102400 }, { "entropy": 1.9240225657820702, "epoch": 0.3174620396339313, "grad_norm": 7.42906379699707, "learning_rate": 4.490026444280649e-06, "loss": 0.4896, "mean_token_accuracy": 0.8577197745442391, "num_tokens": 123128281.0, "step": 102410 }, { "entropy": 1.8641801089048387, "epoch": 0.31749303875898105, "grad_norm": 8.197351455688477, "learning_rate": 4.48980724002904e-06, "loss": 0.4176, "mean_token_accuracy": 0.8553032830357552, "num_tokens": 123140851.0, "step": 102420 }, { "entropy": 1.8485140323638916, "epoch": 0.3175240378840307, "grad_norm": 11.487910270690918, "learning_rate": 4.489588067879123e-06, "loss": 0.459, "mean_token_accuracy": 0.8603411689400673, "num_tokens": 123153326.0, "step": 102430 }, { "entropy": 1.93117448836565, "epoch": 0.31755503700908044, "grad_norm": 9.506771087646484, "learning_rate": 4.489368927823061e-06, "loss": 0.4954, "mean_token_accuracy": 0.8498334422707557, "num_tokens": 123165165.0, "step": 102440 }, { "entropy": 1.958940689265728, "epoch": 0.3175860361341301, "grad_norm": 7.967811107635498, "learning_rate": 4.489149819853024e-06, "loss": 0.4962, "mean_token_accuracy": 0.847984354197979, "num_tokens": 123176618.0, "step": 102450 }, { "entropy": 1.9103962182998657, "epoch": 0.31761703525917984, "grad_norm": 8.108328819274902, "learning_rate": 4.4889307439611805e-06, "loss": 0.4917, "mean_token_accuracy": 0.8437720760703087, "num_tokens": 123188100.0, "step": 102460 }, { "entropy": 1.916041937470436, "epoch": 0.3176480343842295, "grad_norm": 8.520224571228027, "learning_rate": 4.488711700139705e-06, "loss": 0.4949, "mean_token_accuracy": 0.8483468234539032, "num_tokens": 123199271.0, "step": 102470 }, { "entropy": 1.8879058375954627, "epoch": 0.3176790335092792, "grad_norm": 6.797135829925537, "learning_rate": 4.488492688380775e-06, "loss": 0.4652, "mean_token_accuracy": 0.8475539967417717, "num_tokens": 123211068.0, "step": 102480 }, { "entropy": 1.9228038251399995, "epoch": 0.3177100326343289, "grad_norm": 8.574908256530762, "learning_rate": 4.488273708676567e-06, "loss": 0.48, "mean_token_accuracy": 0.8467731967568397, "num_tokens": 123222860.0, "step": 102490 }, { "entropy": 1.9041634023189544, "epoch": 0.31774103175937857, "grad_norm": 3.588517427444458, "learning_rate": 4.488054761019265e-06, "loss": 0.4622, "mean_token_accuracy": 0.8531323969364166, "num_tokens": 123234902.0, "step": 102500 }, { "entropy": 1.880505882203579, "epoch": 0.3177720308844283, "grad_norm": 7.573834419250488, "learning_rate": 4.487835845401051e-06, "loss": 0.4511, "mean_token_accuracy": 0.8521980285644531, "num_tokens": 123247483.0, "step": 102510 }, { "entropy": 1.9097674801945685, "epoch": 0.31780303000947796, "grad_norm": 8.0872163772583, "learning_rate": 4.487616961814113e-06, "loss": 0.5004, "mean_token_accuracy": 0.8391719311475754, "num_tokens": 123259632.0, "step": 102520 }, { "entropy": 1.9394026383757592, "epoch": 0.3178340291345277, "grad_norm": 7.864894390106201, "learning_rate": 4.48739811025064e-06, "loss": 0.5, "mean_token_accuracy": 0.8454336494207382, "num_tokens": 123270948.0, "step": 102530 }, { "entropy": 1.8619687780737877, "epoch": 0.31786502825957735, "grad_norm": 3.913823127746582, "learning_rate": 4.487179290702825e-06, "loss": 0.465, "mean_token_accuracy": 0.8360814332962037, "num_tokens": 123283604.0, "step": 102540 }, { "entropy": 1.934596875309944, "epoch": 0.3178960273846271, "grad_norm": 8.981165885925293, "learning_rate": 4.486960503162861e-06, "loss": 0.4668, "mean_token_accuracy": 0.8490733042359352, "num_tokens": 123294899.0, "step": 102550 }, { "entropy": 1.9189150497317313, "epoch": 0.31792702650967675, "grad_norm": 7.386570453643799, "learning_rate": 4.4867417476229475e-06, "loss": 0.496, "mean_token_accuracy": 0.8401806384325028, "num_tokens": 123306606.0, "step": 102560 }, { "entropy": 1.746788166463375, "epoch": 0.31795802563472647, "grad_norm": 8.746540069580078, "learning_rate": 4.486523024075284e-06, "loss": 0.3844, "mean_token_accuracy": 0.8678254216909409, "num_tokens": 123319627.0, "step": 102570 }, { "entropy": 1.8892858445644378, "epoch": 0.31798902475977614, "grad_norm": 9.26871109008789, "learning_rate": 4.486304332512073e-06, "loss": 0.4685, "mean_token_accuracy": 0.8421612724661827, "num_tokens": 123331981.0, "step": 102580 }, { "entropy": 1.847158958017826, "epoch": 0.31802002388482586, "grad_norm": 8.670111656188965, "learning_rate": 4.48608567292552e-06, "loss": 0.4146, "mean_token_accuracy": 0.8566896244883537, "num_tokens": 123344136.0, "step": 102590 }, { "entropy": 1.839782963693142, "epoch": 0.31805102300987553, "grad_norm": 3.6876060962677, "learning_rate": 4.485867045307833e-06, "loss": 0.435, "mean_token_accuracy": 0.8545563265681266, "num_tokens": 123356337.0, "step": 102600 }, { "entropy": 1.8948510199785233, "epoch": 0.31808202213492526, "grad_norm": 8.7213773727417, "learning_rate": 4.485648449651225e-06, "loss": 0.4486, "mean_token_accuracy": 0.8595595166087151, "num_tokens": 123367945.0, "step": 102610 }, { "entropy": 1.8524553552269936, "epoch": 0.3181130212599749, "grad_norm": 8.925561904907227, "learning_rate": 4.485429885947906e-06, "loss": 0.4511, "mean_token_accuracy": 0.8614932551980019, "num_tokens": 123380029.0, "step": 102620 }, { "entropy": 1.8003907322883606, "epoch": 0.31814402038502465, "grad_norm": 7.866354465484619, "learning_rate": 4.485211354190095e-06, "loss": 0.3988, "mean_token_accuracy": 0.8583444371819496, "num_tokens": 123392498.0, "step": 102630 }, { "entropy": 1.8503638431429863, "epoch": 0.3181750195100743, "grad_norm": 9.890091896057129, "learning_rate": 4.4849928543700085e-06, "loss": 0.4831, "mean_token_accuracy": 0.8414298683404923, "num_tokens": 123405679.0, "step": 102640 }, { "entropy": 1.8876867666840553, "epoch": 0.31820601863512404, "grad_norm": 4.612701416015625, "learning_rate": 4.4847743864798694e-06, "loss": 0.4913, "mean_token_accuracy": 0.8404382139444351, "num_tokens": 123417818.0, "step": 102650 }, { "entropy": 1.8589577794075012, "epoch": 0.3182370177601737, "grad_norm": 8.45789909362793, "learning_rate": 4.4845559505119026e-06, "loss": 0.4453, "mean_token_accuracy": 0.8587053641676903, "num_tokens": 123430054.0, "step": 102660 }, { "entropy": 1.8961587071418762, "epoch": 0.31826801688522344, "grad_norm": 7.61850118637085, "learning_rate": 4.484337546458332e-06, "loss": 0.4666, "mean_token_accuracy": 0.8480044439435005, "num_tokens": 123442303.0, "step": 102670 }, { "entropy": 1.8941489323973655, "epoch": 0.3182990160102731, "grad_norm": 7.859434604644775, "learning_rate": 4.484119174311389e-06, "loss": 0.4962, "mean_token_accuracy": 0.8440102905035018, "num_tokens": 123454240.0, "step": 102680 }, { "entropy": 1.8841777712106704, "epoch": 0.31833001513532283, "grad_norm": 3.9713032245635986, "learning_rate": 4.483900834063305e-06, "loss": 0.505, "mean_token_accuracy": 0.8437553867697716, "num_tokens": 123465668.0, "step": 102690 }, { "entropy": 1.8325778350234032, "epoch": 0.3183610142603725, "grad_norm": 7.278379917144775, "learning_rate": 4.483682525706316e-06, "loss": 0.4385, "mean_token_accuracy": 0.8592413857579231, "num_tokens": 123478018.0, "step": 102700 }, { "entropy": 1.868157622218132, "epoch": 0.3183920133854222, "grad_norm": 4.052938461303711, "learning_rate": 4.483464249232657e-06, "loss": 0.4497, "mean_token_accuracy": 0.8553399801254272, "num_tokens": 123490636.0, "step": 102710 }, { "entropy": 1.8868106931447983, "epoch": 0.3184230125104719, "grad_norm": 7.89746618270874, "learning_rate": 4.483246004634569e-06, "loss": 0.4555, "mean_token_accuracy": 0.8540831059217453, "num_tokens": 123502078.0, "step": 102720 }, { "entropy": 1.9166793465614318, "epoch": 0.31845401163552156, "grad_norm": 8.033954620361328, "learning_rate": 4.4830277919042956e-06, "loss": 0.4815, "mean_token_accuracy": 0.8464692577719688, "num_tokens": 123513541.0, "step": 102730 }, { "entropy": 1.8216803684830665, "epoch": 0.3184850107605713, "grad_norm": 7.996546745300293, "learning_rate": 4.482809611034082e-06, "loss": 0.4857, "mean_token_accuracy": 0.8364669859409333, "num_tokens": 123526355.0, "step": 102740 }, { "entropy": 1.8570113122463225, "epoch": 0.31851600988562095, "grad_norm": 7.581510066986084, "learning_rate": 4.482591462016174e-06, "loss": 0.429, "mean_token_accuracy": 0.8555145755410194, "num_tokens": 123538211.0, "step": 102750 }, { "entropy": 1.812189969420433, "epoch": 0.3185470090106707, "grad_norm": 9.858771324157715, "learning_rate": 4.482373344842824e-06, "loss": 0.4234, "mean_token_accuracy": 0.8614937171339989, "num_tokens": 123550270.0, "step": 102760 }, { "entropy": 1.8564748376607896, "epoch": 0.31857800813572035, "grad_norm": 6.510000228881836, "learning_rate": 4.482155259506284e-06, "loss": 0.5008, "mean_token_accuracy": 0.8492413744330406, "num_tokens": 123561940.0, "step": 102770 }, { "entropy": 1.9163195461034774, "epoch": 0.31860900726077007, "grad_norm": 7.715924263000488, "learning_rate": 4.4819372059988115e-06, "loss": 0.4913, "mean_token_accuracy": 0.8402003988623619, "num_tokens": 123573352.0, "step": 102780 }, { "entropy": 1.851955994963646, "epoch": 0.31864000638581974, "grad_norm": 8.36665153503418, "learning_rate": 4.4817191843126635e-06, "loss": 0.4581, "mean_token_accuracy": 0.8535103261470794, "num_tokens": 123584588.0, "step": 102790 }, { "entropy": 1.9031839981675147, "epoch": 0.31867100551086947, "grad_norm": 8.757781982421875, "learning_rate": 4.4815011944401015e-06, "loss": 0.469, "mean_token_accuracy": 0.8555735006928444, "num_tokens": 123596035.0, "step": 102800 }, { "entropy": 1.8560205325484276, "epoch": 0.31870200463591913, "grad_norm": 8.463615417480469, "learning_rate": 4.4812832363733894e-06, "loss": 0.4696, "mean_token_accuracy": 0.8499815389513969, "num_tokens": 123607275.0, "step": 102810 }, { "entropy": 1.848600834608078, "epoch": 0.31873300376096886, "grad_norm": 7.636923313140869, "learning_rate": 4.481065310104793e-06, "loss": 0.4672, "mean_token_accuracy": 0.8392280101776123, "num_tokens": 123619498.0, "step": 102820 }, { "entropy": 1.9192082822322845, "epoch": 0.3187640028860185, "grad_norm": 7.930289268493652, "learning_rate": 4.480847415626582e-06, "loss": 0.5126, "mean_token_accuracy": 0.8376297026872634, "num_tokens": 123631251.0, "step": 102830 }, { "entropy": 1.897643305361271, "epoch": 0.31879500201106825, "grad_norm": 8.173277854919434, "learning_rate": 4.480629552931028e-06, "loss": 0.4469, "mean_token_accuracy": 0.8563371822237968, "num_tokens": 123642062.0, "step": 102840 }, { "entropy": 1.88473329693079, "epoch": 0.3188260011361179, "grad_norm": 3.9585182666778564, "learning_rate": 4.480411722010404e-06, "loss": 0.4589, "mean_token_accuracy": 0.8535011231899261, "num_tokens": 123652956.0, "step": 102850 }, { "entropy": 1.8646925508975982, "epoch": 0.31885700026116764, "grad_norm": 9.49219799041748, "learning_rate": 4.4801939228569895e-06, "loss": 0.4491, "mean_token_accuracy": 0.8506576329469681, "num_tokens": 123665406.0, "step": 102860 }, { "entropy": 1.8895646423101424, "epoch": 0.3188879993862173, "grad_norm": 8.293597221374512, "learning_rate": 4.4799761554630605e-06, "loss": 0.5429, "mean_token_accuracy": 0.8343157604336738, "num_tokens": 123677366.0, "step": 102870 }, { "entropy": 1.8557443410158156, "epoch": 0.31891899851126704, "grad_norm": 3.470597505569458, "learning_rate": 4.479758419820902e-06, "loss": 0.4917, "mean_token_accuracy": 0.8372866466641427, "num_tokens": 123689278.0, "step": 102880 }, { "entropy": 1.8852709233760834, "epoch": 0.3189499976363167, "grad_norm": 8.924466133117676, "learning_rate": 4.479540715922798e-06, "loss": 0.488, "mean_token_accuracy": 0.8414081260561943, "num_tokens": 123701095.0, "step": 102890 }, { "entropy": 1.8459098264575005, "epoch": 0.31898099676136643, "grad_norm": 9.664191246032715, "learning_rate": 4.479323043761035e-06, "loss": 0.4614, "mean_token_accuracy": 0.8502274408936501, "num_tokens": 123713345.0, "step": 102900 }, { "entropy": 1.8116124227643013, "epoch": 0.3190119958864161, "grad_norm": 8.264068603515625, "learning_rate": 4.479105403327904e-06, "loss": 0.4638, "mean_token_accuracy": 0.8466699972748757, "num_tokens": 123725722.0, "step": 102910 }, { "entropy": 1.8975732818245887, "epoch": 0.3190429950114658, "grad_norm": 8.010908126831055, "learning_rate": 4.478887794615696e-06, "loss": 0.5165, "mean_token_accuracy": 0.8428432583808899, "num_tokens": 123736798.0, "step": 102920 }, { "entropy": 1.836000031232834, "epoch": 0.3190739941365155, "grad_norm": 10.32380485534668, "learning_rate": 4.4786702176167084e-06, "loss": 0.4547, "mean_token_accuracy": 0.8488903164863586, "num_tokens": 123748662.0, "step": 102930 }, { "entropy": 1.8443504258990289, "epoch": 0.3191049932615652, "grad_norm": 7.753139019012451, "learning_rate": 4.478452672323238e-06, "loss": 0.4525, "mean_token_accuracy": 0.8580980643630027, "num_tokens": 123761662.0, "step": 102940 }, { "entropy": 1.8338020756840705, "epoch": 0.3191359923866149, "grad_norm": 9.386250495910645, "learning_rate": 4.4782351587275865e-06, "loss": 0.4805, "mean_token_accuracy": 0.844118581712246, "num_tokens": 123773805.0, "step": 102950 }, { "entropy": 1.8451603963971137, "epoch": 0.3191669915116646, "grad_norm": 8.358642578125, "learning_rate": 4.478017676822054e-06, "loss": 0.4361, "mean_token_accuracy": 0.8548496574163437, "num_tokens": 123786092.0, "step": 102960 }, { "entropy": 1.9037387266755104, "epoch": 0.3191979906367143, "grad_norm": 10.784769058227539, "learning_rate": 4.47780022659895e-06, "loss": 0.4779, "mean_token_accuracy": 0.8458883672952652, "num_tokens": 123798134.0, "step": 102970 }, { "entropy": 1.9118751406669616, "epoch": 0.31922898976176395, "grad_norm": 8.400568962097168, "learning_rate": 4.47758280805058e-06, "loss": 0.5022, "mean_token_accuracy": 0.8501716613769531, "num_tokens": 123809256.0, "step": 102980 }, { "entropy": 1.9268585234880446, "epoch": 0.3192599888868137, "grad_norm": 9.43397045135498, "learning_rate": 4.477365421169256e-06, "loss": 0.5159, "mean_token_accuracy": 0.8423597529530525, "num_tokens": 123819919.0, "step": 102990 }, { "entropy": 1.7849035665392876, "epoch": 0.31929098801186334, "grad_norm": 8.134185791015625, "learning_rate": 4.477148065947293e-06, "loss": 0.4243, "mean_token_accuracy": 0.8558384835720062, "num_tokens": 123832624.0, "step": 103000 }, { "entropy": 1.8854929059743881, "epoch": 0.31932198713691307, "grad_norm": 7.981983661651611, "learning_rate": 4.476930742377004e-06, "loss": 0.4774, "mean_token_accuracy": 0.8419500634074211, "num_tokens": 123843833.0, "step": 103010 }, { "entropy": 1.910216248035431, "epoch": 0.31935298626196273, "grad_norm": 8.01833438873291, "learning_rate": 4.47671345045071e-06, "loss": 0.527, "mean_token_accuracy": 0.8421685129404068, "num_tokens": 123854736.0, "step": 103020 }, { "entropy": 1.869119329750538, "epoch": 0.31938398538701246, "grad_norm": 9.197733879089355, "learning_rate": 4.4764961901607315e-06, "loss": 0.5199, "mean_token_accuracy": 0.8363765180110931, "num_tokens": 123866473.0, "step": 103030 }, { "entropy": 1.9347961366176605, "epoch": 0.31941498451206213, "grad_norm": 7.009976863861084, "learning_rate": 4.476278961499394e-06, "loss": 0.5033, "mean_token_accuracy": 0.8408634766936303, "num_tokens": 123877914.0, "step": 103040 }, { "entropy": 1.765951743721962, "epoch": 0.31944598363711185, "grad_norm": 6.8173933029174805, "learning_rate": 4.4760617644590216e-06, "loss": 0.3763, "mean_token_accuracy": 0.8628764286637306, "num_tokens": 123891728.0, "step": 103050 }, { "entropy": 1.8665249332785607, "epoch": 0.3194769827621615, "grad_norm": 7.756616592407227, "learning_rate": 4.475844599031945e-06, "loss": 0.4528, "mean_token_accuracy": 0.8461050614714622, "num_tokens": 123904519.0, "step": 103060 }, { "entropy": 1.9601498633623122, "epoch": 0.31950798188721125, "grad_norm": 9.403843879699707, "learning_rate": 4.475627465210497e-06, "loss": 0.4825, "mean_token_accuracy": 0.8456299170851708, "num_tokens": 123915934.0, "step": 103070 }, { "entropy": 1.872973631322384, "epoch": 0.3195389810122609, "grad_norm": 8.8914155960083, "learning_rate": 4.475410362987011e-06, "loss": 0.4958, "mean_token_accuracy": 0.837957626581192, "num_tokens": 123928155.0, "step": 103080 }, { "entropy": 1.909055621922016, "epoch": 0.31956998013731064, "grad_norm": 7.7289509773254395, "learning_rate": 4.475193292353822e-06, "loss": 0.4712, "mean_token_accuracy": 0.848746582865715, "num_tokens": 123939530.0, "step": 103090 }, { "entropy": 1.9184820994734764, "epoch": 0.3196009792623603, "grad_norm": 8.20902156829834, "learning_rate": 4.474976253303274e-06, "loss": 0.4887, "mean_token_accuracy": 0.843792799115181, "num_tokens": 123950860.0, "step": 103100 }, { "entropy": 1.8916692659258842, "epoch": 0.31963197838741003, "grad_norm": 11.049240112304688, "learning_rate": 4.4747592458277056e-06, "loss": 0.5029, "mean_token_accuracy": 0.8373655050992965, "num_tokens": 123962960.0, "step": 103110 }, { "entropy": 1.9099874019622802, "epoch": 0.3196629775124597, "grad_norm": 7.249629497528076, "learning_rate": 4.474542269919464e-06, "loss": 0.5163, "mean_token_accuracy": 0.8369929790496826, "num_tokens": 123974112.0, "step": 103120 }, { "entropy": 1.8539279848337173, "epoch": 0.3196939766375094, "grad_norm": 8.467607498168945, "learning_rate": 4.474325325570893e-06, "loss": 0.4473, "mean_token_accuracy": 0.8553314670920372, "num_tokens": 123986201.0, "step": 103130 }, { "entropy": 1.9297155156731605, "epoch": 0.3197249757625591, "grad_norm": 11.687642097473145, "learning_rate": 4.474108412774347e-06, "loss": 0.5208, "mean_token_accuracy": 0.8387605383992195, "num_tokens": 123997260.0, "step": 103140 }, { "entropy": 1.8051437750458716, "epoch": 0.3197559748876088, "grad_norm": 7.733287334442139, "learning_rate": 4.473891531522177e-06, "loss": 0.3998, "mean_token_accuracy": 0.8608478337526322, "num_tokens": 124009391.0, "step": 103150 }, { "entropy": 1.7467816695570946, "epoch": 0.3197869740126585, "grad_norm": 2.378082036972046, "learning_rate": 4.473674681806737e-06, "loss": 0.3921, "mean_token_accuracy": 0.869289082288742, "num_tokens": 124022647.0, "step": 103160 }, { "entropy": 1.876607745885849, "epoch": 0.3198179731377082, "grad_norm": 9.72205924987793, "learning_rate": 4.473457863620386e-06, "loss": 0.4799, "mean_token_accuracy": 0.8447935312986374, "num_tokens": 124034377.0, "step": 103170 }, { "entropy": 1.8972087427973747, "epoch": 0.3198489722627579, "grad_norm": 7.99284553527832, "learning_rate": 4.473241076955484e-06, "loss": 0.4895, "mean_token_accuracy": 0.847452299296856, "num_tokens": 124045591.0, "step": 103180 }, { "entropy": 1.8817680388689042, "epoch": 0.3198799713878076, "grad_norm": 8.3921480178833, "learning_rate": 4.473024321804395e-06, "loss": 0.4908, "mean_token_accuracy": 0.8453219920396805, "num_tokens": 124056874.0, "step": 103190 }, { "entropy": 1.845687435567379, "epoch": 0.3199109705128573, "grad_norm": 8.912002563476562, "learning_rate": 4.472807598159483e-06, "loss": 0.4672, "mean_token_accuracy": 0.845789223909378, "num_tokens": 124068631.0, "step": 103200 }, { "entropy": 1.7905006155371666, "epoch": 0.319941969637907, "grad_norm": 9.591071128845215, "learning_rate": 4.472590906013117e-06, "loss": 0.4584, "mean_token_accuracy": 0.8493612468242645, "num_tokens": 124081286.0, "step": 103210 }, { "entropy": 1.9089469194412232, "epoch": 0.31997296876295667, "grad_norm": 7.169071197509766, "learning_rate": 4.4723742453576675e-06, "loss": 0.502, "mean_token_accuracy": 0.8382734417915344, "num_tokens": 124092376.0, "step": 103220 }, { "entropy": 1.8337425097823143, "epoch": 0.32000396788800634, "grad_norm": 8.232246398925781, "learning_rate": 4.472157616185508e-06, "loss": 0.4213, "mean_token_accuracy": 0.8578509896993637, "num_tokens": 124104861.0, "step": 103230 }, { "entropy": 1.9013035476207734, "epoch": 0.32003496701305606, "grad_norm": 8.12004280090332, "learning_rate": 4.471941018489015e-06, "loss": 0.4745, "mean_token_accuracy": 0.8492400407791137, "num_tokens": 124116168.0, "step": 103240 }, { "entropy": 1.8799728155136108, "epoch": 0.32006596613810573, "grad_norm": 7.341602325439453, "learning_rate": 4.471724452260566e-06, "loss": 0.4878, "mean_token_accuracy": 0.8415581986308098, "num_tokens": 124127690.0, "step": 103250 }, { "entropy": 1.8120905488729477, "epoch": 0.32009696526315545, "grad_norm": 9.098899841308594, "learning_rate": 4.471507917492542e-06, "loss": 0.4301, "mean_token_accuracy": 0.8570401355624199, "num_tokens": 124140339.0, "step": 103260 }, { "entropy": 1.8739472836256028, "epoch": 0.3201279643882051, "grad_norm": 9.830232620239258, "learning_rate": 4.471291414177328e-06, "loss": 0.4751, "mean_token_accuracy": 0.8453883707523346, "num_tokens": 124151956.0, "step": 103270 }, { "entropy": 1.84071436971426, "epoch": 0.32015896351325485, "grad_norm": 8.632100105285645, "learning_rate": 4.47107494230731e-06, "loss": 0.4897, "mean_token_accuracy": 0.8345205932855606, "num_tokens": 124165412.0, "step": 103280 }, { "entropy": 1.8590612232685089, "epoch": 0.3201899626383045, "grad_norm": 4.4268975257873535, "learning_rate": 4.470858501874875e-06, "loss": 0.4495, "mean_token_accuracy": 0.8525667667388916, "num_tokens": 124177344.0, "step": 103290 }, { "entropy": 1.887212759256363, "epoch": 0.32022096176335424, "grad_norm": 7.400831699371338, "learning_rate": 4.470642092872416e-06, "loss": 0.4953, "mean_token_accuracy": 0.8470708400011062, "num_tokens": 124188119.0, "step": 103300 }, { "entropy": 1.8698201969265937, "epoch": 0.3202519608884039, "grad_norm": 9.03722858428955, "learning_rate": 4.470425715292328e-06, "loss": 0.4894, "mean_token_accuracy": 0.8450605183839798, "num_tokens": 124199654.0, "step": 103310 }, { "entropy": 1.9616728246212005, "epoch": 0.32028296001345363, "grad_norm": 7.2699503898620605, "learning_rate": 4.4702093691270045e-06, "loss": 0.4896, "mean_token_accuracy": 0.8387251660227776, "num_tokens": 124210832.0, "step": 103320 }, { "entropy": 1.888064543902874, "epoch": 0.3203139591385033, "grad_norm": 9.468146324157715, "learning_rate": 4.469993054368849e-06, "loss": 0.52, "mean_token_accuracy": 0.8365824237465859, "num_tokens": 124223111.0, "step": 103330 }, { "entropy": 1.8957604005932809, "epoch": 0.320344958263553, "grad_norm": 4.3390302658081055, "learning_rate": 4.469776771010258e-06, "loss": 0.5079, "mean_token_accuracy": 0.8410006389021873, "num_tokens": 124234226.0, "step": 103340 }, { "entropy": 1.9063668623566628, "epoch": 0.3203759573886027, "grad_norm": 8.515616416931152, "learning_rate": 4.46956051904364e-06, "loss": 0.4756, "mean_token_accuracy": 0.8457826554775238, "num_tokens": 124245737.0, "step": 103350 }, { "entropy": 1.8583616137504577, "epoch": 0.3204069565136524, "grad_norm": 8.301240921020508, "learning_rate": 4.469344298461399e-06, "loss": 0.4415, "mean_token_accuracy": 0.8528946593403817, "num_tokens": 124258363.0, "step": 103360 }, { "entropy": 1.8560489758849144, "epoch": 0.3204379556387021, "grad_norm": 7.1716532707214355, "learning_rate": 4.4691281092559474e-06, "loss": 0.4214, "mean_token_accuracy": 0.8615340426564216, "num_tokens": 124270052.0, "step": 103370 }, { "entropy": 1.762592874467373, "epoch": 0.3204689547637518, "grad_norm": 3.3581652641296387, "learning_rate": 4.468911951419696e-06, "loss": 0.3764, "mean_token_accuracy": 0.8641716584563255, "num_tokens": 124283377.0, "step": 103380 }, { "entropy": 1.8988282978534698, "epoch": 0.3204999538888015, "grad_norm": 6.756303310394287, "learning_rate": 4.468695824945058e-06, "loss": 0.4711, "mean_token_accuracy": 0.8476831942796708, "num_tokens": 124294752.0, "step": 103390 }, { "entropy": 1.846705262362957, "epoch": 0.3205309530138512, "grad_norm": 3.833975315093994, "learning_rate": 4.46847972982445e-06, "loss": 0.4031, "mean_token_accuracy": 0.8555555865168571, "num_tokens": 124306566.0, "step": 103400 }, { "entropy": 1.8886190831661225, "epoch": 0.3205619521389009, "grad_norm": 7.964195728302002, "learning_rate": 4.4682636660502945e-06, "loss": 0.5005, "mean_token_accuracy": 0.8455360010266304, "num_tokens": 124318282.0, "step": 103410 }, { "entropy": 1.9076228067278862, "epoch": 0.3205929512639506, "grad_norm": 7.338057994842529, "learning_rate": 4.468047633615013e-06, "loss": 0.4968, "mean_token_accuracy": 0.8368429213762283, "num_tokens": 124329917.0, "step": 103420 }, { "entropy": 1.9275338113307954, "epoch": 0.32062395038900027, "grad_norm": 10.642918586730957, "learning_rate": 4.46783163251103e-06, "loss": 0.5179, "mean_token_accuracy": 0.8408781468868256, "num_tokens": 124341468.0, "step": 103430 }, { "entropy": 1.967161425948143, "epoch": 0.32065494951405, "grad_norm": 7.011558532714844, "learning_rate": 4.467615662730772e-06, "loss": 0.535, "mean_token_accuracy": 0.8357431918382645, "num_tokens": 124352683.0, "step": 103440 }, { "entropy": 1.8656065806746482, "epoch": 0.32068594863909966, "grad_norm": 7.173338413238525, "learning_rate": 4.467399724266671e-06, "loss": 0.4432, "mean_token_accuracy": 0.8562691912055016, "num_tokens": 124364944.0, "step": 103450 }, { "entropy": 1.7879148676991463, "epoch": 0.32071694776414933, "grad_norm": 8.152206420898438, "learning_rate": 4.467183817111157e-06, "loss": 0.422, "mean_token_accuracy": 0.8556606069207191, "num_tokens": 124378575.0, "step": 103460 }, { "entropy": 1.9068648159503936, "epoch": 0.32074794688919905, "grad_norm": 10.881712913513184, "learning_rate": 4.466967941256668e-06, "loss": 0.4996, "mean_token_accuracy": 0.8376988887786865, "num_tokens": 124390206.0, "step": 103470 }, { "entropy": 1.9137423366308213, "epoch": 0.3207789460142487, "grad_norm": 7.468229293823242, "learning_rate": 4.4667520966956385e-06, "loss": 0.5072, "mean_token_accuracy": 0.8356907978653908, "num_tokens": 124401455.0, "step": 103480 }, { "entropy": 1.9200310036540031, "epoch": 0.32080994513929845, "grad_norm": 6.459151744842529, "learning_rate": 4.466536283420511e-06, "loss": 0.4964, "mean_token_accuracy": 0.8393821954727173, "num_tokens": 124412634.0, "step": 103490 }, { "entropy": 1.9078100383281709, "epoch": 0.3208409442643481, "grad_norm": 6.7956013679504395, "learning_rate": 4.466320501423726e-06, "loss": 0.4456, "mean_token_accuracy": 0.8611381649971008, "num_tokens": 124423788.0, "step": 103500 }, { "entropy": 1.9619006097316742, "epoch": 0.32087194338939784, "grad_norm": 9.467850685119629, "learning_rate": 4.466104750697733e-06, "loss": 0.5486, "mean_token_accuracy": 0.8296520456671714, "num_tokens": 124434439.0, "step": 103510 }, { "entropy": 1.8933661192655564, "epoch": 0.3209029425144475, "grad_norm": 4.713155746459961, "learning_rate": 4.465889031234975e-06, "loss": 0.458, "mean_token_accuracy": 0.8469473794102669, "num_tokens": 124446667.0, "step": 103520 }, { "entropy": 1.853841246664524, "epoch": 0.32093394163949723, "grad_norm": 8.347504615783691, "learning_rate": 4.465673343027906e-06, "loss": 0.4666, "mean_token_accuracy": 0.844866256415844, "num_tokens": 124459684.0, "step": 103530 }, { "entropy": 1.8124510049819946, "epoch": 0.3209649407645469, "grad_norm": 4.092881202697754, "learning_rate": 4.465457686068977e-06, "loss": 0.4031, "mean_token_accuracy": 0.8626367405056954, "num_tokens": 124472022.0, "step": 103540 }, { "entropy": 1.8816181853413583, "epoch": 0.3209959398895966, "grad_norm": 7.426637172698975, "learning_rate": 4.465242060350643e-06, "loss": 0.4722, "mean_token_accuracy": 0.844549834728241, "num_tokens": 124484006.0, "step": 103550 }, { "entropy": 1.8122664958238601, "epoch": 0.3210269390146463, "grad_norm": 3.879347562789917, "learning_rate": 4.4650264658653655e-06, "loss": 0.4232, "mean_token_accuracy": 0.8467624381184577, "num_tokens": 124496792.0, "step": 103560 }, { "entropy": 1.8967302456498145, "epoch": 0.321057938139696, "grad_norm": 5.540412902832031, "learning_rate": 4.464810902605601e-06, "loss": 0.5402, "mean_token_accuracy": 0.8287664383649826, "num_tokens": 124508254.0, "step": 103570 }, { "entropy": 1.8487597212195397, "epoch": 0.3210889372647457, "grad_norm": 4.171758651733398, "learning_rate": 4.464595370563815e-06, "loss": 0.4429, "mean_token_accuracy": 0.8530851736664772, "num_tokens": 124520515.0, "step": 103580 }, { "entropy": 1.8957301035523415, "epoch": 0.3211199363897954, "grad_norm": 8.205263137817383, "learning_rate": 4.464379869732473e-06, "loss": 0.4656, "mean_token_accuracy": 0.8478555142879486, "num_tokens": 124532156.0, "step": 103590 }, { "entropy": 1.8724404126405716, "epoch": 0.3211509355148451, "grad_norm": 8.49742317199707, "learning_rate": 4.464164400104043e-06, "loss": 0.4443, "mean_token_accuracy": 0.8556983798742295, "num_tokens": 124543672.0, "step": 103600 }, { "entropy": 1.8402127534151078, "epoch": 0.3211819346398948, "grad_norm": 8.611581802368164, "learning_rate": 4.4639489616709956e-06, "loss": 0.4504, "mean_token_accuracy": 0.8543668672442436, "num_tokens": 124555679.0, "step": 103610 }, { "entropy": 1.8802747756242753, "epoch": 0.3212129337649445, "grad_norm": 4.206470012664795, "learning_rate": 4.463733554425804e-06, "loss": 0.4831, "mean_token_accuracy": 0.8467269361019134, "num_tokens": 124567696.0, "step": 103620 }, { "entropy": 1.864666721224785, "epoch": 0.3212439328899942, "grad_norm": 10.374842643737793, "learning_rate": 4.4635181783609455e-06, "loss": 0.4742, "mean_token_accuracy": 0.8502185359597206, "num_tokens": 124580191.0, "step": 103630 }, { "entropy": 1.839669795334339, "epoch": 0.32127493201504387, "grad_norm": 3.81731915473938, "learning_rate": 4.463302833468897e-06, "loss": 0.4586, "mean_token_accuracy": 0.8490176141262055, "num_tokens": 124592216.0, "step": 103640 }, { "entropy": 1.8732964277267456, "epoch": 0.3213059311400936, "grad_norm": 7.773388385772705, "learning_rate": 4.463087519742139e-06, "loss": 0.5065, "mean_token_accuracy": 0.8358982741832733, "num_tokens": 124604399.0, "step": 103650 }, { "entropy": 1.8805906429886818, "epoch": 0.32133693026514326, "grad_norm": 8.990577697753906, "learning_rate": 4.462872237173157e-06, "loss": 0.4652, "mean_token_accuracy": 0.8515235692262649, "num_tokens": 124615677.0, "step": 103660 }, { "entropy": 1.8140804409980773, "epoch": 0.321367929390193, "grad_norm": 8.409754753112793, "learning_rate": 4.462656985754436e-06, "loss": 0.4065, "mean_token_accuracy": 0.8694860532879829, "num_tokens": 124628163.0, "step": 103670 }, { "entropy": 1.8692081153392792, "epoch": 0.32139892851524265, "grad_norm": 8.463071823120117, "learning_rate": 4.462441765478465e-06, "loss": 0.438, "mean_token_accuracy": 0.8528854250907898, "num_tokens": 124640372.0, "step": 103680 }, { "entropy": 1.8829544261097908, "epoch": 0.3214299276402924, "grad_norm": 9.008349418640137, "learning_rate": 4.462226576337735e-06, "loss": 0.4922, "mean_token_accuracy": 0.8522943645715714, "num_tokens": 124652104.0, "step": 103690 }, { "entropy": 1.908381125330925, "epoch": 0.32146092676534205, "grad_norm": 8.958301544189453, "learning_rate": 4.462011418324738e-06, "loss": 0.4776, "mean_token_accuracy": 0.8482077926397323, "num_tokens": 124663567.0, "step": 103700 }, { "entropy": 1.8642259582877159, "epoch": 0.3214919258903917, "grad_norm": 8.490578651428223, "learning_rate": 4.461796291431973e-06, "loss": 0.4727, "mean_token_accuracy": 0.8547911241650581, "num_tokens": 124675305.0, "step": 103710 }, { "entropy": 1.8607799611985683, "epoch": 0.32152292501544144, "grad_norm": 7.851955413818359, "learning_rate": 4.461581195651937e-06, "loss": 0.4312, "mean_token_accuracy": 0.8484767660498619, "num_tokens": 124688192.0, "step": 103720 }, { "entropy": 1.930846455693245, "epoch": 0.3215539241404911, "grad_norm": 3.855354070663452, "learning_rate": 4.461366130977132e-06, "loss": 0.4822, "mean_token_accuracy": 0.8470954403281212, "num_tokens": 124699818.0, "step": 103730 }, { "entropy": 1.7706571273505687, "epoch": 0.32158492326554083, "grad_norm": 4.847051620483398, "learning_rate": 4.461151097400059e-06, "loss": 0.3651, "mean_token_accuracy": 0.8597544595599175, "num_tokens": 124714446.0, "step": 103740 }, { "entropy": 1.9413534983992577, "epoch": 0.3216159223905905, "grad_norm": 7.653467655181885, "learning_rate": 4.460936094913229e-06, "loss": 0.4902, "mean_token_accuracy": 0.8460107401013375, "num_tokens": 124725683.0, "step": 103750 }, { "entropy": 1.899087278544903, "epoch": 0.3216469215156402, "grad_norm": 9.189424514770508, "learning_rate": 4.460721123509149e-06, "loss": 0.476, "mean_token_accuracy": 0.8444273769855499, "num_tokens": 124738312.0, "step": 103760 }, { "entropy": 1.951054508984089, "epoch": 0.3216779206406899, "grad_norm": 8.07810115814209, "learning_rate": 4.460506183180329e-06, "loss": 0.4815, "mean_token_accuracy": 0.841863676905632, "num_tokens": 124749524.0, "step": 103770 }, { "entropy": 1.9111177667975425, "epoch": 0.3217089197657396, "grad_norm": 8.68226146697998, "learning_rate": 4.4602912739192835e-06, "loss": 0.4722, "mean_token_accuracy": 0.8471680104732513, "num_tokens": 124760976.0, "step": 103780 }, { "entropy": 1.8448022976517677, "epoch": 0.3217399188907893, "grad_norm": 7.906839847564697, "learning_rate": 4.4600763957185295e-06, "loss": 0.4239, "mean_token_accuracy": 0.8532932892441749, "num_tokens": 124773367.0, "step": 103790 }, { "entropy": 1.9335693135857581, "epoch": 0.321770918015839, "grad_norm": 8.961040496826172, "learning_rate": 4.459861548570586e-06, "loss": 0.4793, "mean_token_accuracy": 0.8489025041460991, "num_tokens": 124784721.0, "step": 103800 }, { "entropy": 1.8875031411647796, "epoch": 0.3218019171408887, "grad_norm": 3.5718820095062256, "learning_rate": 4.459646732467974e-06, "loss": 0.4734, "mean_token_accuracy": 0.8439980506896972, "num_tokens": 124797599.0, "step": 103810 }, { "entropy": 1.8897089153528213, "epoch": 0.3218329162659384, "grad_norm": 9.0376558303833, "learning_rate": 4.459431947403218e-06, "loss": 0.4489, "mean_token_accuracy": 0.8548488318920135, "num_tokens": 124808832.0, "step": 103820 }, { "entropy": 1.8961857289075852, "epoch": 0.3218639153909881, "grad_norm": 7.7893829345703125, "learning_rate": 4.4592171933688435e-06, "loss": 0.4688, "mean_token_accuracy": 0.8496623903512954, "num_tokens": 124821106.0, "step": 103830 }, { "entropy": 1.8757227182388305, "epoch": 0.3218949145160378, "grad_norm": 4.173471927642822, "learning_rate": 4.45900247035738e-06, "loss": 0.4201, "mean_token_accuracy": 0.8542967364192009, "num_tokens": 124833050.0, "step": 103840 }, { "entropy": 1.9881081491708756, "epoch": 0.32192591364108747, "grad_norm": 7.131890296936035, "learning_rate": 4.458787778361361e-06, "loss": 0.5528, "mean_token_accuracy": 0.8397052332758903, "num_tokens": 124844238.0, "step": 103850 }, { "entropy": 1.8623345792293549, "epoch": 0.3219569127661372, "grad_norm": 3.7538177967071533, "learning_rate": 4.458573117373317e-06, "loss": 0.413, "mean_token_accuracy": 0.8614621505141258, "num_tokens": 124856830.0, "step": 103860 }, { "entropy": 1.8851841911673546, "epoch": 0.32198791189118686, "grad_norm": 4.068458557128906, "learning_rate": 4.458358487385787e-06, "loss": 0.4739, "mean_token_accuracy": 0.8413153782486915, "num_tokens": 124869280.0, "step": 103870 }, { "entropy": 1.8985486201941968, "epoch": 0.3220189110162366, "grad_norm": 4.279972076416016, "learning_rate": 4.458143888391309e-06, "loss": 0.4229, "mean_token_accuracy": 0.8557350024580955, "num_tokens": 124880870.0, "step": 103880 }, { "entropy": 1.834354992210865, "epoch": 0.32204991014128626, "grad_norm": 3.6236605644226074, "learning_rate": 4.457929320382427e-06, "loss": 0.3981, "mean_token_accuracy": 0.8552977159619332, "num_tokens": 124893391.0, "step": 103890 }, { "entropy": 1.9463069021701813, "epoch": 0.322080909266336, "grad_norm": 8.719969749450684, "learning_rate": 4.457714783351681e-06, "loss": 0.5187, "mean_token_accuracy": 0.8463471621274948, "num_tokens": 124904359.0, "step": 103900 }, { "entropy": 1.9136415734887122, "epoch": 0.32211190839138565, "grad_norm": 8.018022537231445, "learning_rate": 4.457500277291621e-06, "loss": 0.4671, "mean_token_accuracy": 0.8480435863137246, "num_tokens": 124916133.0, "step": 103910 }, { "entropy": 1.9327465161681174, "epoch": 0.3221429075164354, "grad_norm": 9.761283874511719, "learning_rate": 4.457285802194794e-06, "loss": 0.5029, "mean_token_accuracy": 0.8388472273945808, "num_tokens": 124927380.0, "step": 103920 }, { "entropy": 1.9528162240982057, "epoch": 0.32217390664148504, "grad_norm": 8.009254455566406, "learning_rate": 4.457071358053754e-06, "loss": 0.5264, "mean_token_accuracy": 0.8379970923066139, "num_tokens": 124937778.0, "step": 103930 }, { "entropy": 1.9495197027921676, "epoch": 0.32220490576653477, "grad_norm": 8.856219291687012, "learning_rate": 4.456856944861052e-06, "loss": 0.5091, "mean_token_accuracy": 0.8489312395453453, "num_tokens": 124948572.0, "step": 103940 }, { "entropy": 1.8287577375769615, "epoch": 0.32223590489158443, "grad_norm": 4.316620349884033, "learning_rate": 4.4566425626092495e-06, "loss": 0.4242, "mean_token_accuracy": 0.8560908138751984, "num_tokens": 124961049.0, "step": 103950 }, { "entropy": 1.8360350668430327, "epoch": 0.3222669040166341, "grad_norm": 8.45445442199707, "learning_rate": 4.456428211290899e-06, "loss": 0.4338, "mean_token_accuracy": 0.843099394440651, "num_tokens": 124973591.0, "step": 103960 }, { "entropy": 1.829276867955923, "epoch": 0.32229790314168383, "grad_norm": 2.9788012504577637, "learning_rate": 4.456213890898567e-06, "loss": 0.4226, "mean_token_accuracy": 0.8461241707205772, "num_tokens": 124986733.0, "step": 103970 }, { "entropy": 1.951300072669983, "epoch": 0.3223289022667335, "grad_norm": 7.77920389175415, "learning_rate": 4.455999601424818e-06, "loss": 0.5354, "mean_token_accuracy": 0.8368278592824936, "num_tokens": 124998066.0, "step": 103980 }, { "entropy": 1.8031010538339616, "epoch": 0.3223599013917832, "grad_norm": 4.566207408905029, "learning_rate": 4.455785342862216e-06, "loss": 0.4264, "mean_token_accuracy": 0.8428297311067581, "num_tokens": 125011581.0, "step": 103990 }, { "entropy": 1.8515203520655632, "epoch": 0.3223909005168329, "grad_norm": 7.359809875488281, "learning_rate": 4.4555711152033325e-06, "loss": 0.4531, "mean_token_accuracy": 0.852726761996746, "num_tokens": 125024051.0, "step": 104000 }, { "entropy": 1.8627462826669217, "epoch": 0.3224218996418826, "grad_norm": 3.8170571327209473, "learning_rate": 4.455356918440736e-06, "loss": 0.4399, "mean_token_accuracy": 0.8479026794433594, "num_tokens": 125037097.0, "step": 104010 }, { "entropy": 1.8624150231480598, "epoch": 0.3224528987669323, "grad_norm": 8.229700088500977, "learning_rate": 4.455142752567004e-06, "loss": 0.4912, "mean_token_accuracy": 0.8403912633657455, "num_tokens": 125049190.0, "step": 104020 }, { "entropy": 1.8673087805509567, "epoch": 0.322483897891982, "grad_norm": 8.417292594909668, "learning_rate": 4.454928617574712e-06, "loss": 0.4518, "mean_token_accuracy": 0.8480266436934472, "num_tokens": 125061183.0, "step": 104030 }, { "entropy": 1.83387650847435, "epoch": 0.3225148970170317, "grad_norm": 7.1093220710754395, "learning_rate": 4.4547145134564384e-06, "loss": 0.4777, "mean_token_accuracy": 0.8557756185531616, "num_tokens": 125074081.0, "step": 104040 }, { "entropy": 1.8816956907510758, "epoch": 0.3225458961420814, "grad_norm": 8.284878730773926, "learning_rate": 4.454500440204765e-06, "loss": 0.4527, "mean_token_accuracy": 0.8433906123042106, "num_tokens": 125086116.0, "step": 104050 }, { "entropy": 1.9563316345214843, "epoch": 0.32257689526713107, "grad_norm": 7.569207668304443, "learning_rate": 4.454286397812278e-06, "loss": 0.5523, "mean_token_accuracy": 0.82982589751482, "num_tokens": 125097380.0, "step": 104060 }, { "entropy": 1.941270676255226, "epoch": 0.3226078943921808, "grad_norm": 9.317317008972168, "learning_rate": 4.454072386271562e-06, "loss": 0.5532, "mean_token_accuracy": 0.8411487266421318, "num_tokens": 125108379.0, "step": 104070 }, { "entropy": 1.920466449856758, "epoch": 0.32263889351723046, "grad_norm": 9.6175537109375, "learning_rate": 4.453858405575206e-06, "loss": 0.5543, "mean_token_accuracy": 0.8352476447820664, "num_tokens": 125119711.0, "step": 104080 }, { "entropy": 1.8606573291122914, "epoch": 0.3226698926422802, "grad_norm": 3.547621488571167, "learning_rate": 4.453644455715805e-06, "loss": 0.4318, "mean_token_accuracy": 0.8499340802431107, "num_tokens": 125131618.0, "step": 104090 }, { "entropy": 1.9070662692189218, "epoch": 0.32270089176732986, "grad_norm": 4.856088638305664, "learning_rate": 4.453430536685948e-06, "loss": 0.4756, "mean_token_accuracy": 0.8461712196469307, "num_tokens": 125143282.0, "step": 104100 }, { "entropy": 1.8635522559285165, "epoch": 0.3227318908923796, "grad_norm": 8.272976875305176, "learning_rate": 4.453216648478236e-06, "loss": 0.4279, "mean_token_accuracy": 0.85389723777771, "num_tokens": 125155283.0, "step": 104110 }, { "entropy": 1.8606613449752332, "epoch": 0.32276289001742925, "grad_norm": 7.335941314697266, "learning_rate": 4.453002791085265e-06, "loss": 0.4972, "mean_token_accuracy": 0.8420170918107033, "num_tokens": 125168312.0, "step": 104120 }, { "entropy": 1.869354782998562, "epoch": 0.322793889142479, "grad_norm": 8.720457077026367, "learning_rate": 4.452788964499638e-06, "loss": 0.4668, "mean_token_accuracy": 0.8450367793440818, "num_tokens": 125180831.0, "step": 104130 }, { "entropy": 1.886098426580429, "epoch": 0.32282488826752864, "grad_norm": 7.61613130569458, "learning_rate": 4.452575168713959e-06, "loss": 0.4643, "mean_token_accuracy": 0.8547211870551109, "num_tokens": 125191572.0, "step": 104140 }, { "entropy": 1.8242208793759347, "epoch": 0.32285588739257837, "grad_norm": 8.236608505249023, "learning_rate": 4.452361403720835e-06, "loss": 0.5051, "mean_token_accuracy": 0.8449318900704383, "num_tokens": 125204559.0, "step": 104150 }, { "entropy": 1.9878936111927032, "epoch": 0.32288688651762804, "grad_norm": 7.960468769073486, "learning_rate": 4.452147669512874e-06, "loss": 0.5418, "mean_token_accuracy": 0.8334811016917228, "num_tokens": 125215145.0, "step": 104160 }, { "entropy": 1.8423168882727623, "epoch": 0.32291788564267776, "grad_norm": 8.506884574890137, "learning_rate": 4.451933966082689e-06, "loss": 0.436, "mean_token_accuracy": 0.8522772178053856, "num_tokens": 125227069.0, "step": 104170 }, { "entropy": 1.9097376301884652, "epoch": 0.32294888476772743, "grad_norm": 9.220355033874512, "learning_rate": 4.451720293422894e-06, "loss": 0.5155, "mean_token_accuracy": 0.8438367456197738, "num_tokens": 125238513.0, "step": 104180 }, { "entropy": 1.8769617334008217, "epoch": 0.32297988389277715, "grad_norm": 8.432296752929688, "learning_rate": 4.451506651526103e-06, "loss": 0.4553, "mean_token_accuracy": 0.8506760001182556, "num_tokens": 125249982.0, "step": 104190 }, { "entropy": 1.8073933839797973, "epoch": 0.3230108830178268, "grad_norm": 7.3089985847473145, "learning_rate": 4.451293040384938e-06, "loss": 0.4174, "mean_token_accuracy": 0.8688766479492187, "num_tokens": 125263337.0, "step": 104200 }, { "entropy": 1.8363840654492378, "epoch": 0.3230418821428765, "grad_norm": 4.247748851776123, "learning_rate": 4.4510794599920185e-06, "loss": 0.463, "mean_token_accuracy": 0.8401136696338654, "num_tokens": 125275329.0, "step": 104210 }, { "entropy": 1.8861171647906303, "epoch": 0.3230728812679262, "grad_norm": 8.15084457397461, "learning_rate": 4.45086591033997e-06, "loss": 0.4632, "mean_token_accuracy": 0.8480441614985466, "num_tokens": 125287001.0, "step": 104220 }, { "entropy": 1.862444320321083, "epoch": 0.3231038803929759, "grad_norm": 9.114710807800293, "learning_rate": 4.450652391421417e-06, "loss": 0.4541, "mean_token_accuracy": 0.8597635760903358, "num_tokens": 125299228.0, "step": 104230 }, { "entropy": 1.8157359957695007, "epoch": 0.3231348795180256, "grad_norm": 10.471318244934082, "learning_rate": 4.45043890322899e-06, "loss": 0.4853, "mean_token_accuracy": 0.8404217720031738, "num_tokens": 125312489.0, "step": 104240 }, { "entropy": 1.8435872822999955, "epoch": 0.3231658786430753, "grad_norm": 6.866489410400391, "learning_rate": 4.45022544575532e-06, "loss": 0.4322, "mean_token_accuracy": 0.8526515334844589, "num_tokens": 125325340.0, "step": 104250 }, { "entropy": 1.7846007272601128, "epoch": 0.323196877768125, "grad_norm": 8.732168197631836, "learning_rate": 4.450012018993041e-06, "loss": 0.4007, "mean_token_accuracy": 0.8618106812238693, "num_tokens": 125338525.0, "step": 104260 }, { "entropy": 1.9067389905452727, "epoch": 0.32322787689317467, "grad_norm": 9.090784072875977, "learning_rate": 4.4497986229347886e-06, "loss": 0.4901, "mean_token_accuracy": 0.8495014622807503, "num_tokens": 125349264.0, "step": 104270 }, { "entropy": 1.8359978944063187, "epoch": 0.3232588760182244, "grad_norm": 8.766422271728516, "learning_rate": 4.449585257573202e-06, "loss": 0.453, "mean_token_accuracy": 0.8507504045963288, "num_tokens": 125361176.0, "step": 104280 }, { "entropy": 1.860073594748974, "epoch": 0.32328987514327406, "grad_norm": 8.850909233093262, "learning_rate": 4.4493719229009234e-06, "loss": 0.4902, "mean_token_accuracy": 0.8514194667339325, "num_tokens": 125372673.0, "step": 104290 }, { "entropy": 1.8551115795969964, "epoch": 0.3233208742683238, "grad_norm": 4.093299388885498, "learning_rate": 4.449158618910594e-06, "loss": 0.4353, "mean_token_accuracy": 0.8481899574398994, "num_tokens": 125385018.0, "step": 104300 }, { "entropy": 1.9149028778076171, "epoch": 0.32335187339337346, "grad_norm": 8.859583854675293, "learning_rate": 4.448945345594864e-06, "loss": 0.5378, "mean_token_accuracy": 0.8369590178132057, "num_tokens": 125396234.0, "step": 104310 }, { "entropy": 1.9485036730766296, "epoch": 0.3233828725184232, "grad_norm": 8.875260353088379, "learning_rate": 4.448732102946378e-06, "loss": 0.5439, "mean_token_accuracy": 0.83717480301857, "num_tokens": 125407354.0, "step": 104320 }, { "entropy": 1.8549368754029274, "epoch": 0.32341387164347285, "grad_norm": 3.1946167945861816, "learning_rate": 4.448518890957789e-06, "loss": 0.4336, "mean_token_accuracy": 0.8576024889945983, "num_tokens": 125419710.0, "step": 104330 }, { "entropy": 1.8528822794556619, "epoch": 0.3234448707685226, "grad_norm": 5.784121036529541, "learning_rate": 4.44830570962175e-06, "loss": 0.4463, "mean_token_accuracy": 0.8541821107268334, "num_tokens": 125432442.0, "step": 104340 }, { "entropy": 1.9138613402843476, "epoch": 0.32347586989357224, "grad_norm": 3.181318998336792, "learning_rate": 4.448092558930918e-06, "loss": 0.4904, "mean_token_accuracy": 0.8456951528787613, "num_tokens": 125444223.0, "step": 104350 }, { "entropy": 1.893858689069748, "epoch": 0.32350686901862197, "grad_norm": 7.411118030548096, "learning_rate": 4.447879438877952e-06, "loss": 0.5144, "mean_token_accuracy": 0.8454016864299774, "num_tokens": 125455851.0, "step": 104360 }, { "entropy": 1.9555244013667106, "epoch": 0.32353786814367164, "grad_norm": 8.233355522155762, "learning_rate": 4.447666349455512e-06, "loss": 0.5283, "mean_token_accuracy": 0.8410166382789612, "num_tokens": 125466870.0, "step": 104370 }, { "entropy": 1.9233867451548576, "epoch": 0.32356886726872136, "grad_norm": 8.369714736938477, "learning_rate": 4.44745329065626e-06, "loss": 0.5215, "mean_token_accuracy": 0.8294639691710473, "num_tokens": 125477884.0, "step": 104380 }, { "entropy": 1.842393586039543, "epoch": 0.32359986639377103, "grad_norm": 8.982789993286133, "learning_rate": 4.447240262472865e-06, "loss": 0.4581, "mean_token_accuracy": 0.8471063405275345, "num_tokens": 125490187.0, "step": 104390 }, { "entropy": 1.695570257306099, "epoch": 0.32363086551882075, "grad_norm": 8.46627426147461, "learning_rate": 4.447027264897993e-06, "loss": 0.387, "mean_token_accuracy": 0.8552982524037361, "num_tokens": 125504771.0, "step": 104400 }, { "entropy": 1.790937101840973, "epoch": 0.3236618646438704, "grad_norm": 7.850381374359131, "learning_rate": 4.446814297924315e-06, "loss": 0.4354, "mean_token_accuracy": 0.8472680017352104, "num_tokens": 125517462.0, "step": 104410 }, { "entropy": 1.9535409420728684, "epoch": 0.32369286376892015, "grad_norm": 8.090740203857422, "learning_rate": 4.446601361544507e-06, "loss": 0.4965, "mean_token_accuracy": 0.8454528123140335, "num_tokens": 125528256.0, "step": 104420 }, { "entropy": 1.8886640325188637, "epoch": 0.3237238628939698, "grad_norm": 4.096344470977783, "learning_rate": 4.44638845575124e-06, "loss": 0.4827, "mean_token_accuracy": 0.8485276773571968, "num_tokens": 125540038.0, "step": 104430 }, { "entropy": 1.8927355989813806, "epoch": 0.32375486201901954, "grad_norm": 8.601818084716797, "learning_rate": 4.446175580537197e-06, "loss": 0.5351, "mean_token_accuracy": 0.8406930550932884, "num_tokens": 125551783.0, "step": 104440 }, { "entropy": 1.933559662103653, "epoch": 0.3237858611440692, "grad_norm": 7.8219475746154785, "learning_rate": 4.445962735895055e-06, "loss": 0.5101, "mean_token_accuracy": 0.8485094651579856, "num_tokens": 125562679.0, "step": 104450 }, { "entropy": 1.8612630635499954, "epoch": 0.3238168602691189, "grad_norm": 8.244562149047852, "learning_rate": 4.445749921817498e-06, "loss": 0.4741, "mean_token_accuracy": 0.8419337302446366, "num_tokens": 125574193.0, "step": 104460 }, { "entropy": 1.8549982547760009, "epoch": 0.3238478593941686, "grad_norm": 9.33348274230957, "learning_rate": 4.445537138297214e-06, "loss": 0.4646, "mean_token_accuracy": 0.8472777247428894, "num_tokens": 125585671.0, "step": 104470 }, { "entropy": 1.8255490154027938, "epoch": 0.32387885851921827, "grad_norm": 7.582674503326416, "learning_rate": 4.445324385326889e-06, "loss": 0.4751, "mean_token_accuracy": 0.8529926791787148, "num_tokens": 125599259.0, "step": 104480 }, { "entropy": 1.8867202073335647, "epoch": 0.323909857644268, "grad_norm": 7.966434478759766, "learning_rate": 4.445111662899213e-06, "loss": 0.4734, "mean_token_accuracy": 0.8491407498717308, "num_tokens": 125611435.0, "step": 104490 }, { "entropy": 1.8497378468513488, "epoch": 0.32394085676931766, "grad_norm": 5.026673793792725, "learning_rate": 4.444898971006879e-06, "loss": 0.4795, "mean_token_accuracy": 0.8529240384697914, "num_tokens": 125623829.0, "step": 104500 }, { "entropy": 1.8570104882121086, "epoch": 0.3239718558943674, "grad_norm": 3.9983022212982178, "learning_rate": 4.444686309642584e-06, "loss": 0.4461, "mean_token_accuracy": 0.8561410129070282, "num_tokens": 125636076.0, "step": 104510 }, { "entropy": 1.9286787793040276, "epoch": 0.32400285501941706, "grad_norm": 9.436726570129395, "learning_rate": 4.444473678799025e-06, "loss": 0.5129, "mean_token_accuracy": 0.8409220159053803, "num_tokens": 125647410.0, "step": 104520 }, { "entropy": 1.790224689245224, "epoch": 0.3240338541444668, "grad_norm": 7.947468280792236, "learning_rate": 4.444261078468901e-06, "loss": 0.3994, "mean_token_accuracy": 0.8576122537255287, "num_tokens": 125660817.0, "step": 104530 }, { "entropy": 1.979808408021927, "epoch": 0.32406485326951645, "grad_norm": 8.681998252868652, "learning_rate": 4.444048508644915e-06, "loss": 0.5421, "mean_token_accuracy": 0.8427085399627685, "num_tokens": 125671400.0, "step": 104540 }, { "entropy": 1.8585129737854005, "epoch": 0.3240958523945662, "grad_norm": 8.156949043273926, "learning_rate": 4.443835969319773e-06, "loss": 0.5264, "mean_token_accuracy": 0.8462090104818344, "num_tokens": 125684225.0, "step": 104550 }, { "entropy": 1.8992905005812646, "epoch": 0.32412685151961584, "grad_norm": 10.150556564331055, "learning_rate": 4.443623460486183e-06, "loss": 0.4891, "mean_token_accuracy": 0.838335144519806, "num_tokens": 125696461.0, "step": 104560 }, { "entropy": 1.8747254326939582, "epoch": 0.32415785064466557, "grad_norm": 8.261116981506348, "learning_rate": 4.443410982136853e-06, "loss": 0.4682, "mean_token_accuracy": 0.8448391363024712, "num_tokens": 125708703.0, "step": 104570 }, { "entropy": 1.812967798113823, "epoch": 0.32418884976971524, "grad_norm": 9.305374145507812, "learning_rate": 4.443198534264497e-06, "loss": 0.4241, "mean_token_accuracy": 0.853294064104557, "num_tokens": 125721322.0, "step": 104580 }, { "entropy": 1.8749259188771248, "epoch": 0.32421984889476496, "grad_norm": 8.172529220581055, "learning_rate": 4.442986116861831e-06, "loss": 0.4241, "mean_token_accuracy": 0.86143379509449, "num_tokens": 125733217.0, "step": 104590 }, { "entropy": 1.9058199599385262, "epoch": 0.32425084801981463, "grad_norm": 4.497680187225342, "learning_rate": 4.442773729921569e-06, "loss": 0.5266, "mean_token_accuracy": 0.832387951016426, "num_tokens": 125744499.0, "step": 104600 }, { "entropy": 1.9057392075657844, "epoch": 0.32428184714486435, "grad_norm": 9.339095115661621, "learning_rate": 4.4425613734364346e-06, "loss": 0.5003, "mean_token_accuracy": 0.8399216368794441, "num_tokens": 125755625.0, "step": 104610 }, { "entropy": 1.887702539563179, "epoch": 0.324312846269914, "grad_norm": 9.061288833618164, "learning_rate": 4.442349047399148e-06, "loss": 0.5048, "mean_token_accuracy": 0.8430167078971863, "num_tokens": 125767881.0, "step": 104620 }, { "entropy": 1.8627898827195168, "epoch": 0.32434384539496375, "grad_norm": 7.091198921203613, "learning_rate": 4.442136751802433e-06, "loss": 0.4502, "mean_token_accuracy": 0.8567918986082077, "num_tokens": 125779283.0, "step": 104630 }, { "entropy": 1.8987063318490982, "epoch": 0.3243748445200134, "grad_norm": 7.691847324371338, "learning_rate": 4.441924486639018e-06, "loss": 0.4419, "mean_token_accuracy": 0.8585168570280075, "num_tokens": 125790750.0, "step": 104640 }, { "entropy": 1.8565407916903496, "epoch": 0.32440584364506314, "grad_norm": 8.805460929870605, "learning_rate": 4.441712251901632e-06, "loss": 0.4991, "mean_token_accuracy": 0.8448611825704575, "num_tokens": 125803070.0, "step": 104650 }, { "entropy": 1.8467104405164718, "epoch": 0.3244368427701128, "grad_norm": 7.713294982910156, "learning_rate": 4.4415000475830064e-06, "loss": 0.4652, "mean_token_accuracy": 0.85001719892025, "num_tokens": 125814740.0, "step": 104660 }, { "entropy": 1.8211202561855315, "epoch": 0.32446784189516253, "grad_norm": 4.372882843017578, "learning_rate": 4.441287873675877e-06, "loss": 0.4047, "mean_token_accuracy": 0.8514401495456696, "num_tokens": 125826554.0, "step": 104670 }, { "entropy": 1.7906322300434112, "epoch": 0.3244988410202122, "grad_norm": 7.335409641265869, "learning_rate": 4.44107573017298e-06, "loss": 0.4032, "mean_token_accuracy": 0.8584406465291977, "num_tokens": 125839095.0, "step": 104680 }, { "entropy": 1.887892808020115, "epoch": 0.3245298401452619, "grad_norm": 9.185553550720215, "learning_rate": 4.4408636170670526e-06, "loss": 0.4883, "mean_token_accuracy": 0.839701421558857, "num_tokens": 125850893.0, "step": 104690 }, { "entropy": 1.9207829117774964, "epoch": 0.3245608392703116, "grad_norm": 10.154474258422852, "learning_rate": 4.4406515343508405e-06, "loss": 0.4934, "mean_token_accuracy": 0.8411696791648865, "num_tokens": 125862536.0, "step": 104700 }, { "entropy": 1.9450317040085792, "epoch": 0.32459183839536127, "grad_norm": 10.254415512084961, "learning_rate": 4.440439482017084e-06, "loss": 0.4989, "mean_token_accuracy": 0.8435680419206619, "num_tokens": 125873937.0, "step": 104710 }, { "entropy": 1.9040479019284249, "epoch": 0.324622837520411, "grad_norm": 7.563967704772949, "learning_rate": 4.440227460058531e-06, "loss": 0.4784, "mean_token_accuracy": 0.8446063458919525, "num_tokens": 125885785.0, "step": 104720 }, { "entropy": 1.9021693363785743, "epoch": 0.32465383664546066, "grad_norm": 8.840468406677246, "learning_rate": 4.440015468467932e-06, "loss": 0.4776, "mean_token_accuracy": 0.8490529716014862, "num_tokens": 125897435.0, "step": 104730 }, { "entropy": 1.9175179213285447, "epoch": 0.3246848357705104, "grad_norm": 3.9579575061798096, "learning_rate": 4.439803507238037e-06, "loss": 0.4974, "mean_token_accuracy": 0.8449933081865311, "num_tokens": 125908914.0, "step": 104740 }, { "entropy": 1.8500194996595383, "epoch": 0.32471583489556005, "grad_norm": 8.99378490447998, "learning_rate": 4.439591576361599e-06, "loss": 0.4437, "mean_token_accuracy": 0.8460746437311173, "num_tokens": 125921013.0, "step": 104750 }, { "entropy": 1.8048820734024047, "epoch": 0.3247468340206098, "grad_norm": 2.689271926879883, "learning_rate": 4.439379675831374e-06, "loss": 0.4395, "mean_token_accuracy": 0.8416306912899018, "num_tokens": 125934812.0, "step": 104760 }, { "entropy": 1.8943803861737252, "epoch": 0.32477783314565944, "grad_norm": 9.720478057861328, "learning_rate": 4.439167805640121e-06, "loss": 0.462, "mean_token_accuracy": 0.8504800125956535, "num_tokens": 125946337.0, "step": 104770 }, { "entropy": 1.8520411089062692, "epoch": 0.32480883227070917, "grad_norm": 7.526730060577393, "learning_rate": 4.438955965780603e-06, "loss": 0.4906, "mean_token_accuracy": 0.8448957860469818, "num_tokens": 125958824.0, "step": 104780 }, { "entropy": 1.9352994859218597, "epoch": 0.32483983139575884, "grad_norm": 9.028653144836426, "learning_rate": 4.438744156245582e-06, "loss": 0.4804, "mean_token_accuracy": 0.8472319498658181, "num_tokens": 125970046.0, "step": 104790 }, { "entropy": 1.8998494163155555, "epoch": 0.32487083052080856, "grad_norm": 7.81031608581543, "learning_rate": 4.438532377027824e-06, "loss": 0.5041, "mean_token_accuracy": 0.847889693081379, "num_tokens": 125981669.0, "step": 104800 }, { "entropy": 1.8678305372595787, "epoch": 0.32490182964585823, "grad_norm": 6.961506366729736, "learning_rate": 4.438320628120095e-06, "loss": 0.4633, "mean_token_accuracy": 0.8497982889413833, "num_tokens": 125994098.0, "step": 104810 }, { "entropy": 1.8759206786751748, "epoch": 0.32493282877090796, "grad_norm": 7.766717433929443, "learning_rate": 4.43810890951517e-06, "loss": 0.5073, "mean_token_accuracy": 0.8452546551823616, "num_tokens": 126006395.0, "step": 104820 }, { "entropy": 1.9110927432775497, "epoch": 0.3249638278959576, "grad_norm": 9.222915649414062, "learning_rate": 4.437897221205818e-06, "loss": 0.5202, "mean_token_accuracy": 0.8430734008550644, "num_tokens": 126017620.0, "step": 104830 }, { "entropy": 1.81925430893898, "epoch": 0.32499482702100735, "grad_norm": 8.071293830871582, "learning_rate": 4.4376855631848185e-06, "loss": 0.4387, "mean_token_accuracy": 0.8518892988562584, "num_tokens": 126030236.0, "step": 104840 }, { "entropy": 1.8442829206585885, "epoch": 0.325025826146057, "grad_norm": 4.356393337249756, "learning_rate": 4.437473935444945e-06, "loss": 0.4454, "mean_token_accuracy": 0.8437263607978821, "num_tokens": 126043043.0, "step": 104850 }, { "entropy": 1.8087534308433533, "epoch": 0.32505682527110674, "grad_norm": 8.250895500183105, "learning_rate": 4.437262337978981e-06, "loss": 0.4178, "mean_token_accuracy": 0.8567241251468658, "num_tokens": 126055544.0, "step": 104860 }, { "entropy": 1.866674281656742, "epoch": 0.3250878243961564, "grad_norm": 7.741235733032227, "learning_rate": 4.437050770779709e-06, "loss": 0.4723, "mean_token_accuracy": 0.8507555142045021, "num_tokens": 126067524.0, "step": 104870 }, { "entropy": 1.9428170025348663, "epoch": 0.32511882352120614, "grad_norm": 7.691112995147705, "learning_rate": 4.436839233839913e-06, "loss": 0.5591, "mean_token_accuracy": 0.833220262825489, "num_tokens": 126078929.0, "step": 104880 }, { "entropy": 1.8663004338741302, "epoch": 0.3251498226462558, "grad_norm": 6.292846202850342, "learning_rate": 4.436627727152381e-06, "loss": 0.4767, "mean_token_accuracy": 0.8499148935079575, "num_tokens": 126091005.0, "step": 104890 }, { "entropy": 1.9085410133004188, "epoch": 0.32518082177130553, "grad_norm": 8.354232788085938, "learning_rate": 4.436416250709903e-06, "loss": 0.4846, "mean_token_accuracy": 0.8494470819830895, "num_tokens": 126102258.0, "step": 104900 }, { "entropy": 1.9257890343666078, "epoch": 0.3252118208963552, "grad_norm": 8.245911598205566, "learning_rate": 4.436204804505272e-06, "loss": 0.5215, "mean_token_accuracy": 0.8361289039254188, "num_tokens": 126112823.0, "step": 104910 }, { "entropy": 1.8960036650300025, "epoch": 0.3252428200214049, "grad_norm": 7.979340553283691, "learning_rate": 4.435993388531282e-06, "loss": 0.4719, "mean_token_accuracy": 0.8566076219081878, "num_tokens": 126123821.0, "step": 104920 }, { "entropy": 1.7642639800906181, "epoch": 0.3252738191464546, "grad_norm": 4.4087324142456055, "learning_rate": 4.435782002780731e-06, "loss": 0.3795, "mean_token_accuracy": 0.8499569103121758, "num_tokens": 126136484.0, "step": 104930 }, { "entropy": 1.8943352848291397, "epoch": 0.32530481827150426, "grad_norm": 7.900808334350586, "learning_rate": 4.435570647246417e-06, "loss": 0.5372, "mean_token_accuracy": 0.8378649175167083, "num_tokens": 126147568.0, "step": 104940 }, { "entropy": 1.7571329042315482, "epoch": 0.325335817396554, "grad_norm": 5.164160251617432, "learning_rate": 4.435359321921144e-06, "loss": 0.4033, "mean_token_accuracy": 0.8640245348215103, "num_tokens": 126161243.0, "step": 104950 }, { "entropy": 1.8740806519985198, "epoch": 0.32536681652160365, "grad_norm": 7.368401050567627, "learning_rate": 4.435148026797714e-06, "loss": 0.4493, "mean_token_accuracy": 0.8456128895282745, "num_tokens": 126173067.0, "step": 104960 }, { "entropy": 1.852415107935667, "epoch": 0.3253978156466534, "grad_norm": 8.494685173034668, "learning_rate": 4.434936761868937e-06, "loss": 0.4704, "mean_token_accuracy": 0.8431513279676437, "num_tokens": 126185590.0, "step": 104970 }, { "entropy": 1.872846459597349, "epoch": 0.32542881477170305, "grad_norm": 2.7987353801727295, "learning_rate": 4.434725527127619e-06, "loss": 0.451, "mean_token_accuracy": 0.8551968723535538, "num_tokens": 126197812.0, "step": 104980 }, { "entropy": 1.8627401351928712, "epoch": 0.32545981389675277, "grad_norm": 4.279714584350586, "learning_rate": 4.434514322566573e-06, "loss": 0.4787, "mean_token_accuracy": 0.8442763239145279, "num_tokens": 126210066.0, "step": 104990 }, { "entropy": 1.88872180134058, "epoch": 0.32549081302180244, "grad_norm": 7.514123439788818, "learning_rate": 4.434303148178613e-06, "loss": 0.4768, "mean_token_accuracy": 0.8437312439084053, "num_tokens": 126222842.0, "step": 105000 }, { "entropy": 1.9054294735193253, "epoch": 0.32552181214685216, "grad_norm": 7.55472993850708, "learning_rate": 4.434092003956556e-06, "loss": 0.4566, "mean_token_accuracy": 0.8568596675992012, "num_tokens": 126233568.0, "step": 105010 }, { "entropy": 1.9124674081802369, "epoch": 0.32555281127190183, "grad_norm": 10.53081226348877, "learning_rate": 4.4338808898932204e-06, "loss": 0.5074, "mean_token_accuracy": 0.84600650370121, "num_tokens": 126245643.0, "step": 105020 }, { "entropy": 1.8103389233350753, "epoch": 0.32558381039695156, "grad_norm": 3.0053796768188477, "learning_rate": 4.433669805981426e-06, "loss": 0.434, "mean_token_accuracy": 0.8554301127791405, "num_tokens": 126257601.0, "step": 105030 }, { "entropy": 1.9138575494289398, "epoch": 0.3256148095220012, "grad_norm": 8.38183879852295, "learning_rate": 4.433458752213998e-06, "loss": 0.5186, "mean_token_accuracy": 0.838289988040924, "num_tokens": 126269101.0, "step": 105040 }, { "entropy": 1.8735656663775444, "epoch": 0.32564580864705095, "grad_norm": 8.337263107299805, "learning_rate": 4.433247728583761e-06, "loss": 0.4654, "mean_token_accuracy": 0.8512678787112236, "num_tokens": 126279995.0, "step": 105050 }, { "entropy": 1.8865850910544395, "epoch": 0.3256768077721006, "grad_norm": 10.122349739074707, "learning_rate": 4.433036735083546e-06, "loss": 0.456, "mean_token_accuracy": 0.8520815536379814, "num_tokens": 126291441.0, "step": 105060 }, { "entropy": 1.8525515541434288, "epoch": 0.32570780689715034, "grad_norm": 9.650886535644531, "learning_rate": 4.43282577170618e-06, "loss": 0.4447, "mean_token_accuracy": 0.8546164348721504, "num_tokens": 126303505.0, "step": 105070 }, { "entropy": 1.813267020881176, "epoch": 0.3257388060222, "grad_norm": 8.041412353515625, "learning_rate": 4.432614838444499e-06, "loss": 0.4331, "mean_token_accuracy": 0.8510597214102745, "num_tokens": 126316240.0, "step": 105080 }, { "entropy": 1.837722858786583, "epoch": 0.32576980514724974, "grad_norm": 7.805492401123047, "learning_rate": 4.432403935291336e-06, "loss": 0.4346, "mean_token_accuracy": 0.8534246236085892, "num_tokens": 126328521.0, "step": 105090 }, { "entropy": 1.916102121770382, "epoch": 0.3258008042722994, "grad_norm": 8.766565322875977, "learning_rate": 4.432193062239532e-06, "loss": 0.5365, "mean_token_accuracy": 0.8374799504876137, "num_tokens": 126340386.0, "step": 105100 }, { "entropy": 1.894291676580906, "epoch": 0.32583180339734913, "grad_norm": 7.835315227508545, "learning_rate": 4.431982219281925e-06, "loss": 0.4592, "mean_token_accuracy": 0.8560818731784821, "num_tokens": 126351585.0, "step": 105110 }, { "entropy": 1.9061674028635025, "epoch": 0.3258628025223988, "grad_norm": 8.130203247070312, "learning_rate": 4.431771406411358e-06, "loss": 0.4926, "mean_token_accuracy": 0.8496122330427169, "num_tokens": 126362872.0, "step": 105120 }, { "entropy": 1.980933591723442, "epoch": 0.3258938016474485, "grad_norm": 8.286996841430664, "learning_rate": 4.431560623620675e-06, "loss": 0.5517, "mean_token_accuracy": 0.8369853526353837, "num_tokens": 126374021.0, "step": 105130 }, { "entropy": 1.8085208341479302, "epoch": 0.3259248007724982, "grad_norm": 6.969970226287842, "learning_rate": 4.431349870902727e-06, "loss": 0.4279, "mean_token_accuracy": 0.8551606863737107, "num_tokens": 126387654.0, "step": 105140 }, { "entropy": 1.9031474754214286, "epoch": 0.3259557998975479, "grad_norm": 7.189389705657959, "learning_rate": 4.431139148250362e-06, "loss": 0.4203, "mean_token_accuracy": 0.8574759498238563, "num_tokens": 126398864.0, "step": 105150 }, { "entropy": 1.7899470388889314, "epoch": 0.3259867990225976, "grad_norm": 7.750818252563477, "learning_rate": 4.430928455656429e-06, "loss": 0.3847, "mean_token_accuracy": 0.8657332420349121, "num_tokens": 126412367.0, "step": 105160 }, { "entropy": 1.812622857093811, "epoch": 0.3260177981476473, "grad_norm": 6.185579776763916, "learning_rate": 4.4307177931137864e-06, "loss": 0.3992, "mean_token_accuracy": 0.8634996026754379, "num_tokens": 126424902.0, "step": 105170 }, { "entropy": 1.9225602239370345, "epoch": 0.326048797272697, "grad_norm": 9.488457679748535, "learning_rate": 4.4305071606152906e-06, "loss": 0.5349, "mean_token_accuracy": 0.8386317417025566, "num_tokens": 126436598.0, "step": 105180 }, { "entropy": 1.8649600803852082, "epoch": 0.32607979639774665, "grad_norm": 9.695698738098145, "learning_rate": 4.4302965581538e-06, "loss": 0.4751, "mean_token_accuracy": 0.8493688553571701, "num_tokens": 126447796.0, "step": 105190 }, { "entropy": 1.8659547924995423, "epoch": 0.32611079552279637, "grad_norm": 8.478187561035156, "learning_rate": 4.4300859857221765e-06, "loss": 0.4578, "mean_token_accuracy": 0.8515862852334977, "num_tokens": 126460081.0, "step": 105200 }, { "entropy": 1.8974830508232117, "epoch": 0.32614179464784604, "grad_norm": 4.3100457191467285, "learning_rate": 4.429875443313283e-06, "loss": 0.4964, "mean_token_accuracy": 0.8492521673440934, "num_tokens": 126471858.0, "step": 105210 }, { "entropy": 1.8549850180745124, "epoch": 0.32617279377289576, "grad_norm": 8.894652366638184, "learning_rate": 4.429664930919989e-06, "loss": 0.4619, "mean_token_accuracy": 0.8504776194691658, "num_tokens": 126483795.0, "step": 105220 }, { "entropy": 1.8823614329099656, "epoch": 0.32620379289794543, "grad_norm": 8.619571685791016, "learning_rate": 4.429454448535162e-06, "loss": 0.5536, "mean_token_accuracy": 0.8416207283735275, "num_tokens": 126495379.0, "step": 105230 }, { "entropy": 1.8804425790905952, "epoch": 0.32623479202299516, "grad_norm": 8.832228660583496, "learning_rate": 4.429243996151671e-06, "loss": 0.4898, "mean_token_accuracy": 0.8424814388155937, "num_tokens": 126507418.0, "step": 105240 }, { "entropy": 1.9131366968154908, "epoch": 0.3262657911480448, "grad_norm": 9.161233901977539, "learning_rate": 4.4290335737623915e-06, "loss": 0.5025, "mean_token_accuracy": 0.8429681926965713, "num_tokens": 126519465.0, "step": 105250 }, { "entropy": 1.8811295062303544, "epoch": 0.32629679027309455, "grad_norm": 7.556185245513916, "learning_rate": 4.4288231813602e-06, "loss": 0.4708, "mean_token_accuracy": 0.8429519325494766, "num_tokens": 126531139.0, "step": 105260 }, { "entropy": 1.9287168130278587, "epoch": 0.3263277893981442, "grad_norm": 8.719147682189941, "learning_rate": 4.428612818937974e-06, "loss": 0.5497, "mean_token_accuracy": 0.8285974323749542, "num_tokens": 126542914.0, "step": 105270 }, { "entropy": 1.8551797360181808, "epoch": 0.32635878852319394, "grad_norm": 3.8900067806243896, "learning_rate": 4.428402486488593e-06, "loss": 0.4313, "mean_token_accuracy": 0.8506741672754288, "num_tokens": 126554599.0, "step": 105280 }, { "entropy": 1.9169548124074935, "epoch": 0.3263897876482436, "grad_norm": 7.38561487197876, "learning_rate": 4.428192184004942e-06, "loss": 0.4984, "mean_token_accuracy": 0.8483023956418038, "num_tokens": 126565301.0, "step": 105290 }, { "entropy": 1.9362420484423637, "epoch": 0.32642078677329334, "grad_norm": 8.075181007385254, "learning_rate": 4.427981911479907e-06, "loss": 0.5269, "mean_token_accuracy": 0.837260690331459, "num_tokens": 126576660.0, "step": 105300 }, { "entropy": 1.9895741552114488, "epoch": 0.326451785898343, "grad_norm": 8.598695755004883, "learning_rate": 4.427771668906373e-06, "loss": 0.5641, "mean_token_accuracy": 0.8313561111688614, "num_tokens": 126587261.0, "step": 105310 }, { "entropy": 1.9371146634221077, "epoch": 0.32648278502339273, "grad_norm": 4.246029376983643, "learning_rate": 4.427561456277231e-06, "loss": 0.4739, "mean_token_accuracy": 0.8425526946783066, "num_tokens": 126598325.0, "step": 105320 }, { "entropy": 1.897105310857296, "epoch": 0.3265137841484424, "grad_norm": 4.175163269042969, "learning_rate": 4.427351273585373e-06, "loss": 0.4996, "mean_token_accuracy": 0.8386947169899941, "num_tokens": 126610456.0, "step": 105330 }, { "entropy": 1.9232771515846252, "epoch": 0.3265447832734921, "grad_norm": 7.9354071617126465, "learning_rate": 4.427141120823697e-06, "loss": 0.5314, "mean_token_accuracy": 0.8433753445744514, "num_tokens": 126621448.0, "step": 105340 }, { "entropy": 1.9362381488084792, "epoch": 0.3265757823985418, "grad_norm": 8.758517265319824, "learning_rate": 4.426930997985096e-06, "loss": 0.5004, "mean_token_accuracy": 0.8483623921871185, "num_tokens": 126632162.0, "step": 105350 }, { "entropy": 1.8089591830968856, "epoch": 0.3266067815235915, "grad_norm": 8.597465515136719, "learning_rate": 4.426720905062472e-06, "loss": 0.4347, "mean_token_accuracy": 0.851777009665966, "num_tokens": 126645035.0, "step": 105360 }, { "entropy": 1.9275547727942466, "epoch": 0.3266377806486412, "grad_norm": 7.728219509124756, "learning_rate": 4.426510842048728e-06, "loss": 0.5099, "mean_token_accuracy": 0.8419729739427566, "num_tokens": 126656366.0, "step": 105370 }, { "entropy": 1.947744831442833, "epoch": 0.3266687797736909, "grad_norm": 8.231205940246582, "learning_rate": 4.426300808936765e-06, "loss": 0.5361, "mean_token_accuracy": 0.8379727810621261, "num_tokens": 126667203.0, "step": 105380 }, { "entropy": 1.89700628221035, "epoch": 0.3266997788987406, "grad_norm": 8.626848220825195, "learning_rate": 4.426090805719492e-06, "loss": 0.4758, "mean_token_accuracy": 0.8507444426417351, "num_tokens": 126679228.0, "step": 105390 }, { "entropy": 1.7944568783044814, "epoch": 0.3267307780237903, "grad_norm": 7.826314926147461, "learning_rate": 4.4258808323898175e-06, "loss": 0.3949, "mean_token_accuracy": 0.8589142486453056, "num_tokens": 126692017.0, "step": 105400 }, { "entropy": 1.8504990950226783, "epoch": 0.32676177714883997, "grad_norm": 8.685942649841309, "learning_rate": 4.425670888940653e-06, "loss": 0.4481, "mean_token_accuracy": 0.8543372765183449, "num_tokens": 126704537.0, "step": 105410 }, { "entropy": 1.774632167816162, "epoch": 0.3267927762738897, "grad_norm": 8.145844459533691, "learning_rate": 4.425460975364912e-06, "loss": 0.4201, "mean_token_accuracy": 0.853350467979908, "num_tokens": 126717875.0, "step": 105420 }, { "entropy": 1.878597044944763, "epoch": 0.32682377539893936, "grad_norm": 8.8296480178833, "learning_rate": 4.425251091655509e-06, "loss": 0.4983, "mean_token_accuracy": 0.8486263751983643, "num_tokens": 126729109.0, "step": 105430 }, { "entropy": 1.8818610459566116, "epoch": 0.32685477452398903, "grad_norm": 7.772421836853027, "learning_rate": 4.425041237805365e-06, "loss": 0.4909, "mean_token_accuracy": 0.8495032295584679, "num_tokens": 126741332.0, "step": 105440 }, { "entropy": 1.8752919748425483, "epoch": 0.32688577364903876, "grad_norm": 7.812198162078857, "learning_rate": 4.4248314138074e-06, "loss": 0.4661, "mean_token_accuracy": 0.8592524334788323, "num_tokens": 126753407.0, "step": 105450 }, { "entropy": 1.8762979090213776, "epoch": 0.3269167727740884, "grad_norm": 3.573888063430786, "learning_rate": 4.424621619654536e-06, "loss": 0.4951, "mean_token_accuracy": 0.8430203974246979, "num_tokens": 126764669.0, "step": 105460 }, { "entropy": 1.9314564675092698, "epoch": 0.32694777189913815, "grad_norm": 8.204230308532715, "learning_rate": 4.4244118553397e-06, "loss": 0.5152, "mean_token_accuracy": 0.8395652711391449, "num_tokens": 126775741.0, "step": 105470 }, { "entropy": 1.8367981255054473, "epoch": 0.3269787710241878, "grad_norm": 8.883301734924316, "learning_rate": 4.424202120855818e-06, "loss": 0.4383, "mean_token_accuracy": 0.8451298654079438, "num_tokens": 126788018.0, "step": 105480 }, { "entropy": 1.9303217083215714, "epoch": 0.32700977014923754, "grad_norm": 7.516885757446289, "learning_rate": 4.423992416195822e-06, "loss": 0.524, "mean_token_accuracy": 0.8447012811899185, "num_tokens": 126798946.0, "step": 105490 }, { "entropy": 1.8869639337062836, "epoch": 0.3270407692742872, "grad_norm": 7.857840538024902, "learning_rate": 4.4237827413526425e-06, "loss": 0.5241, "mean_token_accuracy": 0.8342716425657273, "num_tokens": 126810950.0, "step": 105500 }, { "entropy": 1.8075002878904343, "epoch": 0.32707176839933694, "grad_norm": 3.914418935775757, "learning_rate": 4.423573096319217e-06, "loss": 0.4139, "mean_token_accuracy": 0.8586590677499771, "num_tokens": 126823453.0, "step": 105510 }, { "entropy": 1.898749789595604, "epoch": 0.3271027675243866, "grad_norm": 8.268956184387207, "learning_rate": 4.423363481088481e-06, "loss": 0.5246, "mean_token_accuracy": 0.8393109634518623, "num_tokens": 126835289.0, "step": 105520 }, { "entropy": 1.7726179018616677, "epoch": 0.32713376664943633, "grad_norm": 3.696336030960083, "learning_rate": 4.423153895653373e-06, "loss": 0.4025, "mean_token_accuracy": 0.8711417764425278, "num_tokens": 126847849.0, "step": 105530 }, { "entropy": 1.7812189802527427, "epoch": 0.327164765774486, "grad_norm": 7.559261798858643, "learning_rate": 4.422944340006837e-06, "loss": 0.4275, "mean_token_accuracy": 0.8589012801647187, "num_tokens": 126860795.0, "step": 105540 }, { "entropy": 1.869552193582058, "epoch": 0.3271957648995357, "grad_norm": 7.7391252517700195, "learning_rate": 4.4227348141418165e-06, "loss": 0.5163, "mean_token_accuracy": 0.8421175703406334, "num_tokens": 126872641.0, "step": 105550 }, { "entropy": 1.847855243086815, "epoch": 0.3272267640245854, "grad_norm": 8.301652908325195, "learning_rate": 4.422525318051257e-06, "loss": 0.4613, "mean_token_accuracy": 0.8523299112915993, "num_tokens": 126884824.0, "step": 105560 }, { "entropy": 1.8750713467597961, "epoch": 0.3272577631496351, "grad_norm": 9.099956512451172, "learning_rate": 4.422315851728109e-06, "loss": 0.4838, "mean_token_accuracy": 0.848430560529232, "num_tokens": 126896206.0, "step": 105570 }, { "entropy": 1.8710100874304771, "epoch": 0.3272887622746848, "grad_norm": 7.930703163146973, "learning_rate": 4.422106415165322e-06, "loss": 0.4751, "mean_token_accuracy": 0.8564017862081528, "num_tokens": 126907868.0, "step": 105580 }, { "entropy": 1.9006529331207276, "epoch": 0.3273197613997345, "grad_norm": 3.9899120330810547, "learning_rate": 4.4218970083558505e-06, "loss": 0.5064, "mean_token_accuracy": 0.8440683215856553, "num_tokens": 126919807.0, "step": 105590 }, { "entropy": 1.8714349627494813, "epoch": 0.3273507605247842, "grad_norm": 8.205395698547363, "learning_rate": 4.421687631292651e-06, "loss": 0.5088, "mean_token_accuracy": 0.8427043676376342, "num_tokens": 126931174.0, "step": 105600 }, { "entropy": 1.8943498641252519, "epoch": 0.3273817596498339, "grad_norm": 5.008574485778809, "learning_rate": 4.4214782839686805e-06, "loss": 0.4976, "mean_token_accuracy": 0.8429151371121406, "num_tokens": 126942843.0, "step": 105610 }, { "entropy": 1.8473748803138732, "epoch": 0.32741275877488357, "grad_norm": 8.37645435333252, "learning_rate": 4.421268966376901e-06, "loss": 0.4991, "mean_token_accuracy": 0.8443385511636734, "num_tokens": 126954725.0, "step": 105620 }, { "entropy": 1.7291858568787575, "epoch": 0.3274437578999333, "grad_norm": 3.739603042602539, "learning_rate": 4.421059678510274e-06, "loss": 0.3693, "mean_token_accuracy": 0.8665387943387032, "num_tokens": 126968620.0, "step": 105630 }, { "entropy": 1.7893247798085212, "epoch": 0.32747475702498297, "grad_norm": 4.73966121673584, "learning_rate": 4.420850420361765e-06, "loss": 0.3692, "mean_token_accuracy": 0.8631637006998062, "num_tokens": 126981603.0, "step": 105640 }, { "entropy": 1.8417418763041495, "epoch": 0.3275057561500327, "grad_norm": 3.766573190689087, "learning_rate": 4.420641191924342e-06, "loss": 0.4141, "mean_token_accuracy": 0.8520455986261368, "num_tokens": 126994028.0, "step": 105650 }, { "entropy": 1.8928628534078598, "epoch": 0.32753675527508236, "grad_norm": 10.196480751037598, "learning_rate": 4.420431993190975e-06, "loss": 0.5382, "mean_token_accuracy": 0.8365366339683533, "num_tokens": 127005092.0, "step": 105660 }, { "entropy": 1.8437321752309799, "epoch": 0.3275677544001321, "grad_norm": 3.234116792678833, "learning_rate": 4.4202228241546354e-06, "loss": 0.4473, "mean_token_accuracy": 0.8564884319901467, "num_tokens": 127017233.0, "step": 105670 }, { "entropy": 1.8227815330028534, "epoch": 0.32759875352518175, "grad_norm": 6.8498921394348145, "learning_rate": 4.420013684808299e-06, "loss": 0.4251, "mean_token_accuracy": 0.8651960000395775, "num_tokens": 127028817.0, "step": 105680 }, { "entropy": 1.8443077996373176, "epoch": 0.3276297526502314, "grad_norm": 5.660989761352539, "learning_rate": 4.419804575144942e-06, "loss": 0.4744, "mean_token_accuracy": 0.8490277513861656, "num_tokens": 127041425.0, "step": 105690 }, { "entropy": 1.8642980486154557, "epoch": 0.32766075177528114, "grad_norm": 7.390862464904785, "learning_rate": 4.419595495157543e-06, "loss": 0.4562, "mean_token_accuracy": 0.8533131554722786, "num_tokens": 127053051.0, "step": 105700 }, { "entropy": 1.906336459517479, "epoch": 0.3276917509003308, "grad_norm": 7.918664455413818, "learning_rate": 4.419386444839084e-06, "loss": 0.536, "mean_token_accuracy": 0.8386664062738418, "num_tokens": 127064707.0, "step": 105710 }, { "entropy": 1.8976897805929185, "epoch": 0.32772275002538054, "grad_norm": 8.025975227355957, "learning_rate": 4.419177424182549e-06, "loss": 0.4764, "mean_token_accuracy": 0.8574541077017784, "num_tokens": 127075139.0, "step": 105720 }, { "entropy": 1.7558037593960762, "epoch": 0.3277537491504302, "grad_norm": 8.160879135131836, "learning_rate": 4.418968433180924e-06, "loss": 0.4305, "mean_token_accuracy": 0.8632358491420746, "num_tokens": 127088887.0, "step": 105730 }, { "entropy": 1.7168943211436272, "epoch": 0.32778474827547993, "grad_norm": 4.3874359130859375, "learning_rate": 4.418759471827199e-06, "loss": 0.3729, "mean_token_accuracy": 0.861461877822876, "num_tokens": 127103106.0, "step": 105740 }, { "entropy": 1.84348274320364, "epoch": 0.3278157474005296, "grad_norm": 4.105869770050049, "learning_rate": 4.418550540114362e-06, "loss": 0.4811, "mean_token_accuracy": 0.8481574520468712, "num_tokens": 127115076.0, "step": 105750 }, { "entropy": 1.803061343729496, "epoch": 0.3278467465255793, "grad_norm": 4.1507368087768555, "learning_rate": 4.418341638035409e-06, "loss": 0.4453, "mean_token_accuracy": 0.8516246452927589, "num_tokens": 127127443.0, "step": 105760 }, { "entropy": 1.821767383813858, "epoch": 0.327877745650629, "grad_norm": 3.866901159286499, "learning_rate": 4.4181327655833315e-06, "loss": 0.44, "mean_token_accuracy": 0.8582574099302291, "num_tokens": 127139967.0, "step": 105770 }, { "entropy": 1.8919250801205636, "epoch": 0.3279087447756787, "grad_norm": 7.636270999908447, "learning_rate": 4.417923922751132e-06, "loss": 0.5007, "mean_token_accuracy": 0.8451293349266052, "num_tokens": 127151560.0, "step": 105780 }, { "entropy": 1.8248420521616935, "epoch": 0.3279397439007284, "grad_norm": 3.9394943714141846, "learning_rate": 4.417715109531807e-06, "loss": 0.4343, "mean_token_accuracy": 0.8457361742854118, "num_tokens": 127163965.0, "step": 105790 }, { "entropy": 1.8081183552742004, "epoch": 0.3279707430257781, "grad_norm": 9.723323822021484, "learning_rate": 4.41750632591836e-06, "loss": 0.4581, "mean_token_accuracy": 0.856559830904007, "num_tokens": 127176304.0, "step": 105800 }, { "entropy": 1.9597806930541992, "epoch": 0.3280017421508278, "grad_norm": 9.514747619628906, "learning_rate": 4.417297571903797e-06, "loss": 0.5702, "mean_token_accuracy": 0.823980063199997, "num_tokens": 127187023.0, "step": 105810 }, { "entropy": 1.8679911822080613, "epoch": 0.3280327412758775, "grad_norm": 8.901237487792969, "learning_rate": 4.4170888474811235e-06, "loss": 0.4157, "mean_token_accuracy": 0.8576633468270302, "num_tokens": 127199443.0, "step": 105820 }, { "entropy": 1.7843319460749627, "epoch": 0.3280637404009272, "grad_norm": 7.45582914352417, "learning_rate": 4.4168801526433495e-06, "loss": 0.3749, "mean_token_accuracy": 0.8751977890729904, "num_tokens": 127212000.0, "step": 105830 }, { "entropy": 1.8773673444986343, "epoch": 0.3280947395259769, "grad_norm": 7.825649261474609, "learning_rate": 4.416671487383486e-06, "loss": 0.4706, "mean_token_accuracy": 0.8506253302097321, "num_tokens": 127222912.0, "step": 105840 }, { "entropy": 1.9522982880473136, "epoch": 0.32812573865102657, "grad_norm": 11.103384017944336, "learning_rate": 4.416462851694547e-06, "loss": 0.5427, "mean_token_accuracy": 0.8257913753390312, "num_tokens": 127234149.0, "step": 105850 }, { "entropy": 1.9224087953567506, "epoch": 0.3281567377760763, "grad_norm": 8.676215171813965, "learning_rate": 4.4162542455695495e-06, "loss": 0.4731, "mean_token_accuracy": 0.8379293918609619, "num_tokens": 127245446.0, "step": 105860 }, { "entropy": 1.8061835870146752, "epoch": 0.32818773690112596, "grad_norm": 3.818633794784546, "learning_rate": 4.416045669001512e-06, "loss": 0.4125, "mean_token_accuracy": 0.8506641060113906, "num_tokens": 127258271.0, "step": 105870 }, { "entropy": 1.891014301776886, "epoch": 0.3282187360261757, "grad_norm": 8.472275733947754, "learning_rate": 4.415837121983454e-06, "loss": 0.5129, "mean_token_accuracy": 0.8417830243706703, "num_tokens": 127269244.0, "step": 105880 }, { "entropy": 1.8804811149835587, "epoch": 0.32824973515122535, "grad_norm": 7.921799659729004, "learning_rate": 4.415628604508402e-06, "loss": 0.503, "mean_token_accuracy": 0.843516594171524, "num_tokens": 127281119.0, "step": 105890 }, { "entropy": 1.8675379946827888, "epoch": 0.3282807342762751, "grad_norm": 7.780767917633057, "learning_rate": 4.415420116569378e-06, "loss": 0.4707, "mean_token_accuracy": 0.8497234523296356, "num_tokens": 127292982.0, "step": 105900 }, { "entropy": 1.8299174696207046, "epoch": 0.32831173340132475, "grad_norm": 8.369937896728516, "learning_rate": 4.41521165815941e-06, "loss": 0.46, "mean_token_accuracy": 0.8549463152885437, "num_tokens": 127305007.0, "step": 105910 }, { "entropy": 1.902072112262249, "epoch": 0.32834273252637447, "grad_norm": 6.59075403213501, "learning_rate": 4.4150032292715315e-06, "loss": 0.4707, "mean_token_accuracy": 0.84717957675457, "num_tokens": 127316897.0, "step": 105920 }, { "entropy": 1.9103587403893472, "epoch": 0.32837373165142414, "grad_norm": 9.852415084838867, "learning_rate": 4.414794829898772e-06, "loss": 0.518, "mean_token_accuracy": 0.8442836970090866, "num_tokens": 127328580.0, "step": 105930 }, { "entropy": 1.8694602742791175, "epoch": 0.3284047307764738, "grad_norm": 9.172961235046387, "learning_rate": 4.4145864600341656e-06, "loss": 0.4835, "mean_token_accuracy": 0.8534001842141151, "num_tokens": 127339695.0, "step": 105940 }, { "entropy": 1.8509365767240524, "epoch": 0.32843572990152353, "grad_norm": 7.372804641723633, "learning_rate": 4.414378119670751e-06, "loss": 0.4497, "mean_token_accuracy": 0.85338544100523, "num_tokens": 127350996.0, "step": 105950 }, { "entropy": 1.9594619423151016, "epoch": 0.3284667290265732, "grad_norm": 9.022714614868164, "learning_rate": 4.414169808801567e-06, "loss": 0.5748, "mean_token_accuracy": 0.8271567165851593, "num_tokens": 127361650.0, "step": 105960 }, { "entropy": 1.788424487411976, "epoch": 0.3284977281516229, "grad_norm": 8.611357688903809, "learning_rate": 4.413961527419656e-06, "loss": 0.4161, "mean_token_accuracy": 0.8564797580242157, "num_tokens": 127375584.0, "step": 105970 }, { "entropy": 1.8492983341217042, "epoch": 0.3285287272766726, "grad_norm": 6.395881175994873, "learning_rate": 4.4137532755180615e-06, "loss": 0.4253, "mean_token_accuracy": 0.8596012353897095, "num_tokens": 127388011.0, "step": 105980 }, { "entropy": 1.9068158611655235, "epoch": 0.3285597264017223, "grad_norm": 9.72159481048584, "learning_rate": 4.4135450530898296e-06, "loss": 0.5133, "mean_token_accuracy": 0.8352061986923218, "num_tokens": 127399562.0, "step": 105990 }, { "entropy": 1.8263181149959564, "epoch": 0.328590725526772, "grad_norm": 9.199592590332031, "learning_rate": 4.413336860128008e-06, "loss": 0.4156, "mean_token_accuracy": 0.856217360496521, "num_tokens": 127412251.0, "step": 106000 }, { "entropy": 1.7998489513993263, "epoch": 0.3286217246518217, "grad_norm": 7.553385257720947, "learning_rate": 4.413128696625648e-06, "loss": 0.4414, "mean_token_accuracy": 0.8605973869562149, "num_tokens": 127424851.0, "step": 106010 }, { "entropy": 1.8556075662374496, "epoch": 0.3286527237768714, "grad_norm": 9.884352684020996, "learning_rate": 4.412920562575802e-06, "loss": 0.503, "mean_token_accuracy": 0.8411578819155693, "num_tokens": 127436563.0, "step": 106020 }, { "entropy": 1.8433560684323311, "epoch": 0.3286837229019211, "grad_norm": 9.31692123413086, "learning_rate": 4.412712457971527e-06, "loss": 0.4248, "mean_token_accuracy": 0.85454620718956, "num_tokens": 127448302.0, "step": 106030 }, { "entropy": 1.8037659108638764, "epoch": 0.3287147220269708, "grad_norm": 4.995815753936768, "learning_rate": 4.412504382805881e-06, "loss": 0.4362, "mean_token_accuracy": 0.8571658223867417, "num_tokens": 127459774.0, "step": 106040 }, { "entropy": 1.8213044986128808, "epoch": 0.3287457211520205, "grad_norm": 8.79011344909668, "learning_rate": 4.412296337071922e-06, "loss": 0.4453, "mean_token_accuracy": 0.8526079818606377, "num_tokens": 127472645.0, "step": 106050 }, { "entropy": 1.9069361835718155, "epoch": 0.32877672027707017, "grad_norm": 9.181472778320312, "learning_rate": 4.412088320762712e-06, "loss": 0.481, "mean_token_accuracy": 0.8438616082072258, "num_tokens": 127483985.0, "step": 106060 }, { "entropy": 1.7741859510540963, "epoch": 0.3288077194021199, "grad_norm": 4.265276908874512, "learning_rate": 4.411880333871319e-06, "loss": 0.4459, "mean_token_accuracy": 0.85208670347929, "num_tokens": 127497519.0, "step": 106070 }, { "entropy": 1.774897436797619, "epoch": 0.32883871852716956, "grad_norm": 3.4330549240112305, "learning_rate": 4.411672376390806e-06, "loss": 0.4134, "mean_token_accuracy": 0.862097978591919, "num_tokens": 127510803.0, "step": 106080 }, { "entropy": 1.7656168833374977, "epoch": 0.3288697176522193, "grad_norm": 7.550422191619873, "learning_rate": 4.411464448314243e-06, "loss": 0.3932, "mean_token_accuracy": 0.859065192937851, "num_tokens": 127523417.0, "step": 106090 }, { "entropy": 1.8322931870818138, "epoch": 0.32890071677726895, "grad_norm": 8.406190872192383, "learning_rate": 4.411256549634704e-06, "loss": 0.4203, "mean_token_accuracy": 0.8471446886658669, "num_tokens": 127536405.0, "step": 106100 }, { "entropy": 1.914437648653984, "epoch": 0.3289317159023187, "grad_norm": 10.335504531860352, "learning_rate": 4.411048680345259e-06, "loss": 0.5209, "mean_token_accuracy": 0.8394327759742737, "num_tokens": 127546905.0, "step": 106110 }, { "entropy": 1.863400039076805, "epoch": 0.32896271502736835, "grad_norm": 7.929013252258301, "learning_rate": 4.410840840438987e-06, "loss": 0.4397, "mean_token_accuracy": 0.8584112614393234, "num_tokens": 127558189.0, "step": 106120 }, { "entropy": 1.8088774591684342, "epoch": 0.32899371415241807, "grad_norm": 12.394187927246094, "learning_rate": 4.410633029908964e-06, "loss": 0.4221, "mean_token_accuracy": 0.8598813712596893, "num_tokens": 127570793.0, "step": 106130 }, { "entropy": 1.8091413155198097, "epoch": 0.32902471327746774, "grad_norm": 7.606610298156738, "learning_rate": 4.4104252487482726e-06, "loss": 0.4373, "mean_token_accuracy": 0.8585825636982918, "num_tokens": 127582513.0, "step": 106140 }, { "entropy": 1.8211368069052696, "epoch": 0.32905571240251746, "grad_norm": 3.8526268005371094, "learning_rate": 4.4102174969499945e-06, "loss": 0.4541, "mean_token_accuracy": 0.8475786969065666, "num_tokens": 127595268.0, "step": 106150 }, { "entropy": 1.8827776312828064, "epoch": 0.32908671152756713, "grad_norm": 8.69912052154541, "learning_rate": 4.410009774507214e-06, "loss": 0.5015, "mean_token_accuracy": 0.8499000027775765, "num_tokens": 127606870.0, "step": 106160 }, { "entropy": 1.9105200260877608, "epoch": 0.32911771065261686, "grad_norm": 8.402679443359375, "learning_rate": 4.40980208141302e-06, "loss": 0.4985, "mean_token_accuracy": 0.8454275488853454, "num_tokens": 127617735.0, "step": 106170 }, { "entropy": 1.8559418886899948, "epoch": 0.3291487097776665, "grad_norm": 10.870802879333496, "learning_rate": 4.4095944176605015e-06, "loss": 0.4824, "mean_token_accuracy": 0.8455240190029144, "num_tokens": 127629429.0, "step": 106180 }, { "entropy": 1.8259831920266152, "epoch": 0.3291797089027162, "grad_norm": 8.05710506439209, "learning_rate": 4.40938678324275e-06, "loss": 0.4642, "mean_token_accuracy": 0.8483387231826782, "num_tokens": 127642106.0, "step": 106190 }, { "entropy": 1.8355090886354446, "epoch": 0.3292107080277659, "grad_norm": 6.109237194061279, "learning_rate": 4.40917917815286e-06, "loss": 0.4785, "mean_token_accuracy": 0.8482315883040428, "num_tokens": 127653739.0, "step": 106200 }, { "entropy": 1.8437444880604743, "epoch": 0.3292417071528156, "grad_norm": 3.581789493560791, "learning_rate": 4.408971602383929e-06, "loss": 0.4453, "mean_token_accuracy": 0.8457508951425552, "num_tokens": 127665387.0, "step": 106210 }, { "entropy": 1.7818902999162674, "epoch": 0.3292727062778653, "grad_norm": 8.96249771118164, "learning_rate": 4.408764055929055e-06, "loss": 0.4116, "mean_token_accuracy": 0.8549827590584755, "num_tokens": 127678366.0, "step": 106220 }, { "entropy": 1.8756959035992622, "epoch": 0.329303705402915, "grad_norm": 8.410722732543945, "learning_rate": 4.40855653878134e-06, "loss": 0.4863, "mean_token_accuracy": 0.8546160280704498, "num_tokens": 127689938.0, "step": 106230 }, { "entropy": 1.782509133219719, "epoch": 0.3293347045279647, "grad_norm": 4.164084434509277, "learning_rate": 4.408349050933885e-06, "loss": 0.4215, "mean_token_accuracy": 0.8557716026902199, "num_tokens": 127703501.0, "step": 106240 }, { "entropy": 1.9288730174303055, "epoch": 0.3293657036530144, "grad_norm": 9.329178810119629, "learning_rate": 4.408141592379797e-06, "loss": 0.5069, "mean_token_accuracy": 0.8449020832777023, "num_tokens": 127714411.0, "step": 106250 }, { "entropy": 1.9135432869195939, "epoch": 0.3293967027780641, "grad_norm": 7.811339378356934, "learning_rate": 4.4079341631121846e-06, "loss": 0.5017, "mean_token_accuracy": 0.843287567794323, "num_tokens": 127725960.0, "step": 106260 }, { "entropy": 1.8159632161259651, "epoch": 0.32942770190311377, "grad_norm": 10.536284446716309, "learning_rate": 4.407726763124156e-06, "loss": 0.4572, "mean_token_accuracy": 0.8523865416646004, "num_tokens": 127738519.0, "step": 106270 }, { "entropy": 1.6889009296894073, "epoch": 0.3294587010281635, "grad_norm": 3.854187488555908, "learning_rate": 4.407519392408826e-06, "loss": 0.3543, "mean_token_accuracy": 0.8675200149416924, "num_tokens": 127753002.0, "step": 106280 }, { "entropy": 1.9123955562710762, "epoch": 0.32948970015321316, "grad_norm": 8.895185470581055, "learning_rate": 4.4073120509593084e-06, "loss": 0.5115, "mean_token_accuracy": 0.843984666466713, "num_tokens": 127764315.0, "step": 106290 }, { "entropy": 1.8070273652672768, "epoch": 0.3295206992782629, "grad_norm": 6.719578266143799, "learning_rate": 4.4071047387687186e-06, "loss": 0.4363, "mean_token_accuracy": 0.8462962463498116, "num_tokens": 127777448.0, "step": 106300 }, { "entropy": 1.9051207095384597, "epoch": 0.32955169840331255, "grad_norm": 6.615823268890381, "learning_rate": 4.406897455830177e-06, "loss": 0.5134, "mean_token_accuracy": 0.8489801079034806, "num_tokens": 127788724.0, "step": 106310 }, { "entropy": 1.790643498301506, "epoch": 0.3295826975283623, "grad_norm": 4.387211322784424, "learning_rate": 4.4066902021368055e-06, "loss": 0.4293, "mean_token_accuracy": 0.8558807000517845, "num_tokens": 127801391.0, "step": 106320 }, { "entropy": 1.9066824808716774, "epoch": 0.32961369665341195, "grad_norm": 8.245688438415527, "learning_rate": 4.406482977681727e-06, "loss": 0.5221, "mean_token_accuracy": 0.8399421364068985, "num_tokens": 127812990.0, "step": 106330 }, { "entropy": 1.7780104890465736, "epoch": 0.32964469577846167, "grad_norm": 7.971060276031494, "learning_rate": 4.406275782458069e-06, "loss": 0.439, "mean_token_accuracy": 0.8576152637600899, "num_tokens": 127825366.0, "step": 106340 }, { "entropy": 1.7980201825499535, "epoch": 0.32967569490351134, "grad_norm": 10.708061218261719, "learning_rate": 4.406068616458957e-06, "loss": 0.4155, "mean_token_accuracy": 0.8627904281020164, "num_tokens": 127837420.0, "step": 106350 }, { "entropy": 1.788325347006321, "epoch": 0.32970669402856106, "grad_norm": 7.981190204620361, "learning_rate": 4.405861479677525e-06, "loss": 0.4451, "mean_token_accuracy": 0.8535309046506881, "num_tokens": 127850016.0, "step": 106360 }, { "entropy": 1.8833219155669212, "epoch": 0.32973769315361073, "grad_norm": 7.901392936706543, "learning_rate": 4.4056543721069024e-06, "loss": 0.5564, "mean_token_accuracy": 0.83796806037426, "num_tokens": 127861358.0, "step": 106370 }, { "entropy": 1.9057514756917953, "epoch": 0.32976869227866046, "grad_norm": 7.878623962402344, "learning_rate": 4.405447293740227e-06, "loss": 0.5158, "mean_token_accuracy": 0.850669352710247, "num_tokens": 127872047.0, "step": 106380 }, { "entropy": 1.8584971472620964, "epoch": 0.3297996914037101, "grad_norm": 3.818636894226074, "learning_rate": 4.405240244570635e-06, "loss": 0.4382, "mean_token_accuracy": 0.8438824072480202, "num_tokens": 127884719.0, "step": 106390 }, { "entropy": 1.9173836246132852, "epoch": 0.32983069052875985, "grad_norm": 9.852693557739258, "learning_rate": 4.405033224591264e-06, "loss": 0.4975, "mean_token_accuracy": 0.8338365435600281, "num_tokens": 127896128.0, "step": 106400 }, { "entropy": 1.8377716928720473, "epoch": 0.3298616896538095, "grad_norm": 5.034619331359863, "learning_rate": 4.404826233795259e-06, "loss": 0.4568, "mean_token_accuracy": 0.8457611680030823, "num_tokens": 127908855.0, "step": 106410 }, { "entropy": 1.7964683637022971, "epoch": 0.32989268877885924, "grad_norm": 4.223313808441162, "learning_rate": 4.4046192721757625e-06, "loss": 0.4682, "mean_token_accuracy": 0.847607783973217, "num_tokens": 127921526.0, "step": 106420 }, { "entropy": 1.806885550916195, "epoch": 0.3299236879039089, "grad_norm": 3.7442190647125244, "learning_rate": 4.404412339725922e-06, "loss": 0.4447, "mean_token_accuracy": 0.8525063678622246, "num_tokens": 127934569.0, "step": 106430 }, { "entropy": 1.763914243876934, "epoch": 0.3299546870289586, "grad_norm": 9.370475769042969, "learning_rate": 4.404205436438884e-06, "loss": 0.4088, "mean_token_accuracy": 0.8634126842021942, "num_tokens": 127947927.0, "step": 106440 }, { "entropy": 1.8372886329889297, "epoch": 0.3299856861540083, "grad_norm": 7.662943363189697, "learning_rate": 4.403998562307801e-06, "loss": 0.4612, "mean_token_accuracy": 0.8420701563358307, "num_tokens": 127960835.0, "step": 106450 }, { "entropy": 1.8602828592061997, "epoch": 0.330016685279058, "grad_norm": 8.085721969604492, "learning_rate": 4.403791717325825e-06, "loss": 0.4263, "mean_token_accuracy": 0.8534385710954666, "num_tokens": 127973028.0, "step": 106460 }, { "entropy": 1.8481948718428611, "epoch": 0.3300476844041077, "grad_norm": 9.864887237548828, "learning_rate": 4.403584901486113e-06, "loss": 0.4325, "mean_token_accuracy": 0.8566409692168235, "num_tokens": 127985321.0, "step": 106470 }, { "entropy": 1.8503880456089974, "epoch": 0.33007868352915737, "grad_norm": 8.877874374389648, "learning_rate": 4.403378114781821e-06, "loss": 0.4707, "mean_token_accuracy": 0.8484792679548263, "num_tokens": 127996333.0, "step": 106480 }, { "entropy": 1.850000935792923, "epoch": 0.3301096826542071, "grad_norm": 8.956764221191406, "learning_rate": 4.403171357206109e-06, "loss": 0.4779, "mean_token_accuracy": 0.8442446753382683, "num_tokens": 128007956.0, "step": 106490 }, { "entropy": 1.84800376445055, "epoch": 0.33014068177925676, "grad_norm": 7.336550712585449, "learning_rate": 4.402964628752139e-06, "loss": 0.4393, "mean_token_accuracy": 0.8560839980840683, "num_tokens": 128019797.0, "step": 106500 }, { "entropy": 1.887277567386627, "epoch": 0.3301716809043065, "grad_norm": 8.57533073425293, "learning_rate": 4.4027579294130766e-06, "loss": 0.4599, "mean_token_accuracy": 0.8528825983405113, "num_tokens": 128030487.0, "step": 106510 }, { "entropy": 1.8442515268921853, "epoch": 0.33020268002935615, "grad_norm": 8.904195785522461, "learning_rate": 4.402551259182087e-06, "loss": 0.4448, "mean_token_accuracy": 0.8501736581325531, "num_tokens": 128042554.0, "step": 106520 }, { "entropy": 1.8353326499462128, "epoch": 0.3302336791544059, "grad_norm": 7.989103317260742, "learning_rate": 4.402344618052339e-06, "loss": 0.4452, "mean_token_accuracy": 0.8517720341682434, "num_tokens": 128054608.0, "step": 106530 }, { "entropy": 1.8498850226402284, "epoch": 0.33026467827945555, "grad_norm": 4.291723251342773, "learning_rate": 4.402138006017006e-06, "loss": 0.4413, "mean_token_accuracy": 0.848425455391407, "num_tokens": 128066823.0, "step": 106540 }, { "entropy": 1.8223819851875305, "epoch": 0.3302956774045053, "grad_norm": 4.217907428741455, "learning_rate": 4.401931423069258e-06, "loss": 0.4128, "mean_token_accuracy": 0.8563816383481025, "num_tokens": 128078929.0, "step": 106550 }, { "entropy": 1.943525066971779, "epoch": 0.33032667652955494, "grad_norm": 6.72135066986084, "learning_rate": 4.401724869202272e-06, "loss": 0.5059, "mean_token_accuracy": 0.8440445035696029, "num_tokens": 128089603.0, "step": 106560 }, { "entropy": 1.8807444974780083, "epoch": 0.33035767565460467, "grad_norm": 9.296396255493164, "learning_rate": 4.401518344409226e-06, "loss": 0.4962, "mean_token_accuracy": 0.8367669865489006, "num_tokens": 128101462.0, "step": 106570 }, { "entropy": 1.8421072617173195, "epoch": 0.33038867477965433, "grad_norm": 4.264364719390869, "learning_rate": 4.4013118486833e-06, "loss": 0.447, "mean_token_accuracy": 0.8508803278207779, "num_tokens": 128113035.0, "step": 106580 }, { "entropy": 1.9045246794819832, "epoch": 0.33041967390470406, "grad_norm": 7.94566011428833, "learning_rate": 4.401105382017675e-06, "loss": 0.5491, "mean_token_accuracy": 0.83597691655159, "num_tokens": 128124609.0, "step": 106590 }, { "entropy": 1.8478467270731926, "epoch": 0.3304506730297537, "grad_norm": 7.042159557342529, "learning_rate": 4.400898944405538e-06, "loss": 0.4577, "mean_token_accuracy": 0.8560745418071747, "num_tokens": 128136516.0, "step": 106600 }, { "entropy": 1.8195524469017983, "epoch": 0.33048167215480345, "grad_norm": 8.00984001159668, "learning_rate": 4.400692535840075e-06, "loss": 0.4113, "mean_token_accuracy": 0.8634827017784119, "num_tokens": 128148763.0, "step": 106610 }, { "entropy": 1.864709873497486, "epoch": 0.3305126712798531, "grad_norm": 7.909924030303955, "learning_rate": 4.4004861563144735e-06, "loss": 0.5111, "mean_token_accuracy": 0.8368150681257248, "num_tokens": 128160851.0, "step": 106620 }, { "entropy": 1.8297677680850029, "epoch": 0.33054367040490285, "grad_norm": 4.41657018661499, "learning_rate": 4.4002798058219256e-06, "loss": 0.4139, "mean_token_accuracy": 0.8527476787567139, "num_tokens": 128173705.0, "step": 106630 }, { "entropy": 1.78669353723526, "epoch": 0.3305746695299525, "grad_norm": 8.336793899536133, "learning_rate": 4.400073484355625e-06, "loss": 0.3994, "mean_token_accuracy": 0.8534336164593697, "num_tokens": 128187455.0, "step": 106640 }, { "entropy": 1.872094802558422, "epoch": 0.33060566865500224, "grad_norm": 6.102949142456055, "learning_rate": 4.399867191908766e-06, "loss": 0.5354, "mean_token_accuracy": 0.8369955107569694, "num_tokens": 128199573.0, "step": 106650 }, { "entropy": 1.8278701439499856, "epoch": 0.3306366677800519, "grad_norm": 6.1908793449401855, "learning_rate": 4.399660928474549e-06, "loss": 0.4362, "mean_token_accuracy": 0.8451722010970115, "num_tokens": 128212545.0, "step": 106660 }, { "entropy": 1.80742659419775, "epoch": 0.3306676669051016, "grad_norm": 9.044242858886719, "learning_rate": 4.399454694046172e-06, "loss": 0.4347, "mean_token_accuracy": 0.8628106907010078, "num_tokens": 128225069.0, "step": 106670 }, { "entropy": 1.8953262731432914, "epoch": 0.3306986660301513, "grad_norm": 7.473639965057373, "learning_rate": 4.399248488616839e-06, "loss": 0.4912, "mean_token_accuracy": 0.846845431625843, "num_tokens": 128236139.0, "step": 106680 }, { "entropy": 1.8824147969484328, "epoch": 0.33072966515520097, "grad_norm": 3.392375946044922, "learning_rate": 4.399042312179753e-06, "loss": 0.486, "mean_token_accuracy": 0.847435437142849, "num_tokens": 128247581.0, "step": 106690 }, { "entropy": 1.8657734468579292, "epoch": 0.3307606642802507, "grad_norm": 8.0246000289917, "learning_rate": 4.398836164728121e-06, "loss": 0.4465, "mean_token_accuracy": 0.8519965663552285, "num_tokens": 128259094.0, "step": 106700 }, { "entropy": 1.9068785265088082, "epoch": 0.33079166340530036, "grad_norm": 10.08130168914795, "learning_rate": 4.398630046255153e-06, "loss": 0.4959, "mean_token_accuracy": 0.8427546977996826, "num_tokens": 128270336.0, "step": 106710 }, { "entropy": 1.6765178635716438, "epoch": 0.3308226625303501, "grad_norm": 3.5709166526794434, "learning_rate": 4.39842395675406e-06, "loss": 0.3849, "mean_token_accuracy": 0.8611236557364463, "num_tokens": 128284344.0, "step": 106720 }, { "entropy": 1.8389539405703546, "epoch": 0.33085366165539976, "grad_norm": 11.363747596740723, "learning_rate": 4.398217896218056e-06, "loss": 0.4644, "mean_token_accuracy": 0.8520830184221267, "num_tokens": 128296693.0, "step": 106730 }, { "entropy": 1.807772381603718, "epoch": 0.3308846607804495, "grad_norm": 4.15220308303833, "learning_rate": 4.398011864640357e-06, "loss": 0.4797, "mean_token_accuracy": 0.8365904971957207, "num_tokens": 128309280.0, "step": 106740 }, { "entropy": 1.8760442197322846, "epoch": 0.33091565990549915, "grad_norm": 8.140005111694336, "learning_rate": 4.397805862014179e-06, "loss": 0.4703, "mean_token_accuracy": 0.8473433390259743, "num_tokens": 128320766.0, "step": 106750 }, { "entropy": 1.884317898750305, "epoch": 0.3309466590305489, "grad_norm": 8.764328956604004, "learning_rate": 4.397599888332744e-06, "loss": 0.4655, "mean_token_accuracy": 0.8561802804470062, "num_tokens": 128332462.0, "step": 106760 }, { "entropy": 1.790523299574852, "epoch": 0.33097765815559854, "grad_norm": 7.192575454711914, "learning_rate": 4.397393943589274e-06, "loss": 0.4529, "mean_token_accuracy": 0.841584998369217, "num_tokens": 128344386.0, "step": 106770 }, { "entropy": 1.765225899219513, "epoch": 0.33100865728064827, "grad_norm": 4.198668956756592, "learning_rate": 4.397188027776993e-06, "loss": 0.3832, "mean_token_accuracy": 0.8681812718510628, "num_tokens": 128357282.0, "step": 106780 }, { "entropy": 1.8882387548685073, "epoch": 0.33103965640569794, "grad_norm": 8.834793090820312, "learning_rate": 4.396982140889129e-06, "loss": 0.5091, "mean_token_accuracy": 0.8361070469021797, "num_tokens": 128368511.0, "step": 106790 }, { "entropy": 1.9408668845891952, "epoch": 0.33107065553074766, "grad_norm": 7.283912181854248, "learning_rate": 4.396776282918909e-06, "loss": 0.5231, "mean_token_accuracy": 0.8402132421731949, "num_tokens": 128379340.0, "step": 106800 }, { "entropy": 1.9052711009979248, "epoch": 0.33110165465579733, "grad_norm": 9.804369926452637, "learning_rate": 4.396570453859568e-06, "loss": 0.5315, "mean_token_accuracy": 0.8372917637228966, "num_tokens": 128390571.0, "step": 106810 }, { "entropy": 1.8480284005403518, "epoch": 0.33113265378084705, "grad_norm": 8.21649169921875, "learning_rate": 4.396364653704335e-06, "loss": 0.4565, "mean_token_accuracy": 0.8489649355411529, "num_tokens": 128402248.0, "step": 106820 }, { "entropy": 1.8601251021027565, "epoch": 0.3311636529058967, "grad_norm": 3.575690746307373, "learning_rate": 4.396158882446449e-06, "loss": 0.4446, "mean_token_accuracy": 0.8458746924996376, "num_tokens": 128414209.0, "step": 106830 }, { "entropy": 1.8285237357020379, "epoch": 0.33119465203094645, "grad_norm": 9.334534645080566, "learning_rate": 4.3959531400791465e-06, "loss": 0.4558, "mean_token_accuracy": 0.8475871294736862, "num_tokens": 128426736.0, "step": 106840 }, { "entropy": 1.8750257775187493, "epoch": 0.3312256511559961, "grad_norm": 5.553659439086914, "learning_rate": 4.395747426595669e-06, "loss": 0.5246, "mean_token_accuracy": 0.8419804513454437, "num_tokens": 128438360.0, "step": 106850 }, { "entropy": 1.870549239218235, "epoch": 0.33125665028104584, "grad_norm": 6.454956531524658, "learning_rate": 4.395541741989258e-06, "loss": 0.529, "mean_token_accuracy": 0.8351958334445954, "num_tokens": 128449971.0, "step": 106860 }, { "entropy": 1.8797499716281891, "epoch": 0.3312876494060955, "grad_norm": 6.935102939605713, "learning_rate": 4.395336086253158e-06, "loss": 0.435, "mean_token_accuracy": 0.8618351891636848, "num_tokens": 128461856.0, "step": 106870 }, { "entropy": 1.8809518918395043, "epoch": 0.33131864853114523, "grad_norm": 7.828016757965088, "learning_rate": 4.395130459380615e-06, "loss": 0.4947, "mean_token_accuracy": 0.844907422363758, "num_tokens": 128473604.0, "step": 106880 }, { "entropy": 1.8513751283288002, "epoch": 0.3313496476561949, "grad_norm": 8.020812034606934, "learning_rate": 4.39492486136488e-06, "loss": 0.4934, "mean_token_accuracy": 0.8413770824670792, "num_tokens": 128485181.0, "step": 106890 }, { "entropy": 1.8325881719589234, "epoch": 0.3313806467812446, "grad_norm": 8.994607925415039, "learning_rate": 4.3947192921992015e-06, "loss": 0.4488, "mean_token_accuracy": 0.854704113304615, "num_tokens": 128497524.0, "step": 106900 }, { "entropy": 1.7628432139754295, "epoch": 0.3314116459062943, "grad_norm": 4.272587299346924, "learning_rate": 4.394513751876836e-06, "loss": 0.4269, "mean_token_accuracy": 0.853622929751873, "num_tokens": 128510023.0, "step": 106910 }, { "entropy": 1.8294406726956367, "epoch": 0.33144264503134396, "grad_norm": 7.545256614685059, "learning_rate": 4.394308240391038e-06, "loss": 0.4807, "mean_token_accuracy": 0.8351798385381699, "num_tokens": 128522418.0, "step": 106920 }, { "entropy": 1.765715080499649, "epoch": 0.3314736441563937, "grad_norm": 6.0750508308410645, "learning_rate": 4.394102757735063e-06, "loss": 0.4355, "mean_token_accuracy": 0.8552381888031959, "num_tokens": 128535920.0, "step": 106930 }, { "entropy": 1.8687207847833633, "epoch": 0.33150464328144336, "grad_norm": 4.047634124755859, "learning_rate": 4.393897303902175e-06, "loss": 0.5086, "mean_token_accuracy": 0.8374577537178993, "num_tokens": 128548220.0, "step": 106940 }, { "entropy": 1.7995543777942657, "epoch": 0.3315356424064931, "grad_norm": 2.202331781387329, "learning_rate": 4.3936918788856334e-06, "loss": 0.4747, "mean_token_accuracy": 0.8424097955226898, "num_tokens": 128561328.0, "step": 106950 }, { "entropy": 1.7521587029099464, "epoch": 0.33156664153154275, "grad_norm": 2.6816599369049072, "learning_rate": 4.393486482678704e-06, "loss": 0.3981, "mean_token_accuracy": 0.8565811112523078, "num_tokens": 128575383.0, "step": 106960 }, { "entropy": 1.841699157655239, "epoch": 0.3315976406565925, "grad_norm": 7.942765235900879, "learning_rate": 4.393281115274653e-06, "loss": 0.4587, "mean_token_accuracy": 0.8441447481513024, "num_tokens": 128587295.0, "step": 106970 }, { "entropy": 1.8485121354460716, "epoch": 0.33162863978164214, "grad_norm": 3.5550448894500732, "learning_rate": 4.39307577666675e-06, "loss": 0.444, "mean_token_accuracy": 0.8580849900841713, "num_tokens": 128598970.0, "step": 106980 }, { "entropy": 1.8623722419142723, "epoch": 0.33165963890669187, "grad_norm": 9.27586555480957, "learning_rate": 4.3928704668482655e-06, "loss": 0.4854, "mean_token_accuracy": 0.8380335718393326, "num_tokens": 128611672.0, "step": 106990 }, { "entropy": 1.8408109113574027, "epoch": 0.33169063803174154, "grad_norm": 8.369120597839355, "learning_rate": 4.392665185812473e-06, "loss": 0.519, "mean_token_accuracy": 0.8391941249370575, "num_tokens": 128623923.0, "step": 107000 }, { "entropy": 1.9453389286994933, "epoch": 0.33172163715679126, "grad_norm": 8.78305435180664, "learning_rate": 4.392459933552647e-06, "loss": 0.5335, "mean_token_accuracy": 0.8356341332197189, "num_tokens": 128634685.0, "step": 107010 }, { "entropy": 1.8125431835651398, "epoch": 0.33175263628184093, "grad_norm": 3.5980794429779053, "learning_rate": 4.392254710062065e-06, "loss": 0.4076, "mean_token_accuracy": 0.854391148686409, "num_tokens": 128648003.0, "step": 107020 }, { "entropy": 1.8818465709686278, "epoch": 0.33178363540689065, "grad_norm": 9.284706115722656, "learning_rate": 4.3920495153340094e-06, "loss": 0.5199, "mean_token_accuracy": 0.838673610985279, "num_tokens": 128659608.0, "step": 107030 }, { "entropy": 1.8223556622862815, "epoch": 0.3318146345319403, "grad_norm": 5.312669277191162, "learning_rate": 4.39184434936176e-06, "loss": 0.4506, "mean_token_accuracy": 0.8526388436555863, "num_tokens": 128672024.0, "step": 107040 }, { "entropy": 1.8191813245415687, "epoch": 0.33184563365699005, "grad_norm": 9.039751052856445, "learning_rate": 4.3916392121386025e-06, "loss": 0.4316, "mean_token_accuracy": 0.8523767486214637, "num_tokens": 128684326.0, "step": 107050 }, { "entropy": 1.8492727160453797, "epoch": 0.3318766327820397, "grad_norm": 4.566840648651123, "learning_rate": 4.3914341036578225e-06, "loss": 0.4415, "mean_token_accuracy": 0.8528471812605858, "num_tokens": 128696669.0, "step": 107060 }, { "entropy": 1.8731192171573638, "epoch": 0.33190763190708944, "grad_norm": 10.486961364746094, "learning_rate": 4.391229023912708e-06, "loss": 0.464, "mean_token_accuracy": 0.8397856906056405, "num_tokens": 128708878.0, "step": 107070 }, { "entropy": 1.757045941054821, "epoch": 0.3319386310321391, "grad_norm": 4.080169677734375, "learning_rate": 4.3910239728965514e-06, "loss": 0.3642, "mean_token_accuracy": 0.8683019027113914, "num_tokens": 128722019.0, "step": 107080 }, { "entropy": 1.8372290149331092, "epoch": 0.33196963015718883, "grad_norm": 7.9230265617370605, "learning_rate": 4.390818950602644e-06, "loss": 0.4465, "mean_token_accuracy": 0.8605737552046776, "num_tokens": 128733732.0, "step": 107090 }, { "entropy": 1.7193499386310578, "epoch": 0.3320006292822385, "grad_norm": 3.894108533859253, "learning_rate": 4.390613957024283e-06, "loss": 0.3726, "mean_token_accuracy": 0.8623716205358505, "num_tokens": 128747568.0, "step": 107100 }, { "entropy": 1.8526916906237603, "epoch": 0.3320316284072882, "grad_norm": 7.582345008850098, "learning_rate": 4.390408992154765e-06, "loss": 0.4731, "mean_token_accuracy": 0.8529738247394562, "num_tokens": 128758676.0, "step": 107110 }, { "entropy": 1.7845040023326875, "epoch": 0.3320626275323379, "grad_norm": 4.077572822570801, "learning_rate": 4.390204055987388e-06, "loss": 0.4233, "mean_token_accuracy": 0.8509064018726349, "num_tokens": 128770630.0, "step": 107120 }, { "entropy": 1.9034978345036506, "epoch": 0.3320936266573876, "grad_norm": 11.147791862487793, "learning_rate": 4.389999148515457e-06, "loss": 0.5067, "mean_token_accuracy": 0.8462053269147873, "num_tokens": 128781845.0, "step": 107130 }, { "entropy": 1.8408338278532028, "epoch": 0.3321246257824373, "grad_norm": 5.386666297912598, "learning_rate": 4.389794269732274e-06, "loss": 0.5242, "mean_token_accuracy": 0.8391245663166046, "num_tokens": 128794017.0, "step": 107140 }, { "entropy": 1.8361081883311272, "epoch": 0.332155624907487, "grad_norm": 7.174749374389648, "learning_rate": 4.389589419631145e-06, "loss": 0.462, "mean_token_accuracy": 0.8466487154364586, "num_tokens": 128805772.0, "step": 107150 }, { "entropy": 1.8699164032936095, "epoch": 0.3321866240325367, "grad_norm": 9.90065860748291, "learning_rate": 4.389384598205379e-06, "loss": 0.4566, "mean_token_accuracy": 0.8560563385486603, "num_tokens": 128817741.0, "step": 107160 }, { "entropy": 1.8497094124555589, "epoch": 0.33221762315758635, "grad_norm": 8.163619995117188, "learning_rate": 4.389179805448286e-06, "loss": 0.4387, "mean_token_accuracy": 0.8564316019415855, "num_tokens": 128829031.0, "step": 107170 }, { "entropy": 1.8336839243769645, "epoch": 0.3322486222826361, "grad_norm": 8.241865158081055, "learning_rate": 4.38897504135318e-06, "loss": 0.4941, "mean_token_accuracy": 0.8471016883850098, "num_tokens": 128841424.0, "step": 107180 }, { "entropy": 1.7827384427189827, "epoch": 0.33227962140768574, "grad_norm": 4.903016090393066, "learning_rate": 4.388770305913374e-06, "loss": 0.4137, "mean_token_accuracy": 0.8514184907078743, "num_tokens": 128855066.0, "step": 107190 }, { "entropy": 1.89354208111763, "epoch": 0.33231062053273547, "grad_norm": 8.681282043457031, "learning_rate": 4.388565599122187e-06, "loss": 0.5053, "mean_token_accuracy": 0.8418336838483811, "num_tokens": 128866035.0, "step": 107200 }, { "entropy": 1.8403149604797364, "epoch": 0.33234161965778514, "grad_norm": 4.251253604888916, "learning_rate": 4.3883609209729375e-06, "loss": 0.447, "mean_token_accuracy": 0.8545697897672653, "num_tokens": 128878048.0, "step": 107210 }, { "entropy": 1.9954572170972824, "epoch": 0.33237261878283486, "grad_norm": 8.248795509338379, "learning_rate": 4.388156271458946e-06, "loss": 0.5474, "mean_token_accuracy": 0.8341551095247268, "num_tokens": 128888607.0, "step": 107220 }, { "entropy": 1.7177160628139974, "epoch": 0.33240361790788453, "grad_norm": 7.692461967468262, "learning_rate": 4.387951650573538e-06, "loss": 0.4112, "mean_token_accuracy": 0.8625841051340103, "num_tokens": 128903267.0, "step": 107230 }, { "entropy": 1.9298606514930725, "epoch": 0.33243461703293425, "grad_norm": 7.782931327819824, "learning_rate": 4.387747058310038e-06, "loss": 0.4988, "mean_token_accuracy": 0.8428981378674507, "num_tokens": 128914211.0, "step": 107240 }, { "entropy": 1.9102041110396386, "epoch": 0.3324656161579839, "grad_norm": 9.960110664367676, "learning_rate": 4.387542494661775e-06, "loss": 0.5518, "mean_token_accuracy": 0.8341942340135574, "num_tokens": 128925160.0, "step": 107250 }, { "entropy": 1.7972071468830109, "epoch": 0.33249661528303365, "grad_norm": 8.017036437988281, "learning_rate": 4.3873379596220784e-06, "loss": 0.4138, "mean_token_accuracy": 0.8529744014143944, "num_tokens": 128937771.0, "step": 107260 }, { "entropy": 1.8894135251641273, "epoch": 0.3325276144080833, "grad_norm": 8.887062072753906, "learning_rate": 4.3871334531842805e-06, "loss": 0.4585, "mean_token_accuracy": 0.8535187065601348, "num_tokens": 128949419.0, "step": 107270 }, { "entropy": 1.8428677216172218, "epoch": 0.33255861353313304, "grad_norm": 6.1778645515441895, "learning_rate": 4.386928975341716e-06, "loss": 0.4089, "mean_token_accuracy": 0.8626132607460022, "num_tokens": 128960944.0, "step": 107280 }, { "entropy": 1.8553937733173371, "epoch": 0.3325896126581827, "grad_norm": 8.516434669494629, "learning_rate": 4.386724526087722e-06, "loss": 0.4573, "mean_token_accuracy": 0.8496822208166123, "num_tokens": 128972466.0, "step": 107290 }, { "entropy": 1.8534843027591705, "epoch": 0.33262061178323243, "grad_norm": 8.477677345275879, "learning_rate": 4.386520105415637e-06, "loss": 0.4455, "mean_token_accuracy": 0.8520261511206627, "num_tokens": 128984702.0, "step": 107300 }, { "entropy": 1.8577327355742455, "epoch": 0.3326516109082821, "grad_norm": 8.226881980895996, "learning_rate": 4.386315713318802e-06, "loss": 0.4799, "mean_token_accuracy": 0.8503885567188263, "num_tokens": 128996533.0, "step": 107310 }, { "entropy": 1.766034336388111, "epoch": 0.3326826100333318, "grad_norm": 7.096409320831299, "learning_rate": 4.386111349790562e-06, "loss": 0.4041, "mean_token_accuracy": 0.8526156499981881, "num_tokens": 129009556.0, "step": 107320 }, { "entropy": 1.823246991634369, "epoch": 0.3327136091583815, "grad_norm": 8.441973686218262, "learning_rate": 4.385907014824259e-06, "loss": 0.4427, "mean_token_accuracy": 0.8562351033091545, "num_tokens": 129021912.0, "step": 107330 }, { "entropy": 1.8534847319126129, "epoch": 0.3327446082834312, "grad_norm": 4.784743785858154, "learning_rate": 4.385702708413244e-06, "loss": 0.4468, "mean_token_accuracy": 0.8512447312474251, "num_tokens": 129033807.0, "step": 107340 }, { "entropy": 1.8405146718025207, "epoch": 0.3327756074084809, "grad_norm": 3.7886173725128174, "learning_rate": 4.385498430550864e-06, "loss": 0.4918, "mean_token_accuracy": 0.8435701325535774, "num_tokens": 129046120.0, "step": 107350 }, { "entropy": 1.7437313303351403, "epoch": 0.3328066065335306, "grad_norm": 4.013438701629639, "learning_rate": 4.385294181230472e-06, "loss": 0.3651, "mean_token_accuracy": 0.8705343827605247, "num_tokens": 129060159.0, "step": 107360 }, { "entropy": 1.8268103674054146, "epoch": 0.3328376056585803, "grad_norm": 10.785335540771484, "learning_rate": 4.385089960445422e-06, "loss": 0.403, "mean_token_accuracy": 0.853385554254055, "num_tokens": 129072056.0, "step": 107370 }, { "entropy": 1.7955839529633522, "epoch": 0.33286860478363, "grad_norm": 3.143990993499756, "learning_rate": 4.384885768189071e-06, "loss": 0.4295, "mean_token_accuracy": 0.854556767642498, "num_tokens": 129084936.0, "step": 107380 }, { "entropy": 1.9022680431604386, "epoch": 0.3328996039086797, "grad_norm": 7.960306167602539, "learning_rate": 4.384681604454776e-06, "loss": 0.5184, "mean_token_accuracy": 0.8433623731136322, "num_tokens": 129096132.0, "step": 107390 }, { "entropy": 1.8644019782543182, "epoch": 0.3329306030337294, "grad_norm": 9.219409942626953, "learning_rate": 4.384477469235899e-06, "loss": 0.5549, "mean_token_accuracy": 0.8375217631459236, "num_tokens": 129108399.0, "step": 107400 }, { "entropy": 1.8267999082803725, "epoch": 0.33296160215877907, "grad_norm": 9.918363571166992, "learning_rate": 4.384273362525801e-06, "loss": 0.4495, "mean_token_accuracy": 0.8537021040916443, "num_tokens": 129120240.0, "step": 107410 }, { "entropy": 1.8040613248944282, "epoch": 0.33299260128382874, "grad_norm": 9.216553688049316, "learning_rate": 4.384069284317849e-06, "loss": 0.3904, "mean_token_accuracy": 0.8598408699035645, "num_tokens": 129133042.0, "step": 107420 }, { "entropy": 1.8831330671906472, "epoch": 0.33302360040887846, "grad_norm": 7.262103080749512, "learning_rate": 4.383865234605409e-06, "loss": 0.4714, "mean_token_accuracy": 0.8518488049507141, "num_tokens": 129144475.0, "step": 107430 }, { "entropy": 1.8981991022825242, "epoch": 0.33305459953392813, "grad_norm": 8.177460670471191, "learning_rate": 4.38366121338185e-06, "loss": 0.4907, "mean_token_accuracy": 0.8512781083583831, "num_tokens": 129156011.0, "step": 107440 }, { "entropy": 1.829132416844368, "epoch": 0.33308559865897786, "grad_norm": 9.38082504272461, "learning_rate": 4.383457220640543e-06, "loss": 0.4463, "mean_token_accuracy": 0.8530718177556992, "num_tokens": 129168301.0, "step": 107450 }, { "entropy": 1.76314737200737, "epoch": 0.3331165977840275, "grad_norm": 8.310257911682129, "learning_rate": 4.383253256374863e-06, "loss": 0.3821, "mean_token_accuracy": 0.8589179947972297, "num_tokens": 129181867.0, "step": 107460 }, { "entropy": 1.7426991537213326, "epoch": 0.33314759690907725, "grad_norm": 4.443640232086182, "learning_rate": 4.383049320578185e-06, "loss": 0.4003, "mean_token_accuracy": 0.8596662864089012, "num_tokens": 129194857.0, "step": 107470 }, { "entropy": 1.8887342095375061, "epoch": 0.3331785960341269, "grad_norm": 9.242331504821777, "learning_rate": 4.382845413243886e-06, "loss": 0.5217, "mean_token_accuracy": 0.8357272610068321, "num_tokens": 129206228.0, "step": 107480 }, { "entropy": 1.7917708232998848, "epoch": 0.33320959515917664, "grad_norm": 7.552341938018799, "learning_rate": 4.382641534365348e-06, "loss": 0.4455, "mean_token_accuracy": 0.8635924890637398, "num_tokens": 129219549.0, "step": 107490 }, { "entropy": 1.9377831488847732, "epoch": 0.3332405942842263, "grad_norm": 8.330388069152832, "learning_rate": 4.38243768393595e-06, "loss": 0.5005, "mean_token_accuracy": 0.8412655621767045, "num_tokens": 129230964.0, "step": 107500 }, { "entropy": 1.8014700457453727, "epoch": 0.33327159340927603, "grad_norm": 9.167746543884277, "learning_rate": 4.382233861949079e-06, "loss": 0.4119, "mean_token_accuracy": 0.8664462238550186, "num_tokens": 129242857.0, "step": 107510 }, { "entropy": 1.8836747616529466, "epoch": 0.3333025925343257, "grad_norm": 8.401406288146973, "learning_rate": 4.382030068398122e-06, "loss": 0.48, "mean_token_accuracy": 0.846863467991352, "num_tokens": 129254438.0, "step": 107520 }, { "entropy": 1.8405706137418747, "epoch": 0.33333359165937543, "grad_norm": 3.3866302967071533, "learning_rate": 4.381826303276464e-06, "loss": 0.4663, "mean_token_accuracy": 0.849305622279644, "num_tokens": 129266522.0, "step": 107530 }, { "entropy": 1.8931007355451583, "epoch": 0.3333645907844251, "grad_norm": 9.184843063354492, "learning_rate": 4.3816225665775e-06, "loss": 0.5305, "mean_token_accuracy": 0.8357789561152458, "num_tokens": 129278660.0, "step": 107540 }, { "entropy": 1.8619947463274003, "epoch": 0.3333955899094748, "grad_norm": 8.761635780334473, "learning_rate": 4.38141885829462e-06, "loss": 0.4635, "mean_token_accuracy": 0.8441411420702934, "num_tokens": 129290127.0, "step": 107550 }, { "entropy": 1.8187642887234687, "epoch": 0.3334265890345245, "grad_norm": 7.852782249450684, "learning_rate": 4.3812151784212205e-06, "loss": 0.4222, "mean_token_accuracy": 0.8587213516235351, "num_tokens": 129302352.0, "step": 107560 }, { "entropy": 1.8852619290351869, "epoch": 0.3334575881595742, "grad_norm": 8.888199806213379, "learning_rate": 4.3810115269506985e-06, "loss": 0.5021, "mean_token_accuracy": 0.850848950445652, "num_tokens": 129313337.0, "step": 107570 }, { "entropy": 1.8726006522774696, "epoch": 0.3334885872846239, "grad_norm": 7.686779022216797, "learning_rate": 4.380807903876452e-06, "loss": 0.4997, "mean_token_accuracy": 0.8449873507022858, "num_tokens": 129324483.0, "step": 107580 }, { "entropy": 1.857853028178215, "epoch": 0.3335195864096736, "grad_norm": 9.525311470031738, "learning_rate": 4.3806043091918855e-06, "loss": 0.4275, "mean_token_accuracy": 0.8560262963175773, "num_tokens": 129336845.0, "step": 107590 }, { "entropy": 1.8294879227876664, "epoch": 0.3335505855347233, "grad_norm": 3.713810920715332, "learning_rate": 4.3804007428904e-06, "loss": 0.4397, "mean_token_accuracy": 0.8501683503389359, "num_tokens": 129349320.0, "step": 107600 }, { "entropy": 1.7990604281425475, "epoch": 0.333581584659773, "grad_norm": 8.975895881652832, "learning_rate": 4.380197204965402e-06, "loss": 0.4444, "mean_token_accuracy": 0.850322599709034, "num_tokens": 129361696.0, "step": 107610 }, { "entropy": 1.8823053747415543, "epoch": 0.33361258378482267, "grad_norm": 9.130423545837402, "learning_rate": 4.3799936954103e-06, "loss": 0.5031, "mean_token_accuracy": 0.8529215559363366, "num_tokens": 129373047.0, "step": 107620 }, { "entropy": 1.7972995683550834, "epoch": 0.3336435829098724, "grad_norm": 6.733953952789307, "learning_rate": 4.3797902142185034e-06, "loss": 0.4064, "mean_token_accuracy": 0.8613699153065681, "num_tokens": 129386076.0, "step": 107630 }, { "entropy": 1.9032483518123626, "epoch": 0.33367458203492206, "grad_norm": 8.466991424560547, "learning_rate": 4.379586761383426e-06, "loss": 0.5461, "mean_token_accuracy": 0.8323118224740028, "num_tokens": 129397394.0, "step": 107640 }, { "entropy": 1.8645583361387252, "epoch": 0.3337055811599718, "grad_norm": 9.792939186096191, "learning_rate": 4.379383336898479e-06, "loss": 0.4703, "mean_token_accuracy": 0.8481997266411782, "num_tokens": 129408714.0, "step": 107650 }, { "entropy": 1.9275704205036164, "epoch": 0.33373658028502146, "grad_norm": 8.203200340270996, "learning_rate": 4.3791799407570814e-06, "loss": 0.5707, "mean_token_accuracy": 0.8284709110856057, "num_tokens": 129420176.0, "step": 107660 }, { "entropy": 1.9315796703100205, "epoch": 0.3337675794100711, "grad_norm": 7.3529953956604, "learning_rate": 4.378976572952653e-06, "loss": 0.4796, "mean_token_accuracy": 0.8478155717253685, "num_tokens": 129430827.0, "step": 107670 }, { "entropy": 1.7806938499212266, "epoch": 0.33379857853512085, "grad_norm": 3.7703471183776855, "learning_rate": 4.378773233478612e-06, "loss": 0.4423, "mean_token_accuracy": 0.8491846337914467, "num_tokens": 129443966.0, "step": 107680 }, { "entropy": 1.7839350268244742, "epoch": 0.3338295776601705, "grad_norm": 7.560736179351807, "learning_rate": 4.378569922328383e-06, "loss": 0.3992, "mean_token_accuracy": 0.8620262175798417, "num_tokens": 129457050.0, "step": 107690 }, { "entropy": 1.8768269151449204, "epoch": 0.33386057678522024, "grad_norm": 8.099947929382324, "learning_rate": 4.37836663949539e-06, "loss": 0.4894, "mean_token_accuracy": 0.844311997294426, "num_tokens": 129468839.0, "step": 107700 }, { "entropy": 1.8897486120462417, "epoch": 0.3338915759102699, "grad_norm": 8.095099449157715, "learning_rate": 4.3781633849730605e-06, "loss": 0.5024, "mean_token_accuracy": 0.8418814584612846, "num_tokens": 129480361.0, "step": 107710 }, { "entropy": 1.8843684524297715, "epoch": 0.33392257503531964, "grad_norm": 5.762594223022461, "learning_rate": 4.377960158754823e-06, "loss": 0.482, "mean_token_accuracy": 0.8446230858564376, "num_tokens": 129492583.0, "step": 107720 }, { "entropy": 1.969536703824997, "epoch": 0.3339535741603693, "grad_norm": 8.858036041259766, "learning_rate": 4.377756960834111e-06, "loss": 0.509, "mean_token_accuracy": 0.844764456152916, "num_tokens": 129503430.0, "step": 107730 }, { "entropy": 1.8859269142150878, "epoch": 0.33398457328541903, "grad_norm": 4.305178165435791, "learning_rate": 4.377553791204358e-06, "loss": 0.4548, "mean_token_accuracy": 0.8533265367150307, "num_tokens": 129514786.0, "step": 107740 }, { "entropy": 1.859623844921589, "epoch": 0.3340155724104687, "grad_norm": 7.1102190017700195, "learning_rate": 4.377350649858999e-06, "loss": 0.4748, "mean_token_accuracy": 0.8454333543777466, "num_tokens": 129526854.0, "step": 107750 }, { "entropy": 1.897015392780304, "epoch": 0.3340465715355184, "grad_norm": 8.836033821105957, "learning_rate": 4.377147536791471e-06, "loss": 0.5238, "mean_token_accuracy": 0.8378755912184715, "num_tokens": 129537781.0, "step": 107760 }, { "entropy": 1.8058869987726212, "epoch": 0.3340775706605681, "grad_norm": 2.6924660205841064, "learning_rate": 4.376944451995214e-06, "loss": 0.4335, "mean_token_accuracy": 0.8562029376626015, "num_tokens": 129550764.0, "step": 107770 }, { "entropy": 1.8765144675970078, "epoch": 0.3341085697856178, "grad_norm": 7.857125759124756, "learning_rate": 4.376741395463672e-06, "loss": 0.4803, "mean_token_accuracy": 0.8540433526039124, "num_tokens": 129562412.0, "step": 107780 }, { "entropy": 1.8461247250437736, "epoch": 0.3341395689106675, "grad_norm": 8.548455238342285, "learning_rate": 4.376538367190288e-06, "loss": 0.4783, "mean_token_accuracy": 0.8496983379125596, "num_tokens": 129573651.0, "step": 107790 }, { "entropy": 1.8069075807929038, "epoch": 0.3341705680357172, "grad_norm": 9.008338928222656, "learning_rate": 4.37633536716851e-06, "loss": 0.4151, "mean_token_accuracy": 0.8606833323836327, "num_tokens": 129586574.0, "step": 107800 }, { "entropy": 1.8529318809509276, "epoch": 0.3342015671607669, "grad_norm": 3.940270185470581, "learning_rate": 4.376132395391783e-06, "loss": 0.4425, "mean_token_accuracy": 0.8578683048486709, "num_tokens": 129598430.0, "step": 107810 }, { "entropy": 1.8817769691348076, "epoch": 0.3342325662858166, "grad_norm": 8.67603588104248, "learning_rate": 4.375929451853561e-06, "loss": 0.5161, "mean_token_accuracy": 0.8386638849973679, "num_tokens": 129610580.0, "step": 107820 }, { "entropy": 1.8078914038836955, "epoch": 0.33426356541086627, "grad_norm": 10.10114574432373, "learning_rate": 4.375726536547296e-06, "loss": 0.4132, "mean_token_accuracy": 0.8479559436440468, "num_tokens": 129623248.0, "step": 107830 }, { "entropy": 1.8651557683944702, "epoch": 0.334294564535916, "grad_norm": 7.359059810638428, "learning_rate": 4.375523649466441e-06, "loss": 0.4767, "mean_token_accuracy": 0.8485989525914193, "num_tokens": 129635182.0, "step": 107840 }, { "entropy": 1.744455586373806, "epoch": 0.33432556366096566, "grad_norm": 5.441449165344238, "learning_rate": 4.375320790604457e-06, "loss": 0.3794, "mean_token_accuracy": 0.8629415735602379, "num_tokens": 129649244.0, "step": 107850 }, { "entropy": 1.7221811696887017, "epoch": 0.3343565627860154, "grad_norm": 3.5538432598114014, "learning_rate": 4.375117959954799e-06, "loss": 0.353, "mean_token_accuracy": 0.8725274935364723, "num_tokens": 129662708.0, "step": 107860 }, { "entropy": 1.7094054415822029, "epoch": 0.33438756191106506, "grad_norm": 11.59929370880127, "learning_rate": 4.37491515751093e-06, "loss": 0.39, "mean_token_accuracy": 0.8701628282666206, "num_tokens": 129676262.0, "step": 107870 }, { "entropy": 1.7959278047084808, "epoch": 0.3344185610361148, "grad_norm": 3.8675200939178467, "learning_rate": 4.374712383266314e-06, "loss": 0.4187, "mean_token_accuracy": 0.8563177272677421, "num_tokens": 129689029.0, "step": 107880 }, { "entropy": 1.8378399103879928, "epoch": 0.33444956016116445, "grad_norm": 7.299082279205322, "learning_rate": 4.3745096372144166e-06, "loss": 0.5197, "mean_token_accuracy": 0.8417981162667274, "num_tokens": 129701201.0, "step": 107890 }, { "entropy": 1.8643528178334237, "epoch": 0.3344805592862142, "grad_norm": 7.960921287536621, "learning_rate": 4.374306919348705e-06, "loss": 0.4717, "mean_token_accuracy": 0.8524877533316613, "num_tokens": 129712743.0, "step": 107900 }, { "entropy": 1.900659802556038, "epoch": 0.33451155841126384, "grad_norm": 7.667184829711914, "learning_rate": 4.374104229662648e-06, "loss": 0.4911, "mean_token_accuracy": 0.8470824211835861, "num_tokens": 129723529.0, "step": 107910 }, { "entropy": 1.8972845152020454, "epoch": 0.3345425575363135, "grad_norm": 7.8113484382629395, "learning_rate": 4.3739015681497185e-06, "loss": 0.4998, "mean_token_accuracy": 0.8441723167896271, "num_tokens": 129734887.0, "step": 107920 }, { "entropy": 1.8530030235648156, "epoch": 0.33457355666136324, "grad_norm": 7.784739971160889, "learning_rate": 4.37369893480339e-06, "loss": 0.4475, "mean_token_accuracy": 0.8602523878216743, "num_tokens": 129746166.0, "step": 107930 }, { "entropy": 1.8418569251894952, "epoch": 0.3346045557864129, "grad_norm": 3.6897940635681152, "learning_rate": 4.37349632961714e-06, "loss": 0.4411, "mean_token_accuracy": 0.8551413804292679, "num_tokens": 129758785.0, "step": 107940 }, { "entropy": 1.8382650315761566, "epoch": 0.33463555491146263, "grad_norm": 6.814600467681885, "learning_rate": 4.373293752584445e-06, "loss": 0.4678, "mean_token_accuracy": 0.8480258256196975, "num_tokens": 129771240.0, "step": 107950 }, { "entropy": 1.7906376250088214, "epoch": 0.3346665540365123, "grad_norm": 8.821784973144531, "learning_rate": 4.373091203698785e-06, "loss": 0.3782, "mean_token_accuracy": 0.8606226801872253, "num_tokens": 129783410.0, "step": 107960 }, { "entropy": 1.850863453745842, "epoch": 0.334697553161562, "grad_norm": 3.5716798305511475, "learning_rate": 4.372888682953645e-06, "loss": 0.4767, "mean_token_accuracy": 0.8479766875505448, "num_tokens": 129794773.0, "step": 107970 }, { "entropy": 1.8356069535017014, "epoch": 0.3347285522866117, "grad_norm": 8.035232543945312, "learning_rate": 4.372686190342508e-06, "loss": 0.4409, "mean_token_accuracy": 0.8509008720517158, "num_tokens": 129807004.0, "step": 107980 }, { "entropy": 1.8803606986999513, "epoch": 0.3347595514116614, "grad_norm": 7.623292446136475, "learning_rate": 4.3724837258588594e-06, "loss": 0.5087, "mean_token_accuracy": 0.8468181252479553, "num_tokens": 129818561.0, "step": 107990 }, { "entropy": 1.8860761135816575, "epoch": 0.3347905505367111, "grad_norm": 9.617867469787598, "learning_rate": 4.37228128949619e-06, "loss": 0.5025, "mean_token_accuracy": 0.8485965311527253, "num_tokens": 129830143.0, "step": 108000 }, { "entropy": 1.912020392715931, "epoch": 0.3348215496617608, "grad_norm": 8.843141555786133, "learning_rate": 4.37207888124799e-06, "loss": 0.504, "mean_token_accuracy": 0.8369658708572387, "num_tokens": 129841114.0, "step": 108010 }, { "entropy": 1.9048634201288224, "epoch": 0.3348525487868105, "grad_norm": 6.282487392425537, "learning_rate": 4.3718765011077526e-06, "loss": 0.5192, "mean_token_accuracy": 0.8487017884850502, "num_tokens": 129851752.0, "step": 108020 }, { "entropy": 1.9025760471820832, "epoch": 0.3348835479118602, "grad_norm": 9.245036125183105, "learning_rate": 4.371674149068973e-06, "loss": 0.4887, "mean_token_accuracy": 0.8425028279423714, "num_tokens": 129863830.0, "step": 108030 }, { "entropy": 1.863828182220459, "epoch": 0.33491454703690987, "grad_norm": 3.375025510787964, "learning_rate": 4.3714718251251484e-06, "loss": 0.4603, "mean_token_accuracy": 0.8575912222266198, "num_tokens": 129875417.0, "step": 108040 }, { "entropy": 1.749958410859108, "epoch": 0.3349455461619596, "grad_norm": 3.8491711616516113, "learning_rate": 4.371269529269777e-06, "loss": 0.3726, "mean_token_accuracy": 0.8608524903655053, "num_tokens": 129888923.0, "step": 108050 }, { "entropy": 1.8047884538769723, "epoch": 0.33497654528700926, "grad_norm": 4.140578746795654, "learning_rate": 4.371067261496363e-06, "loss": 0.4493, "mean_token_accuracy": 0.8508468925952911, "num_tokens": 129901648.0, "step": 108060 }, { "entropy": 1.9018609017133712, "epoch": 0.335007544412059, "grad_norm": 7.736186504364014, "learning_rate": 4.3708650217984065e-06, "loss": 0.4772, "mean_token_accuracy": 0.8536763086915016, "num_tokens": 129912303.0, "step": 108070 }, { "entropy": 1.8833556294441223, "epoch": 0.33503854353710866, "grad_norm": 8.170893669128418, "learning_rate": 4.370662810169415e-06, "loss": 0.454, "mean_token_accuracy": 0.8535693794488907, "num_tokens": 129923394.0, "step": 108080 }, { "entropy": 1.7835390165448188, "epoch": 0.3350695426621584, "grad_norm": 8.293116569519043, "learning_rate": 4.370460626602896e-06, "loss": 0.4399, "mean_token_accuracy": 0.8520323082804679, "num_tokens": 129936227.0, "step": 108090 }, { "entropy": 1.8735915690660476, "epoch": 0.33510054178720805, "grad_norm": 8.832682609558105, "learning_rate": 4.37025847109236e-06, "loss": 0.4544, "mean_token_accuracy": 0.8542598769068718, "num_tokens": 129947875.0, "step": 108100 }, { "entropy": 1.8861595943570137, "epoch": 0.3351315409122578, "grad_norm": 4.6345601081848145, "learning_rate": 4.370056343631318e-06, "loss": 0.473, "mean_token_accuracy": 0.8438780650496482, "num_tokens": 129959918.0, "step": 108110 }, { "entropy": 1.8636915504932403, "epoch": 0.33516254003730744, "grad_norm": 9.650102615356445, "learning_rate": 4.3698542442132845e-06, "loss": 0.4528, "mean_token_accuracy": 0.8606543466448784, "num_tokens": 129970846.0, "step": 108120 }, { "entropy": 1.911002305150032, "epoch": 0.33519353916235717, "grad_norm": 9.570003509521484, "learning_rate": 4.369652172831775e-06, "loss": 0.5068, "mean_token_accuracy": 0.8438562154769897, "num_tokens": 129981852.0, "step": 108130 }, { "entropy": 1.8741035476326942, "epoch": 0.33522453828740684, "grad_norm": 6.404723644256592, "learning_rate": 4.3694501294803084e-06, "loss": 0.4746, "mean_token_accuracy": 0.8497413903474808, "num_tokens": 129993561.0, "step": 108140 }, { "entropy": 1.881354147195816, "epoch": 0.3352555374124565, "grad_norm": 8.92978286743164, "learning_rate": 4.369248114152405e-06, "loss": 0.4388, "mean_token_accuracy": 0.8553690627217293, "num_tokens": 130005553.0, "step": 108150 }, { "entropy": 1.9167394667863846, "epoch": 0.33528653653750623, "grad_norm": 6.4430341720581055, "learning_rate": 4.369046126841588e-06, "loss": 0.4929, "mean_token_accuracy": 0.848999661207199, "num_tokens": 130016296.0, "step": 108160 }, { "entropy": 1.7608971558511257, "epoch": 0.3353175356625559, "grad_norm": 11.467843055725098, "learning_rate": 4.3688441675413805e-06, "loss": 0.4082, "mean_token_accuracy": 0.8577596142888069, "num_tokens": 130029257.0, "step": 108170 }, { "entropy": 1.7902075618505477, "epoch": 0.3353485347876056, "grad_norm": 7.754476070404053, "learning_rate": 4.3686422362453095e-06, "loss": 0.4486, "mean_token_accuracy": 0.8457227498292923, "num_tokens": 130042376.0, "step": 108180 }, { "entropy": 1.80088053047657, "epoch": 0.3353795339126553, "grad_norm": 9.529817581176758, "learning_rate": 4.368440332946905e-06, "loss": 0.4318, "mean_token_accuracy": 0.8530313909053803, "num_tokens": 130054996.0, "step": 108190 }, { "entropy": 1.88881815969944, "epoch": 0.335410533037705, "grad_norm": 7.529262542724609, "learning_rate": 4.368238457639695e-06, "loss": 0.4901, "mean_token_accuracy": 0.8425343319773674, "num_tokens": 130066402.0, "step": 108200 }, { "entropy": 1.9495450556278229, "epoch": 0.3354415321627547, "grad_norm": 7.71326208114624, "learning_rate": 4.368036610317216e-06, "loss": 0.5015, "mean_token_accuracy": 0.8409694746136666, "num_tokens": 130077178.0, "step": 108210 }, { "entropy": 1.898306019604206, "epoch": 0.3354725312878044, "grad_norm": 7.798426151275635, "learning_rate": 4.367834790973002e-06, "loss": 0.4673, "mean_token_accuracy": 0.8483818009495735, "num_tokens": 130088667.0, "step": 108220 }, { "entropy": 1.8723418027162553, "epoch": 0.3355035304128541, "grad_norm": 4.362555503845215, "learning_rate": 4.367632999600588e-06, "loss": 0.4794, "mean_token_accuracy": 0.8386352211236954, "num_tokens": 130101166.0, "step": 108230 }, { "entropy": 1.8666965618729592, "epoch": 0.3355345295379038, "grad_norm": 6.942992687225342, "learning_rate": 4.367431236193516e-06, "loss": 0.4756, "mean_token_accuracy": 0.8470498412847519, "num_tokens": 130113744.0, "step": 108240 }, { "entropy": 1.811617687344551, "epoch": 0.33556552866295347, "grad_norm": 10.553791999816895, "learning_rate": 4.367229500745324e-06, "loss": 0.4317, "mean_token_accuracy": 0.8510163232684136, "num_tokens": 130126707.0, "step": 108250 }, { "entropy": 1.9607309699058533, "epoch": 0.3355965277880032, "grad_norm": 8.012662887573242, "learning_rate": 4.367027793249559e-06, "loss": 0.5177, "mean_token_accuracy": 0.8463420748710633, "num_tokens": 130137777.0, "step": 108260 }, { "entropy": 1.7869203016161919, "epoch": 0.33562752691305286, "grad_norm": 7.589692115783691, "learning_rate": 4.366826113699764e-06, "loss": 0.4129, "mean_token_accuracy": 0.8630554795265197, "num_tokens": 130150461.0, "step": 108270 }, { "entropy": 1.880982668697834, "epoch": 0.3356585260381026, "grad_norm": 7.124267101287842, "learning_rate": 4.366624462089486e-06, "loss": 0.5833, "mean_token_accuracy": 0.827752648293972, "num_tokens": 130162409.0, "step": 108280 }, { "entropy": 1.8591894909739495, "epoch": 0.33568952516315226, "grad_norm": 7.850539684295654, "learning_rate": 4.3664228384122775e-06, "loss": 0.4857, "mean_token_accuracy": 0.8456223160028458, "num_tokens": 130175026.0, "step": 108290 }, { "entropy": 1.8133115381002427, "epoch": 0.335720524288202, "grad_norm": 3.5843214988708496, "learning_rate": 4.366221242661688e-06, "loss": 0.4181, "mean_token_accuracy": 0.8574982568621635, "num_tokens": 130187161.0, "step": 108300 }, { "entropy": 1.8298716515302658, "epoch": 0.33575152341325165, "grad_norm": 7.22581148147583, "learning_rate": 4.366019674831272e-06, "loss": 0.4093, "mean_token_accuracy": 0.8628581598401069, "num_tokens": 130199254.0, "step": 108310 }, { "entropy": 1.8459315612912177, "epoch": 0.3357825225383014, "grad_norm": 7.195556640625, "learning_rate": 4.365818134914586e-06, "loss": 0.4566, "mean_token_accuracy": 0.855617669224739, "num_tokens": 130211367.0, "step": 108320 }, { "entropy": 1.7838531404733657, "epoch": 0.33581352166335104, "grad_norm": 3.379918098449707, "learning_rate": 4.3656166229051865e-06, "loss": 0.3927, "mean_token_accuracy": 0.868264564871788, "num_tokens": 130223512.0, "step": 108330 }, { "entropy": 1.805635717511177, "epoch": 0.33584452078840077, "grad_norm": 8.333756446838379, "learning_rate": 4.365415138796635e-06, "loss": 0.4398, "mean_token_accuracy": 0.8466704249382019, "num_tokens": 130235508.0, "step": 108340 }, { "entropy": 1.8783694118261338, "epoch": 0.33587551991345044, "grad_norm": 8.413524627685547, "learning_rate": 4.3652136825824914e-06, "loss": 0.5141, "mean_token_accuracy": 0.8432982712984085, "num_tokens": 130247392.0, "step": 108350 }, { "entropy": 1.8464765295386314, "epoch": 0.33590651903850016, "grad_norm": 10.091898918151855, "learning_rate": 4.365012254256323e-06, "loss": 0.5256, "mean_token_accuracy": 0.836361040174961, "num_tokens": 130259254.0, "step": 108360 }, { "entropy": 1.8891173303127289, "epoch": 0.33593751816354983, "grad_norm": 7.264171123504639, "learning_rate": 4.3648108538116935e-06, "loss": 0.4516, "mean_token_accuracy": 0.854046767950058, "num_tokens": 130270895.0, "step": 108370 }, { "entropy": 1.7990408152341844, "epoch": 0.33596851728859956, "grad_norm": 7.887672424316406, "learning_rate": 4.364609481242173e-06, "loss": 0.3973, "mean_token_accuracy": 0.8649446800351143, "num_tokens": 130283074.0, "step": 108380 }, { "entropy": 1.8251417383551598, "epoch": 0.3359995164136492, "grad_norm": 8.665481567382812, "learning_rate": 4.36440813654133e-06, "loss": 0.4955, "mean_token_accuracy": 0.8497830405831337, "num_tokens": 130295411.0, "step": 108390 }, { "entropy": 1.8189852401614188, "epoch": 0.3360305155386989, "grad_norm": 7.711446762084961, "learning_rate": 4.36420681970274e-06, "loss": 0.4558, "mean_token_accuracy": 0.8486074671149254, "num_tokens": 130307723.0, "step": 108400 }, { "entropy": 1.8464395835995675, "epoch": 0.3360615146637486, "grad_norm": 8.451082229614258, "learning_rate": 4.364005530719975e-06, "loss": 0.4729, "mean_token_accuracy": 0.845195434987545, "num_tokens": 130320326.0, "step": 108410 }, { "entropy": 1.9397107988595963, "epoch": 0.3360925137887983, "grad_norm": 8.73201847076416, "learning_rate": 4.363804269586612e-06, "loss": 0.5236, "mean_token_accuracy": 0.8377207443118095, "num_tokens": 130330931.0, "step": 108420 }, { "entropy": 1.8559130504727364, "epoch": 0.336123512913848, "grad_norm": 7.468655109405518, "learning_rate": 4.363603036296231e-06, "loss": 0.4628, "mean_token_accuracy": 0.8583543822169304, "num_tokens": 130342645.0, "step": 108430 }, { "entropy": 1.8359305024147035, "epoch": 0.3361545120388977, "grad_norm": 6.1533522605896, "learning_rate": 4.363401830842411e-06, "loss": 0.438, "mean_token_accuracy": 0.8503975555300712, "num_tokens": 130354736.0, "step": 108440 }, { "entropy": 1.8281599283218384, "epoch": 0.3361855111639474, "grad_norm": 10.031996726989746, "learning_rate": 4.3632006532187375e-06, "loss": 0.4527, "mean_token_accuracy": 0.8479070514440536, "num_tokens": 130366811.0, "step": 108450 }, { "entropy": 1.8181740254163743, "epoch": 0.3362165102889971, "grad_norm": 9.026259422302246, "learning_rate": 4.3629995034187926e-06, "loss": 0.4531, "mean_token_accuracy": 0.8515908911824226, "num_tokens": 130378443.0, "step": 108460 }, { "entropy": 1.653176798671484, "epoch": 0.3362475094140468, "grad_norm": 3.760293960571289, "learning_rate": 4.362798381436164e-06, "loss": 0.3436, "mean_token_accuracy": 0.8677978113293647, "num_tokens": 130393766.0, "step": 108470 }, { "entropy": 1.8992000639438629, "epoch": 0.33627850853909647, "grad_norm": 8.01069164276123, "learning_rate": 4.362597287264443e-06, "loss": 0.4874, "mean_token_accuracy": 0.8475998684763908, "num_tokens": 130405181.0, "step": 108480 }, { "entropy": 1.8159596145153045, "epoch": 0.3363095076641462, "grad_norm": 4.691624641418457, "learning_rate": 4.362396220897218e-06, "loss": 0.4395, "mean_token_accuracy": 0.8557477623224259, "num_tokens": 130417472.0, "step": 108490 }, { "entropy": 1.8598027050495147, "epoch": 0.33634050678919586, "grad_norm": 4.708345413208008, "learning_rate": 4.362195182328085e-06, "loss": 0.4617, "mean_token_accuracy": 0.8497734665870667, "num_tokens": 130429769.0, "step": 108500 }, { "entropy": 1.8282673090696335, "epoch": 0.3363715059142456, "grad_norm": 7.435179233551025, "learning_rate": 4.361994171550637e-06, "loss": 0.4484, "mean_token_accuracy": 0.8541338846087456, "num_tokens": 130441794.0, "step": 108510 }, { "entropy": 1.875466302037239, "epoch": 0.33640250503929525, "grad_norm": 9.038471221923828, "learning_rate": 4.361793188558471e-06, "loss": 0.5276, "mean_token_accuracy": 0.8360525846481324, "num_tokens": 130453505.0, "step": 108520 }, { "entropy": 1.9005172535777093, "epoch": 0.336433504164345, "grad_norm": 8.704236030578613, "learning_rate": 4.361592233345188e-06, "loss": 0.5057, "mean_token_accuracy": 0.8375848770141602, "num_tokens": 130465066.0, "step": 108530 }, { "entropy": 1.811057348549366, "epoch": 0.33646450328939465, "grad_norm": 10.785872459411621, "learning_rate": 4.361391305904388e-06, "loss": 0.4455, "mean_token_accuracy": 0.8490529522299767, "num_tokens": 130477719.0, "step": 108540 }, { "entropy": 1.8520541563630104, "epoch": 0.33649550241444437, "grad_norm": 7.816697597503662, "learning_rate": 4.361190406229676e-06, "loss": 0.4668, "mean_token_accuracy": 0.849137333035469, "num_tokens": 130489472.0, "step": 108550 }, { "entropy": 1.867296999692917, "epoch": 0.33652650153949404, "grad_norm": 4.023216724395752, "learning_rate": 4.360989534314658e-06, "loss": 0.4848, "mean_token_accuracy": 0.8456032857298851, "num_tokens": 130500834.0, "step": 108560 }, { "entropy": 1.9245821714401246, "epoch": 0.33655750066454376, "grad_norm": 7.590324401855469, "learning_rate": 4.36078869015294e-06, "loss": 0.5113, "mean_token_accuracy": 0.8328892081975937, "num_tokens": 130512705.0, "step": 108570 }, { "entropy": 1.8899716123938561, "epoch": 0.33658849978959343, "grad_norm": 8.06901741027832, "learning_rate": 4.360587873738132e-06, "loss": 0.4813, "mean_token_accuracy": 0.847601892054081, "num_tokens": 130524196.0, "step": 108580 }, { "entropy": 1.8788098588585853, "epoch": 0.33661949891464316, "grad_norm": 8.891491889953613, "learning_rate": 4.360387085063847e-06, "loss": 0.5064, "mean_token_accuracy": 0.844712820649147, "num_tokens": 130536389.0, "step": 108590 }, { "entropy": 1.8847306236624717, "epoch": 0.3366504980396928, "grad_norm": 4.331157684326172, "learning_rate": 4.3601863241237e-06, "loss": 0.4542, "mean_token_accuracy": 0.8504201784729958, "num_tokens": 130548320.0, "step": 108600 }, { "entropy": 1.8718352302908898, "epoch": 0.33668149716474255, "grad_norm": 7.971691131591797, "learning_rate": 4.359985590911303e-06, "loss": 0.4724, "mean_token_accuracy": 0.8353804409503937, "num_tokens": 130560292.0, "step": 108610 }, { "entropy": 1.8931324303150177, "epoch": 0.3367124962897922, "grad_norm": 7.773447513580322, "learning_rate": 4.359784885420276e-06, "loss": 0.4605, "mean_token_accuracy": 0.8583063185214996, "num_tokens": 130570906.0, "step": 108620 }, { "entropy": 1.844712384045124, "epoch": 0.33674349541484194, "grad_norm": 4.043328285217285, "learning_rate": 4.3595842076442394e-06, "loss": 0.4537, "mean_token_accuracy": 0.8548864290118218, "num_tokens": 130583434.0, "step": 108630 }, { "entropy": 1.851549918949604, "epoch": 0.3367744945398916, "grad_norm": 8.612878799438477, "learning_rate": 4.359383557576814e-06, "loss": 0.4301, "mean_token_accuracy": 0.8524287804961205, "num_tokens": 130595397.0, "step": 108640 }, { "entropy": 1.90091462880373, "epoch": 0.3368054936649413, "grad_norm": 9.851509094238281, "learning_rate": 4.359182935211626e-06, "loss": 0.534, "mean_token_accuracy": 0.8377526298165321, "num_tokens": 130606655.0, "step": 108650 }, { "entropy": 1.8972817674279212, "epoch": 0.336836492789991, "grad_norm": 8.680237770080566, "learning_rate": 4.3589823405423e-06, "loss": 0.4671, "mean_token_accuracy": 0.8444389298558235, "num_tokens": 130617788.0, "step": 108660 }, { "entropy": 1.8298688307404518, "epoch": 0.3368674919150407, "grad_norm": 9.806267738342285, "learning_rate": 4.358781773562466e-06, "loss": 0.4908, "mean_token_accuracy": 0.8443355247378349, "num_tokens": 130629777.0, "step": 108670 }, { "entropy": 1.8319950178265572, "epoch": 0.3368984910400904, "grad_norm": 7.356091499328613, "learning_rate": 4.358581234265752e-06, "loss": 0.424, "mean_token_accuracy": 0.8494518354535103, "num_tokens": 130642431.0, "step": 108680 }, { "entropy": 1.9162609457969666, "epoch": 0.33692949016514007, "grad_norm": 4.026700973510742, "learning_rate": 4.3583807226457904e-06, "loss": 0.5023, "mean_token_accuracy": 0.8415546149015427, "num_tokens": 130653469.0, "step": 108690 }, { "entropy": 1.846106144785881, "epoch": 0.3369604892901898, "grad_norm": 8.164050102233887, "learning_rate": 4.358180238696217e-06, "loss": 0.5068, "mean_token_accuracy": 0.8469562456011772, "num_tokens": 130665500.0, "step": 108700 }, { "entropy": 1.8313665881752967, "epoch": 0.33699148841523946, "grad_norm": 9.194726943969727, "learning_rate": 4.357979782410669e-06, "loss": 0.4965, "mean_token_accuracy": 0.8402792170643807, "num_tokens": 130677813.0, "step": 108710 }, { "entropy": 1.9255888879299163, "epoch": 0.3370224875402892, "grad_norm": 10.219868659973145, "learning_rate": 4.357779353782783e-06, "loss": 0.5116, "mean_token_accuracy": 0.8486444607377053, "num_tokens": 130689073.0, "step": 108720 }, { "entropy": 1.87244506329298, "epoch": 0.33705348666533885, "grad_norm": 8.656083106994629, "learning_rate": 4.3575789528062e-06, "loss": 0.4825, "mean_token_accuracy": 0.8389718398451805, "num_tokens": 130701943.0, "step": 108730 }, { "entropy": 1.8355872690677644, "epoch": 0.3370844857903886, "grad_norm": 7.93245792388916, "learning_rate": 4.357378579474562e-06, "loss": 0.4352, "mean_token_accuracy": 0.8615069106221199, "num_tokens": 130714006.0, "step": 108740 }, { "entropy": 1.8478736653923988, "epoch": 0.33711548491543825, "grad_norm": 9.331077575683594, "learning_rate": 4.357178233781516e-06, "loss": 0.474, "mean_token_accuracy": 0.8405976980924607, "num_tokens": 130726086.0, "step": 108750 }, { "entropy": 1.8251071408391, "epoch": 0.33714648404048797, "grad_norm": 9.389089584350586, "learning_rate": 4.356977915720707e-06, "loss": 0.4713, "mean_token_accuracy": 0.8374730452895165, "num_tokens": 130738640.0, "step": 108760 }, { "entropy": 1.8872618451714516, "epoch": 0.33717748316553764, "grad_norm": 6.961200714111328, "learning_rate": 4.356777625285783e-06, "loss": 0.5043, "mean_token_accuracy": 0.8394622817635536, "num_tokens": 130750629.0, "step": 108770 }, { "entropy": 1.8409572780132293, "epoch": 0.33720848229058736, "grad_norm": 4.427051067352295, "learning_rate": 4.356577362470397e-06, "loss": 0.4558, "mean_token_accuracy": 0.8474222555756569, "num_tokens": 130763704.0, "step": 108780 }, { "entropy": 1.8041258588433267, "epoch": 0.33723948141563703, "grad_norm": 2.4223392009735107, "learning_rate": 4.3563771272681995e-06, "loss": 0.494, "mean_token_accuracy": 0.8473551660776139, "num_tokens": 130776752.0, "step": 108790 }, { "entropy": 1.8579876363277434, "epoch": 0.33727048054068676, "grad_norm": 4.547750949859619, "learning_rate": 4.356176919672846e-06, "loss": 0.4858, "mean_token_accuracy": 0.8426438301801682, "num_tokens": 130789486.0, "step": 108800 }, { "entropy": 1.756131686270237, "epoch": 0.3373014796657364, "grad_norm": 4.558795928955078, "learning_rate": 4.355976739677995e-06, "loss": 0.3766, "mean_token_accuracy": 0.8526749208569526, "num_tokens": 130803159.0, "step": 108810 }, { "entropy": 1.736269749701023, "epoch": 0.33733247879078615, "grad_norm": 3.4384872913360596, "learning_rate": 4.355776587277302e-06, "loss": 0.4221, "mean_token_accuracy": 0.8524326965212822, "num_tokens": 130817219.0, "step": 108820 }, { "entropy": 1.7983601108193397, "epoch": 0.3373634779158358, "grad_norm": 7.676399230957031, "learning_rate": 4.355576462464431e-06, "loss": 0.4438, "mean_token_accuracy": 0.8553432121872901, "num_tokens": 130829997.0, "step": 108830 }, { "entropy": 1.849858644604683, "epoch": 0.33739447704088554, "grad_norm": 6.735903739929199, "learning_rate": 4.3553763652330446e-06, "loss": 0.4716, "mean_token_accuracy": 0.8419752418994904, "num_tokens": 130842661.0, "step": 108840 }, { "entropy": 1.9195626258850098, "epoch": 0.3374254761659352, "grad_norm": 8.325475692749023, "learning_rate": 4.355176295576807e-06, "loss": 0.5276, "mean_token_accuracy": 0.8380102962255478, "num_tokens": 130854094.0, "step": 108850 }, { "entropy": 1.6827729061245917, "epoch": 0.33745647529098494, "grad_norm": 3.7391862869262695, "learning_rate": 4.354976253489386e-06, "loss": 0.345, "mean_token_accuracy": 0.8674192950129509, "num_tokens": 130868296.0, "step": 108860 }, { "entropy": 1.8783330723643303, "epoch": 0.3374874744160346, "grad_norm": 7.508564472198486, "learning_rate": 4.35477623896445e-06, "loss": 0.4763, "mean_token_accuracy": 0.8525745928287506, "num_tokens": 130879523.0, "step": 108870 }, { "entropy": 1.831697914004326, "epoch": 0.33751847354108433, "grad_norm": 7.616162300109863, "learning_rate": 4.35457625199567e-06, "loss": 0.4391, "mean_token_accuracy": 0.8564835578203202, "num_tokens": 130891707.0, "step": 108880 }, { "entropy": 1.780026839673519, "epoch": 0.337549472666134, "grad_norm": 6.331145763397217, "learning_rate": 4.354376292576721e-06, "loss": 0.3985, "mean_token_accuracy": 0.8688737958669662, "num_tokens": 130904551.0, "step": 108890 }, { "entropy": 1.8686219871044158, "epoch": 0.33758047179118367, "grad_norm": 8.733838081359863, "learning_rate": 4.354176360701276e-06, "loss": 0.4802, "mean_token_accuracy": 0.8480039656162262, "num_tokens": 130916170.0, "step": 108900 }, { "entropy": 1.8567358165979386, "epoch": 0.3376114709162334, "grad_norm": 9.616422653198242, "learning_rate": 4.353976456363014e-06, "loss": 0.4977, "mean_token_accuracy": 0.83982073366642, "num_tokens": 130927254.0, "step": 108910 }, { "entropy": 1.7685384809970857, "epoch": 0.33764247004128306, "grad_norm": 9.59286117553711, "learning_rate": 4.353776579555613e-06, "loss": 0.3828, "mean_token_accuracy": 0.8706537902355194, "num_tokens": 130939683.0, "step": 108920 }, { "entropy": 1.864783415198326, "epoch": 0.3376734691663328, "grad_norm": 8.993294715881348, "learning_rate": 4.353576730272754e-06, "loss": 0.4998, "mean_token_accuracy": 0.847262118756771, "num_tokens": 130951156.0, "step": 108930 }, { "entropy": 1.850709395110607, "epoch": 0.33770446829138245, "grad_norm": 8.352401733398438, "learning_rate": 4.3533769085081226e-06, "loss": 0.5349, "mean_token_accuracy": 0.8358264163136482, "num_tokens": 130962743.0, "step": 108940 }, { "entropy": 1.8151063159108163, "epoch": 0.3377354674164322, "grad_norm": 7.70052433013916, "learning_rate": 4.353177114255402e-06, "loss": 0.4231, "mean_token_accuracy": 0.8555310294032097, "num_tokens": 130974981.0, "step": 108950 }, { "entropy": 1.813981081545353, "epoch": 0.33776646654148185, "grad_norm": 7.85250997543335, "learning_rate": 4.352977347508281e-06, "loss": 0.408, "mean_token_accuracy": 0.8673159837722778, "num_tokens": 130985978.0, "step": 108960 }, { "entropy": 1.8001227349042892, "epoch": 0.33779746566653157, "grad_norm": 7.5097270011901855, "learning_rate": 4.352777608260448e-06, "loss": 0.4514, "mean_token_accuracy": 0.8529728427529335, "num_tokens": 130998505.0, "step": 108970 }, { "entropy": 1.812097106873989, "epoch": 0.33782846479158124, "grad_norm": 8.388304710388184, "learning_rate": 4.352577896505595e-06, "loss": 0.45, "mean_token_accuracy": 0.8446347638964653, "num_tokens": 131010534.0, "step": 108980 }, { "entropy": 1.8360211491584777, "epoch": 0.33785946391663096, "grad_norm": 9.120504379272461, "learning_rate": 4.352378212237415e-06, "loss": 0.4926, "mean_token_accuracy": 0.8418739259243011, "num_tokens": 131022899.0, "step": 108990 }, { "entropy": 1.9238962471485137, "epoch": 0.33789046304168063, "grad_norm": 8.026063919067383, "learning_rate": 4.352178555449604e-06, "loss": 0.5547, "mean_token_accuracy": 0.8331388279795646, "num_tokens": 131034537.0, "step": 109000 }, { "entropy": 1.8263150677084923, "epoch": 0.33792146216673036, "grad_norm": 8.960397720336914, "learning_rate": 4.351978926135859e-06, "loss": 0.4264, "mean_token_accuracy": 0.8622379198670387, "num_tokens": 131046023.0, "step": 109010 }, { "entropy": 1.8406116291880608, "epoch": 0.33795246129178, "grad_norm": 8.759995460510254, "learning_rate": 4.351779324289881e-06, "loss": 0.4578, "mean_token_accuracy": 0.8474781602621079, "num_tokens": 131057713.0, "step": 109020 }, { "entropy": 1.8527844935655593, "epoch": 0.33798346041682975, "grad_norm": 8.07425594329834, "learning_rate": 4.35157974990537e-06, "loss": 0.4921, "mean_token_accuracy": 0.8427453622221946, "num_tokens": 131069288.0, "step": 109030 }, { "entropy": 1.8225320369005202, "epoch": 0.3380144595418794, "grad_norm": 8.338834762573242, "learning_rate": 4.35138020297603e-06, "loss": 0.4286, "mean_token_accuracy": 0.8498498395085334, "num_tokens": 131082196.0, "step": 109040 }, { "entropy": 1.7835667803883553, "epoch": 0.33804545866692914, "grad_norm": 4.169846534729004, "learning_rate": 4.351180683495567e-06, "loss": 0.4429, "mean_token_accuracy": 0.8465461641550064, "num_tokens": 131094762.0, "step": 109050 }, { "entropy": 1.8385676950216294, "epoch": 0.3380764577919788, "grad_norm": 7.802093505859375, "learning_rate": 4.350981191457688e-06, "loss": 0.5106, "mean_token_accuracy": 0.8392044931650162, "num_tokens": 131107519.0, "step": 109060 }, { "entropy": 1.9263346076011658, "epoch": 0.33810745691702854, "grad_norm": 8.671483039855957, "learning_rate": 4.350781726856105e-06, "loss": 0.5573, "mean_token_accuracy": 0.8391161412000656, "num_tokens": 131118004.0, "step": 109070 }, { "entropy": 1.7481406077742576, "epoch": 0.3381384560420782, "grad_norm": 4.902276039123535, "learning_rate": 4.3505822896845245e-06, "loss": 0.4375, "mean_token_accuracy": 0.8569299519062042, "num_tokens": 131132148.0, "step": 109080 }, { "entropy": 1.8396126255393028, "epoch": 0.33816945516712793, "grad_norm": 9.361617088317871, "learning_rate": 4.350382879936665e-06, "loss": 0.4339, "mean_token_accuracy": 0.8506528735160828, "num_tokens": 131143760.0, "step": 109090 }, { "entropy": 1.7284615464508533, "epoch": 0.3382004542921776, "grad_norm": 3.8647422790527344, "learning_rate": 4.350183497606242e-06, "loss": 0.3681, "mean_token_accuracy": 0.8593399003148079, "num_tokens": 131158374.0, "step": 109100 }, { "entropy": 1.84774319678545, "epoch": 0.3382314534172273, "grad_norm": 7.73523473739624, "learning_rate": 4.34998414268697e-06, "loss": 0.4693, "mean_token_accuracy": 0.8502051830291748, "num_tokens": 131170450.0, "step": 109110 }, { "entropy": 1.8346653819084167, "epoch": 0.338262452542277, "grad_norm": 9.322132110595703, "learning_rate": 4.349784815172573e-06, "loss": 0.4714, "mean_token_accuracy": 0.8492701262235641, "num_tokens": 131182861.0, "step": 109120 }, { "entropy": 1.899721224606037, "epoch": 0.3382934516673267, "grad_norm": 10.050145149230957, "learning_rate": 4.349585515056768e-06, "loss": 0.5082, "mean_token_accuracy": 0.8410122975707054, "num_tokens": 131194694.0, "step": 109130 }, { "entropy": 1.8425235763192176, "epoch": 0.3383244507923764, "grad_norm": 12.264300346374512, "learning_rate": 4.349386242333283e-06, "loss": 0.4549, "mean_token_accuracy": 0.8546083837747573, "num_tokens": 131205712.0, "step": 109140 }, { "entropy": 1.896941477060318, "epoch": 0.33835544991742605, "grad_norm": 8.371997833251953, "learning_rate": 4.349186996995841e-06, "loss": 0.5291, "mean_token_accuracy": 0.8351884350180626, "num_tokens": 131217095.0, "step": 109150 }, { "entropy": 1.863653865456581, "epoch": 0.3383864490424758, "grad_norm": 8.408138275146484, "learning_rate": 4.3489877790381716e-06, "loss": 0.4931, "mean_token_accuracy": 0.8465486764907837, "num_tokens": 131228887.0, "step": 109160 }, { "entropy": 1.7314985610544682, "epoch": 0.33841744816752545, "grad_norm": 3.9695277214050293, "learning_rate": 4.348788588454003e-06, "loss": 0.3653, "mean_token_accuracy": 0.8568756997585296, "num_tokens": 131242417.0, "step": 109170 }, { "entropy": 1.7864164792001247, "epoch": 0.33844844729257517, "grad_norm": 3.7821221351623535, "learning_rate": 4.348589425237069e-06, "loss": 0.4225, "mean_token_accuracy": 0.8489362448453903, "num_tokens": 131255401.0, "step": 109180 }, { "entropy": 1.8366502463817596, "epoch": 0.33847944641762484, "grad_norm": 7.946313381195068, "learning_rate": 4.348390289381101e-06, "loss": 0.4708, "mean_token_accuracy": 0.8512516096234322, "num_tokens": 131267388.0, "step": 109190 }, { "entropy": 1.800801232457161, "epoch": 0.33851044554267457, "grad_norm": 8.96656322479248, "learning_rate": 4.348191180879837e-06, "loss": 0.4545, "mean_token_accuracy": 0.8497388571500778, "num_tokens": 131280203.0, "step": 109200 }, { "entropy": 1.8128297343850135, "epoch": 0.33854144466772423, "grad_norm": 7.974830150604248, "learning_rate": 4.347992099727013e-06, "loss": 0.4203, "mean_token_accuracy": 0.8562871024012566, "num_tokens": 131292537.0, "step": 109210 }, { "entropy": 1.8276149809360505, "epoch": 0.33857244379277396, "grad_norm": 7.294045448303223, "learning_rate": 4.347793045916371e-06, "loss": 0.4722, "mean_token_accuracy": 0.8447601184248924, "num_tokens": 131304251.0, "step": 109220 }, { "entropy": 1.9087721765041352, "epoch": 0.3386034429178236, "grad_norm": 9.217912673950195, "learning_rate": 4.3475940194416516e-06, "loss": 0.4886, "mean_token_accuracy": 0.8431843280792236, "num_tokens": 131315640.0, "step": 109230 }, { "entropy": 1.8142705112695694, "epoch": 0.33863444204287335, "grad_norm": 8.342325210571289, "learning_rate": 4.347395020296598e-06, "loss": 0.4458, "mean_token_accuracy": 0.84480060338974, "num_tokens": 131328126.0, "step": 109240 }, { "entropy": 1.8766521513462067, "epoch": 0.338665441167923, "grad_norm": 7.5658745765686035, "learning_rate": 4.347196048474958e-06, "loss": 0.4759, "mean_token_accuracy": 0.8464653968811036, "num_tokens": 131339951.0, "step": 109250 }, { "entropy": 1.7888486787676812, "epoch": 0.33869644029297274, "grad_norm": 8.956585884094238, "learning_rate": 4.346997103970477e-06, "loss": 0.4082, "mean_token_accuracy": 0.8637697413563729, "num_tokens": 131352826.0, "step": 109260 }, { "entropy": 1.9030503317713738, "epoch": 0.3387274394180224, "grad_norm": 9.701215744018555, "learning_rate": 4.3467981867769075e-06, "loss": 0.5059, "mean_token_accuracy": 0.8389164701104164, "num_tokens": 131364743.0, "step": 109270 }, { "entropy": 1.904677079617977, "epoch": 0.33875843854307214, "grad_norm": 7.389717102050781, "learning_rate": 4.346599296888e-06, "loss": 0.472, "mean_token_accuracy": 0.8455563247203827, "num_tokens": 131376346.0, "step": 109280 }, { "entropy": 1.853348232060671, "epoch": 0.3387894376681218, "grad_norm": 8.647706031799316, "learning_rate": 4.346400434297507e-06, "loss": 0.4289, "mean_token_accuracy": 0.8570475921034812, "num_tokens": 131388648.0, "step": 109290 }, { "entropy": 1.854999852180481, "epoch": 0.33882043679317153, "grad_norm": 5.456407070159912, "learning_rate": 4.346201598999186e-06, "loss": 0.4675, "mean_token_accuracy": 0.845246997475624, "num_tokens": 131401406.0, "step": 109300 }, { "entropy": 1.9005474954843522, "epoch": 0.3388514359182212, "grad_norm": 7.554257869720459, "learning_rate": 4.346002790986796e-06, "loss": 0.5159, "mean_token_accuracy": 0.8443148121237755, "num_tokens": 131412576.0, "step": 109310 }, { "entropy": 1.957038275897503, "epoch": 0.3388824350432709, "grad_norm": 8.818881034851074, "learning_rate": 4.3458040102540945e-06, "loss": 0.5046, "mean_token_accuracy": 0.8357266262173653, "num_tokens": 131424034.0, "step": 109320 }, { "entropy": 1.7917650774121285, "epoch": 0.3389134341683206, "grad_norm": 7.657764911651611, "learning_rate": 4.345605256794846e-06, "loss": 0.438, "mean_token_accuracy": 0.8518206834793091, "num_tokens": 131437119.0, "step": 109330 }, { "entropy": 1.9100136697292327, "epoch": 0.3389444332933703, "grad_norm": 7.320883274078369, "learning_rate": 4.345406530602812e-06, "loss": 0.4628, "mean_token_accuracy": 0.8504310637712479, "num_tokens": 131448308.0, "step": 109340 }, { "entropy": 1.8857989877462387, "epoch": 0.33897543241842, "grad_norm": 8.363656044006348, "learning_rate": 4.3452078316717594e-06, "loss": 0.5471, "mean_token_accuracy": 0.8354710638523102, "num_tokens": 131460352.0, "step": 109350 }, { "entropy": 1.829300546646118, "epoch": 0.3390064315434697, "grad_norm": 3.853241443634033, "learning_rate": 4.345009159995455e-06, "loss": 0.5, "mean_token_accuracy": 0.8451276645064354, "num_tokens": 131472410.0, "step": 109360 }, { "entropy": 1.9137260258197784, "epoch": 0.3390374306685194, "grad_norm": 8.751523971557617, "learning_rate": 4.344810515567671e-06, "loss": 0.4801, "mean_token_accuracy": 0.8443731889128685, "num_tokens": 131484493.0, "step": 109370 }, { "entropy": 1.839902514219284, "epoch": 0.3390684297935691, "grad_norm": 8.783478736877441, "learning_rate": 4.344611898382176e-06, "loss": 0.4881, "mean_token_accuracy": 0.8553899496793747, "num_tokens": 131496822.0, "step": 109380 }, { "entropy": 1.896230974793434, "epoch": 0.3390994289186188, "grad_norm": 8.84762954711914, "learning_rate": 4.3444133084327464e-06, "loss": 0.4819, "mean_token_accuracy": 0.8462594136595726, "num_tokens": 131508185.0, "step": 109390 }, { "entropy": 1.889091356098652, "epoch": 0.33913042804366844, "grad_norm": 9.512887954711914, "learning_rate": 4.344214745713158e-06, "loss": 0.505, "mean_token_accuracy": 0.8467561930418015, "num_tokens": 131519594.0, "step": 109400 }, { "entropy": 1.9061472669243813, "epoch": 0.33916142716871817, "grad_norm": 9.376303672790527, "learning_rate": 4.344016210217188e-06, "loss": 0.5523, "mean_token_accuracy": 0.8321489855647087, "num_tokens": 131531669.0, "step": 109410 }, { "entropy": 1.8296129301190376, "epoch": 0.33919242629376783, "grad_norm": 3.8118650913238525, "learning_rate": 4.343817701938615e-06, "loss": 0.4197, "mean_token_accuracy": 0.8557925269007682, "num_tokens": 131544918.0, "step": 109420 }, { "entropy": 1.8758042737841607, "epoch": 0.33922342541881756, "grad_norm": 8.741567611694336, "learning_rate": 4.3436192208712234e-06, "loss": 0.4798, "mean_token_accuracy": 0.849548727273941, "num_tokens": 131557550.0, "step": 109430 }, { "entropy": 1.8672025337815286, "epoch": 0.33925442454386723, "grad_norm": 7.4333624839782715, "learning_rate": 4.343420767008793e-06, "loss": 0.489, "mean_token_accuracy": 0.8443247124552726, "num_tokens": 131569386.0, "step": 109440 }, { "entropy": 1.9236103370785713, "epoch": 0.33928542366891695, "grad_norm": 9.117382049560547, "learning_rate": 4.343222340345114e-06, "loss": 0.5212, "mean_token_accuracy": 0.8418242424726486, "num_tokens": 131580901.0, "step": 109450 }, { "entropy": 1.7963170796632766, "epoch": 0.3393164227939666, "grad_norm": 7.715478897094727, "learning_rate": 4.343023940873973e-06, "loss": 0.3927, "mean_token_accuracy": 0.8599506393074989, "num_tokens": 131594068.0, "step": 109460 }, { "entropy": 1.7216453760862351, "epoch": 0.33934742191901635, "grad_norm": 7.901419162750244, "learning_rate": 4.342825568589158e-06, "loss": 0.3747, "mean_token_accuracy": 0.8583090871572494, "num_tokens": 131608444.0, "step": 109470 }, { "entropy": 1.9911436915397644, "epoch": 0.339378421044066, "grad_norm": 8.45919132232666, "learning_rate": 4.342627223484461e-06, "loss": 0.5489, "mean_token_accuracy": 0.8311255067586899, "num_tokens": 131619482.0, "step": 109480 }, { "entropy": 1.8852492436766624, "epoch": 0.33940942016911574, "grad_norm": 7.630876064300537, "learning_rate": 4.342428905553678e-06, "loss": 0.442, "mean_token_accuracy": 0.8610611036419868, "num_tokens": 131632114.0, "step": 109490 }, { "entropy": 1.7850932255387306, "epoch": 0.3394404192941654, "grad_norm": 7.689153671264648, "learning_rate": 4.342230614790603e-06, "loss": 0.3813, "mean_token_accuracy": 0.8630185857415199, "num_tokens": 131645342.0, "step": 109500 }, { "entropy": 1.8250619187951087, "epoch": 0.33947141841921513, "grad_norm": 9.863249778747559, "learning_rate": 4.342032351189033e-06, "loss": 0.4457, "mean_token_accuracy": 0.8580538392066955, "num_tokens": 131657561.0, "step": 109510 }, { "entropy": 1.8692012056708336, "epoch": 0.3395024175442648, "grad_norm": 7.83109712600708, "learning_rate": 4.341834114742769e-06, "loss": 0.503, "mean_token_accuracy": 0.8402553409337997, "num_tokens": 131669689.0, "step": 109520 }, { "entropy": 1.8381701841950417, "epoch": 0.3395334166693145, "grad_norm": 9.981649398803711, "learning_rate": 4.341635905445612e-06, "loss": 0.4459, "mean_token_accuracy": 0.8586729139089584, "num_tokens": 131681608.0, "step": 109530 }, { "entropy": 1.879381312429905, "epoch": 0.3395644157943642, "grad_norm": 3.283308506011963, "learning_rate": 4.341437723291367e-06, "loss": 0.4761, "mean_token_accuracy": 0.846294941008091, "num_tokens": 131693388.0, "step": 109540 }, { "entropy": 1.812347574532032, "epoch": 0.3395954149194139, "grad_norm": 8.128570556640625, "learning_rate": 4.341239568273838e-06, "loss": 0.4339, "mean_token_accuracy": 0.8501547113060951, "num_tokens": 131706396.0, "step": 109550 }, { "entropy": 1.8860585197806359, "epoch": 0.3396264140444636, "grad_norm": 8.69900894165039, "learning_rate": 4.341041440386833e-06, "loss": 0.4872, "mean_token_accuracy": 0.844724814593792, "num_tokens": 131718236.0, "step": 109560 }, { "entropy": 1.7079974353313445, "epoch": 0.3396574131695133, "grad_norm": 2.750688314437866, "learning_rate": 4.34084333962416e-06, "loss": 0.3525, "mean_token_accuracy": 0.8660884618759155, "num_tokens": 131732740.0, "step": 109570 }, { "entropy": 1.9225364074110984, "epoch": 0.339688412294563, "grad_norm": 8.820713996887207, "learning_rate": 4.340645265979634e-06, "loss": 0.5022, "mean_token_accuracy": 0.8516004055738449, "num_tokens": 131743968.0, "step": 109580 }, { "entropy": 1.9468763768672943, "epoch": 0.3397194114196127, "grad_norm": 9.133736610412598, "learning_rate": 4.340447219447068e-06, "loss": 0.5167, "mean_token_accuracy": 0.8412905469536781, "num_tokens": 131754741.0, "step": 109590 }, { "entropy": 1.99237479865551, "epoch": 0.3397504105446624, "grad_norm": 8.745469093322754, "learning_rate": 4.340249200020274e-06, "loss": 0.5682, "mean_token_accuracy": 0.8262647077441215, "num_tokens": 131766026.0, "step": 109600 }, { "entropy": 1.9354695051908493, "epoch": 0.3397814096697121, "grad_norm": 7.548391819000244, "learning_rate": 4.340051207693073e-06, "loss": 0.4983, "mean_token_accuracy": 0.8456323340535163, "num_tokens": 131776690.0, "step": 109610 }, { "entropy": 1.8958056300878525, "epoch": 0.33981240879476177, "grad_norm": 4.0965256690979, "learning_rate": 4.339853242459283e-06, "loss": 0.5162, "mean_token_accuracy": 0.8384672284126282, "num_tokens": 131788190.0, "step": 109620 }, { "entropy": 1.8747544452548026, "epoch": 0.3398434079198115, "grad_norm": 9.14088249206543, "learning_rate": 4.339655304312725e-06, "loss": 0.4678, "mean_token_accuracy": 0.8471368581056595, "num_tokens": 131799932.0, "step": 109630 }, { "entropy": 1.9145475149154663, "epoch": 0.33987440704486116, "grad_norm": 8.718486785888672, "learning_rate": 4.339457393247224e-06, "loss": 0.4874, "mean_token_accuracy": 0.8452249899506569, "num_tokens": 131810626.0, "step": 109640 }, { "entropy": 1.9000482648611068, "epoch": 0.33990540616991083, "grad_norm": 5.583610534667969, "learning_rate": 4.339259509256604e-06, "loss": 0.5213, "mean_token_accuracy": 0.8375597059726715, "num_tokens": 131821937.0, "step": 109650 }, { "entropy": 1.7997367039322854, "epoch": 0.33993640529496055, "grad_norm": 7.190251350402832, "learning_rate": 4.339061652334693e-06, "loss": 0.412, "mean_token_accuracy": 0.8610153466463089, "num_tokens": 131834731.0, "step": 109660 }, { "entropy": 1.8661419078707695, "epoch": 0.3399674044200102, "grad_norm": 7.567733287811279, "learning_rate": 4.33886382247532e-06, "loss": 0.4701, "mean_token_accuracy": 0.8479538485407829, "num_tokens": 131846172.0, "step": 109670 }, { "entropy": 1.8096156984567642, "epoch": 0.33999840354505995, "grad_norm": 8.170653343200684, "learning_rate": 4.338666019672315e-06, "loss": 0.3938, "mean_token_accuracy": 0.8594262331724167, "num_tokens": 131859132.0, "step": 109680 }, { "entropy": 1.8735254988074304, "epoch": 0.3400294026701096, "grad_norm": 4.316888809204102, "learning_rate": 4.3384682439195146e-06, "loss": 0.5105, "mean_token_accuracy": 0.8408343940973282, "num_tokens": 131870647.0, "step": 109690 }, { "entropy": 1.8006722897291183, "epoch": 0.34006040179515934, "grad_norm": 3.814378499984741, "learning_rate": 4.338270495210751e-06, "loss": 0.4023, "mean_token_accuracy": 0.8559133544564247, "num_tokens": 131882827.0, "step": 109700 }, { "entropy": 1.8134817466139794, "epoch": 0.340091400920209, "grad_norm": 4.081186771392822, "learning_rate": 4.338072773539862e-06, "loss": 0.4345, "mean_token_accuracy": 0.8579586237668991, "num_tokens": 131894503.0, "step": 109710 }, { "entropy": 1.8244572654366493, "epoch": 0.34012240004525873, "grad_norm": 3.6130762100219727, "learning_rate": 4.337875078900688e-06, "loss": 0.4724, "mean_token_accuracy": 0.8462868794798851, "num_tokens": 131907275.0, "step": 109720 }, { "entropy": 1.911041909456253, "epoch": 0.3401533991703084, "grad_norm": 10.027078628540039, "learning_rate": 4.337677411287069e-06, "loss": 0.4936, "mean_token_accuracy": 0.855143415927887, "num_tokens": 131918604.0, "step": 109730 }, { "entropy": 1.7550349622964858, "epoch": 0.3401843982953581, "grad_norm": 4.361454010009766, "learning_rate": 4.337479770692849e-06, "loss": 0.4275, "mean_token_accuracy": 0.8564995616674423, "num_tokens": 131931986.0, "step": 109740 }, { "entropy": 1.8325242474675179, "epoch": 0.3402153974204078, "grad_norm": 7.0168890953063965, "learning_rate": 4.337282157111871e-06, "loss": 0.4177, "mean_token_accuracy": 0.856087788939476, "num_tokens": 131944417.0, "step": 109750 }, { "entropy": 1.8343531802296638, "epoch": 0.3402463965454575, "grad_norm": 8.980559349060059, "learning_rate": 4.337084570537985e-06, "loss": 0.4608, "mean_token_accuracy": 0.8543761387467385, "num_tokens": 131956829.0, "step": 109760 }, { "entropy": 1.9214718401432038, "epoch": 0.3402773956705072, "grad_norm": 9.840754508972168, "learning_rate": 4.336887010965037e-06, "loss": 0.5471, "mean_token_accuracy": 0.8250068485736847, "num_tokens": 131967623.0, "step": 109770 }, { "entropy": 1.7958107739686966, "epoch": 0.3403083947955569, "grad_norm": 6.940483570098877, "learning_rate": 4.336689478386879e-06, "loss": 0.3996, "mean_token_accuracy": 0.8612048402428627, "num_tokens": 131980237.0, "step": 109780 }, { "entropy": 1.8743787840008737, "epoch": 0.3403393939206066, "grad_norm": 8.042901039123535, "learning_rate": 4.3364919727973655e-06, "loss": 0.4931, "mean_token_accuracy": 0.8504285082221031, "num_tokens": 131991550.0, "step": 109790 }, { "entropy": 1.86217290610075, "epoch": 0.3403703930456563, "grad_norm": 8.980916023254395, "learning_rate": 4.33629449419035e-06, "loss": 0.4735, "mean_token_accuracy": 0.8460476860404015, "num_tokens": 132003560.0, "step": 109800 }, { "entropy": 1.7462153866887093, "epoch": 0.340401392170706, "grad_norm": 7.179854393005371, "learning_rate": 4.336097042559689e-06, "loss": 0.3725, "mean_token_accuracy": 0.8604865953326225, "num_tokens": 132016859.0, "step": 109810 }, { "entropy": 1.8352238804101944, "epoch": 0.3404323912957557, "grad_norm": 8.293797492980957, "learning_rate": 4.3358996178992416e-06, "loss": 0.4728, "mean_token_accuracy": 0.8555570766329765, "num_tokens": 132028207.0, "step": 109820 }, { "entropy": 1.7612206950783729, "epoch": 0.34046339042080537, "grad_norm": 8.661210060119629, "learning_rate": 4.3357022202028685e-06, "loss": 0.452, "mean_token_accuracy": 0.8550508677959442, "num_tokens": 132041782.0, "step": 109830 }, { "entropy": 1.8841624334454536, "epoch": 0.3404943895458551, "grad_norm": 8.303897857666016, "learning_rate": 4.335504849464432e-06, "loss": 0.543, "mean_token_accuracy": 0.8422963932156563, "num_tokens": 132053535.0, "step": 109840 }, { "entropy": 1.7789085522294044, "epoch": 0.34052538867090476, "grad_norm": 3.7404472827911377, "learning_rate": 4.335307505677798e-06, "loss": 0.407, "mean_token_accuracy": 0.8541888251900673, "num_tokens": 132066748.0, "step": 109850 }, { "entropy": 1.8748383790254592, "epoch": 0.3405563877959545, "grad_norm": 7.891180515289307, "learning_rate": 4.335110188836832e-06, "loss": 0.4698, "mean_token_accuracy": 0.8546398028731346, "num_tokens": 132078371.0, "step": 109860 }, { "entropy": 1.8530076041817665, "epoch": 0.34058738692100415, "grad_norm": 8.61839771270752, "learning_rate": 4.334912898935402e-06, "loss": 0.4438, "mean_token_accuracy": 0.845193374156952, "num_tokens": 132090696.0, "step": 109870 }, { "entropy": 1.9376697182655334, "epoch": 0.3406183860460538, "grad_norm": 10.494202613830566, "learning_rate": 4.334715635967379e-06, "loss": 0.5363, "mean_token_accuracy": 0.8341496884822845, "num_tokens": 132101490.0, "step": 109880 }, { "entropy": 1.8309767156839372, "epoch": 0.34064938517110355, "grad_norm": 8.481719970703125, "learning_rate": 4.334518399926636e-06, "loss": 0.4131, "mean_token_accuracy": 0.85340007096529, "num_tokens": 132114554.0, "step": 109890 }, { "entropy": 1.8747343346476555, "epoch": 0.3406803842961532, "grad_norm": 9.323307037353516, "learning_rate": 4.334321190807049e-06, "loss": 0.5063, "mean_token_accuracy": 0.8413134127855301, "num_tokens": 132126561.0, "step": 109900 }, { "entropy": 1.8665272369980812, "epoch": 0.34071138342120294, "grad_norm": 9.879739761352539, "learning_rate": 4.334124008602491e-06, "loss": 0.5596, "mean_token_accuracy": 0.8321680203080177, "num_tokens": 132137909.0, "step": 109910 }, { "entropy": 1.872728843986988, "epoch": 0.3407423825462526, "grad_norm": 9.696663856506348, "learning_rate": 4.333926853306841e-06, "loss": 0.4574, "mean_token_accuracy": 0.8571437358856201, "num_tokens": 132149808.0, "step": 109920 }, { "entropy": 1.9334780007600785, "epoch": 0.34077338167130233, "grad_norm": 8.289495468139648, "learning_rate": 4.333729724913981e-06, "loss": 0.5049, "mean_token_accuracy": 0.837024663388729, "num_tokens": 132160716.0, "step": 109930 }, { "entropy": 1.879612100124359, "epoch": 0.340804380796352, "grad_norm": 8.466344833374023, "learning_rate": 4.333532623417792e-06, "loss": 0.4873, "mean_token_accuracy": 0.8417725265026093, "num_tokens": 132172002.0, "step": 109940 }, { "entropy": 1.8401907190680504, "epoch": 0.3408353799214017, "grad_norm": 8.750163078308105, "learning_rate": 4.333335548812158e-06, "loss": 0.447, "mean_token_accuracy": 0.8564073830842972, "num_tokens": 132183522.0, "step": 109950 }, { "entropy": 1.7649420738220214, "epoch": 0.3408663790464514, "grad_norm": 7.617003917694092, "learning_rate": 4.333138501090967e-06, "loss": 0.4166, "mean_token_accuracy": 0.8601633608341217, "num_tokens": 132197109.0, "step": 109960 }, { "entropy": 1.823840895295143, "epoch": 0.3408973781715011, "grad_norm": 4.131731986999512, "learning_rate": 4.332941480248105e-06, "loss": 0.4298, "mean_token_accuracy": 0.8559944689273834, "num_tokens": 132209939.0, "step": 109970 }, { "entropy": 1.8967970684170723, "epoch": 0.3409283772965508, "grad_norm": 8.258302688598633, "learning_rate": 4.332744486277461e-06, "loss": 0.4842, "mean_token_accuracy": 0.8519051864743232, "num_tokens": 132220802.0, "step": 109980 }, { "entropy": 1.8975848436355591, "epoch": 0.3409593764216005, "grad_norm": 9.49807071685791, "learning_rate": 4.332547519172929e-06, "loss": 0.469, "mean_token_accuracy": 0.8441584467887878, "num_tokens": 132232580.0, "step": 109990 }, { "entropy": 1.7971587955951691, "epoch": 0.3409903755466502, "grad_norm": 8.716933250427246, "learning_rate": 4.332350578928402e-06, "loss": 0.4633, "mean_token_accuracy": 0.8555277913808823, "num_tokens": 132244660.0, "step": 110000 }, { "entropy": 1.7943158000707626, "epoch": 0.3410213746716999, "grad_norm": 9.320921897888184, "learning_rate": 4.332153665537777e-06, "loss": 0.4137, "mean_token_accuracy": 0.8571292623877526, "num_tokens": 132257556.0, "step": 110010 }, { "entropy": 1.883206208050251, "epoch": 0.3410523737967496, "grad_norm": 7.559988021850586, "learning_rate": 4.331956778994951e-06, "loss": 0.4647, "mean_token_accuracy": 0.8475514382123948, "num_tokens": 132269837.0, "step": 110020 }, { "entropy": 1.8622760981321336, "epoch": 0.3410833729217993, "grad_norm": 8.432063102722168, "learning_rate": 4.331759919293823e-06, "loss": 0.4867, "mean_token_accuracy": 0.8502196714282035, "num_tokens": 132281620.0, "step": 110030 }, { "entropy": 1.8266623839735985, "epoch": 0.34111437204684897, "grad_norm": 7.229222774505615, "learning_rate": 4.331563086428295e-06, "loss": 0.3981, "mean_token_accuracy": 0.8582778573036194, "num_tokens": 132293944.0, "step": 110040 }, { "entropy": 1.8780689999461173, "epoch": 0.3411453711718987, "grad_norm": 9.767049789428711, "learning_rate": 4.331366280392271e-06, "loss": 0.4905, "mean_token_accuracy": 0.8368968412280082, "num_tokens": 132306054.0, "step": 110050 }, { "entropy": 1.9592034816741943, "epoch": 0.34117637029694836, "grad_norm": 8.0049467086792, "learning_rate": 4.331169501179655e-06, "loss": 0.5138, "mean_token_accuracy": 0.8449469640851021, "num_tokens": 132316729.0, "step": 110060 }, { "entropy": 1.8753671124577522, "epoch": 0.3412073694219981, "grad_norm": 8.132515907287598, "learning_rate": 4.330972748784358e-06, "loss": 0.452, "mean_token_accuracy": 0.8468053087592124, "num_tokens": 132328733.0, "step": 110070 }, { "entropy": 1.9104474276304244, "epoch": 0.34123836854704775, "grad_norm": 9.560647010803223, "learning_rate": 4.330776023200286e-06, "loss": 0.4779, "mean_token_accuracy": 0.847663089632988, "num_tokens": 132339483.0, "step": 110080 }, { "entropy": 1.7548475116491318, "epoch": 0.3412693676720975, "grad_norm": 7.8430280685424805, "learning_rate": 4.330579324421352e-06, "loss": 0.4229, "mean_token_accuracy": 0.8698502600193023, "num_tokens": 132352754.0, "step": 110090 }, { "entropy": 1.778562317788601, "epoch": 0.34130036679714715, "grad_norm": 3.4906435012817383, "learning_rate": 4.330382652441468e-06, "loss": 0.409, "mean_token_accuracy": 0.8567349463701248, "num_tokens": 132366131.0, "step": 110100 }, { "entropy": 1.8327470771968364, "epoch": 0.34133136592219687, "grad_norm": 9.07868766784668, "learning_rate": 4.33018600725455e-06, "loss": 0.4468, "mean_token_accuracy": 0.8581578373908997, "num_tokens": 132377583.0, "step": 110110 }, { "entropy": 1.8057816326618195, "epoch": 0.34136236504724654, "grad_norm": 9.56470775604248, "learning_rate": 4.329989388854515e-06, "loss": 0.4509, "mean_token_accuracy": 0.8603610098361969, "num_tokens": 132389492.0, "step": 110120 }, { "entropy": 1.8053456172347069, "epoch": 0.3413933641722962, "grad_norm": 8.362168312072754, "learning_rate": 4.3297927972352825e-06, "loss": 0.4358, "mean_token_accuracy": 0.8586013182997704, "num_tokens": 132401915.0, "step": 110130 }, { "entropy": 1.8421172469854354, "epoch": 0.34142436329734593, "grad_norm": 5.665745735168457, "learning_rate": 4.3295962323907735e-06, "loss": 0.4893, "mean_token_accuracy": 0.8431659877300263, "num_tokens": 132414324.0, "step": 110140 }, { "entropy": 1.8274363920092582, "epoch": 0.3414553624223956, "grad_norm": 4.698655128479004, "learning_rate": 4.3293996943149094e-06, "loss": 0.4581, "mean_token_accuracy": 0.8378838330507279, "num_tokens": 132426353.0, "step": 110150 }, { "entropy": 1.8264761924743653, "epoch": 0.3414863615474453, "grad_norm": 8.485648155212402, "learning_rate": 4.329203183001617e-06, "loss": 0.4714, "mean_token_accuracy": 0.8489476129412651, "num_tokens": 132438108.0, "step": 110160 }, { "entropy": 1.7348950892686843, "epoch": 0.341517360672495, "grad_norm": 3.869995355606079, "learning_rate": 4.329006698444822e-06, "loss": 0.3746, "mean_token_accuracy": 0.8635307684540748, "num_tokens": 132452376.0, "step": 110170 }, { "entropy": 1.8900276586413383, "epoch": 0.3415483597975447, "grad_norm": 7.771602630615234, "learning_rate": 4.3288102406384535e-06, "loss": 0.5439, "mean_token_accuracy": 0.8306577250361442, "num_tokens": 132463778.0, "step": 110180 }, { "entropy": 1.816038003563881, "epoch": 0.3415793589225944, "grad_norm": 3.8563501834869385, "learning_rate": 4.3286138095764414e-06, "loss": 0.4445, "mean_token_accuracy": 0.8512075647711754, "num_tokens": 132476109.0, "step": 110190 }, { "entropy": 1.7983648508787156, "epoch": 0.3416103580476441, "grad_norm": 6.0994873046875, "learning_rate": 4.328417405252719e-06, "loss": 0.4387, "mean_token_accuracy": 0.857032585144043, "num_tokens": 132489219.0, "step": 110200 }, { "entropy": 1.7746062129735947, "epoch": 0.3416413571726938, "grad_norm": 7.233607769012451, "learning_rate": 4.3282210276612215e-06, "loss": 0.3889, "mean_token_accuracy": 0.8589346572756767, "num_tokens": 132501835.0, "step": 110210 }, { "entropy": 1.854676540195942, "epoch": 0.3416723562977435, "grad_norm": 8.12541389465332, "learning_rate": 4.328024676795884e-06, "loss": 0.4286, "mean_token_accuracy": 0.8524368569254875, "num_tokens": 132513519.0, "step": 110220 }, { "entropy": 1.7707249775528908, "epoch": 0.3417033554227932, "grad_norm": 11.075340270996094, "learning_rate": 4.3278283526506465e-06, "loss": 0.4199, "mean_token_accuracy": 0.8538433343172074, "num_tokens": 132526640.0, "step": 110230 }, { "entropy": 1.896279625594616, "epoch": 0.3417343545478429, "grad_norm": 10.801493644714355, "learning_rate": 4.3276320552194465e-06, "loss": 0.4733, "mean_token_accuracy": 0.8482535511255265, "num_tokens": 132538190.0, "step": 110240 }, { "entropy": 1.8617658391594887, "epoch": 0.34176535367289257, "grad_norm": 7.516333103179932, "learning_rate": 4.32743578449623e-06, "loss": 0.4729, "mean_token_accuracy": 0.8491196766495704, "num_tokens": 132550185.0, "step": 110250 }, { "entropy": 1.8173480436205864, "epoch": 0.3417963527979423, "grad_norm": 8.738038063049316, "learning_rate": 4.327239540474937e-06, "loss": 0.4669, "mean_token_accuracy": 0.836649663746357, "num_tokens": 132563493.0, "step": 110260 }, { "entropy": 1.9029010236263275, "epoch": 0.34182735192299196, "grad_norm": 8.454696655273438, "learning_rate": 4.3270433231495165e-06, "loss": 0.4533, "mean_token_accuracy": 0.8503329768776894, "num_tokens": 132575617.0, "step": 110270 }, { "entropy": 1.8139191955327987, "epoch": 0.3418583510480417, "grad_norm": 3.524780750274658, "learning_rate": 4.326847132513916e-06, "loss": 0.4293, "mean_token_accuracy": 0.8617829859256745, "num_tokens": 132588755.0, "step": 110280 }, { "entropy": 1.7936020240187645, "epoch": 0.34188935017309136, "grad_norm": 8.44808578491211, "learning_rate": 4.326650968562085e-06, "loss": 0.4323, "mean_token_accuracy": 0.8550214633345604, "num_tokens": 132601884.0, "step": 110290 }, { "entropy": 1.8529448464512825, "epoch": 0.3419203492981411, "grad_norm": 4.374883651733398, "learning_rate": 4.3264548312879736e-06, "loss": 0.4341, "mean_token_accuracy": 0.8531146451830864, "num_tokens": 132614584.0, "step": 110300 }, { "entropy": 1.800635115802288, "epoch": 0.34195134842319075, "grad_norm": 4.555408954620361, "learning_rate": 4.326258720685538e-06, "loss": 0.4128, "mean_token_accuracy": 0.8581522032618523, "num_tokens": 132627313.0, "step": 110310 }, { "entropy": 1.7813701510429383, "epoch": 0.3419823475482405, "grad_norm": 4.661769390106201, "learning_rate": 4.326062636748733e-06, "loss": 0.3958, "mean_token_accuracy": 0.8568063646554946, "num_tokens": 132640535.0, "step": 110320 }, { "entropy": 1.8333582818508147, "epoch": 0.34201334667329014, "grad_norm": 8.417710304260254, "learning_rate": 4.325866579471516e-06, "loss": 0.4775, "mean_token_accuracy": 0.839620991051197, "num_tokens": 132652723.0, "step": 110330 }, { "entropy": 1.7189094245433807, "epoch": 0.34204434579833987, "grad_norm": 6.682080268859863, "learning_rate": 4.325670548847847e-06, "loss": 0.3986, "mean_token_accuracy": 0.8616839155554772, "num_tokens": 132667020.0, "step": 110340 }, { "entropy": 1.9014250546693803, "epoch": 0.34207534492338953, "grad_norm": 9.291830062866211, "learning_rate": 4.325474544871687e-06, "loss": 0.4903, "mean_token_accuracy": 0.8418972924351692, "num_tokens": 132678104.0, "step": 110350 }, { "entropy": 1.8418912634253501, "epoch": 0.34210634404843926, "grad_norm": 9.592106819152832, "learning_rate": 4.325278567536999e-06, "loss": 0.4669, "mean_token_accuracy": 0.8414492189884186, "num_tokens": 132691635.0, "step": 110360 }, { "entropy": 1.8121645241975783, "epoch": 0.34213734317348893, "grad_norm": 10.184158325195312, "learning_rate": 4.325082616837749e-06, "loss": 0.4521, "mean_token_accuracy": 0.8527585849165916, "num_tokens": 132703725.0, "step": 110370 }, { "entropy": 1.8687279969453812, "epoch": 0.3421683422985386, "grad_norm": 7.617676734924316, "learning_rate": 4.324886692767904e-06, "loss": 0.5293, "mean_token_accuracy": 0.8405532166361809, "num_tokens": 132715491.0, "step": 110380 }, { "entropy": 1.8479404971003532, "epoch": 0.3421993414235883, "grad_norm": 8.61865520477295, "learning_rate": 4.324690795321433e-06, "loss": 0.4618, "mean_token_accuracy": 0.8454162836074829, "num_tokens": 132727389.0, "step": 110390 }, { "entropy": 1.8582603454589843, "epoch": 0.342230340548638, "grad_norm": 8.203682899475098, "learning_rate": 4.324494924492305e-06, "loss": 0.5359, "mean_token_accuracy": 0.8352208867669105, "num_tokens": 132739602.0, "step": 110400 }, { "entropy": 1.8665910184383392, "epoch": 0.3422613396736877, "grad_norm": 8.948098182678223, "learning_rate": 4.324299080274496e-06, "loss": 0.4703, "mean_token_accuracy": 0.8546964704990387, "num_tokens": 132751570.0, "step": 110410 }, { "entropy": 1.870258367061615, "epoch": 0.3422923387987374, "grad_norm": 7.890269756317139, "learning_rate": 4.32410326266198e-06, "loss": 0.4762, "mean_token_accuracy": 0.8475801780819893, "num_tokens": 132763398.0, "step": 110420 }, { "entropy": 1.841132828593254, "epoch": 0.3423233379237871, "grad_norm": 6.834152698516846, "learning_rate": 4.3239074716487314e-06, "loss": 0.4194, "mean_token_accuracy": 0.8546931356191635, "num_tokens": 132775491.0, "step": 110430 }, { "entropy": 1.9839861959218978, "epoch": 0.3423543370488368, "grad_norm": 9.827447891235352, "learning_rate": 4.323711707228732e-06, "loss": 0.5771, "mean_token_accuracy": 0.8222515046596527, "num_tokens": 132786260.0, "step": 110440 }, { "entropy": 1.869850617647171, "epoch": 0.3423853361738865, "grad_norm": 7.602901935577393, "learning_rate": 4.323515969395961e-06, "loss": 0.487, "mean_token_accuracy": 0.849778589606285, "num_tokens": 132798632.0, "step": 110450 }, { "entropy": 1.8364197805523872, "epoch": 0.34241633529893617, "grad_norm": 6.836818695068359, "learning_rate": 4.3233202581444e-06, "loss": 0.3919, "mean_token_accuracy": 0.8704248234629631, "num_tokens": 132811082.0, "step": 110460 }, { "entropy": 1.76078250259161, "epoch": 0.3424473344239859, "grad_norm": 3.4893312454223633, "learning_rate": 4.323124573468033e-06, "loss": 0.3575, "mean_token_accuracy": 0.8747459024190902, "num_tokens": 132824398.0, "step": 110470 }, { "entropy": 1.9431472271680832, "epoch": 0.34247833354903556, "grad_norm": 8.165310859680176, "learning_rate": 4.3229289153608484e-06, "loss": 0.5315, "mean_token_accuracy": 0.8350071504712104, "num_tokens": 132835017.0, "step": 110480 }, { "entropy": 1.828757031261921, "epoch": 0.3425093326740853, "grad_norm": 5.19187593460083, "learning_rate": 4.3227332838168335e-06, "loss": 0.4395, "mean_token_accuracy": 0.8579125210642815, "num_tokens": 132847063.0, "step": 110490 }, { "entropy": 1.907851468026638, "epoch": 0.34254033179913496, "grad_norm": 8.472184181213379, "learning_rate": 4.3225376788299765e-06, "loss": 0.4777, "mean_token_accuracy": 0.8507169172167778, "num_tokens": 132858770.0, "step": 110500 }, { "entropy": 1.9092920407652856, "epoch": 0.3425713309241847, "grad_norm": 8.181653022766113, "learning_rate": 4.322342100394272e-06, "loss": 0.5, "mean_token_accuracy": 0.8374480813741684, "num_tokens": 132869792.0, "step": 110510 }, { "entropy": 1.801915517449379, "epoch": 0.34260233004923435, "grad_norm": 9.726790428161621, "learning_rate": 4.322146548503712e-06, "loss": 0.37, "mean_token_accuracy": 0.8696004092693329, "num_tokens": 132882414.0, "step": 110520 }, { "entropy": 1.871196947991848, "epoch": 0.3426333291742841, "grad_norm": 8.217884063720703, "learning_rate": 4.321951023152293e-06, "loss": 0.4747, "mean_token_accuracy": 0.8464583426713943, "num_tokens": 132894145.0, "step": 110530 }, { "entropy": 1.8100114524364472, "epoch": 0.34266432829933374, "grad_norm": 9.168874740600586, "learning_rate": 4.321755524334014e-06, "loss": 0.5045, "mean_token_accuracy": 0.8399093985557556, "num_tokens": 132907133.0, "step": 110540 }, { "entropy": 1.8548692613840103, "epoch": 0.34269532742438347, "grad_norm": 8.339598655700684, "learning_rate": 4.3215600520428715e-06, "loss": 0.5182, "mean_token_accuracy": 0.8403585880994797, "num_tokens": 132918583.0, "step": 110550 }, { "entropy": 1.8597151398658753, "epoch": 0.34272632654943314, "grad_norm": 8.382675170898438, "learning_rate": 4.3213646062728695e-06, "loss": 0.4849, "mean_token_accuracy": 0.8416018903255462, "num_tokens": 132930506.0, "step": 110560 }, { "entropy": 1.8886392131447791, "epoch": 0.34275732567448286, "grad_norm": 4.080488204956055, "learning_rate": 4.321169187018011e-06, "loss": 0.4526, "mean_token_accuracy": 0.8549024015665054, "num_tokens": 132942640.0, "step": 110570 }, { "entropy": 1.7458569899201393, "epoch": 0.34278832479953253, "grad_norm": 5.599550724029541, "learning_rate": 4.3209737942722985e-06, "loss": 0.3801, "mean_token_accuracy": 0.8586847379803657, "num_tokens": 132956699.0, "step": 110580 }, { "entropy": 1.8726666495203972, "epoch": 0.34281932392458225, "grad_norm": 7.666844367980957, "learning_rate": 4.320778428029743e-06, "loss": 0.4654, "mean_token_accuracy": 0.8506060406565666, "num_tokens": 132967858.0, "step": 110590 }, { "entropy": 1.8701406121253967, "epoch": 0.3428503230496319, "grad_norm": 9.031076431274414, "learning_rate": 4.320583088284352e-06, "loss": 0.4853, "mean_token_accuracy": 0.8525993511080742, "num_tokens": 132979844.0, "step": 110600 }, { "entropy": 1.7444271817803383, "epoch": 0.34288132217468165, "grad_norm": 3.681898593902588, "learning_rate": 4.320387775030135e-06, "loss": 0.3891, "mean_token_accuracy": 0.8686041355133056, "num_tokens": 132993483.0, "step": 110610 }, { "entropy": 1.8315519198775292, "epoch": 0.3429123212997313, "grad_norm": 13.511602401733398, "learning_rate": 4.320192488261108e-06, "loss": 0.4676, "mean_token_accuracy": 0.8444625332951545, "num_tokens": 133006258.0, "step": 110620 }, { "entropy": 1.8297417625784873, "epoch": 0.342943320424781, "grad_norm": 4.069921016693115, "learning_rate": 4.319997227971282e-06, "loss": 0.4569, "mean_token_accuracy": 0.8498388364911079, "num_tokens": 133018876.0, "step": 110630 }, { "entropy": 1.9210706606507302, "epoch": 0.3429743195498307, "grad_norm": 9.114517211914062, "learning_rate": 4.319801994154677e-06, "loss": 0.536, "mean_token_accuracy": 0.8308271437883377, "num_tokens": 133030224.0, "step": 110640 }, { "entropy": 1.8908802688121795, "epoch": 0.3430053186748804, "grad_norm": 9.932719230651855, "learning_rate": 4.319606786805309e-06, "loss": 0.529, "mean_token_accuracy": 0.8441593378782273, "num_tokens": 133041073.0, "step": 110650 }, { "entropy": 1.8904657304286956, "epoch": 0.3430363177999301, "grad_norm": 7.985291957855225, "learning_rate": 4.3194116059172e-06, "loss": 0.5112, "mean_token_accuracy": 0.8325783342123032, "num_tokens": 133052277.0, "step": 110660 }, { "entropy": 1.9574284851551056, "epoch": 0.34306731692497977, "grad_norm": 7.235466957092285, "learning_rate": 4.319216451484371e-06, "loss": 0.5211, "mean_token_accuracy": 0.8355097994208336, "num_tokens": 133064090.0, "step": 110670 }, { "entropy": 1.816710241138935, "epoch": 0.3430983160500295, "grad_norm": 8.71338939666748, "learning_rate": 4.319021323500848e-06, "loss": 0.4738, "mean_token_accuracy": 0.8443104445934295, "num_tokens": 133076199.0, "step": 110680 }, { "entropy": 1.843747340142727, "epoch": 0.34312931517507916, "grad_norm": 6.804162502288818, "learning_rate": 4.318826221960655e-06, "loss": 0.4614, "mean_token_accuracy": 0.8444552600383759, "num_tokens": 133088227.0, "step": 110690 }, { "entropy": 1.8602527409791947, "epoch": 0.3431603143001289, "grad_norm": 7.936315059661865, "learning_rate": 4.318631146857822e-06, "loss": 0.4549, "mean_token_accuracy": 0.8574521571397782, "num_tokens": 133100175.0, "step": 110700 }, { "entropy": 1.8244513690471649, "epoch": 0.34319131342517856, "grad_norm": 11.78140926361084, "learning_rate": 4.318436098186376e-06, "loss": 0.416, "mean_token_accuracy": 0.8596523448824882, "num_tokens": 133111645.0, "step": 110710 }, { "entropy": 1.8955079719424248, "epoch": 0.3432223125502283, "grad_norm": 5.455853462219238, "learning_rate": 4.318241075940353e-06, "loss": 0.5204, "mean_token_accuracy": 0.8417312771081924, "num_tokens": 133122905.0, "step": 110720 }, { "entropy": 1.898301364481449, "epoch": 0.34325331167527795, "grad_norm": 8.922945976257324, "learning_rate": 4.318046080113783e-06, "loss": 0.4867, "mean_token_accuracy": 0.8424309939146042, "num_tokens": 133134095.0, "step": 110730 }, { "entropy": 1.8243930265307426, "epoch": 0.3432843108003277, "grad_norm": 4.409710884094238, "learning_rate": 4.317851110700703e-06, "loss": 0.4563, "mean_token_accuracy": 0.8556462466716767, "num_tokens": 133146666.0, "step": 110740 }, { "entropy": 1.9261058151721955, "epoch": 0.34331530992537734, "grad_norm": 12.174426078796387, "learning_rate": 4.31765616769515e-06, "loss": 0.4769, "mean_token_accuracy": 0.8526764348149299, "num_tokens": 133157255.0, "step": 110750 }, { "entropy": 1.911964663863182, "epoch": 0.34334630905042707, "grad_norm": 8.915404319763184, "learning_rate": 4.317461251091163e-06, "loss": 0.4823, "mean_token_accuracy": 0.8492590352892876, "num_tokens": 133168516.0, "step": 110760 }, { "entropy": 1.8462224438786508, "epoch": 0.34337730817547674, "grad_norm": 9.527721405029297, "learning_rate": 4.317266360882783e-06, "loss": 0.486, "mean_token_accuracy": 0.8471042737364769, "num_tokens": 133180707.0, "step": 110770 }, { "entropy": 1.821668528020382, "epoch": 0.34340830730052646, "grad_norm": 3.8620917797088623, "learning_rate": 4.3170714970640535e-06, "loss": 0.4187, "mean_token_accuracy": 0.859420596063137, "num_tokens": 133192497.0, "step": 110780 }, { "entropy": 1.875546859204769, "epoch": 0.34343930642557613, "grad_norm": 7.777998924255371, "learning_rate": 4.3168766596290205e-06, "loss": 0.4715, "mean_token_accuracy": 0.8493057772517204, "num_tokens": 133204389.0, "step": 110790 }, { "entropy": 1.807924547791481, "epoch": 0.34347030555062585, "grad_norm": 12.520805358886719, "learning_rate": 4.31668184857173e-06, "loss": 0.3938, "mean_token_accuracy": 0.8617511481046677, "num_tokens": 133217528.0, "step": 110800 }, { "entropy": 1.8352250516414643, "epoch": 0.3435013046756755, "grad_norm": 5.211875915527344, "learning_rate": 4.31648706388623e-06, "loss": 0.4489, "mean_token_accuracy": 0.8534446090459824, "num_tokens": 133229712.0, "step": 110810 }, { "entropy": 1.8670003831386566, "epoch": 0.34353230380072525, "grad_norm": 7.83875846862793, "learning_rate": 4.316292305566571e-06, "loss": 0.4622, "mean_token_accuracy": 0.850574080646038, "num_tokens": 133241353.0, "step": 110820 }, { "entropy": 1.887353539466858, "epoch": 0.3435633029257749, "grad_norm": 10.31457233428955, "learning_rate": 4.316097573606808e-06, "loss": 0.4903, "mean_token_accuracy": 0.8490018859505654, "num_tokens": 133252486.0, "step": 110830 }, { "entropy": 1.891994397342205, "epoch": 0.34359430205082464, "grad_norm": 8.610884666442871, "learning_rate": 4.315902868000992e-06, "loss": 0.4975, "mean_token_accuracy": 0.8397418484091759, "num_tokens": 133264441.0, "step": 110840 }, { "entropy": 1.8664520055055618, "epoch": 0.3436253011758743, "grad_norm": 7.726016044616699, "learning_rate": 4.3157081887431804e-06, "loss": 0.499, "mean_token_accuracy": 0.8486996784806251, "num_tokens": 133276898.0, "step": 110850 }, { "entropy": 1.884059876203537, "epoch": 0.34365630030092403, "grad_norm": 4.186914443969727, "learning_rate": 4.315513535827431e-06, "loss": 0.4701, "mean_token_accuracy": 0.8496600687503815, "num_tokens": 133288536.0, "step": 110860 }, { "entropy": 1.9221114248037339, "epoch": 0.3436872994259737, "grad_norm": 9.337282180786133, "learning_rate": 4.315318909247805e-06, "loss": 0.4669, "mean_token_accuracy": 0.842443785071373, "num_tokens": 133299978.0, "step": 110870 }, { "entropy": 1.8485377907752991, "epoch": 0.34371829855102337, "grad_norm": 9.948966979980469, "learning_rate": 4.315124308998364e-06, "loss": 0.4684, "mean_token_accuracy": 0.8506104618310928, "num_tokens": 133311745.0, "step": 110880 }, { "entropy": 1.8648578524589539, "epoch": 0.3437492976760731, "grad_norm": 7.327975749969482, "learning_rate": 4.31492973507317e-06, "loss": 0.4557, "mean_token_accuracy": 0.8568118900060654, "num_tokens": 133323824.0, "step": 110890 }, { "entropy": 1.9047629460692406, "epoch": 0.34378029680112276, "grad_norm": 8.159683227539062, "learning_rate": 4.314735187466291e-06, "loss": 0.4994, "mean_token_accuracy": 0.8451401859521865, "num_tokens": 133335255.0, "step": 110900 }, { "entropy": 1.925184278190136, "epoch": 0.3438112959261725, "grad_norm": 4.655861854553223, "learning_rate": 4.3145406661717925e-06, "loss": 0.5484, "mean_token_accuracy": 0.826390016078949, "num_tokens": 133347385.0, "step": 110910 }, { "entropy": 1.9013187378644942, "epoch": 0.34384229505122216, "grad_norm": 7.1715922355651855, "learning_rate": 4.3143461711837445e-06, "loss": 0.5173, "mean_token_accuracy": 0.8376141622662544, "num_tokens": 133358459.0, "step": 110920 }, { "entropy": 1.9203463315963745, "epoch": 0.3438732941762719, "grad_norm": 10.447772026062012, "learning_rate": 4.314151702496219e-06, "loss": 0.5415, "mean_token_accuracy": 0.8378437146544456, "num_tokens": 133369315.0, "step": 110930 }, { "entropy": 1.926878383755684, "epoch": 0.34390429330132155, "grad_norm": 7.588080406188965, "learning_rate": 4.313957260103287e-06, "loss": 0.4738, "mean_token_accuracy": 0.8456119611859322, "num_tokens": 133379860.0, "step": 110940 }, { "entropy": 1.714067880809307, "epoch": 0.3439352924263713, "grad_norm": 4.141031265258789, "learning_rate": 4.313762843999025e-06, "loss": 0.3662, "mean_token_accuracy": 0.8552543878555298, "num_tokens": 133394752.0, "step": 110950 }, { "entropy": 1.7716796442866325, "epoch": 0.34396629155142094, "grad_norm": 8.312151908874512, "learning_rate": 4.31356845417751e-06, "loss": 0.4151, "mean_token_accuracy": 0.8612663641571998, "num_tokens": 133407887.0, "step": 110960 }, { "entropy": 1.909244528412819, "epoch": 0.34399729067647067, "grad_norm": 9.186685562133789, "learning_rate": 4.31337409063282e-06, "loss": 0.5101, "mean_token_accuracy": 0.8425434127449989, "num_tokens": 133419016.0, "step": 110970 }, { "entropy": 1.831558196246624, "epoch": 0.34402828980152034, "grad_norm": 7.182348728179932, "learning_rate": 4.3131797533590354e-06, "loss": 0.4402, "mean_token_accuracy": 0.8547964975237846, "num_tokens": 133431120.0, "step": 110980 }, { "entropy": 1.923382543027401, "epoch": 0.34405928892657006, "grad_norm": 7.744353771209717, "learning_rate": 4.31298544235024e-06, "loss": 0.5213, "mean_token_accuracy": 0.8368471801280976, "num_tokens": 133442547.0, "step": 110990 }, { "entropy": 1.8427509516477585, "epoch": 0.34409028805161973, "grad_norm": 8.841486930847168, "learning_rate": 4.312791157600516e-06, "loss": 0.4723, "mean_token_accuracy": 0.8453793004155159, "num_tokens": 133455042.0, "step": 111000 }, { "entropy": 1.854433636367321, "epoch": 0.34412128717666945, "grad_norm": 8.208456039428711, "learning_rate": 4.312596899103951e-06, "loss": 0.4855, "mean_token_accuracy": 0.8534053891897202, "num_tokens": 133466808.0, "step": 111010 }, { "entropy": 1.936078244447708, "epoch": 0.3441522863017191, "grad_norm": 10.055807113647461, "learning_rate": 4.312402666854633e-06, "loss": 0.4524, "mean_token_accuracy": 0.8546758487820625, "num_tokens": 133478433.0, "step": 111020 }, { "entropy": 1.8237747445702552, "epoch": 0.34418328542676885, "grad_norm": 10.912522315979004, "learning_rate": 4.312208460846651e-06, "loss": 0.4265, "mean_token_accuracy": 0.848297068476677, "num_tokens": 133490903.0, "step": 111030 }, { "entropy": 1.8850932896137238, "epoch": 0.3442142845518185, "grad_norm": 8.489951133728027, "learning_rate": 4.312014281074098e-06, "loss": 0.4681, "mean_token_accuracy": 0.8428701400756836, "num_tokens": 133503256.0, "step": 111040 }, { "entropy": 1.80315043926239, "epoch": 0.34424528367686824, "grad_norm": 4.91754674911499, "learning_rate": 4.311820127531066e-06, "loss": 0.4476, "mean_token_accuracy": 0.8500543773174286, "num_tokens": 133515626.0, "step": 111050 }, { "entropy": 1.8291157126426696, "epoch": 0.3442762828019179, "grad_norm": 8.575767517089844, "learning_rate": 4.3116260002116505e-06, "loss": 0.4447, "mean_token_accuracy": 0.8480419516563416, "num_tokens": 133528210.0, "step": 111060 }, { "entropy": 1.8445888727903366, "epoch": 0.34430728192696763, "grad_norm": 7.410253524780273, "learning_rate": 4.3114318991099505e-06, "loss": 0.4645, "mean_token_accuracy": 0.8509875819087028, "num_tokens": 133539348.0, "step": 111070 }, { "entropy": 1.7617506772279738, "epoch": 0.3443382810520173, "grad_norm": 4.4981770515441895, "learning_rate": 4.311237824220064e-06, "loss": 0.4237, "mean_token_accuracy": 0.8568379059433937, "num_tokens": 133552472.0, "step": 111080 }, { "entropy": 1.845915214717388, "epoch": 0.344369280177067, "grad_norm": 9.03842544555664, "learning_rate": 4.311043775536092e-06, "loss": 0.5056, "mean_token_accuracy": 0.8473999381065369, "num_tokens": 133564707.0, "step": 111090 }, { "entropy": 1.685810850560665, "epoch": 0.3444002793021167, "grad_norm": 4.30578088760376, "learning_rate": 4.310849753052138e-06, "loss": 0.3662, "mean_token_accuracy": 0.864673687517643, "num_tokens": 133579259.0, "step": 111100 }, { "entropy": 1.8005015209317208, "epoch": 0.3444312784271664, "grad_norm": 9.097541809082031, "learning_rate": 4.310655756762307e-06, "loss": 0.4941, "mean_token_accuracy": 0.8453745514154434, "num_tokens": 133591953.0, "step": 111110 }, { "entropy": 1.8135478526353837, "epoch": 0.3444622775522161, "grad_norm": 3.2069530487060547, "learning_rate": 4.310461786660705e-06, "loss": 0.4532, "mean_token_accuracy": 0.8551574990153312, "num_tokens": 133604208.0, "step": 111120 }, { "entropy": 1.873196867108345, "epoch": 0.34449327667726576, "grad_norm": 8.344642639160156, "learning_rate": 4.310267842741439e-06, "loss": 0.4472, "mean_token_accuracy": 0.8539148271083832, "num_tokens": 133616056.0, "step": 111130 }, { "entropy": 1.9149486266076565, "epoch": 0.3445242758023155, "grad_norm": 8.610182762145996, "learning_rate": 4.3100739249986225e-06, "loss": 0.473, "mean_token_accuracy": 0.8442770466208458, "num_tokens": 133627877.0, "step": 111140 }, { "entropy": 1.910437923669815, "epoch": 0.34455527492736515, "grad_norm": 8.95980167388916, "learning_rate": 4.309880033426365e-06, "loss": 0.5054, "mean_token_accuracy": 0.8403397217392922, "num_tokens": 133639187.0, "step": 111150 }, { "entropy": 1.9167566150426865, "epoch": 0.3445862740524149, "grad_norm": 9.184028625488281, "learning_rate": 4.3096861680187815e-06, "loss": 0.4849, "mean_token_accuracy": 0.8474911168217659, "num_tokens": 133650200.0, "step": 111160 }, { "entropy": 1.9442434057593345, "epoch": 0.34461727317746454, "grad_norm": 9.074106216430664, "learning_rate": 4.309492328769988e-06, "loss": 0.5261, "mean_token_accuracy": 0.8301569879055023, "num_tokens": 133661615.0, "step": 111170 }, { "entropy": 1.8889770179986953, "epoch": 0.34464827230251427, "grad_norm": 11.795031547546387, "learning_rate": 4.309298515674102e-06, "loss": 0.5, "mean_token_accuracy": 0.8382586434483528, "num_tokens": 133672713.0, "step": 111180 }, { "entropy": 1.8732004672288896, "epoch": 0.34467927142756394, "grad_norm": 7.575119495391846, "learning_rate": 4.309104728725243e-06, "loss": 0.4804, "mean_token_accuracy": 0.8471902221441269, "num_tokens": 133683668.0, "step": 111190 }, { "entropy": 1.9257921800017357, "epoch": 0.34471027055261366, "grad_norm": 7.821493148803711, "learning_rate": 4.308910967917533e-06, "loss": 0.4817, "mean_token_accuracy": 0.8495068654417992, "num_tokens": 133694848.0, "step": 111200 }, { "entropy": 1.876696328818798, "epoch": 0.34474126967766333, "grad_norm": 9.46831226348877, "learning_rate": 4.308717233245096e-06, "loss": 0.4807, "mean_token_accuracy": 0.847463445365429, "num_tokens": 133706948.0, "step": 111210 }, { "entropy": 1.9034352615475654, "epoch": 0.34477226880271306, "grad_norm": 4.813274383544922, "learning_rate": 4.3085235247020545e-06, "loss": 0.4282, "mean_token_accuracy": 0.8571040198206902, "num_tokens": 133719003.0, "step": 111220 }, { "entropy": 1.8417223021388054, "epoch": 0.3448032679277627, "grad_norm": 8.651920318603516, "learning_rate": 4.3083298422825375e-06, "loss": 0.4393, "mean_token_accuracy": 0.8489273890852929, "num_tokens": 133731771.0, "step": 111230 }, { "entropy": 1.919362673163414, "epoch": 0.34483426705281245, "grad_norm": 7.571285724639893, "learning_rate": 4.308136185980673e-06, "loss": 0.5243, "mean_token_accuracy": 0.839247289299965, "num_tokens": 133742822.0, "step": 111240 }, { "entropy": 1.8016601279377937, "epoch": 0.3448652661778621, "grad_norm": 9.703665733337402, "learning_rate": 4.307942555790593e-06, "loss": 0.4382, "mean_token_accuracy": 0.8540796846151352, "num_tokens": 133755750.0, "step": 111250 }, { "entropy": 1.7813518926501275, "epoch": 0.34489626530291184, "grad_norm": 4.28598690032959, "learning_rate": 4.3077489517064285e-06, "loss": 0.3962, "mean_token_accuracy": 0.8613515332341194, "num_tokens": 133768622.0, "step": 111260 }, { "entropy": 1.8209292754530906, "epoch": 0.3449272644279615, "grad_norm": 8.168103218078613, "learning_rate": 4.307555373722316e-06, "loss": 0.4038, "mean_token_accuracy": 0.8652269795536995, "num_tokens": 133780564.0, "step": 111270 }, { "entropy": 1.8425551727414131, "epoch": 0.34495826355301124, "grad_norm": 8.19618034362793, "learning_rate": 4.307361821832388e-06, "loss": 0.4519, "mean_token_accuracy": 0.8445751652121544, "num_tokens": 133792677.0, "step": 111280 }, { "entropy": 1.824992810189724, "epoch": 0.3449892626780609, "grad_norm": 8.843597412109375, "learning_rate": 4.307168296030786e-06, "loss": 0.4662, "mean_token_accuracy": 0.8552077680826187, "num_tokens": 133805279.0, "step": 111290 }, { "entropy": 1.9060368582606315, "epoch": 0.34502026180311063, "grad_norm": 3.7033939361572266, "learning_rate": 4.306974796311647e-06, "loss": 0.4961, "mean_token_accuracy": 0.841067835688591, "num_tokens": 133817633.0, "step": 111300 }, { "entropy": 1.864403063058853, "epoch": 0.3450512609281603, "grad_norm": 7.459976673126221, "learning_rate": 4.306781322669116e-06, "loss": 0.4578, "mean_token_accuracy": 0.8469650357961654, "num_tokens": 133829133.0, "step": 111310 }, { "entropy": 1.8203039526939393, "epoch": 0.34508226005321, "grad_norm": 9.59398365020752, "learning_rate": 4.306587875097335e-06, "loss": 0.4271, "mean_token_accuracy": 0.8534867212176322, "num_tokens": 133841674.0, "step": 111320 }, { "entropy": 1.843678180873394, "epoch": 0.3451132591782597, "grad_norm": 11.09599781036377, "learning_rate": 4.306394453590449e-06, "loss": 0.4394, "mean_token_accuracy": 0.8509141758084298, "num_tokens": 133853073.0, "step": 111330 }, { "entropy": 1.837014827132225, "epoch": 0.3451442583033094, "grad_norm": 7.771301746368408, "learning_rate": 4.306201058142605e-06, "loss": 0.4294, "mean_token_accuracy": 0.8529363244771957, "num_tokens": 133865482.0, "step": 111340 }, { "entropy": 1.8316177785396577, "epoch": 0.3451752574283591, "grad_norm": 3.7739436626434326, "learning_rate": 4.3060076887479545e-06, "loss": 0.4391, "mean_token_accuracy": 0.8522952899336815, "num_tokens": 133877076.0, "step": 111350 }, { "entropy": 1.8375495672225952, "epoch": 0.34520625655340875, "grad_norm": 2.9696900844573975, "learning_rate": 4.305814345400645e-06, "loss": 0.4598, "mean_token_accuracy": 0.8476609244942666, "num_tokens": 133889360.0, "step": 111360 }, { "entropy": 1.783223459124565, "epoch": 0.3452372556784585, "grad_norm": 7.960699081420898, "learning_rate": 4.305621028094832e-06, "loss": 0.4627, "mean_token_accuracy": 0.8435281231999397, "num_tokens": 133903252.0, "step": 111370 }, { "entropy": 1.84706342369318, "epoch": 0.34526825480350815, "grad_norm": 7.111573696136475, "learning_rate": 4.305427736824668e-06, "loss": 0.4798, "mean_token_accuracy": 0.8439237996935844, "num_tokens": 133915237.0, "step": 111380 }, { "entropy": 1.7956417188048364, "epoch": 0.34529925392855787, "grad_norm": 7.185604572296143, "learning_rate": 4.305234471584312e-06, "loss": 0.3936, "mean_token_accuracy": 0.8640512511134147, "num_tokens": 133927538.0, "step": 111390 }, { "entropy": 1.8561489313840867, "epoch": 0.34533025305360754, "grad_norm": 8.284113883972168, "learning_rate": 4.3050412323679206e-06, "loss": 0.457, "mean_token_accuracy": 0.8489404112100601, "num_tokens": 133938940.0, "step": 111400 }, { "entropy": 1.8327260576188564, "epoch": 0.34536125217865726, "grad_norm": 3.422919750213623, "learning_rate": 4.3048480191696544e-06, "loss": 0.458, "mean_token_accuracy": 0.8489983096718788, "num_tokens": 133951108.0, "step": 111410 }, { "entropy": 1.8617496728897094, "epoch": 0.34539225130370693, "grad_norm": 7.909846305847168, "learning_rate": 4.304654831983675e-06, "loss": 0.4596, "mean_token_accuracy": 0.8476134285330772, "num_tokens": 133963396.0, "step": 111420 }, { "entropy": 1.9265403121709823, "epoch": 0.34542325042875666, "grad_norm": 7.380890369415283, "learning_rate": 4.304461670804146e-06, "loss": 0.5297, "mean_token_accuracy": 0.841274605691433, "num_tokens": 133974112.0, "step": 111430 }, { "entropy": 1.8644670337438582, "epoch": 0.3454542495538063, "grad_norm": 8.807491302490234, "learning_rate": 4.3042685356252335e-06, "loss": 0.4656, "mean_token_accuracy": 0.8540144443511963, "num_tokens": 133985279.0, "step": 111440 }, { "entropy": 1.9059779167175293, "epoch": 0.34548524867885605, "grad_norm": 7.666346073150635, "learning_rate": 4.304075426441105e-06, "loss": 0.5396, "mean_token_accuracy": 0.8371213942766189, "num_tokens": 133996376.0, "step": 111450 }, { "entropy": 1.8472643613815307, "epoch": 0.3455162478039057, "grad_norm": 7.403988361358643, "learning_rate": 4.3038823432459305e-06, "loss": 0.4338, "mean_token_accuracy": 0.8564211338758468, "num_tokens": 134008017.0, "step": 111460 }, { "entropy": 1.8477882623672486, "epoch": 0.34554724692895544, "grad_norm": 7.693085670471191, "learning_rate": 4.30368928603388e-06, "loss": 0.4726, "mean_token_accuracy": 0.8446563795208931, "num_tokens": 134020115.0, "step": 111470 }, { "entropy": 1.8344331562519074, "epoch": 0.3455782460540051, "grad_norm": 4.255249977111816, "learning_rate": 4.303496254799126e-06, "loss": 0.465, "mean_token_accuracy": 0.8438744112849236, "num_tokens": 134032355.0, "step": 111480 }, { "entropy": 1.8852899730205537, "epoch": 0.34560924517905484, "grad_norm": 9.30275821685791, "learning_rate": 4.303303249535845e-06, "loss": 0.4825, "mean_token_accuracy": 0.8437858000397682, "num_tokens": 134043633.0, "step": 111490 }, { "entropy": 1.7422870948910714, "epoch": 0.3456402443041045, "grad_norm": 7.4847941398620605, "learning_rate": 4.3031102702382125e-06, "loss": 0.3388, "mean_token_accuracy": 0.8664668053388596, "num_tokens": 134057736.0, "step": 111500 }, { "entropy": 1.7793061807751656, "epoch": 0.34567124342915423, "grad_norm": 3.7864532470703125, "learning_rate": 4.302917316900407e-06, "loss": 0.4038, "mean_token_accuracy": 0.8605924814939498, "num_tokens": 134070395.0, "step": 111510 }, { "entropy": 1.8568082675337791, "epoch": 0.3457022425542039, "grad_norm": 11.483708381652832, "learning_rate": 4.302724389516609e-06, "loss": 0.4593, "mean_token_accuracy": 0.8510003834962845, "num_tokens": 134082721.0, "step": 111520 }, { "entropy": 1.8132430225610734, "epoch": 0.3457332416792536, "grad_norm": 2.2380177974700928, "learning_rate": 4.302531488081001e-06, "loss": 0.4718, "mean_token_accuracy": 0.8539024695754052, "num_tokens": 134094940.0, "step": 111530 }, { "entropy": 1.7989331185817719, "epoch": 0.3457642408043033, "grad_norm": 3.9611406326293945, "learning_rate": 4.302338612587765e-06, "loss": 0.4067, "mean_token_accuracy": 0.856696180999279, "num_tokens": 134108140.0, "step": 111540 }, { "entropy": 1.85663383603096, "epoch": 0.345795239929353, "grad_norm": 3.9444892406463623, "learning_rate": 4.3021457630310894e-06, "loss": 0.4472, "mean_token_accuracy": 0.860635980963707, "num_tokens": 134119906.0, "step": 111550 }, { "entropy": 1.8466177895665168, "epoch": 0.3458262390544027, "grad_norm": 9.456862449645996, "learning_rate": 4.30195293940516e-06, "loss": 0.4452, "mean_token_accuracy": 0.8576301142573357, "num_tokens": 134132194.0, "step": 111560 }, { "entropy": 1.8773884311318398, "epoch": 0.3458572381794524, "grad_norm": 6.836810111999512, "learning_rate": 4.301760141704167e-06, "loss": 0.4855, "mean_token_accuracy": 0.8493657246232033, "num_tokens": 134143520.0, "step": 111570 }, { "entropy": 1.9150910288095475, "epoch": 0.3458882373045021, "grad_norm": 8.402644157409668, "learning_rate": 4.301567369922301e-06, "loss": 0.527, "mean_token_accuracy": 0.8446568265557289, "num_tokens": 134154428.0, "step": 111580 }, { "entropy": 1.911669085919857, "epoch": 0.3459192364295518, "grad_norm": 8.802173614501953, "learning_rate": 4.301374624053757e-06, "loss": 0.4807, "mean_token_accuracy": 0.849866247177124, "num_tokens": 134165966.0, "step": 111590 }, { "entropy": 1.8143538311123848, "epoch": 0.34595023555460147, "grad_norm": 8.931032180786133, "learning_rate": 4.301181904092727e-06, "loss": 0.4441, "mean_token_accuracy": 0.8532052963972092, "num_tokens": 134178567.0, "step": 111600 }, { "entropy": 1.8561271876096725, "epoch": 0.34598123467965114, "grad_norm": 9.355246543884277, "learning_rate": 4.300989210033409e-06, "loss": 0.4762, "mean_token_accuracy": 0.8413384512066842, "num_tokens": 134190432.0, "step": 111610 }, { "entropy": 1.8068694084882737, "epoch": 0.34601223380470086, "grad_norm": 8.892166137695312, "learning_rate": 4.3007965418700015e-06, "loss": 0.4047, "mean_token_accuracy": 0.8617716804146767, "num_tokens": 134202936.0, "step": 111620 }, { "entropy": 1.819658127427101, "epoch": 0.34604323292975053, "grad_norm": 4.105354309082031, "learning_rate": 4.300603899596706e-06, "loss": 0.4383, "mean_token_accuracy": 0.8501406639814377, "num_tokens": 134215290.0, "step": 111630 }, { "entropy": 1.7999876148998737, "epoch": 0.34607423205480026, "grad_norm": 3.328152894973755, "learning_rate": 4.300411283207722e-06, "loss": 0.4242, "mean_token_accuracy": 0.8598711341619492, "num_tokens": 134228492.0, "step": 111640 }, { "entropy": 1.8138612404465675, "epoch": 0.3461052311798499, "grad_norm": 8.897016525268555, "learning_rate": 4.300218692697255e-06, "loss": 0.4467, "mean_token_accuracy": 0.8562261596322059, "num_tokens": 134240367.0, "step": 111650 }, { "entropy": 1.8443049594759942, "epoch": 0.34613623030489965, "grad_norm": 7.959166049957275, "learning_rate": 4.300026128059511e-06, "loss": 0.4563, "mean_token_accuracy": 0.8506998106837272, "num_tokens": 134252476.0, "step": 111660 }, { "entropy": 1.8540190950036048, "epoch": 0.3461672294299493, "grad_norm": 4.293795585632324, "learning_rate": 4.2998335892886964e-06, "loss": 0.4592, "mean_token_accuracy": 0.8463447079062462, "num_tokens": 134264385.0, "step": 111670 }, { "entropy": 1.8193232595920563, "epoch": 0.34619822855499904, "grad_norm": 8.931365966796875, "learning_rate": 4.2996410763790225e-06, "loss": 0.4414, "mean_token_accuracy": 0.8522950485348701, "num_tokens": 134276602.0, "step": 111680 }, { "entropy": 1.8343895703554154, "epoch": 0.3462292276800487, "grad_norm": 9.969979286193848, "learning_rate": 4.2994485893247e-06, "loss": 0.4677, "mean_token_accuracy": 0.8539774760603904, "num_tokens": 134288782.0, "step": 111690 }, { "entropy": 1.8512503013014794, "epoch": 0.34626022680509844, "grad_norm": 2.8974452018737793, "learning_rate": 4.2992561281199405e-06, "loss": 0.4929, "mean_token_accuracy": 0.8387772515416145, "num_tokens": 134301209.0, "step": 111700 }, { "entropy": 1.8505238316953183, "epoch": 0.3462912259301481, "grad_norm": 3.372145175933838, "learning_rate": 4.29906369275896e-06, "loss": 0.4562, "mean_token_accuracy": 0.8435917362570763, "num_tokens": 134313368.0, "step": 111710 }, { "entropy": 1.8507800638675689, "epoch": 0.34632222505519783, "grad_norm": 7.913898944854736, "learning_rate": 4.2988712832359755e-06, "loss": 0.4267, "mean_token_accuracy": 0.8630237206816673, "num_tokens": 134325077.0, "step": 111720 }, { "entropy": 1.7628626979887485, "epoch": 0.3463532241802475, "grad_norm": 3.487839698791504, "learning_rate": 4.2986788995452044e-06, "loss": 0.3905, "mean_token_accuracy": 0.8562428668141365, "num_tokens": 134339036.0, "step": 111730 }, { "entropy": 1.843147909641266, "epoch": 0.3463842233052972, "grad_norm": 6.786396026611328, "learning_rate": 4.298486541680868e-06, "loss": 0.4768, "mean_token_accuracy": 0.8427472576498986, "num_tokens": 134351445.0, "step": 111740 }, { "entropy": 1.807756444811821, "epoch": 0.3464152224303469, "grad_norm": 5.164430141448975, "learning_rate": 4.298294209637186e-06, "loss": 0.4349, "mean_token_accuracy": 0.853118185698986, "num_tokens": 134365069.0, "step": 111750 }, { "entropy": 1.859677466750145, "epoch": 0.3464462215553966, "grad_norm": 9.513275146484375, "learning_rate": 4.298101903408386e-06, "loss": 0.4894, "mean_token_accuracy": 0.8469240546226502, "num_tokens": 134377457.0, "step": 111760 }, { "entropy": 1.8968846127390862, "epoch": 0.3464772206804463, "grad_norm": 4.018582344055176, "learning_rate": 4.297909622988691e-06, "loss": 0.4795, "mean_token_accuracy": 0.8519986256957054, "num_tokens": 134388907.0, "step": 111770 }, { "entropy": 1.9146791011095048, "epoch": 0.346508219805496, "grad_norm": 10.912361145019531, "learning_rate": 4.297717368372331e-06, "loss": 0.4754, "mean_token_accuracy": 0.8396982088685035, "num_tokens": 134400847.0, "step": 111780 }, { "entropy": 1.9298876509070397, "epoch": 0.3465392189305457, "grad_norm": 3.7877981662750244, "learning_rate": 4.2975251395535315e-06, "loss": 0.6157, "mean_token_accuracy": 0.8234444096684456, "num_tokens": 134412392.0, "step": 111790 }, { "entropy": 1.8450885564088821, "epoch": 0.3465702180555954, "grad_norm": 9.34116268157959, "learning_rate": 4.297332936526527e-06, "loss": 0.4455, "mean_token_accuracy": 0.8459417134523392, "num_tokens": 134424120.0, "step": 111800 }, { "entropy": 1.8955134615302085, "epoch": 0.34660121718064507, "grad_norm": 8.82641315460205, "learning_rate": 4.297140759285549e-06, "loss": 0.4741, "mean_token_accuracy": 0.852502977848053, "num_tokens": 134435233.0, "step": 111810 }, { "entropy": 1.91280208081007, "epoch": 0.3466322163056948, "grad_norm": 7.556041240692139, "learning_rate": 4.296948607824833e-06, "loss": 0.4761, "mean_token_accuracy": 0.8468595892190933, "num_tokens": 134446764.0, "step": 111820 }, { "entropy": 1.7918421171605587, "epoch": 0.34666321543074446, "grad_norm": 8.250411033630371, "learning_rate": 4.296756482138616e-06, "loss": 0.4091, "mean_token_accuracy": 0.8426727816462517, "num_tokens": 134460455.0, "step": 111830 }, { "entropy": 1.8062309354543686, "epoch": 0.3466942145557942, "grad_norm": 8.455044746398926, "learning_rate": 4.2965643822211335e-06, "loss": 0.4249, "mean_token_accuracy": 0.8552202418446541, "num_tokens": 134473028.0, "step": 111840 }, { "entropy": 1.8862827733159064, "epoch": 0.34672521368084386, "grad_norm": 9.205554962158203, "learning_rate": 4.2963723080666284e-06, "loss": 0.4578, "mean_token_accuracy": 0.8518849700689316, "num_tokens": 134484150.0, "step": 111850 }, { "entropy": 1.874978469312191, "epoch": 0.3467562128058935, "grad_norm": 7.733843803405762, "learning_rate": 4.2961802596693425e-06, "loss": 0.4926, "mean_token_accuracy": 0.8496404230594635, "num_tokens": 134495656.0, "step": 111860 }, { "entropy": 1.7182648986577989, "epoch": 0.34678721193094325, "grad_norm": 3.7245407104492188, "learning_rate": 4.295988237023518e-06, "loss": 0.3912, "mean_token_accuracy": 0.8628142789006233, "num_tokens": 134509703.0, "step": 111870 }, { "entropy": 1.8488290548324584, "epoch": 0.3468182110559929, "grad_norm": 9.35959529876709, "learning_rate": 4.295796240123402e-06, "loss": 0.485, "mean_token_accuracy": 0.8483680263161659, "num_tokens": 134522262.0, "step": 111880 }, { "entropy": 1.7668012008070946, "epoch": 0.34684921018104264, "grad_norm": 6.6548662185668945, "learning_rate": 4.2956042689632414e-06, "loss": 0.4355, "mean_token_accuracy": 0.8533818453550339, "num_tokens": 134534783.0, "step": 111890 }, { "entropy": 1.8939444333314897, "epoch": 0.3468802093060923, "grad_norm": 3.5856080055236816, "learning_rate": 4.295412323537284e-06, "loss": 0.4928, "mean_token_accuracy": 0.8458682015538216, "num_tokens": 134545993.0, "step": 111900 }, { "entropy": 1.8058057472109794, "epoch": 0.34691120843114204, "grad_norm": 8.451927185058594, "learning_rate": 4.295220403839784e-06, "loss": 0.4545, "mean_token_accuracy": 0.8482358440756798, "num_tokens": 134558581.0, "step": 111910 }, { "entropy": 1.92232054322958, "epoch": 0.3469422075561917, "grad_norm": 9.112445831298828, "learning_rate": 4.29502850986499e-06, "loss": 0.5232, "mean_token_accuracy": 0.8385997459292411, "num_tokens": 134570176.0, "step": 111920 }, { "entropy": 1.8965173974633216, "epoch": 0.34697320668124143, "grad_norm": 8.362327575683594, "learning_rate": 4.294836641607161e-06, "loss": 0.4858, "mean_token_accuracy": 0.8484830498695374, "num_tokens": 134581749.0, "step": 111930 }, { "entropy": 1.8018444642424583, "epoch": 0.3470042058062911, "grad_norm": 7.452921390533447, "learning_rate": 4.294644799060549e-06, "loss": 0.3927, "mean_token_accuracy": 0.861353749036789, "num_tokens": 134595028.0, "step": 111940 }, { "entropy": 1.8184639766812325, "epoch": 0.3470352049313408, "grad_norm": 6.740150451660156, "learning_rate": 4.2944529822194155e-06, "loss": 0.3941, "mean_token_accuracy": 0.8679483845829964, "num_tokens": 134607592.0, "step": 111950 }, { "entropy": 1.8601836189627647, "epoch": 0.3470662040563905, "grad_norm": 7.418848991394043, "learning_rate": 4.294261191078018e-06, "loss": 0.4602, "mean_token_accuracy": 0.847012946009636, "num_tokens": 134619234.0, "step": 111960 }, { "entropy": 1.8946197971701622, "epoch": 0.3470972031814402, "grad_norm": 10.559837341308594, "learning_rate": 4.29406942563062e-06, "loss": 0.4595, "mean_token_accuracy": 0.8531604185700417, "num_tokens": 134630843.0, "step": 111970 }, { "entropy": 1.8593143001198769, "epoch": 0.3471282023064899, "grad_norm": 9.078370094299316, "learning_rate": 4.293877685871484e-06, "loss": 0.4688, "mean_token_accuracy": 0.8558160826563835, "num_tokens": 134642923.0, "step": 111980 }, { "entropy": 1.8398296535015106, "epoch": 0.3471592014315396, "grad_norm": 7.763385772705078, "learning_rate": 4.293685971794876e-06, "loss": 0.4964, "mean_token_accuracy": 0.8360516101121902, "num_tokens": 134655091.0, "step": 111990 }, { "entropy": 1.893106034398079, "epoch": 0.3471902005565893, "grad_norm": 9.48266315460205, "learning_rate": 4.293494283395064e-06, "loss": 0.5324, "mean_token_accuracy": 0.8328988835215568, "num_tokens": 134666427.0, "step": 112000 }, { "entropy": 1.9177181079983712, "epoch": 0.347221199681639, "grad_norm": 7.473196029663086, "learning_rate": 4.293302620666314e-06, "loss": 0.4983, "mean_token_accuracy": 0.8428963899612427, "num_tokens": 134678226.0, "step": 112010 }, { "entropy": 1.903357920050621, "epoch": 0.34725219880668867, "grad_norm": 8.634038925170898, "learning_rate": 4.293110983602899e-06, "loss": 0.5214, "mean_token_accuracy": 0.8499746888875961, "num_tokens": 134688516.0, "step": 112020 }, { "entropy": 1.9164575353264808, "epoch": 0.3472831979317384, "grad_norm": 7.6614789962768555, "learning_rate": 4.29291937219909e-06, "loss": 0.5117, "mean_token_accuracy": 0.8393423616886139, "num_tokens": 134699742.0, "step": 112030 }, { "entropy": 1.8141165480017662, "epoch": 0.34731419705678807, "grad_norm": 3.817478656768799, "learning_rate": 4.292727786449164e-06, "loss": 0.4548, "mean_token_accuracy": 0.8489150524139404, "num_tokens": 134713148.0, "step": 112040 }, { "entropy": 1.9213459312915802, "epoch": 0.3473451961818378, "grad_norm": 10.819091796875, "learning_rate": 4.292536226347394e-06, "loss": 0.5138, "mean_token_accuracy": 0.8452482402324677, "num_tokens": 134723649.0, "step": 112050 }, { "entropy": 1.8041829138994216, "epoch": 0.34737619530688746, "grad_norm": 2.2280683517456055, "learning_rate": 4.29234469188806e-06, "loss": 0.4352, "mean_token_accuracy": 0.8564143598079681, "num_tokens": 134736057.0, "step": 112060 }, { "entropy": 1.8665026307106019, "epoch": 0.3474071944319372, "grad_norm": 8.091485023498535, "learning_rate": 4.2921531830654395e-06, "loss": 0.5063, "mean_token_accuracy": 0.8456338688731193, "num_tokens": 134746830.0, "step": 112070 }, { "entropy": 1.818691223859787, "epoch": 0.34743819355698685, "grad_norm": 9.684453964233398, "learning_rate": 4.291961699873817e-06, "loss": 0.4977, "mean_token_accuracy": 0.8366310462355614, "num_tokens": 134759170.0, "step": 112080 }, { "entropy": 1.8601401686668395, "epoch": 0.3474691926820366, "grad_norm": 8.760641098022461, "learning_rate": 4.291770242307472e-06, "loss": 0.4772, "mean_token_accuracy": 0.8496808990836143, "num_tokens": 134770727.0, "step": 112090 }, { "entropy": 1.782160858809948, "epoch": 0.34750019180708624, "grad_norm": 3.916487455368042, "learning_rate": 4.291578810360692e-06, "loss": 0.4125, "mean_token_accuracy": 0.859810471534729, "num_tokens": 134783191.0, "step": 112100 }, { "entropy": 1.8653858050704002, "epoch": 0.3475311909321359, "grad_norm": 3.788681983947754, "learning_rate": 4.291387404027763e-06, "loss": 0.4692, "mean_token_accuracy": 0.8446184307336807, "num_tokens": 134794470.0, "step": 112110 }, { "entropy": 1.816742117702961, "epoch": 0.34756219005718564, "grad_norm": 3.5885984897613525, "learning_rate": 4.2911960233029745e-06, "loss": 0.4449, "mean_token_accuracy": 0.8555623233318329, "num_tokens": 134806055.0, "step": 112120 }, { "entropy": 1.817148557305336, "epoch": 0.3475931891822353, "grad_norm": 9.98969841003418, "learning_rate": 4.291004668180616e-06, "loss": 0.4527, "mean_token_accuracy": 0.8531207337975502, "num_tokens": 134817396.0, "step": 112130 }, { "entropy": 1.916732382774353, "epoch": 0.34762418830728503, "grad_norm": 9.270211219787598, "learning_rate": 4.290813338654979e-06, "loss": 0.5333, "mean_token_accuracy": 0.8368429586291313, "num_tokens": 134828575.0, "step": 112140 }, { "entropy": 1.8983269765973092, "epoch": 0.3476551874323347, "grad_norm": 8.712531089782715, "learning_rate": 4.2906220347203585e-06, "loss": 0.4922, "mean_token_accuracy": 0.8402269974350929, "num_tokens": 134840282.0, "step": 112150 }, { "entropy": 1.8531476333737373, "epoch": 0.3476861865573844, "grad_norm": 8.996459007263184, "learning_rate": 4.290430756371049e-06, "loss": 0.4724, "mean_token_accuracy": 0.8387810304760933, "num_tokens": 134852480.0, "step": 112160 }, { "entropy": 1.9212437227368355, "epoch": 0.3477171856824341, "grad_norm": 7.855039119720459, "learning_rate": 4.290239503601349e-06, "loss": 0.5461, "mean_token_accuracy": 0.8299373790621758, "num_tokens": 134864007.0, "step": 112170 }, { "entropy": 1.8462037175893784, "epoch": 0.3477481848074838, "grad_norm": 8.122658729553223, "learning_rate": 4.290048276405558e-06, "loss": 0.4725, "mean_token_accuracy": 0.8515694797039032, "num_tokens": 134874756.0, "step": 112180 }, { "entropy": 1.8744586423039435, "epoch": 0.3477791839325335, "grad_norm": 8.11306381225586, "learning_rate": 4.289857074777977e-06, "loss": 0.4777, "mean_token_accuracy": 0.8434885591268539, "num_tokens": 134886296.0, "step": 112190 }, { "entropy": 1.8067267432808876, "epoch": 0.3478101830575832, "grad_norm": 4.103103160858154, "learning_rate": 4.289665898712908e-06, "loss": 0.4167, "mean_token_accuracy": 0.8561098381876946, "num_tokens": 134898467.0, "step": 112200 }, { "entropy": 1.9343467682600022, "epoch": 0.3478411821826329, "grad_norm": 6.8563923835754395, "learning_rate": 4.289474748204655e-06, "loss": 0.5171, "mean_token_accuracy": 0.8526286870241165, "num_tokens": 134909189.0, "step": 112210 }, { "entropy": 1.8394694164395333, "epoch": 0.3478721813076826, "grad_norm": 12.269657135009766, "learning_rate": 4.289283623247527e-06, "loss": 0.4493, "mean_token_accuracy": 0.8569044172763824, "num_tokens": 134920985.0, "step": 112220 }, { "entropy": 1.7924383148550986, "epoch": 0.3479031804327323, "grad_norm": 3.9235920906066895, "learning_rate": 4.289092523835829e-06, "loss": 0.4259, "mean_token_accuracy": 0.8516478061676025, "num_tokens": 134934467.0, "step": 112230 }, { "entropy": 1.9101425260305405, "epoch": 0.347934179557782, "grad_norm": 8.595767974853516, "learning_rate": 4.288901449963873e-06, "loss": 0.5025, "mean_token_accuracy": 0.8431054592132569, "num_tokens": 134945129.0, "step": 112240 }, { "entropy": 1.8743219137191773, "epoch": 0.34796517868283167, "grad_norm": 8.251272201538086, "learning_rate": 4.288710401625969e-06, "loss": 0.4904, "mean_token_accuracy": 0.8410423472523689, "num_tokens": 134957099.0, "step": 112250 }, { "entropy": 1.8412858352065087, "epoch": 0.3479961778078814, "grad_norm": 7.280477046966553, "learning_rate": 4.2885193788164325e-06, "loss": 0.4299, "mean_token_accuracy": 0.8605325534939766, "num_tokens": 134969309.0, "step": 112260 }, { "entropy": 1.8739237666130066, "epoch": 0.34802717693293106, "grad_norm": 7.483038902282715, "learning_rate": 4.288328381529578e-06, "loss": 0.4976, "mean_token_accuracy": 0.8409060567617417, "num_tokens": 134980937.0, "step": 112270 }, { "entropy": 1.7966715544462204, "epoch": 0.3480581760579808, "grad_norm": 3.68406081199646, "learning_rate": 4.288137409759721e-06, "loss": 0.4251, "mean_token_accuracy": 0.8603166654706002, "num_tokens": 134993822.0, "step": 112280 }, { "entropy": 1.9044744417071342, "epoch": 0.34808917518303045, "grad_norm": 9.015130043029785, "learning_rate": 4.287946463501182e-06, "loss": 0.4947, "mean_token_accuracy": 0.8462210029363633, "num_tokens": 135005423.0, "step": 112290 }, { "entropy": 1.8514644920825958, "epoch": 0.3481201743080802, "grad_norm": 4.599631309509277, "learning_rate": 4.287755542748281e-06, "loss": 0.4693, "mean_token_accuracy": 0.8454376772046089, "num_tokens": 135017386.0, "step": 112300 }, { "entropy": 1.8641838699579238, "epoch": 0.34815117343312985, "grad_norm": 9.161344528198242, "learning_rate": 4.287564647495341e-06, "loss": 0.4695, "mean_token_accuracy": 0.841671122610569, "num_tokens": 135029362.0, "step": 112310 }, { "entropy": 1.8425541408360004, "epoch": 0.34818217255817957, "grad_norm": 8.259307861328125, "learning_rate": 4.287373777736684e-06, "loss": 0.4328, "mean_token_accuracy": 0.8496177807450295, "num_tokens": 135042307.0, "step": 112320 }, { "entropy": 1.8292279705405234, "epoch": 0.34821317168322924, "grad_norm": 7.943572044372559, "learning_rate": 4.287182933466639e-06, "loss": 0.4415, "mean_token_accuracy": 0.8518629640340805, "num_tokens": 135054362.0, "step": 112330 }, { "entropy": 1.7942394033074378, "epoch": 0.34824417080827896, "grad_norm": 3.5839719772338867, "learning_rate": 4.286992114679531e-06, "loss": 0.3978, "mean_token_accuracy": 0.858057115972042, "num_tokens": 135067150.0, "step": 112340 }, { "entropy": 1.9045852065086364, "epoch": 0.34827516993332863, "grad_norm": 7.259175777435303, "learning_rate": 4.286801321369691e-06, "loss": 0.4925, "mean_token_accuracy": 0.8487705394625664, "num_tokens": 135078189.0, "step": 112350 }, { "entropy": 1.8571503296494485, "epoch": 0.3483061690583783, "grad_norm": 9.252874374389648, "learning_rate": 4.286610553531448e-06, "loss": 0.4521, "mean_token_accuracy": 0.8501718282699585, "num_tokens": 135089770.0, "step": 112360 }, { "entropy": 1.8037226751446724, "epoch": 0.348337168183428, "grad_norm": 7.917973041534424, "learning_rate": 4.286419811159137e-06, "loss": 0.485, "mean_token_accuracy": 0.8473261296749115, "num_tokens": 135102849.0, "step": 112370 }, { "entropy": 1.8692201957106591, "epoch": 0.3483681673084777, "grad_norm": 4.2562575340271, "learning_rate": 4.286229094247093e-06, "loss": 0.4657, "mean_token_accuracy": 0.8484464436769485, "num_tokens": 135114862.0, "step": 112380 }, { "entropy": 1.7813385412096978, "epoch": 0.3483991664335274, "grad_norm": 8.083758354187012, "learning_rate": 4.28603840278965e-06, "loss": 0.4436, "mean_token_accuracy": 0.8533005833625793, "num_tokens": 135127643.0, "step": 112390 }, { "entropy": 1.7049074426293374, "epoch": 0.3484301655585771, "grad_norm": 4.437235355377197, "learning_rate": 4.285847736781148e-06, "loss": 0.3249, "mean_token_accuracy": 0.871871954202652, "num_tokens": 135141468.0, "step": 112400 }, { "entropy": 1.8399584576487542, "epoch": 0.3484611646836268, "grad_norm": 7.485554218292236, "learning_rate": 4.285657096215928e-06, "loss": 0.4449, "mean_token_accuracy": 0.850048904120922, "num_tokens": 135153861.0, "step": 112410 }, { "entropy": 1.8359861955046655, "epoch": 0.3484921638086765, "grad_norm": 4.102065563201904, "learning_rate": 4.285466481088329e-06, "loss": 0.5005, "mean_token_accuracy": 0.8472228407859802, "num_tokens": 135166609.0, "step": 112420 }, { "entropy": 1.8454686462879182, "epoch": 0.3485231629337262, "grad_norm": 7.877338409423828, "learning_rate": 4.2852758913926965e-06, "loss": 0.4434, "mean_token_accuracy": 0.849373996257782, "num_tokens": 135179113.0, "step": 112430 }, { "entropy": 1.8713324323296547, "epoch": 0.3485541620587759, "grad_norm": 9.064533233642578, "learning_rate": 4.285085327123374e-06, "loss": 0.4783, "mean_token_accuracy": 0.8380694881081581, "num_tokens": 135191714.0, "step": 112440 }, { "entropy": 1.8374795719981194, "epoch": 0.3485851611838256, "grad_norm": 8.24960994720459, "learning_rate": 4.284894788274712e-06, "loss": 0.4526, "mean_token_accuracy": 0.8538482531905174, "num_tokens": 135203618.0, "step": 112450 }, { "entropy": 1.9125969141721726, "epoch": 0.34861616030887527, "grad_norm": 7.1157002449035645, "learning_rate": 4.284704274841055e-06, "loss": 0.4723, "mean_token_accuracy": 0.8587391778826714, "num_tokens": 135214093.0, "step": 112460 }, { "entropy": 1.7782380178570747, "epoch": 0.348647159433925, "grad_norm": 6.913045406341553, "learning_rate": 4.284513786816757e-06, "loss": 0.3724, "mean_token_accuracy": 0.865745086967945, "num_tokens": 135227964.0, "step": 112470 }, { "entropy": 1.8960969477891922, "epoch": 0.34867815855897466, "grad_norm": 8.845172882080078, "learning_rate": 4.284323324196168e-06, "loss": 0.5353, "mean_token_accuracy": 0.8316770166158676, "num_tokens": 135238940.0, "step": 112480 }, { "entropy": 1.8000614181160928, "epoch": 0.3487091576840244, "grad_norm": 9.656888008117676, "learning_rate": 4.284132886973643e-06, "loss": 0.4356, "mean_token_accuracy": 0.8554758876562119, "num_tokens": 135251553.0, "step": 112490 }, { "entropy": 1.8248675152659417, "epoch": 0.34874015680907405, "grad_norm": 8.482026100158691, "learning_rate": 4.283942475143537e-06, "loss": 0.4344, "mean_token_accuracy": 0.8560914367437362, "num_tokens": 135263125.0, "step": 112500 }, { "entropy": 1.830253078043461, "epoch": 0.3487711559341238, "grad_norm": 7.03892183303833, "learning_rate": 4.283752088700209e-06, "loss": 0.4362, "mean_token_accuracy": 0.8495120465755462, "num_tokens": 135275581.0, "step": 112510 }, { "entropy": 1.840060842782259, "epoch": 0.34880215505917345, "grad_norm": 8.70134449005127, "learning_rate": 4.283561727638018e-06, "loss": 0.4745, "mean_token_accuracy": 0.8449655011296272, "num_tokens": 135287871.0, "step": 112520 }, { "entropy": 1.7875398755073548, "epoch": 0.34883315418422317, "grad_norm": 8.763019561767578, "learning_rate": 4.283371391951324e-06, "loss": 0.4492, "mean_token_accuracy": 0.846840138733387, "num_tokens": 135300518.0, "step": 112530 }, { "entropy": 1.906669056415558, "epoch": 0.34886415330927284, "grad_norm": 8.401066780090332, "learning_rate": 4.2831810816344906e-06, "loss": 0.5297, "mean_token_accuracy": 0.8414530888199806, "num_tokens": 135311327.0, "step": 112540 }, { "entropy": 1.847528837621212, "epoch": 0.34889515243432256, "grad_norm": 9.117242813110352, "learning_rate": 4.282990796681881e-06, "loss": 0.4505, "mean_token_accuracy": 0.8574508696794509, "num_tokens": 135322556.0, "step": 112550 }, { "entropy": 1.8085899502038956, "epoch": 0.34892615155937223, "grad_norm": 8.368361473083496, "learning_rate": 4.282800537087866e-06, "loss": 0.4078, "mean_token_accuracy": 0.8587646409869194, "num_tokens": 135335243.0, "step": 112560 }, { "entropy": 1.8577345311641693, "epoch": 0.34895715068442196, "grad_norm": 8.698378562927246, "learning_rate": 4.282610302846807e-06, "loss": 0.4703, "mean_token_accuracy": 0.8500236317515373, "num_tokens": 135346757.0, "step": 112570 }, { "entropy": 1.8168217539787292, "epoch": 0.3489881498094716, "grad_norm": 9.063081741333008, "learning_rate": 4.2824200939530796e-06, "loss": 0.4817, "mean_token_accuracy": 0.8485912069678306, "num_tokens": 135359508.0, "step": 112580 }, { "entropy": 1.8818160638213157, "epoch": 0.34901914893452135, "grad_norm": 9.063243865966797, "learning_rate": 4.282229910401052e-06, "loss": 0.5124, "mean_token_accuracy": 0.8384617939591408, "num_tokens": 135371143.0, "step": 112590 }, { "entropy": 1.908516588807106, "epoch": 0.349050148059571, "grad_norm": 7.1072282791137695, "learning_rate": 4.282039752185099e-06, "loss": 0.4676, "mean_token_accuracy": 0.8510788649320602, "num_tokens": 135382304.0, "step": 112600 }, { "entropy": 1.9243893340229987, "epoch": 0.3490811471846207, "grad_norm": 8.661938667297363, "learning_rate": 4.2818496192995955e-06, "loss": 0.5046, "mean_token_accuracy": 0.8423077017068863, "num_tokens": 135393382.0, "step": 112610 }, { "entropy": 1.9478312745690345, "epoch": 0.3491121463096704, "grad_norm": 8.19333267211914, "learning_rate": 4.281659511738918e-06, "loss": 0.545, "mean_token_accuracy": 0.8402326017618179, "num_tokens": 135404664.0, "step": 112620 }, { "entropy": 1.9101561456918716, "epoch": 0.3491431454347201, "grad_norm": 11.05017375946045, "learning_rate": 4.281469429497445e-06, "loss": 0.5287, "mean_token_accuracy": 0.8389782354235649, "num_tokens": 135416146.0, "step": 112630 }, { "entropy": 1.8043391317129136, "epoch": 0.3491741445597698, "grad_norm": 9.59979248046875, "learning_rate": 4.281279372569557e-06, "loss": 0.4637, "mean_token_accuracy": 0.8521313741803169, "num_tokens": 135428371.0, "step": 112640 }, { "entropy": 1.822686144709587, "epoch": 0.3492051436848195, "grad_norm": 9.630875587463379, "learning_rate": 4.281089340949636e-06, "loss": 0.4281, "mean_token_accuracy": 0.8514738500118255, "num_tokens": 135440601.0, "step": 112650 }, { "entropy": 1.844109094142914, "epoch": 0.3492361428098692, "grad_norm": 3.6397085189819336, "learning_rate": 4.280899334632067e-06, "loss": 0.4534, "mean_token_accuracy": 0.8515435203909874, "num_tokens": 135452180.0, "step": 112660 }, { "entropy": 1.7884308710694312, "epoch": 0.34926714193491887, "grad_norm": 8.593812942504883, "learning_rate": 4.280709353611234e-06, "loss": 0.4016, "mean_token_accuracy": 0.8537843570113182, "num_tokens": 135465795.0, "step": 112670 }, { "entropy": 1.7283026084303856, "epoch": 0.3492981410599686, "grad_norm": 8.442497253417969, "learning_rate": 4.280519397881524e-06, "loss": 0.3646, "mean_token_accuracy": 0.8640974462032318, "num_tokens": 135478919.0, "step": 112680 }, { "entropy": 1.8432815566658973, "epoch": 0.34932914018501826, "grad_norm": 8.06016731262207, "learning_rate": 4.280329467437327e-06, "loss": 0.4117, "mean_token_accuracy": 0.8606173783540726, "num_tokens": 135490424.0, "step": 112690 }, { "entropy": 1.821248809993267, "epoch": 0.349360139310068, "grad_norm": 7.616511821746826, "learning_rate": 4.280139562273034e-06, "loss": 0.4417, "mean_token_accuracy": 0.847284109890461, "num_tokens": 135503303.0, "step": 112700 }, { "entropy": 1.8346032842993736, "epoch": 0.34939113843511765, "grad_norm": 8.359513282775879, "learning_rate": 4.279949682383039e-06, "loss": 0.4911, "mean_token_accuracy": 0.8445788651704789, "num_tokens": 135514959.0, "step": 112710 }, { "entropy": 1.8606849431991577, "epoch": 0.3494221375601674, "grad_norm": 9.360651016235352, "learning_rate": 4.279759827761733e-06, "loss": 0.5023, "mean_token_accuracy": 0.8447454944252968, "num_tokens": 135525909.0, "step": 112720 }, { "entropy": 1.8562923952937127, "epoch": 0.34945313668521705, "grad_norm": 7.43868350982666, "learning_rate": 4.279569998403512e-06, "loss": 0.5419, "mean_token_accuracy": 0.8330693423748017, "num_tokens": 135537774.0, "step": 112730 }, { "entropy": 1.7586598336696624, "epoch": 0.34948413581026677, "grad_norm": 4.424074649810791, "learning_rate": 4.279380194302777e-06, "loss": 0.4219, "mean_token_accuracy": 0.8576010316610336, "num_tokens": 135550831.0, "step": 112740 }, { "entropy": 1.7588186517357827, "epoch": 0.34951513493531644, "grad_norm": 8.668641090393066, "learning_rate": 4.279190415453926e-06, "loss": 0.3962, "mean_token_accuracy": 0.8586042076349258, "num_tokens": 135564325.0, "step": 112750 }, { "entropy": 1.814384798705578, "epoch": 0.34954613406036616, "grad_norm": 8.124059677124023, "learning_rate": 4.279000661851359e-06, "loss": 0.4319, "mean_token_accuracy": 0.8556953683495522, "num_tokens": 135576775.0, "step": 112760 }, { "entropy": 1.8137044474482535, "epoch": 0.34957713318541583, "grad_norm": 9.291952133178711, "learning_rate": 4.278810933489481e-06, "loss": 0.4802, "mean_token_accuracy": 0.8497354611754417, "num_tokens": 135589390.0, "step": 112770 }, { "entropy": 1.8097498089075088, "epoch": 0.34960813231046556, "grad_norm": 7.762622356414795, "learning_rate": 4.278621230362695e-06, "loss": 0.4427, "mean_token_accuracy": 0.8576091334223748, "num_tokens": 135602123.0, "step": 112780 }, { "entropy": 1.9097516283392906, "epoch": 0.3496391314355152, "grad_norm": 8.101143836975098, "learning_rate": 4.278431552465409e-06, "loss": 0.522, "mean_token_accuracy": 0.8374461770057678, "num_tokens": 135613911.0, "step": 112790 }, { "entropy": 1.8759268373250961, "epoch": 0.34967013056056495, "grad_norm": 8.917213439941406, "learning_rate": 4.27824189979203e-06, "loss": 0.4684, "mean_token_accuracy": 0.8517908856272698, "num_tokens": 135625727.0, "step": 112800 }, { "entropy": 1.8602227076888085, "epoch": 0.3497011296856146, "grad_norm": 5.823835372924805, "learning_rate": 4.278052272336967e-06, "loss": 0.4806, "mean_token_accuracy": 0.8478166356682777, "num_tokens": 135637869.0, "step": 112810 }, { "entropy": 1.921775433421135, "epoch": 0.34973212881066434, "grad_norm": 10.301258087158203, "learning_rate": 4.2778626700946335e-06, "loss": 0.5161, "mean_token_accuracy": 0.8499249458312989, "num_tokens": 135648405.0, "step": 112820 }, { "entropy": 1.877712282538414, "epoch": 0.349763127935714, "grad_norm": 7.107439994812012, "learning_rate": 4.2776730930594425e-06, "loss": 0.5527, "mean_token_accuracy": 0.8331371307373047, "num_tokens": 135660457.0, "step": 112830 }, { "entropy": 1.83882287889719, "epoch": 0.34979412706076374, "grad_norm": 12.444573402404785, "learning_rate": 4.277483541225809e-06, "loss": 0.5465, "mean_token_accuracy": 0.8394935175776481, "num_tokens": 135672611.0, "step": 112840 }, { "entropy": 1.8228483900427819, "epoch": 0.3498251261858134, "grad_norm": 7.8412933349609375, "learning_rate": 4.27729401458815e-06, "loss": 0.4319, "mean_token_accuracy": 0.8546199440956116, "num_tokens": 135684136.0, "step": 112850 }, { "entropy": 1.8391767397522927, "epoch": 0.3498561253108631, "grad_norm": 8.248656272888184, "learning_rate": 4.2771045131408834e-06, "loss": 0.4476, "mean_token_accuracy": 0.8546992540359497, "num_tokens": 135696190.0, "step": 112860 }, { "entropy": 1.8091487675905227, "epoch": 0.3498871244359128, "grad_norm": 3.6982574462890625, "learning_rate": 4.2769150368784295e-06, "loss": 0.4425, "mean_token_accuracy": 0.8531347304582596, "num_tokens": 135708890.0, "step": 112870 }, { "entropy": 1.7924383908510209, "epoch": 0.34991812356096247, "grad_norm": 8.61406135559082, "learning_rate": 4.2767255857952115e-06, "loss": 0.4149, "mean_token_accuracy": 0.8567244291305542, "num_tokens": 135721688.0, "step": 112880 }, { "entropy": 1.850319354236126, "epoch": 0.3499491226860122, "grad_norm": 7.656332015991211, "learning_rate": 4.2765361598856534e-06, "loss": 0.52, "mean_token_accuracy": 0.8434787318110466, "num_tokens": 135733821.0, "step": 112890 }, { "entropy": 1.8255574628710747, "epoch": 0.34998012181106186, "grad_norm": 3.7060792446136475, "learning_rate": 4.276346759144178e-06, "loss": 0.4367, "mean_token_accuracy": 0.8535738855600357, "num_tokens": 135745431.0, "step": 112900 }, { "entropy": 1.8570487424731255, "epoch": 0.3500111209361116, "grad_norm": 8.941734313964844, "learning_rate": 4.276157383565215e-06, "loss": 0.488, "mean_token_accuracy": 0.8438050165772438, "num_tokens": 135757174.0, "step": 112910 }, { "entropy": 1.683679609745741, "epoch": 0.35004212006116125, "grad_norm": 2.426884174346924, "learning_rate": 4.2759680331431915e-06, "loss": 0.3749, "mean_token_accuracy": 0.8656910866498947, "num_tokens": 135772071.0, "step": 112920 }, { "entropy": 1.8811020210385323, "epoch": 0.350073119186211, "grad_norm": 9.89200210571289, "learning_rate": 4.275778707872541e-06, "loss": 0.4854, "mean_token_accuracy": 0.8509863570332528, "num_tokens": 135783363.0, "step": 112930 }, { "entropy": 1.8929930627346039, "epoch": 0.35010411831126065, "grad_norm": 8.002644538879395, "learning_rate": 4.2755894077476925e-06, "loss": 0.5056, "mean_token_accuracy": 0.8419990673661232, "num_tokens": 135794454.0, "step": 112940 }, { "entropy": 1.8019443243741988, "epoch": 0.3501351174363104, "grad_norm": 7.989790916442871, "learning_rate": 4.275400132763083e-06, "loss": 0.4648, "mean_token_accuracy": 0.8515826135873794, "num_tokens": 135806472.0, "step": 112950 }, { "entropy": 1.8456357419490814, "epoch": 0.35016611656136004, "grad_norm": 4.0877556800842285, "learning_rate": 4.275210882913148e-06, "loss": 0.5089, "mean_token_accuracy": 0.8436346635222435, "num_tokens": 135818253.0, "step": 112960 }, { "entropy": 1.8550100848078728, "epoch": 0.35019711568640977, "grad_norm": 9.118851661682129, "learning_rate": 4.275021658192323e-06, "loss": 0.4907, "mean_token_accuracy": 0.8482651144266129, "num_tokens": 135829973.0, "step": 112970 }, { "entropy": 1.8620183497667313, "epoch": 0.35022811481145943, "grad_norm": 12.924306869506836, "learning_rate": 4.274832458595049e-06, "loss": 0.4881, "mean_token_accuracy": 0.8375033006072045, "num_tokens": 135840885.0, "step": 112980 }, { "entropy": 1.7792857438325882, "epoch": 0.35025911393650916, "grad_norm": 7.036866664886475, "learning_rate": 4.274643284115767e-06, "loss": 0.4054, "mean_token_accuracy": 0.848195219039917, "num_tokens": 135853598.0, "step": 112990 }, { "entropy": 1.8005570635199546, "epoch": 0.3502901130615588, "grad_norm": 8.605463027954102, "learning_rate": 4.274454134748919e-06, "loss": 0.4765, "mean_token_accuracy": 0.8483079835772515, "num_tokens": 135866796.0, "step": 113000 }, { "entropy": 1.867513844370842, "epoch": 0.35032111218660855, "grad_norm": 8.543618202209473, "learning_rate": 4.2742650104889496e-06, "loss": 0.4927, "mean_token_accuracy": 0.8421450614929199, "num_tokens": 135878432.0, "step": 113010 }, { "entropy": 1.7870407178997993, "epoch": 0.3503521113116582, "grad_norm": 8.509590148925781, "learning_rate": 4.274075911330306e-06, "loss": 0.437, "mean_token_accuracy": 0.8518823325634003, "num_tokens": 135891001.0, "step": 113020 }, { "entropy": 1.8838384568691253, "epoch": 0.35038311043670795, "grad_norm": 8.782331466674805, "learning_rate": 4.273886837267435e-06, "loss": 0.5032, "mean_token_accuracy": 0.8412837624549866, "num_tokens": 135902263.0, "step": 113030 }, { "entropy": 1.7536726333200932, "epoch": 0.3504141095617576, "grad_norm": 4.015054702758789, "learning_rate": 4.2736977882947855e-06, "loss": 0.3985, "mean_token_accuracy": 0.8617945462465286, "num_tokens": 135915709.0, "step": 113040 }, { "entropy": 1.8604958072304725, "epoch": 0.35044510868680734, "grad_norm": 7.668582916259766, "learning_rate": 4.273508764406811e-06, "loss": 0.4567, "mean_token_accuracy": 0.8354014694690705, "num_tokens": 135928153.0, "step": 113050 }, { "entropy": 1.7598976030945779, "epoch": 0.350476107811857, "grad_norm": 10.459297180175781, "learning_rate": 4.273319765597964e-06, "loss": 0.3907, "mean_token_accuracy": 0.8666064769029618, "num_tokens": 135940924.0, "step": 113060 }, { "entropy": 1.925863367319107, "epoch": 0.35050710693690673, "grad_norm": 4.1394853591918945, "learning_rate": 4.273130791862697e-06, "loss": 0.5217, "mean_token_accuracy": 0.839854308962822, "num_tokens": 135952237.0, "step": 113070 }, { "entropy": 1.8460562735795976, "epoch": 0.3505381060619564, "grad_norm": 8.900185585021973, "learning_rate": 4.27294184319547e-06, "loss": 0.4746, "mean_token_accuracy": 0.8506451055407525, "num_tokens": 135964374.0, "step": 113080 }, { "entropy": 1.8739062339067458, "epoch": 0.35056910518700607, "grad_norm": Infinity, "learning_rate": 4.272752919590739e-06, "loss": 0.4781, "mean_token_accuracy": 0.8531906709074975, "num_tokens": 135974841.0, "step": 113090 }, { "entropy": 1.9152967289090157, "epoch": 0.3506001043120558, "grad_norm": 9.287792205810547, "learning_rate": 4.272564021042964e-06, "loss": 0.5305, "mean_token_accuracy": 0.8334060996770859, "num_tokens": 135986242.0, "step": 113100 }, { "entropy": 1.779229535162449, "epoch": 0.35063110343710546, "grad_norm": 8.452899932861328, "learning_rate": 4.272375147546608e-06, "loss": 0.3765, "mean_token_accuracy": 0.8608766749501229, "num_tokens": 135998810.0, "step": 113110 }, { "entropy": 1.9012344628572464, "epoch": 0.3506621025621552, "grad_norm": 8.408564567565918, "learning_rate": 4.272186299096133e-06, "loss": 0.4997, "mean_token_accuracy": 0.8417745620012284, "num_tokens": 136009754.0, "step": 113120 }, { "entropy": 1.8403314396739006, "epoch": 0.35069310168720486, "grad_norm": 10.097549438476562, "learning_rate": 4.271997475686004e-06, "loss": 0.4692, "mean_token_accuracy": 0.8540255069732666, "num_tokens": 136021001.0, "step": 113130 }, { "entropy": 1.8598705619573592, "epoch": 0.3507241008122546, "grad_norm": 10.163576126098633, "learning_rate": 4.2718086773106895e-06, "loss": 0.4469, "mean_token_accuracy": 0.8600896030664444, "num_tokens": 136032506.0, "step": 113140 }, { "entropy": 1.8539843708276749, "epoch": 0.35075509993730425, "grad_norm": 9.255050659179688, "learning_rate": 4.271619903964656e-06, "loss": 0.4837, "mean_token_accuracy": 0.8301028832793236, "num_tokens": 136044950.0, "step": 113150 }, { "entropy": 1.7757720202207565, "epoch": 0.350786099062354, "grad_norm": 9.801958084106445, "learning_rate": 4.271431155642374e-06, "loss": 0.4251, "mean_token_accuracy": 0.8526146367192269, "num_tokens": 136057364.0, "step": 113160 }, { "entropy": 1.818256576359272, "epoch": 0.35081709818740364, "grad_norm": 8.34134578704834, "learning_rate": 4.271242432338316e-06, "loss": 0.4512, "mean_token_accuracy": 0.8509904339909553, "num_tokens": 136069897.0, "step": 113170 }, { "entropy": 1.7901548087596892, "epoch": 0.35084809731245337, "grad_norm": 10.111919403076172, "learning_rate": 4.271053734046957e-06, "loss": 0.428, "mean_token_accuracy": 0.8516396522521973, "num_tokens": 136081662.0, "step": 113180 }, { "entropy": 1.8030595824122428, "epoch": 0.35087909643750304, "grad_norm": 8.223822593688965, "learning_rate": 4.270865060762769e-06, "loss": 0.4416, "mean_token_accuracy": 0.8596414580941201, "num_tokens": 136094054.0, "step": 113190 }, { "entropy": 1.8742952436208724, "epoch": 0.35091009556255276, "grad_norm": 7.283956050872803, "learning_rate": 4.270676412480232e-06, "loss": 0.4801, "mean_token_accuracy": 0.8499576464295387, "num_tokens": 136104584.0, "step": 113200 }, { "entropy": 1.8488594472408295, "epoch": 0.35094109468760243, "grad_norm": 4.605078220367432, "learning_rate": 4.270487789193823e-06, "loss": 0.5479, "mean_token_accuracy": 0.829779888689518, "num_tokens": 136116396.0, "step": 113210 }, { "entropy": 1.7764787435531617, "epoch": 0.35097209381265215, "grad_norm": 4.046032905578613, "learning_rate": 4.270299190898024e-06, "loss": 0.4293, "mean_token_accuracy": 0.8495251774787903, "num_tokens": 136129027.0, "step": 113220 }, { "entropy": 1.8619268134236335, "epoch": 0.3510030929377018, "grad_norm": 7.183846473693848, "learning_rate": 4.2701106175873156e-06, "loss": 0.5151, "mean_token_accuracy": 0.8402719959616661, "num_tokens": 136141038.0, "step": 113230 }, { "entropy": 1.8102800309658051, "epoch": 0.35103409206275155, "grad_norm": 3.5368010997772217, "learning_rate": 4.2699220692561825e-06, "loss": 0.4151, "mean_token_accuracy": 0.8623706936836243, "num_tokens": 136153341.0, "step": 113240 }, { "entropy": 1.9274267673492431, "epoch": 0.3510650911878012, "grad_norm": 6.329407215118408, "learning_rate": 4.26973354589911e-06, "loss": 0.46, "mean_token_accuracy": 0.8464403167366982, "num_tokens": 136164572.0, "step": 113250 }, { "entropy": 1.859284907579422, "epoch": 0.35109609031285094, "grad_norm": 9.37317943572998, "learning_rate": 4.2695450475105855e-06, "loss": 0.5012, "mean_token_accuracy": 0.8453105211257934, "num_tokens": 136176202.0, "step": 113260 }, { "entropy": 1.8950539276003837, "epoch": 0.3511270894379006, "grad_norm": 8.385150909423828, "learning_rate": 4.269356574085098e-06, "loss": 0.4593, "mean_token_accuracy": 0.8488032355904579, "num_tokens": 136187307.0, "step": 113270 }, { "entropy": 1.844504640996456, "epoch": 0.35115808856295033, "grad_norm": 3.8115599155426025, "learning_rate": 4.269168125617139e-06, "loss": 0.4382, "mean_token_accuracy": 0.8605356469750405, "num_tokens": 136198798.0, "step": 113280 }, { "entropy": 1.8304932430386542, "epoch": 0.351189087688, "grad_norm": 6.604245185852051, "learning_rate": 4.2689797021012e-06, "loss": 0.4187, "mean_token_accuracy": 0.8543907523155212, "num_tokens": 136211146.0, "step": 113290 }, { "entropy": 1.7669322073459626, "epoch": 0.3512200868130497, "grad_norm": 9.710320472717285, "learning_rate": 4.268791303531774e-06, "loss": 0.4546, "mean_token_accuracy": 0.8585976973176003, "num_tokens": 136223754.0, "step": 113300 }, { "entropy": 1.8966532975435257, "epoch": 0.3512510859380994, "grad_norm": 7.919690132141113, "learning_rate": 4.268602929903359e-06, "loss": 0.5226, "mean_token_accuracy": 0.8372878462076188, "num_tokens": 136234793.0, "step": 113310 }, { "entropy": 1.8168953225016593, "epoch": 0.3512820850631491, "grad_norm": 9.95252513885498, "learning_rate": 4.26841458121045e-06, "loss": 0.4564, "mean_token_accuracy": 0.8458817571401596, "num_tokens": 136247165.0, "step": 113320 }, { "entropy": 1.8963631451129914, "epoch": 0.3513130841881988, "grad_norm": 8.91224193572998, "learning_rate": 4.26822625744755e-06, "loss": 0.5239, "mean_token_accuracy": 0.83456601947546, "num_tokens": 136259113.0, "step": 113330 }, { "entropy": 1.7711772859096526, "epoch": 0.35134408331324846, "grad_norm": 8.007668495178223, "learning_rate": 4.268037958609155e-06, "loss": 0.3735, "mean_token_accuracy": 0.8632699027657509, "num_tokens": 136272310.0, "step": 113340 }, { "entropy": 1.9127058327198028, "epoch": 0.3513750824382982, "grad_norm": 8.543416023254395, "learning_rate": 4.267849684689771e-06, "loss": 0.5151, "mean_token_accuracy": 0.8420788779854774, "num_tokens": 136283081.0, "step": 113350 }, { "entropy": 1.8306966736912726, "epoch": 0.35140608156334785, "grad_norm": 7.711248397827148, "learning_rate": 4.267661435683903e-06, "loss": 0.4486, "mean_token_accuracy": 0.8500320836901665, "num_tokens": 136295002.0, "step": 113360 }, { "entropy": 1.9049995988607407, "epoch": 0.3514370806883976, "grad_norm": 9.507410049438477, "learning_rate": 4.267473211586053e-06, "loss": 0.5309, "mean_token_accuracy": 0.8459584578871727, "num_tokens": 136305515.0, "step": 113370 }, { "entropy": 1.91897811293602, "epoch": 0.35146807981344724, "grad_norm": 8.550996780395508, "learning_rate": 4.267285012390732e-06, "loss": 0.5122, "mean_token_accuracy": 0.8397913321852684, "num_tokens": 136316379.0, "step": 113380 }, { "entropy": 1.837944434583187, "epoch": 0.35149907893849697, "grad_norm": 9.132309913635254, "learning_rate": 4.267096838092449e-06, "loss": 0.4261, "mean_token_accuracy": 0.8522770449519157, "num_tokens": 136328473.0, "step": 113390 }, { "entropy": 1.8238879337906837, "epoch": 0.35153007806354664, "grad_norm": 8.676041603088379, "learning_rate": 4.266908688685714e-06, "loss": 0.4717, "mean_token_accuracy": 0.8456574112176896, "num_tokens": 136340510.0, "step": 113400 }, { "entropy": 1.8029537141323089, "epoch": 0.35156107718859636, "grad_norm": 8.039926528930664, "learning_rate": 4.26672056416504e-06, "loss": 0.4531, "mean_token_accuracy": 0.8499104723334312, "num_tokens": 136352921.0, "step": 113410 }, { "entropy": 1.8220291912555695, "epoch": 0.35159207631364603, "grad_norm": 3.9269118309020996, "learning_rate": 4.2665324645249425e-06, "loss": 0.4366, "mean_token_accuracy": 0.8560845017433166, "num_tokens": 136365546.0, "step": 113420 }, { "entropy": 1.8378960967063904, "epoch": 0.35162307543869575, "grad_norm": 7.433810234069824, "learning_rate": 4.266344389759935e-06, "loss": 0.4517, "mean_token_accuracy": 0.8511560037732124, "num_tokens": 136377851.0, "step": 113430 }, { "entropy": 1.838972160220146, "epoch": 0.3516540745637454, "grad_norm": 8.42202091217041, "learning_rate": 4.2661563398645395e-06, "loss": 0.4176, "mean_token_accuracy": 0.8631669625639915, "num_tokens": 136389500.0, "step": 113440 }, { "entropy": 1.9216330140829085, "epoch": 0.35168507368879515, "grad_norm": 8.987174987792969, "learning_rate": 4.2659683148332716e-06, "loss": 0.5193, "mean_token_accuracy": 0.8396774441003799, "num_tokens": 136400087.0, "step": 113450 }, { "entropy": 1.814877037703991, "epoch": 0.3517160728138448, "grad_norm": 4.2126688957214355, "learning_rate": 4.265780314660655e-06, "loss": 0.4897, "mean_token_accuracy": 0.8484560027718544, "num_tokens": 136412750.0, "step": 113460 }, { "entropy": 1.675320317596197, "epoch": 0.35174707193889454, "grad_norm": 7.873843669891357, "learning_rate": 4.265592339341211e-06, "loss": 0.374, "mean_token_accuracy": 0.861806321144104, "num_tokens": 136428096.0, "step": 113470 }, { "entropy": 1.7690242916345595, "epoch": 0.3517780710639442, "grad_norm": 8.084175109863281, "learning_rate": 4.265404388869465e-06, "loss": 0.4054, "mean_token_accuracy": 0.8658703714609146, "num_tokens": 136440575.0, "step": 113480 }, { "entropy": 1.853781743347645, "epoch": 0.35180907018899393, "grad_norm": 3.831251621246338, "learning_rate": 4.265216463239944e-06, "loss": 0.4657, "mean_token_accuracy": 0.8527114719152451, "num_tokens": 136452228.0, "step": 113490 }, { "entropy": 1.7241961359977722, "epoch": 0.3518400693140436, "grad_norm": 3.6539437770843506, "learning_rate": 4.265028562447174e-06, "loss": 0.381, "mean_token_accuracy": 0.8586121633648872, "num_tokens": 136465487.0, "step": 113500 }, { "entropy": 1.763987348973751, "epoch": 0.3518710684390933, "grad_norm": 9.397258758544922, "learning_rate": 4.264840686485687e-06, "loss": 0.4463, "mean_token_accuracy": 0.8535782873630524, "num_tokens": 136477731.0, "step": 113510 }, { "entropy": 1.821959725022316, "epoch": 0.351902067564143, "grad_norm": 3.157595634460449, "learning_rate": 4.264652835350013e-06, "loss": 0.4798, "mean_token_accuracy": 0.8425436571240426, "num_tokens": 136489410.0, "step": 113520 }, { "entropy": 1.8831856340169906, "epoch": 0.3519330666891927, "grad_norm": 6.910709857940674, "learning_rate": 4.264465009034684e-06, "loss": 0.4742, "mean_token_accuracy": 0.8598052114248276, "num_tokens": 136500743.0, "step": 113530 }, { "entropy": 1.9352434664964675, "epoch": 0.3519640658142424, "grad_norm": 11.542353630065918, "learning_rate": 4.264277207534237e-06, "loss": 0.5419, "mean_token_accuracy": 0.8231759518384933, "num_tokens": 136511423.0, "step": 113540 }, { "entropy": 1.719076856970787, "epoch": 0.3519950649392921, "grad_norm": 8.80350112915039, "learning_rate": 4.264089430843207e-06, "loss": 0.3627, "mean_token_accuracy": 0.8623046159744263, "num_tokens": 136525514.0, "step": 113550 }, { "entropy": 1.8064083874225616, "epoch": 0.3520260640643418, "grad_norm": 9.935619354248047, "learning_rate": 4.263901678956134e-06, "loss": 0.4276, "mean_token_accuracy": 0.85721445530653, "num_tokens": 136537364.0, "step": 113560 }, { "entropy": 1.696343556046486, "epoch": 0.3520570631893915, "grad_norm": 3.945061445236206, "learning_rate": 4.263713951867554e-06, "loss": 0.3869, "mean_token_accuracy": 0.8635579511523247, "num_tokens": 136551460.0, "step": 113570 }, { "entropy": 1.8621020391583443, "epoch": 0.3520880623144412, "grad_norm": 10.784354209899902, "learning_rate": 4.263526249572011e-06, "loss": 0.4916, "mean_token_accuracy": 0.8448983728885651, "num_tokens": 136562725.0, "step": 113580 }, { "entropy": 1.8034757658839227, "epoch": 0.35211906143949084, "grad_norm": 7.975634574890137, "learning_rate": 4.263338572064049e-06, "loss": 0.4426, "mean_token_accuracy": 0.8480996668338776, "num_tokens": 136574947.0, "step": 113590 }, { "entropy": 1.7639338225126266, "epoch": 0.35215006056454057, "grad_norm": 2.267158031463623, "learning_rate": 4.263150919338211e-06, "loss": 0.4393, "mean_token_accuracy": 0.8544442996382713, "num_tokens": 136587812.0, "step": 113600 }, { "entropy": 1.7771714597940445, "epoch": 0.35218105968959024, "grad_norm": 4.19359016418457, "learning_rate": 4.262963291389045e-06, "loss": 0.4269, "mean_token_accuracy": 0.851497220993042, "num_tokens": 136601096.0, "step": 113610 }, { "entropy": 1.8166782572865485, "epoch": 0.35221205881463996, "grad_norm": 4.370350360870361, "learning_rate": 4.262775688211097e-06, "loss": 0.4615, "mean_token_accuracy": 0.8466724216938019, "num_tokens": 136613550.0, "step": 113620 }, { "entropy": 1.8014537960290908, "epoch": 0.35224305793968963, "grad_norm": 7.764684677124023, "learning_rate": 4.262588109798919e-06, "loss": 0.4062, "mean_token_accuracy": 0.8574507102370262, "num_tokens": 136626300.0, "step": 113630 }, { "entropy": 1.7783242926001548, "epoch": 0.35227405706473935, "grad_norm": 7.457787036895752, "learning_rate": 4.262400556147062e-06, "loss": 0.4071, "mean_token_accuracy": 0.8619847133755684, "num_tokens": 136638878.0, "step": 113640 }, { "entropy": 1.7466833159327506, "epoch": 0.352305056189789, "grad_norm": 4.234191417694092, "learning_rate": 4.2622130272500775e-06, "loss": 0.4237, "mean_token_accuracy": 0.8527040392160415, "num_tokens": 136652119.0, "step": 113650 }, { "entropy": 1.8685536488890648, "epoch": 0.35233605531483875, "grad_norm": 8.825045585632324, "learning_rate": 4.262025523102523e-06, "loss": 0.5118, "mean_token_accuracy": 0.8355319261550903, "num_tokens": 136663383.0, "step": 113660 }, { "entropy": 1.88068146109581, "epoch": 0.3523670544398884, "grad_norm": 10.0654296875, "learning_rate": 4.261838043698953e-06, "loss": 0.5218, "mean_token_accuracy": 0.8337692707777024, "num_tokens": 136674512.0, "step": 113670 }, { "entropy": 1.8553010821342468, "epoch": 0.35239805356493814, "grad_norm": 8.771764755249023, "learning_rate": 4.2616505890339275e-06, "loss": 0.4976, "mean_token_accuracy": 0.8464409559965134, "num_tokens": 136686081.0, "step": 113680 }, { "entropy": 1.8780170038342476, "epoch": 0.3524290526899878, "grad_norm": 7.942883014678955, "learning_rate": 4.261463159102005e-06, "loss": 0.5024, "mean_token_accuracy": 0.8412024825811386, "num_tokens": 136697359.0, "step": 113690 }, { "entropy": 1.82506482899189, "epoch": 0.35246005181503753, "grad_norm": 8.61766242980957, "learning_rate": 4.261275753897748e-06, "loss": 0.4501, "mean_token_accuracy": 0.8486872628331185, "num_tokens": 136709869.0, "step": 113700 }, { "entropy": 1.8322357684373856, "epoch": 0.3524910509400872, "grad_norm": 3.983318328857422, "learning_rate": 4.261088373415719e-06, "loss": 0.4879, "mean_token_accuracy": 0.8400922358036041, "num_tokens": 136721994.0, "step": 113710 }, { "entropy": 1.8604241654276847, "epoch": 0.3525220500651369, "grad_norm": 7.393711090087891, "learning_rate": 4.260901017650485e-06, "loss": 0.4855, "mean_token_accuracy": 0.8476014077663422, "num_tokens": 136732793.0, "step": 113720 }, { "entropy": 1.8291943043470382, "epoch": 0.3525530491901866, "grad_norm": 8.577530860900879, "learning_rate": 4.26071368659661e-06, "loss": 0.4601, "mean_token_accuracy": 0.8571088537573814, "num_tokens": 136744850.0, "step": 113730 }, { "entropy": 1.810156986117363, "epoch": 0.3525840483152363, "grad_norm": 7.944915294647217, "learning_rate": 4.260526380248662e-06, "loss": 0.4268, "mean_token_accuracy": 0.8527490749955178, "num_tokens": 136756757.0, "step": 113740 }, { "entropy": 1.8879317745566369, "epoch": 0.352615047440286, "grad_norm": 9.205636978149414, "learning_rate": 4.260339098601214e-06, "loss": 0.4998, "mean_token_accuracy": 0.8410181164741516, "num_tokens": 136768280.0, "step": 113750 }, { "entropy": 1.8330311551690102, "epoch": 0.3526460465653357, "grad_norm": 4.422030448913574, "learning_rate": 4.2601518416488344e-06, "loss": 0.4343, "mean_token_accuracy": 0.8590439051389694, "num_tokens": 136779826.0, "step": 113760 }, { "entropy": 1.8848232328891754, "epoch": 0.3526770456903854, "grad_norm": 8.074309349060059, "learning_rate": 4.259964609386099e-06, "loss": 0.4696, "mean_token_accuracy": 0.8494037300348282, "num_tokens": 136791347.0, "step": 113770 }, { "entropy": 1.862524962425232, "epoch": 0.3527080448154351, "grad_norm": 9.903372764587402, "learning_rate": 4.2597774018075815e-06, "loss": 0.4728, "mean_token_accuracy": 0.8448254838585854, "num_tokens": 136803506.0, "step": 113780 }, { "entropy": 1.787963542342186, "epoch": 0.3527390439404848, "grad_norm": 9.371294975280762, "learning_rate": 4.259590218907858e-06, "loss": 0.4379, "mean_token_accuracy": 0.8571085095405578, "num_tokens": 136816790.0, "step": 113790 }, { "entropy": 1.8950905337929727, "epoch": 0.3527700430655345, "grad_norm": 4.0718159675598145, "learning_rate": 4.259403060681509e-06, "loss": 0.4897, "mean_token_accuracy": 0.8508754774928093, "num_tokens": 136827775.0, "step": 113800 }, { "entropy": 1.8704029634594916, "epoch": 0.35280104219058417, "grad_norm": 8.028072357177734, "learning_rate": 4.259215927123112e-06, "loss": 0.477, "mean_token_accuracy": 0.8462028250098228, "num_tokens": 136839928.0, "step": 113810 }, { "entropy": 1.7776376932859421, "epoch": 0.3528320413156339, "grad_norm": 7.8810529708862305, "learning_rate": 4.25902881822725e-06, "loss": 0.3856, "mean_token_accuracy": 0.8655279219150543, "num_tokens": 136852948.0, "step": 113820 }, { "entropy": 1.886681966483593, "epoch": 0.35286304044068356, "grad_norm": 9.677278518676758, "learning_rate": 4.2588417339885055e-06, "loss": 0.4847, "mean_token_accuracy": 0.8440056949853897, "num_tokens": 136864663.0, "step": 113830 }, { "entropy": 1.860078476369381, "epoch": 0.35289403956573323, "grad_norm": 7.314884185791016, "learning_rate": 4.258654674401464e-06, "loss": 0.4945, "mean_token_accuracy": 0.8441386088728905, "num_tokens": 136875913.0, "step": 113840 }, { "entropy": 1.856418649852276, "epoch": 0.35292503869078296, "grad_norm": 6.926784515380859, "learning_rate": 4.258467639460713e-06, "loss": 0.4665, "mean_token_accuracy": 0.8466035217046738, "num_tokens": 136888046.0, "step": 113850 }, { "entropy": 1.8510633751749992, "epoch": 0.3529560378158326, "grad_norm": 8.287582397460938, "learning_rate": 4.258280629160838e-06, "loss": 0.5001, "mean_token_accuracy": 0.8363550141453743, "num_tokens": 136900496.0, "step": 113860 }, { "entropy": 1.8378751188516618, "epoch": 0.35298703694088235, "grad_norm": 7.225609302520752, "learning_rate": 4.258093643496433e-06, "loss": 0.4568, "mean_token_accuracy": 0.8513509348034859, "num_tokens": 136912943.0, "step": 113870 }, { "entropy": 1.8136658892035484, "epoch": 0.353018036065932, "grad_norm": 8.231322288513184, "learning_rate": 4.257906682462087e-06, "loss": 0.4455, "mean_token_accuracy": 0.8516188710927963, "num_tokens": 136925555.0, "step": 113880 }, { "entropy": 1.8374749287962913, "epoch": 0.35304903519098174, "grad_norm": 7.933830738067627, "learning_rate": 4.257719746052393e-06, "loss": 0.4492, "mean_token_accuracy": 0.8512130409479142, "num_tokens": 136937326.0, "step": 113890 }, { "entropy": 1.8850289553403854, "epoch": 0.3530800343160314, "grad_norm": 9.631787300109863, "learning_rate": 4.257532834261947e-06, "loss": 0.5034, "mean_token_accuracy": 0.8466032981872559, "num_tokens": 136947938.0, "step": 113900 }, { "entropy": 1.774746771156788, "epoch": 0.35311103344108113, "grad_norm": 3.474881172180176, "learning_rate": 4.257345947085346e-06, "loss": 0.4201, "mean_token_accuracy": 0.8617874652147293, "num_tokens": 136960564.0, "step": 113910 }, { "entropy": 1.8160261258482933, "epoch": 0.3531420325661308, "grad_norm": 8.455029487609863, "learning_rate": 4.257159084517186e-06, "loss": 0.4386, "mean_token_accuracy": 0.8601742401719094, "num_tokens": 136972540.0, "step": 113920 }, { "entropy": 1.8941467106342316, "epoch": 0.35317303169118053, "grad_norm": 4.235901355743408, "learning_rate": 4.25697224655207e-06, "loss": 0.5141, "mean_token_accuracy": 0.8378585115075111, "num_tokens": 136984312.0, "step": 113930 }, { "entropy": 1.8767295882105828, "epoch": 0.3532040308162302, "grad_norm": 9.515031814575195, "learning_rate": 4.256785433184598e-06, "loss": 0.5171, "mean_token_accuracy": 0.8369233354926109, "num_tokens": 136995625.0, "step": 113940 }, { "entropy": 1.81492610424757, "epoch": 0.3532350299412799, "grad_norm": 4.470407485961914, "learning_rate": 4.256598644409373e-06, "loss": 0.4293, "mean_token_accuracy": 0.8580437764525414, "num_tokens": 137008213.0, "step": 113950 }, { "entropy": 1.849051834642887, "epoch": 0.3532660290663296, "grad_norm": 7.72014045715332, "learning_rate": 4.2564118802210006e-06, "loss": 0.4815, "mean_token_accuracy": 0.8522540688514709, "num_tokens": 137020112.0, "step": 113960 }, { "entropy": 1.8805166974663734, "epoch": 0.3532970281913793, "grad_norm": 4.944484710693359, "learning_rate": 4.256225140614086e-06, "loss": 0.4837, "mean_token_accuracy": 0.8436195507645607, "num_tokens": 137031603.0, "step": 113970 }, { "entropy": 1.874251839518547, "epoch": 0.353328027316429, "grad_norm": 8.50593090057373, "learning_rate": 4.25603842558324e-06, "loss": 0.4732, "mean_token_accuracy": 0.8450019791722297, "num_tokens": 137043366.0, "step": 113980 }, { "entropy": 1.8801655188202857, "epoch": 0.3533590264414787, "grad_norm": 8.969855308532715, "learning_rate": 4.25585173512307e-06, "loss": 0.5296, "mean_token_accuracy": 0.8407104894518852, "num_tokens": 137054687.0, "step": 113990 }, { "entropy": 1.8593452662229537, "epoch": 0.3533900255665284, "grad_norm": 8.071375846862793, "learning_rate": 4.255665069228188e-06, "loss": 0.4094, "mean_token_accuracy": 0.8621448844671249, "num_tokens": 137066396.0, "step": 114000 }, { "entropy": 1.8180650487542152, "epoch": 0.3534210246915781, "grad_norm": 8.56116771697998, "learning_rate": 4.255478427893209e-06, "loss": 0.4487, "mean_token_accuracy": 0.8478880062699318, "num_tokens": 137077708.0, "step": 114010 }, { "entropy": 1.815793578326702, "epoch": 0.35345202381662777, "grad_norm": 9.843114852905273, "learning_rate": 4.255291811112745e-06, "loss": 0.4388, "mean_token_accuracy": 0.8555766463279724, "num_tokens": 137090459.0, "step": 114020 }, { "entropy": 1.8414412125945092, "epoch": 0.3534830229416775, "grad_norm": 7.240721225738525, "learning_rate": 4.255105218881416e-06, "loss": 0.5089, "mean_token_accuracy": 0.8408783197402954, "num_tokens": 137101852.0, "step": 114030 }, { "entropy": 1.8683375775814057, "epoch": 0.35351402206672716, "grad_norm": 7.452057361602783, "learning_rate": 4.2549186511938356e-06, "loss": 0.4929, "mean_token_accuracy": 0.8472641706466675, "num_tokens": 137113521.0, "step": 114040 }, { "entropy": 1.9232849180698395, "epoch": 0.3535450211917769, "grad_norm": 8.561758995056152, "learning_rate": 4.254732108044627e-06, "loss": 0.4893, "mean_token_accuracy": 0.8424664407968521, "num_tokens": 137124822.0, "step": 114050 }, { "entropy": 1.8383488565683366, "epoch": 0.35357602031682656, "grad_norm": 10.501875877380371, "learning_rate": 4.254545589428411e-06, "loss": 0.4763, "mean_token_accuracy": 0.836837200820446, "num_tokens": 137136758.0, "step": 114060 }, { "entropy": 1.873065246641636, "epoch": 0.3536070194418763, "grad_norm": 9.702674865722656, "learning_rate": 4.254359095339811e-06, "loss": 0.4836, "mean_token_accuracy": 0.8503431528806686, "num_tokens": 137148091.0, "step": 114070 }, { "entropy": 1.8993804574012756, "epoch": 0.35363801856692595, "grad_norm": 7.872958183288574, "learning_rate": 4.254172625773451e-06, "loss": 0.469, "mean_token_accuracy": 0.849186685681343, "num_tokens": 137159064.0, "step": 114080 }, { "entropy": 1.9097682803869247, "epoch": 0.3536690176919756, "grad_norm": 8.155599594116211, "learning_rate": 4.253986180723957e-06, "loss": 0.4959, "mean_token_accuracy": 0.8419617161154747, "num_tokens": 137170143.0, "step": 114090 }, { "entropy": 1.8655310958623885, "epoch": 0.35370001681702534, "grad_norm": 10.33460807800293, "learning_rate": 4.253799760185957e-06, "loss": 0.5043, "mean_token_accuracy": 0.8362808287143707, "num_tokens": 137181569.0, "step": 114100 }, { "entropy": 1.7499642267823219, "epoch": 0.353731015942075, "grad_norm": 3.961226224899292, "learning_rate": 4.253613364154082e-06, "loss": 0.4155, "mean_token_accuracy": 0.8528372913599014, "num_tokens": 137194428.0, "step": 114110 }, { "entropy": 1.8577033221721648, "epoch": 0.35376201506712474, "grad_norm": 8.127702713012695, "learning_rate": 4.2534269926229625e-06, "loss": 0.4483, "mean_token_accuracy": 0.8494561642408371, "num_tokens": 137206496.0, "step": 114120 }, { "entropy": 1.950427946448326, "epoch": 0.3537930141921744, "grad_norm": 7.7237043380737305, "learning_rate": 4.253240645587232e-06, "loss": 0.5027, "mean_token_accuracy": 0.847864530980587, "num_tokens": 137217616.0, "step": 114130 }, { "entropy": 1.8073005706071854, "epoch": 0.35382401331722413, "grad_norm": 8.634578704833984, "learning_rate": 4.253054323041525e-06, "loss": 0.4007, "mean_token_accuracy": 0.8615914180874824, "num_tokens": 137229842.0, "step": 114140 }, { "entropy": 1.9044538155198096, "epoch": 0.3538550124422738, "grad_norm": 10.229740142822266, "learning_rate": 4.252868024980477e-06, "loss": 0.5133, "mean_token_accuracy": 0.840241327881813, "num_tokens": 137241062.0, "step": 114150 }, { "entropy": 1.7597592651844025, "epoch": 0.3538860115673235, "grad_norm": 9.922557830810547, "learning_rate": 4.252681751398727e-06, "loss": 0.3942, "mean_token_accuracy": 0.8564444318413734, "num_tokens": 137254220.0, "step": 114160 }, { "entropy": 1.9016540557146073, "epoch": 0.3539170106923732, "grad_norm": 10.562433242797852, "learning_rate": 4.252495502290913e-06, "loss": 0.5396, "mean_token_accuracy": 0.832036292552948, "num_tokens": 137266517.0, "step": 114170 }, { "entropy": 1.9116821497678758, "epoch": 0.3539480098174229, "grad_norm": 8.220035552978516, "learning_rate": 4.252309277651677e-06, "loss": 0.511, "mean_token_accuracy": 0.8397183999419212, "num_tokens": 137277540.0, "step": 114180 }, { "entropy": 1.8361477851867676, "epoch": 0.3539790089424726, "grad_norm": 8.83745288848877, "learning_rate": 4.252123077475664e-06, "loss": 0.5102, "mean_token_accuracy": 0.8428474217653275, "num_tokens": 137289242.0, "step": 114190 }, { "entropy": 1.8084876164793968, "epoch": 0.3540100080675223, "grad_norm": 7.766282081604004, "learning_rate": 4.251936901757515e-06, "loss": 0.4502, "mean_token_accuracy": 0.8553501293063164, "num_tokens": 137301999.0, "step": 114200 }, { "entropy": 1.8107165440917015, "epoch": 0.354041007192572, "grad_norm": 4.7136993408203125, "learning_rate": 4.251750750491878e-06, "loss": 0.4923, "mean_token_accuracy": 0.8484808370471001, "num_tokens": 137314968.0, "step": 114210 }, { "entropy": 1.94393610060215, "epoch": 0.3540720063176217, "grad_norm": 9.238444328308105, "learning_rate": 4.2515646236734e-06, "loss": 0.5164, "mean_token_accuracy": 0.8350920498371124, "num_tokens": 137325889.0, "step": 114220 }, { "entropy": 1.782192386686802, "epoch": 0.35410300544267137, "grad_norm": 9.513643264770508, "learning_rate": 4.251378521296731e-06, "loss": 0.427, "mean_token_accuracy": 0.8538872867822647, "num_tokens": 137338800.0, "step": 114230 }, { "entropy": 1.830018326640129, "epoch": 0.3541340045677211, "grad_norm": 7.8421759605407715, "learning_rate": 4.251192443356523e-06, "loss": 0.468, "mean_token_accuracy": 0.843048295378685, "num_tokens": 137350877.0, "step": 114240 }, { "entropy": 1.8395485177636146, "epoch": 0.35416500369277076, "grad_norm": 8.42922592163086, "learning_rate": 4.251006389847427e-06, "loss": 0.4412, "mean_token_accuracy": 0.8559403479099273, "num_tokens": 137362938.0, "step": 114250 }, { "entropy": 1.8116194292902947, "epoch": 0.3541960028178205, "grad_norm": 4.232943534851074, "learning_rate": 4.250820360764097e-06, "loss": 0.4439, "mean_token_accuracy": 0.847702045738697, "num_tokens": 137375890.0, "step": 114260 }, { "entropy": 1.699189220368862, "epoch": 0.35422700194287016, "grad_norm": 6.99352502822876, "learning_rate": 4.25063435610119e-06, "loss": 0.3176, "mean_token_accuracy": 0.8732634499669075, "num_tokens": 137389460.0, "step": 114270 }, { "entropy": 1.7898903474211694, "epoch": 0.3542580010679199, "grad_norm": 9.566137313842773, "learning_rate": 4.250448375853365e-06, "loss": 0.4222, "mean_token_accuracy": 0.8538245901465416, "num_tokens": 137402759.0, "step": 114280 }, { "entropy": 1.8570916399359703, "epoch": 0.35428900019296955, "grad_norm": 3.8215625286102295, "learning_rate": 4.250262420015279e-06, "loss": 0.5149, "mean_token_accuracy": 0.8420146137475968, "num_tokens": 137414428.0, "step": 114290 }, { "entropy": 1.8959661558270455, "epoch": 0.3543199993180193, "grad_norm": 7.246954441070557, "learning_rate": 4.250076488581593e-06, "loss": 0.5078, "mean_token_accuracy": 0.8415970131754875, "num_tokens": 137425829.0, "step": 114300 }, { "entropy": 1.8812722265720367, "epoch": 0.35435099844306894, "grad_norm": 7.860531330108643, "learning_rate": 4.24989058154697e-06, "loss": 0.4598, "mean_token_accuracy": 0.8447592079639434, "num_tokens": 137437268.0, "step": 114310 }, { "entropy": 1.8524739906191825, "epoch": 0.35438199756811867, "grad_norm": 8.29738712310791, "learning_rate": 4.2497046989060754e-06, "loss": 0.4355, "mean_token_accuracy": 0.8458985432982444, "num_tokens": 137449877.0, "step": 114320 }, { "entropy": 1.8565015509724616, "epoch": 0.35441299669316834, "grad_norm": 8.345024108886719, "learning_rate": 4.2495188406535735e-06, "loss": 0.4799, "mean_token_accuracy": 0.8513003289699554, "num_tokens": 137461527.0, "step": 114330 }, { "entropy": 1.8561938509345055, "epoch": 0.354443995818218, "grad_norm": 9.380087852478027, "learning_rate": 4.249333006784131e-06, "loss": 0.4599, "mean_token_accuracy": 0.8521606922149658, "num_tokens": 137473760.0, "step": 114340 }, { "entropy": 1.8369782000780106, "epoch": 0.35447499494326773, "grad_norm": 8.772756576538086, "learning_rate": 4.249147197292419e-06, "loss": 0.4711, "mean_token_accuracy": 0.8478844106197357, "num_tokens": 137485545.0, "step": 114350 }, { "entropy": 1.9263944923877716, "epoch": 0.3545059940683174, "grad_norm": 7.329461574554443, "learning_rate": 4.248961412173107e-06, "loss": 0.5178, "mean_token_accuracy": 0.8424314111471176, "num_tokens": 137496566.0, "step": 114360 }, { "entropy": 1.8391064286231995, "epoch": 0.3545369931933671, "grad_norm": 8.21514892578125, "learning_rate": 4.2487756514208675e-06, "loss": 0.4321, "mean_token_accuracy": 0.8564809501171112, "num_tokens": 137508491.0, "step": 114370 }, { "entropy": 1.8533219590783119, "epoch": 0.3545679923184168, "grad_norm": 8.260367393493652, "learning_rate": 4.248589915030374e-06, "loss": 0.4759, "mean_token_accuracy": 0.8540527150034904, "num_tokens": 137520622.0, "step": 114380 }, { "entropy": 1.9417014926671983, "epoch": 0.3545989914434665, "grad_norm": 8.334314346313477, "learning_rate": 4.248404202996303e-06, "loss": 0.5266, "mean_token_accuracy": 0.8378186166286469, "num_tokens": 137531376.0, "step": 114390 }, { "entropy": 1.8331923067569733, "epoch": 0.3546299905685162, "grad_norm": 9.672205924987793, "learning_rate": 4.248218515313331e-06, "loss": 0.4463, "mean_token_accuracy": 0.8539971187710762, "num_tokens": 137543888.0, "step": 114400 }, { "entropy": 1.8398851856589318, "epoch": 0.3546609896935659, "grad_norm": 8.222822189331055, "learning_rate": 4.248032851976136e-06, "loss": 0.4917, "mean_token_accuracy": 0.8429914176464081, "num_tokens": 137555582.0, "step": 114410 }, { "entropy": 1.817541065812111, "epoch": 0.3546919888186156, "grad_norm": 7.350632190704346, "learning_rate": 4.2478472129794004e-06, "loss": 0.4313, "mean_token_accuracy": 0.8614602595567703, "num_tokens": 137567761.0, "step": 114420 }, { "entropy": 1.8555574625730515, "epoch": 0.3547229879436653, "grad_norm": 4.468617916107178, "learning_rate": 4.247661598317806e-06, "loss": 0.4489, "mean_token_accuracy": 0.8440041437745094, "num_tokens": 137579603.0, "step": 114430 }, { "entropy": 1.8921550765633584, "epoch": 0.35475398706871497, "grad_norm": 9.684183120727539, "learning_rate": 4.247476007986034e-06, "loss": 0.5208, "mean_token_accuracy": 0.8349121674895287, "num_tokens": 137591460.0, "step": 114440 }, { "entropy": 1.9489626735448837, "epoch": 0.3547849861937647, "grad_norm": 7.472236633300781, "learning_rate": 4.247290441978772e-06, "loss": 0.5164, "mean_token_accuracy": 0.8391944885253906, "num_tokens": 137602126.0, "step": 114450 }, { "entropy": 1.8100664362311363, "epoch": 0.35481598531881436, "grad_norm": 7.485744476318359, "learning_rate": 4.247104900290708e-06, "loss": 0.413, "mean_token_accuracy": 0.8578644886612892, "num_tokens": 137614832.0, "step": 114460 }, { "entropy": 1.8753397777676581, "epoch": 0.3548469844438641, "grad_norm": 7.755117893218994, "learning_rate": 4.246919382916528e-06, "loss": 0.4843, "mean_token_accuracy": 0.8451044067740441, "num_tokens": 137626735.0, "step": 114470 }, { "entropy": 1.851782961189747, "epoch": 0.35487798356891376, "grad_norm": 8.35659408569336, "learning_rate": 4.2467338898509225e-06, "loss": 0.456, "mean_token_accuracy": 0.8530733227729798, "num_tokens": 137638707.0, "step": 114480 }, { "entropy": 1.8803992569446564, "epoch": 0.3549089826939635, "grad_norm": 9.96238899230957, "learning_rate": 4.2465484210885845e-06, "loss": 0.4666, "mean_token_accuracy": 0.8485779255628586, "num_tokens": 137650395.0, "step": 114490 }, { "entropy": 1.8529548197984695, "epoch": 0.35493998181901315, "grad_norm": 7.451599597930908, "learning_rate": 4.2463629766242074e-06, "loss": 0.4776, "mean_token_accuracy": 0.8496254831552505, "num_tokens": 137661248.0, "step": 114500 }, { "entropy": 1.8691770792007447, "epoch": 0.3549709809440629, "grad_norm": 8.7528657913208, "learning_rate": 4.246177556452486e-06, "loss": 0.5205, "mean_token_accuracy": 0.8437319055199624, "num_tokens": 137672985.0, "step": 114510 }, { "entropy": 1.8243165865540505, "epoch": 0.35500198006911254, "grad_norm": 7.32008171081543, "learning_rate": 4.245992160568117e-06, "loss": 0.4583, "mean_token_accuracy": 0.841618250310421, "num_tokens": 137684795.0, "step": 114520 }, { "entropy": 1.8710746631026267, "epoch": 0.35503297919416227, "grad_norm": 8.433467864990234, "learning_rate": 4.245806788965798e-06, "loss": 0.5185, "mean_token_accuracy": 0.8461954742670059, "num_tokens": 137696877.0, "step": 114530 }, { "entropy": 1.9102421700954437, "epoch": 0.35506397831921194, "grad_norm": 8.911306381225586, "learning_rate": 4.245621441640229e-06, "loss": 0.5067, "mean_token_accuracy": 0.8445306703448295, "num_tokens": 137708323.0, "step": 114540 }, { "entropy": 1.704357886314392, "epoch": 0.35509497744426166, "grad_norm": 5.612151145935059, "learning_rate": 4.245436118586114e-06, "loss": 0.3443, "mean_token_accuracy": 0.8622689947485924, "num_tokens": 137722546.0, "step": 114550 }, { "entropy": 1.8894066840410233, "epoch": 0.35512597656931133, "grad_norm": 3.8784425258636475, "learning_rate": 4.245250819798153e-06, "loss": 0.4699, "mean_token_accuracy": 0.8490568831562996, "num_tokens": 137733876.0, "step": 114560 }, { "entropy": 1.7194712564349175, "epoch": 0.355156975694361, "grad_norm": 9.469301223754883, "learning_rate": 4.2450655452710515e-06, "loss": 0.4478, "mean_token_accuracy": 0.8465792283415794, "num_tokens": 137747364.0, "step": 114570 }, { "entropy": 1.7580340534448624, "epoch": 0.3551879748194107, "grad_norm": 8.43166732788086, "learning_rate": 4.244880294999517e-06, "loss": 0.4017, "mean_token_accuracy": 0.8530274391174316, "num_tokens": 137760169.0, "step": 114580 }, { "entropy": 1.798786661028862, "epoch": 0.3552189739444604, "grad_norm": 3.7261557579040527, "learning_rate": 4.2446950689782575e-06, "loss": 0.4434, "mean_token_accuracy": 0.8502959847450257, "num_tokens": 137772888.0, "step": 114590 }, { "entropy": 1.782324093580246, "epoch": 0.3552499730695101, "grad_norm": 3.953874349594116, "learning_rate": 4.244509867201982e-06, "loss": 0.397, "mean_token_accuracy": 0.8616974368691445, "num_tokens": 137786038.0, "step": 114600 }, { "entropy": 1.8710459485650062, "epoch": 0.3552809721945598, "grad_norm": 7.7761101722717285, "learning_rate": 4.244324689665401e-06, "loss": 0.4873, "mean_token_accuracy": 0.8453879669308663, "num_tokens": 137797978.0, "step": 114610 }, { "entropy": 1.8599458813667298, "epoch": 0.3553119713196095, "grad_norm": 7.9371747970581055, "learning_rate": 4.244139536363229e-06, "loss": 0.4237, "mean_token_accuracy": 0.8475207060575485, "num_tokens": 137810153.0, "step": 114620 }, { "entropy": 1.8876738995313644, "epoch": 0.3553429704446592, "grad_norm": 9.171344757080078, "learning_rate": 4.24395440729018e-06, "loss": 0.462, "mean_token_accuracy": 0.8501772657036781, "num_tokens": 137822116.0, "step": 114630 }, { "entropy": 1.91156704723835, "epoch": 0.3553739695697089, "grad_norm": 8.98309326171875, "learning_rate": 4.2437693024409685e-06, "loss": 0.4877, "mean_token_accuracy": 0.8459622815251351, "num_tokens": 137833476.0, "step": 114640 }, { "entropy": 1.855041791498661, "epoch": 0.35540496869475857, "grad_norm": 7.716914176940918, "learning_rate": 4.243584221810315e-06, "loss": 0.4479, "mean_token_accuracy": 0.8591550663113594, "num_tokens": 137845518.0, "step": 114650 }, { "entropy": 1.9073261603713036, "epoch": 0.3554359678198083, "grad_norm": 7.420741558074951, "learning_rate": 4.243399165392936e-06, "loss": 0.5075, "mean_token_accuracy": 0.842662687599659, "num_tokens": 137856959.0, "step": 114660 }, { "entropy": 1.9126224562525749, "epoch": 0.35546696694485796, "grad_norm": 8.25358772277832, "learning_rate": 4.243214133183554e-06, "loss": 0.5035, "mean_token_accuracy": 0.8389646530151367, "num_tokens": 137867845.0, "step": 114670 }, { "entropy": 1.908442348241806, "epoch": 0.3554979660699077, "grad_norm": 3.6674983501434326, "learning_rate": 4.243029125176892e-06, "loss": 0.4823, "mean_token_accuracy": 0.8443290576338768, "num_tokens": 137879689.0, "step": 114680 }, { "entropy": 1.7568098783493042, "epoch": 0.35552896519495736, "grad_norm": 3.361020088195801, "learning_rate": 4.242844141367674e-06, "loss": 0.389, "mean_token_accuracy": 0.8680830702185631, "num_tokens": 137892854.0, "step": 114690 }, { "entropy": 1.8590233013033868, "epoch": 0.3555599643200071, "grad_norm": 8.2853422164917, "learning_rate": 4.242659181750625e-06, "loss": 0.4697, "mean_token_accuracy": 0.8496688306331635, "num_tokens": 137904751.0, "step": 114700 }, { "entropy": 1.8025938093662262, "epoch": 0.35559096344505675, "grad_norm": 6.7893595695495605, "learning_rate": 4.242474246320471e-06, "loss": 0.4388, "mean_token_accuracy": 0.8490458041429519, "num_tokens": 137917185.0, "step": 114710 }, { "entropy": 1.8439732417464256, "epoch": 0.3556219625701065, "grad_norm": 4.053720951080322, "learning_rate": 4.2422893350719436e-06, "loss": 0.455, "mean_token_accuracy": 0.8445042937994003, "num_tokens": 137929018.0, "step": 114720 }, { "entropy": 1.8289795368909836, "epoch": 0.35565296169515614, "grad_norm": 7.735437393188477, "learning_rate": 4.2421044479997735e-06, "loss": 0.4398, "mean_token_accuracy": 0.8526295185089111, "num_tokens": 137941602.0, "step": 114730 }, { "entropy": 1.7583549529314042, "epoch": 0.35568396082020587, "grad_norm": 9.589922904968262, "learning_rate": 4.24191958509869e-06, "loss": 0.3761, "mean_token_accuracy": 0.8651691839098931, "num_tokens": 137954921.0, "step": 114740 }, { "entropy": 1.848554477095604, "epoch": 0.35571495994525554, "grad_norm": 8.365562438964844, "learning_rate": 4.24173474636343e-06, "loss": 0.4633, "mean_token_accuracy": 0.8425380006432533, "num_tokens": 137965949.0, "step": 114750 }, { "entropy": 1.9080881744623184, "epoch": 0.35574595907030526, "grad_norm": 8.790996551513672, "learning_rate": 4.241549931788727e-06, "loss": 0.507, "mean_token_accuracy": 0.8438135430216789, "num_tokens": 137976920.0, "step": 114760 }, { "entropy": 1.9018929034471512, "epoch": 0.35577695819535493, "grad_norm": 8.04598617553711, "learning_rate": 4.2413651413693185e-06, "loss": 0.4829, "mean_token_accuracy": 0.8404053092002869, "num_tokens": 137988280.0, "step": 114770 }, { "entropy": 1.8129638865590096, "epoch": 0.35580795732040466, "grad_norm": 3.7647767066955566, "learning_rate": 4.241180375099945e-06, "loss": 0.421, "mean_token_accuracy": 0.8564304232597351, "num_tokens": 138001123.0, "step": 114780 }, { "entropy": 1.821030892431736, "epoch": 0.3558389564454543, "grad_norm": 3.7905266284942627, "learning_rate": 4.240995632975342e-06, "loss": 0.4884, "mean_token_accuracy": 0.8544037476181984, "num_tokens": 138012902.0, "step": 114790 }, { "entropy": 1.8608674079179763, "epoch": 0.35586995557050405, "grad_norm": 8.616358757019043, "learning_rate": 4.240810914990257e-06, "loss": 0.5023, "mean_token_accuracy": 0.8417997151613236, "num_tokens": 138025108.0, "step": 114800 }, { "entropy": 1.8812236726284026, "epoch": 0.3559009546955537, "grad_norm": 8.76230239868164, "learning_rate": 4.240626221139429e-06, "loss": 0.5105, "mean_token_accuracy": 0.8435042083263398, "num_tokens": 138037279.0, "step": 114810 }, { "entropy": 1.7783361107110978, "epoch": 0.3559319538206034, "grad_norm": 3.4879698753356934, "learning_rate": 4.240441551417605e-06, "loss": 0.3984, "mean_token_accuracy": 0.8596335366368294, "num_tokens": 138050823.0, "step": 114820 }, { "entropy": 1.7827171936631203, "epoch": 0.3559629529456531, "grad_norm": 3.9974160194396973, "learning_rate": 4.240256905819533e-06, "loss": 0.3823, "mean_token_accuracy": 0.8612391158938408, "num_tokens": 138064142.0, "step": 114830 }, { "entropy": 1.8113099455833435, "epoch": 0.3559939520707028, "grad_norm": 7.9462666511535645, "learning_rate": 4.2400722843399585e-06, "loss": 0.4029, "mean_token_accuracy": 0.8514809906482697, "num_tokens": 138076703.0, "step": 114840 }, { "entropy": 1.818100643157959, "epoch": 0.3560249511957525, "grad_norm": 8.276450157165527, "learning_rate": 4.2398876869736325e-06, "loss": 0.4672, "mean_token_accuracy": 0.8574807778000831, "num_tokens": 138088592.0, "step": 114850 }, { "entropy": 1.899727213382721, "epoch": 0.3560559503208022, "grad_norm": 8.389613151550293, "learning_rate": 4.239703113715307e-06, "loss": 0.5211, "mean_token_accuracy": 0.8366639405488968, "num_tokens": 138099369.0, "step": 114860 }, { "entropy": 1.8642989337444305, "epoch": 0.3560869494458519, "grad_norm": 9.097145080566406, "learning_rate": 4.239518564559734e-06, "loss": 0.4706, "mean_token_accuracy": 0.8484190404415131, "num_tokens": 138111261.0, "step": 114870 }, { "entropy": 1.8235100641846658, "epoch": 0.35611794857090157, "grad_norm": 8.840439796447754, "learning_rate": 4.239334039501668e-06, "loss": 0.4041, "mean_token_accuracy": 0.8622523456811905, "num_tokens": 138124180.0, "step": 114880 }, { "entropy": 1.8383688524365425, "epoch": 0.3561489476959513, "grad_norm": 5.670360088348389, "learning_rate": 4.2391495385358675e-06, "loss": 0.4512, "mean_token_accuracy": 0.8513874992728233, "num_tokens": 138136327.0, "step": 114890 }, { "entropy": 1.8067172005772592, "epoch": 0.35617994682100096, "grad_norm": 9.207831382751465, "learning_rate": 4.238965061657087e-06, "loss": 0.4221, "mean_token_accuracy": 0.8570899114012718, "num_tokens": 138148730.0, "step": 114900 }, { "entropy": 1.917934286594391, "epoch": 0.3562109459460507, "grad_norm": 8.583123207092285, "learning_rate": 4.238780608860088e-06, "loss": 0.5284, "mean_token_accuracy": 0.8423309728503228, "num_tokens": 138159657.0, "step": 114910 }, { "entropy": 1.8137140288949012, "epoch": 0.35624194507110035, "grad_norm": 7.209733009338379, "learning_rate": 4.238596180139632e-06, "loss": 0.4447, "mean_token_accuracy": 0.8488231867551803, "num_tokens": 138172455.0, "step": 114920 }, { "entropy": 1.8409941777586938, "epoch": 0.3562729441961501, "grad_norm": 7.371655464172363, "learning_rate": 4.238411775490481e-06, "loss": 0.4485, "mean_token_accuracy": 0.8461980625987053, "num_tokens": 138184789.0, "step": 114930 }, { "entropy": 1.8432543560862542, "epoch": 0.35630394332119975, "grad_norm": 8.352115631103516, "learning_rate": 4.238227394907398e-06, "loss": 0.4777, "mean_token_accuracy": 0.8550651222467422, "num_tokens": 138196321.0, "step": 114940 }, { "entropy": 1.880720390379429, "epoch": 0.35633494244624947, "grad_norm": 7.84929084777832, "learning_rate": 4.23804303838515e-06, "loss": 0.515, "mean_token_accuracy": 0.8451797798275947, "num_tokens": 138207824.0, "step": 114950 }, { "entropy": 1.8087940603494643, "epoch": 0.35636594157129914, "grad_norm": 9.440030097961426, "learning_rate": 4.237858705918504e-06, "loss": 0.4492, "mean_token_accuracy": 0.8487761154770851, "num_tokens": 138220985.0, "step": 114960 }, { "entropy": 1.875385396182537, "epoch": 0.35639694069634886, "grad_norm": 7.9317474365234375, "learning_rate": 4.23767439750223e-06, "loss": 0.4614, "mean_token_accuracy": 0.8596486851572991, "num_tokens": 138232460.0, "step": 114970 }, { "entropy": 1.7997772373259067, "epoch": 0.35642793982139853, "grad_norm": 4.375933647155762, "learning_rate": 4.237490113131097e-06, "loss": 0.4026, "mean_token_accuracy": 0.8406259372830391, "num_tokens": 138246226.0, "step": 114980 }, { "entropy": 1.9087285965681076, "epoch": 0.35645893894644826, "grad_norm": 9.480292320251465, "learning_rate": 4.237305852799878e-06, "loss": 0.5354, "mean_token_accuracy": 0.8353115454316139, "num_tokens": 138258064.0, "step": 114990 }, { "entropy": 1.8310803532600404, "epoch": 0.3564899380714979, "grad_norm": 5.65321683883667, "learning_rate": 4.237121616503348e-06, "loss": 0.4133, "mean_token_accuracy": 0.8590890899300575, "num_tokens": 138271607.0, "step": 115000 }, { "entropy": 1.727584820985794, "epoch": 0.35652093719654765, "grad_norm": 4.688484191894531, "learning_rate": 4.2369374042362805e-06, "loss": 0.3905, "mean_token_accuracy": 0.8516164928674698, "num_tokens": 138286322.0, "step": 115010 }, { "entropy": 1.946674033999443, "epoch": 0.3565519363215973, "grad_norm": 7.794990062713623, "learning_rate": 4.236753215993452e-06, "loss": 0.4955, "mean_token_accuracy": 0.8492588356137276, "num_tokens": 138297078.0, "step": 115020 }, { "entropy": 1.7960212633013726, "epoch": 0.35658293544664704, "grad_norm": 4.166678428649902, "learning_rate": 4.236569051769643e-06, "loss": 0.4404, "mean_token_accuracy": 0.8482924640178681, "num_tokens": 138310205.0, "step": 115030 }, { "entropy": 1.8630914211273193, "epoch": 0.3566139345716967, "grad_norm": 7.438714504241943, "learning_rate": 4.236384911559633e-06, "loss": 0.4529, "mean_token_accuracy": 0.8541238859295845, "num_tokens": 138322264.0, "step": 115040 }, { "entropy": 1.8478296250104904, "epoch": 0.35664493369674644, "grad_norm": 3.994792938232422, "learning_rate": 4.236200795358203e-06, "loss": 0.438, "mean_token_accuracy": 0.853163267672062, "num_tokens": 138334432.0, "step": 115050 }, { "entropy": 1.840542307496071, "epoch": 0.3566759328217961, "grad_norm": 7.899695873260498, "learning_rate": 4.2360167031601366e-06, "loss": 0.4738, "mean_token_accuracy": 0.8437621504068374, "num_tokens": 138346236.0, "step": 115060 }, { "entropy": 1.9013837978243828, "epoch": 0.3567069319468458, "grad_norm": 8.970711708068848, "learning_rate": 4.235832634960219e-06, "loss": 0.4772, "mean_token_accuracy": 0.8377508997917176, "num_tokens": 138357432.0, "step": 115070 }, { "entropy": 1.8663888260722161, "epoch": 0.3567379310718955, "grad_norm": 8.907432556152344, "learning_rate": 4.235648590753237e-06, "loss": 0.4474, "mean_token_accuracy": 0.8535327434539794, "num_tokens": 138369028.0, "step": 115080 }, { "entropy": 1.8116501927375794, "epoch": 0.35676893019694517, "grad_norm": 7.725051403045654, "learning_rate": 4.235464570533978e-06, "loss": 0.3945, "mean_token_accuracy": 0.8590543121099472, "num_tokens": 138380629.0, "step": 115090 }, { "entropy": 1.8142176762223243, "epoch": 0.3567999293219949, "grad_norm": 10.798834800720215, "learning_rate": 4.2352805742972315e-06, "loss": 0.4315, "mean_token_accuracy": 0.848982447385788, "num_tokens": 138393683.0, "step": 115100 }, { "entropy": 1.8765118628740312, "epoch": 0.35683092844704456, "grad_norm": 11.132994651794434, "learning_rate": 4.23509660203779e-06, "loss": 0.4416, "mean_token_accuracy": 0.8448266923427582, "num_tokens": 138405992.0, "step": 115110 }, { "entropy": 1.895536944270134, "epoch": 0.3568619275720943, "grad_norm": 7.782128810882568, "learning_rate": 4.234912653750445e-06, "loss": 0.4735, "mean_token_accuracy": 0.8532333001494408, "num_tokens": 138417451.0, "step": 115120 }, { "entropy": 1.851093652844429, "epoch": 0.35689292669714395, "grad_norm": 3.747323751449585, "learning_rate": 4.23472872942999e-06, "loss": 0.51, "mean_token_accuracy": 0.8384227573871612, "num_tokens": 138428567.0, "step": 115130 }, { "entropy": 1.9262410998344421, "epoch": 0.3569239258221937, "grad_norm": 8.404274940490723, "learning_rate": 4.234544829071223e-06, "loss": 0.5125, "mean_token_accuracy": 0.8368908122181893, "num_tokens": 138439819.0, "step": 115140 }, { "entropy": 1.8113437429070474, "epoch": 0.35695492494724335, "grad_norm": 8.569499969482422, "learning_rate": 4.234360952668942e-06, "loss": 0.4249, "mean_token_accuracy": 0.8553946748375892, "num_tokens": 138452192.0, "step": 115150 }, { "entropy": 1.8376051411032677, "epoch": 0.35698592407229307, "grad_norm": 8.602355003356934, "learning_rate": 4.2341771002179445e-06, "loss": 0.422, "mean_token_accuracy": 0.8601582199335098, "num_tokens": 138464939.0, "step": 115160 }, { "entropy": 1.7371099531650542, "epoch": 0.35701692319734274, "grad_norm": 8.904439926147461, "learning_rate": 4.233993271713032e-06, "loss": 0.3651, "mean_token_accuracy": 0.8645968466997147, "num_tokens": 138477993.0, "step": 115170 }, { "entropy": 1.8617297038435936, "epoch": 0.35704792232239246, "grad_norm": 7.874594211578369, "learning_rate": 4.233809467149005e-06, "loss": 0.4488, "mean_token_accuracy": 0.852948172390461, "num_tokens": 138489670.0, "step": 115180 }, { "entropy": 1.8260050147771836, "epoch": 0.35707892144744213, "grad_norm": 3.6491997241973877, "learning_rate": 4.23362568652067e-06, "loss": 0.4346, "mean_token_accuracy": 0.8457119166851044, "num_tokens": 138501968.0, "step": 115190 }, { "entropy": 1.84189365953207, "epoch": 0.35710992057249186, "grad_norm": 8.175023078918457, "learning_rate": 4.2334419298228315e-06, "loss": 0.4542, "mean_token_accuracy": 0.8577545329928398, "num_tokens": 138514376.0, "step": 115200 }, { "entropy": 1.8731770426034928, "epoch": 0.3571409196975415, "grad_norm": 9.102734565734863, "learning_rate": 4.2332581970502965e-06, "loss": 0.4974, "mean_token_accuracy": 0.8443191021680831, "num_tokens": 138526194.0, "step": 115210 }, { "entropy": 1.8527789831161499, "epoch": 0.35717191882259125, "grad_norm": 7.554600238800049, "learning_rate": 4.233074488197873e-06, "loss": 0.4415, "mean_token_accuracy": 0.8490279093384743, "num_tokens": 138538253.0, "step": 115220 }, { "entropy": 1.9107401639223098, "epoch": 0.3572029179476409, "grad_norm": 9.52925968170166, "learning_rate": 4.232890803260372e-06, "loss": 0.527, "mean_token_accuracy": 0.8357208788394928, "num_tokens": 138549714.0, "step": 115230 }, { "entropy": 1.857765756547451, "epoch": 0.35723391707269064, "grad_norm": 8.061179161071777, "learning_rate": 4.232707142232605e-06, "loss": 0.4667, "mean_token_accuracy": 0.8525500863790512, "num_tokens": 138561400.0, "step": 115240 }, { "entropy": 1.852206926047802, "epoch": 0.3572649161977403, "grad_norm": 2.4236626625061035, "learning_rate": 4.232523505109386e-06, "loss": 0.4453, "mean_token_accuracy": 0.854184539616108, "num_tokens": 138573545.0, "step": 115250 }, { "entropy": 1.9055633306503297, "epoch": 0.35729591532279004, "grad_norm": 8.553743362426758, "learning_rate": 4.232339891885528e-06, "loss": 0.4845, "mean_token_accuracy": 0.8542478173971176, "num_tokens": 138584725.0, "step": 115260 }, { "entropy": 1.917627716064453, "epoch": 0.3573269144478397, "grad_norm": 8.1434326171875, "learning_rate": 4.23215630255585e-06, "loss": 0.5061, "mean_token_accuracy": 0.8403222784399986, "num_tokens": 138596131.0, "step": 115270 }, { "entropy": 1.7823843270540238, "epoch": 0.35735791357288943, "grad_norm": 8.837855339050293, "learning_rate": 4.2319727371151685e-06, "loss": 0.449, "mean_token_accuracy": 0.8501473888754845, "num_tokens": 138609457.0, "step": 115280 }, { "entropy": 1.802987203001976, "epoch": 0.3573889126979391, "grad_norm": 8.615238189697266, "learning_rate": 4.231789195558304e-06, "loss": 0.4009, "mean_token_accuracy": 0.8549714654684066, "num_tokens": 138621968.0, "step": 115290 }, { "entropy": 1.7702410399913788, "epoch": 0.3574199118229888, "grad_norm": 9.068475723266602, "learning_rate": 4.231605677880076e-06, "loss": 0.417, "mean_token_accuracy": 0.8504804894328117, "num_tokens": 138635487.0, "step": 115300 }, { "entropy": 1.7608641982078552, "epoch": 0.3574509109480385, "grad_norm": 3.600853204727173, "learning_rate": 4.2314221840753095e-06, "loss": 0.4207, "mean_token_accuracy": 0.8591883108019829, "num_tokens": 138649773.0, "step": 115310 }, { "entropy": 1.845149078965187, "epoch": 0.35748191007308816, "grad_norm": 8.9186372756958, "learning_rate": 4.231238714138827e-06, "loss": 0.4463, "mean_token_accuracy": 0.8526797413825988, "num_tokens": 138661604.0, "step": 115320 }, { "entropy": 1.7300834864377976, "epoch": 0.3575129091981379, "grad_norm": 3.3661861419677734, "learning_rate": 4.231055268065456e-06, "loss": 0.3556, "mean_token_accuracy": 0.8640945121645928, "num_tokens": 138675068.0, "step": 115330 }, { "entropy": 1.7648856535553932, "epoch": 0.35754390832318755, "grad_norm": 7.938429832458496, "learning_rate": 4.230871845850023e-06, "loss": 0.3813, "mean_token_accuracy": 0.8690492272377014, "num_tokens": 138687865.0, "step": 115340 }, { "entropy": 1.8311346605420113, "epoch": 0.3575749074482373, "grad_norm": 8.160514831542969, "learning_rate": 4.230688447487358e-06, "loss": 0.4429, "mean_token_accuracy": 0.8510325223207473, "num_tokens": 138700240.0, "step": 115350 }, { "entropy": 1.8927728280425071, "epoch": 0.35760590657328695, "grad_norm": 7.9980926513671875, "learning_rate": 4.230505072972291e-06, "loss": 0.4889, "mean_token_accuracy": 0.8445928290486335, "num_tokens": 138711415.0, "step": 115360 }, { "entropy": 1.9151074886322021, "epoch": 0.35763690569833667, "grad_norm": 8.005682945251465, "learning_rate": 4.230321722299654e-06, "loss": 0.4521, "mean_token_accuracy": 0.859444510936737, "num_tokens": 138722459.0, "step": 115370 }, { "entropy": 1.8123747482895851, "epoch": 0.35766790482338634, "grad_norm": 3.6913387775421143, "learning_rate": 4.23013839546428e-06, "loss": 0.4163, "mean_token_accuracy": 0.8595644026994705, "num_tokens": 138734633.0, "step": 115380 }, { "entropy": 1.8173100531101227, "epoch": 0.35769890394843606, "grad_norm": 4.180068016052246, "learning_rate": 4.2299550924610065e-06, "loss": 0.4591, "mean_token_accuracy": 0.8400069773197174, "num_tokens": 138747367.0, "step": 115390 }, { "entropy": 1.9122267067432404, "epoch": 0.35772990307348573, "grad_norm": 8.708627700805664, "learning_rate": 4.229771813284669e-06, "loss": 0.4755, "mean_token_accuracy": 0.8578638061881065, "num_tokens": 138758011.0, "step": 115400 }, { "entropy": 1.8789207234978675, "epoch": 0.35776090219853546, "grad_norm": 8.253546714782715, "learning_rate": 4.229588557930106e-06, "loss": 0.4652, "mean_token_accuracy": 0.8481252580881119, "num_tokens": 138770045.0, "step": 115410 }, { "entropy": 1.8555074393749238, "epoch": 0.3577919013235851, "grad_norm": 9.735289573669434, "learning_rate": 4.229405326392158e-06, "loss": 0.4524, "mean_token_accuracy": 0.850931105017662, "num_tokens": 138782324.0, "step": 115420 }, { "entropy": 1.857953730225563, "epoch": 0.35782290044863485, "grad_norm": 3.539726495742798, "learning_rate": 4.229222118665667e-06, "loss": 0.4617, "mean_token_accuracy": 0.8570713594555854, "num_tokens": 138793921.0, "step": 115430 }, { "entropy": 1.8376942738890647, "epoch": 0.3578538995736845, "grad_norm": 9.80213451385498, "learning_rate": 4.229038934745475e-06, "loss": 0.4324, "mean_token_accuracy": 0.8545270070433617, "num_tokens": 138806022.0, "step": 115440 }, { "entropy": 1.803570680320263, "epoch": 0.35788489869873424, "grad_norm": 9.964187622070312, "learning_rate": 4.228855774626427e-06, "loss": 0.4649, "mean_token_accuracy": 0.8454685240983963, "num_tokens": 138818578.0, "step": 115450 }, { "entropy": 1.7595643281936646, "epoch": 0.3579158978237839, "grad_norm": 8.82363224029541, "learning_rate": 4.22867263830337e-06, "loss": 0.4198, "mean_token_accuracy": 0.8574300840497017, "num_tokens": 138831318.0, "step": 115460 }, { "entropy": 1.8091361120343208, "epoch": 0.35794689694883364, "grad_norm": 7.342787742614746, "learning_rate": 4.228489525771151e-06, "loss": 0.4686, "mean_token_accuracy": 0.8519743844866753, "num_tokens": 138843358.0, "step": 115470 }, { "entropy": 1.878666016459465, "epoch": 0.3579778960738833, "grad_norm": 8.650647163391113, "learning_rate": 4.22830643702462e-06, "loss": 0.4983, "mean_token_accuracy": 0.8367117345333099, "num_tokens": 138854574.0, "step": 115480 }, { "entropy": 1.7836987793445587, "epoch": 0.35800889519893303, "grad_norm": 4.929512023925781, "learning_rate": 4.228123372058628e-06, "loss": 0.4007, "mean_token_accuracy": 0.8506701856851577, "num_tokens": 138867932.0, "step": 115490 }, { "entropy": 1.841169884800911, "epoch": 0.3580398943239827, "grad_norm": 8.031444549560547, "learning_rate": 4.227940330868028e-06, "loss": 0.4698, "mean_token_accuracy": 0.8463330894708634, "num_tokens": 138880884.0, "step": 115500 }, { "entropy": 1.8855138301849366, "epoch": 0.3580708934490324, "grad_norm": 7.73654317855835, "learning_rate": 4.227757313447673e-06, "loss": 0.5016, "mean_token_accuracy": 0.8478637263178825, "num_tokens": 138891775.0, "step": 115510 }, { "entropy": 1.8171266317367554, "epoch": 0.3581018925740821, "grad_norm": 4.404027462005615, "learning_rate": 4.2275743197924185e-06, "loss": 0.4329, "mean_token_accuracy": 0.8575972750782966, "num_tokens": 138904411.0, "step": 115520 }, { "entropy": 1.8890212267637252, "epoch": 0.3581328916991318, "grad_norm": 8.792807579040527, "learning_rate": 4.227391349897123e-06, "loss": 0.5362, "mean_token_accuracy": 0.8435036420822144, "num_tokens": 138917174.0, "step": 115530 }, { "entropy": 1.8580753847956657, "epoch": 0.3581638908241815, "grad_norm": 7.962569236755371, "learning_rate": 4.227208403756644e-06, "loss": 0.4431, "mean_token_accuracy": 0.8595779970288276, "num_tokens": 138928817.0, "step": 115540 }, { "entropy": 1.79182361215353, "epoch": 0.3581948899492312, "grad_norm": 8.742132186889648, "learning_rate": 4.227025481365844e-06, "loss": 0.4222, "mean_token_accuracy": 0.8503703057765961, "num_tokens": 138941364.0, "step": 115550 }, { "entropy": 1.9465411305427551, "epoch": 0.3582258890742809, "grad_norm": 9.272126197814941, "learning_rate": 4.226842582719583e-06, "loss": 0.5234, "mean_token_accuracy": 0.8437400907278061, "num_tokens": 138952471.0, "step": 115560 }, { "entropy": 1.8139133349061012, "epoch": 0.35825688819933055, "grad_norm": 3.899977684020996, "learning_rate": 4.226659707812723e-06, "loss": 0.3716, "mean_token_accuracy": 0.8707442149519921, "num_tokens": 138965676.0, "step": 115570 }, { "entropy": 1.8158785462379456, "epoch": 0.35828788732438027, "grad_norm": 9.818337440490723, "learning_rate": 4.2264768566401325e-06, "loss": 0.4877, "mean_token_accuracy": 0.8463491559028625, "num_tokens": 138977871.0, "step": 115580 }, { "entropy": 1.764494700729847, "epoch": 0.35831888644942994, "grad_norm": 8.205857276916504, "learning_rate": 4.226294029196676e-06, "loss": 0.3989, "mean_token_accuracy": 0.8628710359334946, "num_tokens": 138991007.0, "step": 115590 }, { "entropy": 1.8400152049958707, "epoch": 0.35834988557447967, "grad_norm": 6.771747589111328, "learning_rate": 4.226111225477222e-06, "loss": 0.4849, "mean_token_accuracy": 0.8476285025477409, "num_tokens": 139002964.0, "step": 115600 }, { "entropy": 1.899264845252037, "epoch": 0.35838088469952933, "grad_norm": 9.301812171936035, "learning_rate": 4.225928445476641e-06, "loss": 0.5283, "mean_token_accuracy": 0.8411499798297882, "num_tokens": 139014497.0, "step": 115610 }, { "entropy": 1.861107437312603, "epoch": 0.35841188382457906, "grad_norm": 3.8199284076690674, "learning_rate": 4.2257456891898015e-06, "loss": 0.518, "mean_token_accuracy": 0.8456135064363479, "num_tokens": 139026732.0, "step": 115620 }, { "entropy": 1.786482220888138, "epoch": 0.3584428829496287, "grad_norm": 7.765440940856934, "learning_rate": 4.2255629566115795e-06, "loss": 0.3853, "mean_token_accuracy": 0.8641697198152543, "num_tokens": 139039189.0, "step": 115630 }, { "entropy": 1.8395205929875373, "epoch": 0.35847388207467845, "grad_norm": 9.376654624938965, "learning_rate": 4.225380247736847e-06, "loss": 0.515, "mean_token_accuracy": 0.8413670718669891, "num_tokens": 139052067.0, "step": 115640 }, { "entropy": 1.9465155333280564, "epoch": 0.3585048811997281, "grad_norm": 8.243704795837402, "learning_rate": 4.225197562560482e-06, "loss": 0.4934, "mean_token_accuracy": 0.8416017279028892, "num_tokens": 139062764.0, "step": 115650 }, { "entropy": 1.842594537138939, "epoch": 0.35853588032477784, "grad_norm": 4.1204681396484375, "learning_rate": 4.22501490107736e-06, "loss": 0.5122, "mean_token_accuracy": 0.8348633021116256, "num_tokens": 139075044.0, "step": 115660 }, { "entropy": 1.8628608852624893, "epoch": 0.3585668794498275, "grad_norm": 7.374316215515137, "learning_rate": 4.2248322632823606e-06, "loss": 0.4423, "mean_token_accuracy": 0.8515262484550477, "num_tokens": 139086635.0, "step": 115670 }, { "entropy": 1.8487340614199639, "epoch": 0.35859787857487724, "grad_norm": 8.635334014892578, "learning_rate": 4.224649649170366e-06, "loss": 0.4594, "mean_token_accuracy": 0.8458402872085571, "num_tokens": 139098547.0, "step": 115680 }, { "entropy": 1.9221700012683869, "epoch": 0.3586288776999269, "grad_norm": 6.838761806488037, "learning_rate": 4.224467058736255e-06, "loss": 0.5657, "mean_token_accuracy": 0.8330105841159821, "num_tokens": 139109818.0, "step": 115690 }, { "entropy": 1.8932587698101997, "epoch": 0.35865987682497663, "grad_norm": 9.387928009033203, "learning_rate": 4.224284491974914e-06, "loss": 0.5134, "mean_token_accuracy": 0.8380125910043716, "num_tokens": 139121059.0, "step": 115700 }, { "entropy": 1.9230019852519036, "epoch": 0.3586908759500263, "grad_norm": 8.831636428833008, "learning_rate": 4.224101948881227e-06, "loss": 0.4914, "mean_token_accuracy": 0.8391887709498406, "num_tokens": 139132889.0, "step": 115710 }, { "entropy": 1.8802688077092171, "epoch": 0.358721875075076, "grad_norm": 3.8806021213531494, "learning_rate": 4.22391942945008e-06, "loss": 0.4532, "mean_token_accuracy": 0.8473272129893303, "num_tokens": 139145205.0, "step": 115720 }, { "entropy": 1.9340951204299928, "epoch": 0.3587528742001257, "grad_norm": 8.25346851348877, "learning_rate": 4.2237369336763625e-06, "loss": 0.4932, "mean_token_accuracy": 0.8459508880972862, "num_tokens": 139156135.0, "step": 115730 }, { "entropy": 1.900227214396, "epoch": 0.3587838733251754, "grad_norm": 8.245087623596191, "learning_rate": 4.223554461554964e-06, "loss": 0.473, "mean_token_accuracy": 0.8462023451924324, "num_tokens": 139167831.0, "step": 115740 }, { "entropy": 1.8334951400756836, "epoch": 0.3588148724502251, "grad_norm": 4.438302040100098, "learning_rate": 4.223372013080776e-06, "loss": 0.4653, "mean_token_accuracy": 0.8457862615585328, "num_tokens": 139180382.0, "step": 115750 }, { "entropy": 1.8711395308375358, "epoch": 0.3588458715752748, "grad_norm": 8.4141845703125, "learning_rate": 4.223189588248691e-06, "loss": 0.4886, "mean_token_accuracy": 0.8420770734548568, "num_tokens": 139192007.0, "step": 115760 }, { "entropy": 1.892673698067665, "epoch": 0.3588768707003245, "grad_norm": 8.64199161529541, "learning_rate": 4.223007187053604e-06, "loss": 0.4734, "mean_token_accuracy": 0.8470587819814682, "num_tokens": 139203626.0, "step": 115770 }, { "entropy": 1.8397964909672737, "epoch": 0.3589078698253742, "grad_norm": 8.469186782836914, "learning_rate": 4.222824809490409e-06, "loss": 0.4957, "mean_token_accuracy": 0.8453930094838142, "num_tokens": 139215766.0, "step": 115780 }, { "entropy": 1.932980051636696, "epoch": 0.3589388689504239, "grad_norm": 6.354795455932617, "learning_rate": 4.2226424555540065e-06, "loss": 0.4755, "mean_token_accuracy": 0.8456053122878074, "num_tokens": 139227681.0, "step": 115790 }, { "entropy": 1.8004991948604583, "epoch": 0.3589698680754736, "grad_norm": 6.467407703399658, "learning_rate": 4.2224601252392935e-06, "loss": 0.4131, "mean_token_accuracy": 0.8630038425326347, "num_tokens": 139239994.0, "step": 115800 }, { "entropy": 1.8661329224705696, "epoch": 0.35900086720052327, "grad_norm": 4.90010929107666, "learning_rate": 4.222277818541172e-06, "loss": 0.498, "mean_token_accuracy": 0.8436091437935829, "num_tokens": 139251297.0, "step": 115810 }, { "entropy": 1.8228035122156143, "epoch": 0.35903186632557293, "grad_norm": 6.741776943206787, "learning_rate": 4.2220955354545435e-06, "loss": 0.4671, "mean_token_accuracy": 0.8479675367474556, "num_tokens": 139263722.0, "step": 115820 }, { "entropy": 1.796131867170334, "epoch": 0.35906286545062266, "grad_norm": 6.297611713409424, "learning_rate": 4.221913275974311e-06, "loss": 0.4424, "mean_token_accuracy": 0.8649747148156166, "num_tokens": 139276144.0, "step": 115830 }, { "entropy": 1.873625811934471, "epoch": 0.35909386457567233, "grad_norm": 7.727682590484619, "learning_rate": 4.221731040095381e-06, "loss": 0.4792, "mean_token_accuracy": 0.8530295416712761, "num_tokens": 139287760.0, "step": 115840 }, { "entropy": 1.8328562811017037, "epoch": 0.35912486370072205, "grad_norm": 8.941142082214355, "learning_rate": 4.221548827812659e-06, "loss": 0.4475, "mean_token_accuracy": 0.8467715248465538, "num_tokens": 139300146.0, "step": 115850 }, { "entropy": 1.8431931123137475, "epoch": 0.3591558628257717, "grad_norm": 8.686592102050781, "learning_rate": 4.221366639121054e-06, "loss": 0.4923, "mean_token_accuracy": 0.8481633648276329, "num_tokens": 139312174.0, "step": 115860 }, { "entropy": 1.7692224755883217, "epoch": 0.35918686195082145, "grad_norm": 7.942878723144531, "learning_rate": 4.221184474015477e-06, "loss": 0.4286, "mean_token_accuracy": 0.8561850890517235, "num_tokens": 139325247.0, "step": 115870 }, { "entropy": 1.8758103132247925, "epoch": 0.3592178610758711, "grad_norm": 8.962884902954102, "learning_rate": 4.221002332490837e-06, "loss": 0.4467, "mean_token_accuracy": 0.8550149217247963, "num_tokens": 139336232.0, "step": 115880 }, { "entropy": 1.7588933199644088, "epoch": 0.35924886020092084, "grad_norm": 4.053327560424805, "learning_rate": 4.220820214542049e-06, "loss": 0.3706, "mean_token_accuracy": 0.8556968107819557, "num_tokens": 139350251.0, "step": 115890 }, { "entropy": 1.8734589472413063, "epoch": 0.3592798593259705, "grad_norm": 9.165533065795898, "learning_rate": 4.220638120164026e-06, "loss": 0.5239, "mean_token_accuracy": 0.8462863549590111, "num_tokens": 139361764.0, "step": 115900 }, { "entropy": 1.8266181737184524, "epoch": 0.35931085845102023, "grad_norm": 7.9356255531311035, "learning_rate": 4.220456049351685e-06, "loss": 0.45, "mean_token_accuracy": 0.8596974804997444, "num_tokens": 139373578.0, "step": 115910 }, { "entropy": 1.8083546802401542, "epoch": 0.3593418575760699, "grad_norm": 3.8307013511657715, "learning_rate": 4.220274002099943e-06, "loss": 0.4362, "mean_token_accuracy": 0.849444305896759, "num_tokens": 139386017.0, "step": 115920 }, { "entropy": 1.8357669502496718, "epoch": 0.3593728567011196, "grad_norm": 3.281771421432495, "learning_rate": 4.220091978403719e-06, "loss": 0.5271, "mean_token_accuracy": 0.848252309858799, "num_tokens": 139398172.0, "step": 115930 }, { "entropy": 1.818470537662506, "epoch": 0.3594038558261693, "grad_norm": 2.5742039680480957, "learning_rate": 4.219909978257934e-06, "loss": 0.4774, "mean_token_accuracy": 0.8499492272734642, "num_tokens": 139410862.0, "step": 115940 }, { "entropy": 1.8105147659778595, "epoch": 0.359434854951219, "grad_norm": 7.754059791564941, "learning_rate": 4.219728001657508e-06, "loss": 0.4111, "mean_token_accuracy": 0.8644904047250748, "num_tokens": 139422867.0, "step": 115950 }, { "entropy": 1.8963597849011422, "epoch": 0.3594658540762687, "grad_norm": 9.430159568786621, "learning_rate": 4.219546048597369e-06, "loss": 0.4856, "mean_token_accuracy": 0.8429613292217255, "num_tokens": 139433810.0, "step": 115960 }, { "entropy": 1.8394680365920066, "epoch": 0.3594968532013184, "grad_norm": 3.8290538787841797, "learning_rate": 4.219364119072439e-06, "loss": 0.4543, "mean_token_accuracy": 0.8480119571089745, "num_tokens": 139446229.0, "step": 115970 }, { "entropy": 1.8272418528795242, "epoch": 0.3595278523263681, "grad_norm": 7.950870990753174, "learning_rate": 4.219182213077646e-06, "loss": 0.4343, "mean_token_accuracy": 0.8526568979024887, "num_tokens": 139458579.0, "step": 115980 }, { "entropy": 1.887735801935196, "epoch": 0.3595588514514178, "grad_norm": 3.933110237121582, "learning_rate": 4.219000330607917e-06, "loss": 0.4695, "mean_token_accuracy": 0.8449479952454567, "num_tokens": 139470657.0, "step": 115990 }, { "entropy": 1.9549269363284112, "epoch": 0.3595898505764675, "grad_norm": 9.912589073181152, "learning_rate": 4.218818471658183e-06, "loss": 0.4899, "mean_token_accuracy": 0.841597031056881, "num_tokens": 139481544.0, "step": 116000 }, { "entropy": 1.904000848531723, "epoch": 0.3596208497015172, "grad_norm": 6.567648887634277, "learning_rate": 4.218636636223375e-06, "loss": 0.4788, "mean_token_accuracy": 0.8507451817393303, "num_tokens": 139492483.0, "step": 116010 }, { "entropy": 1.9398685723543168, "epoch": 0.35965184882656687, "grad_norm": 4.198793411254883, "learning_rate": 4.218454824298425e-06, "loss": 0.5066, "mean_token_accuracy": 0.8451578319072723, "num_tokens": 139504258.0, "step": 116020 }, { "entropy": 1.9017282500863075, "epoch": 0.3596828479516166, "grad_norm": 7.781552314758301, "learning_rate": 4.218273035878269e-06, "loss": 0.4881, "mean_token_accuracy": 0.8435791626572609, "num_tokens": 139515473.0, "step": 116030 }, { "entropy": 1.9017712712287902, "epoch": 0.35971384707666626, "grad_norm": 8.259926795959473, "learning_rate": 4.218091270957841e-06, "loss": 0.5256, "mean_token_accuracy": 0.8393255040049553, "num_tokens": 139526776.0, "step": 116040 }, { "entropy": 1.8337740018963813, "epoch": 0.359744846201716, "grad_norm": 9.133225440979004, "learning_rate": 4.21790952953208e-06, "loss": 0.4639, "mean_token_accuracy": 0.8523485392332077, "num_tokens": 139539083.0, "step": 116050 }, { "entropy": 1.7753349527716638, "epoch": 0.35977584532676565, "grad_norm": 7.402058124542236, "learning_rate": 4.217727811595925e-06, "loss": 0.4548, "mean_token_accuracy": 0.847336596250534, "num_tokens": 139552085.0, "step": 116060 }, { "entropy": 1.8660298094153405, "epoch": 0.3598068444518153, "grad_norm": 7.4676337242126465, "learning_rate": 4.217546117144314e-06, "loss": 0.4653, "mean_token_accuracy": 0.8392193883657455, "num_tokens": 139564096.0, "step": 116070 }, { "entropy": 1.8619601786136628, "epoch": 0.35983784357686505, "grad_norm": 9.07433795928955, "learning_rate": 4.217364446172193e-06, "loss": 0.4855, "mean_token_accuracy": 0.839654740691185, "num_tokens": 139576414.0, "step": 116080 }, { "entropy": 1.9037289932370185, "epoch": 0.3598688427019147, "grad_norm": 8.008439064025879, "learning_rate": 4.217182798674502e-06, "loss": 0.4998, "mean_token_accuracy": 0.8401681199669838, "num_tokens": 139587784.0, "step": 116090 }, { "entropy": 1.8978828579187392, "epoch": 0.35989984182696444, "grad_norm": 8.578210830688477, "learning_rate": 4.2170011746461886e-06, "loss": 0.4871, "mean_token_accuracy": 0.8408385306596756, "num_tokens": 139599171.0, "step": 116100 }, { "entropy": 1.84117241948843, "epoch": 0.3599308409520141, "grad_norm": 3.883363962173462, "learning_rate": 4.216819574082197e-06, "loss": 0.4476, "mean_token_accuracy": 0.8472616419196128, "num_tokens": 139611729.0, "step": 116110 }, { "entropy": 1.8370385035872459, "epoch": 0.35996184007706383, "grad_norm": 4.151669979095459, "learning_rate": 4.2166379969774775e-06, "loss": 0.4473, "mean_token_accuracy": 0.855852136015892, "num_tokens": 139623044.0, "step": 116120 }, { "entropy": 1.8387348473072052, "epoch": 0.3599928392021135, "grad_norm": 8.288407325744629, "learning_rate": 4.216456443326979e-06, "loss": 0.4327, "mean_token_accuracy": 0.8595815673470497, "num_tokens": 139634713.0, "step": 116130 }, { "entropy": 1.9357994318008422, "epoch": 0.3600238383271632, "grad_norm": 10.54449462890625, "learning_rate": 4.216274913125652e-06, "loss": 0.5153, "mean_token_accuracy": 0.8542236015200615, "num_tokens": 139645451.0, "step": 116140 }, { "entropy": 1.8762347102165222, "epoch": 0.3600548374522129, "grad_norm": 8.778923034667969, "learning_rate": 4.216093406368449e-06, "loss": 0.4745, "mean_token_accuracy": 0.8462166652083397, "num_tokens": 139657312.0, "step": 116150 }, { "entropy": 1.8865218967199326, "epoch": 0.3600858365772626, "grad_norm": 7.668148040771484, "learning_rate": 4.2159119230503255e-06, "loss": 0.5044, "mean_token_accuracy": 0.8440534695982933, "num_tokens": 139668202.0, "step": 116160 }, { "entropy": 1.8358159840106965, "epoch": 0.3601168357023123, "grad_norm": 8.311078071594238, "learning_rate": 4.215730463166237e-06, "loss": 0.4634, "mean_token_accuracy": 0.8523998752236366, "num_tokens": 139680084.0, "step": 116170 }, { "entropy": 1.925374338030815, "epoch": 0.360147834827362, "grad_norm": 8.066094398498535, "learning_rate": 4.21554902671114e-06, "loss": 0.5061, "mean_token_accuracy": 0.8462306708097458, "num_tokens": 139690862.0, "step": 116180 }, { "entropy": 1.8480087623000145, "epoch": 0.3601788339524117, "grad_norm": 4.514954090118408, "learning_rate": 4.2153676136799934e-06, "loss": 0.421, "mean_token_accuracy": 0.8582058116793633, "num_tokens": 139703017.0, "step": 116190 }, { "entropy": 1.7714590035378932, "epoch": 0.3602098330774614, "grad_norm": 2.609532356262207, "learning_rate": 4.215186224067758e-06, "loss": 0.3956, "mean_token_accuracy": 0.8530898556113243, "num_tokens": 139716890.0, "step": 116200 }, { "entropy": 1.9455319941043854, "epoch": 0.3602408322025111, "grad_norm": 7.525179862976074, "learning_rate": 4.215004857869394e-06, "loss": 0.5113, "mean_token_accuracy": 0.8397414952516555, "num_tokens": 139727863.0, "step": 116210 }, { "entropy": 1.766502921283245, "epoch": 0.3602718313275608, "grad_norm": 8.821333885192871, "learning_rate": 4.214823515079867e-06, "loss": 0.4441, "mean_token_accuracy": 0.8470347732305527, "num_tokens": 139740638.0, "step": 116220 }, { "entropy": 1.8291141256690024, "epoch": 0.36030283045261047, "grad_norm": 4.927423000335693, "learning_rate": 4.214642195694141e-06, "loss": 0.4251, "mean_token_accuracy": 0.8610336139798165, "num_tokens": 139753345.0, "step": 116230 }, { "entropy": 1.8271966442465781, "epoch": 0.3603338295776602, "grad_norm": 9.124969482421875, "learning_rate": 4.214460899707181e-06, "loss": 0.4365, "mean_token_accuracy": 0.8442619845271111, "num_tokens": 139765345.0, "step": 116240 }, { "entropy": 1.8535583779215812, "epoch": 0.36036482870270986, "grad_norm": 8.4541597366333, "learning_rate": 4.214279627113957e-06, "loss": 0.4227, "mean_token_accuracy": 0.8529452204704284, "num_tokens": 139777146.0, "step": 116250 }, { "entropy": 1.8414417818188666, "epoch": 0.3603958278277596, "grad_norm": 3.8833229541778564, "learning_rate": 4.214098377909436e-06, "loss": 0.421, "mean_token_accuracy": 0.852535355091095, "num_tokens": 139789410.0, "step": 116260 }, { "entropy": 1.8427911520004272, "epoch": 0.36042682695280925, "grad_norm": 3.8969404697418213, "learning_rate": 4.213917152088591e-06, "loss": 0.4381, "mean_token_accuracy": 0.8565546840429306, "num_tokens": 139801439.0, "step": 116270 }, { "entropy": 1.7546941101551057, "epoch": 0.360457826077859, "grad_norm": 8.13198184967041, "learning_rate": 4.2137359496463936e-06, "loss": 0.3821, "mean_token_accuracy": 0.8690331175923347, "num_tokens": 139814865.0, "step": 116280 }, { "entropy": 1.7768574252724647, "epoch": 0.36048882520290865, "grad_norm": 3.7505130767822266, "learning_rate": 4.213554770577818e-06, "loss": 0.3773, "mean_token_accuracy": 0.86257985830307, "num_tokens": 139828042.0, "step": 116290 }, { "entropy": 1.8828290045261382, "epoch": 0.3605198243279583, "grad_norm": 6.885351181030273, "learning_rate": 4.213373614877838e-06, "loss": 0.4649, "mean_token_accuracy": 0.8605077177286148, "num_tokens": 139838856.0, "step": 116300 }, { "entropy": 1.7729797944426537, "epoch": 0.36055082345300804, "grad_norm": 9.330283164978027, "learning_rate": 4.213192482541433e-06, "loss": 0.3842, "mean_token_accuracy": 0.8671557918190956, "num_tokens": 139852638.0, "step": 116310 }, { "entropy": 1.9487037807703018, "epoch": 0.3605818225780577, "grad_norm": 7.210131645202637, "learning_rate": 4.21301137356358e-06, "loss": 0.5066, "mean_token_accuracy": 0.8386420547962189, "num_tokens": 139863146.0, "step": 116320 }, { "entropy": 1.9695353388786316, "epoch": 0.36061282170310743, "grad_norm": 9.178292274475098, "learning_rate": 4.21283028793926e-06, "loss": 0.52, "mean_token_accuracy": 0.8453469514846802, "num_tokens": 139874014.0, "step": 116330 }, { "entropy": 1.8018761366605758, "epoch": 0.3606438208281571, "grad_norm": 7.328160285949707, "learning_rate": 4.212649225663452e-06, "loss": 0.4631, "mean_token_accuracy": 0.8488622546195984, "num_tokens": 139886434.0, "step": 116340 }, { "entropy": 1.9044070094823837, "epoch": 0.3606748199532068, "grad_norm": 7.514301776885986, "learning_rate": 4.212468186731141e-06, "loss": 0.5155, "mean_token_accuracy": 0.8392976179718972, "num_tokens": 139897996.0, "step": 116350 }, { "entropy": 1.8434274643659592, "epoch": 0.3607058190782565, "grad_norm": 8.972174644470215, "learning_rate": 4.212287171137313e-06, "loss": 0.4632, "mean_token_accuracy": 0.8550772503018379, "num_tokens": 139910313.0, "step": 116360 }, { "entropy": 1.803036929666996, "epoch": 0.3607368182033062, "grad_norm": 8.392457008361816, "learning_rate": 4.212106178876951e-06, "loss": 0.4493, "mean_token_accuracy": 0.8607822299003601, "num_tokens": 139922530.0, "step": 116370 }, { "entropy": 1.8568144857883453, "epoch": 0.3607678173283559, "grad_norm": 3.606626510620117, "learning_rate": 4.211925209945044e-06, "loss": 0.4578, "mean_token_accuracy": 0.8569135695695878, "num_tokens": 139933805.0, "step": 116380 }, { "entropy": 1.9102680534124374, "epoch": 0.3607988164534056, "grad_norm": 8.40206241607666, "learning_rate": 4.21174426433658e-06, "loss": 0.5215, "mean_token_accuracy": 0.8408423826098442, "num_tokens": 139945504.0, "step": 116390 }, { "entropy": 1.8889756113290788, "epoch": 0.3608298155784553, "grad_norm": 7.147616386413574, "learning_rate": 4.21156334204655e-06, "loss": 0.4508, "mean_token_accuracy": 0.8580622985959053, "num_tokens": 139956323.0, "step": 116400 }, { "entropy": 1.7778198927640916, "epoch": 0.360860814703505, "grad_norm": 8.543440818786621, "learning_rate": 4.211382443069949e-06, "loss": 0.4076, "mean_token_accuracy": 0.8652505591511727, "num_tokens": 139968634.0, "step": 116410 }, { "entropy": 1.877928839623928, "epoch": 0.3608918138285547, "grad_norm": 2.7431368827819824, "learning_rate": 4.2112015674017645e-06, "loss": 0.4852, "mean_token_accuracy": 0.8467942878603936, "num_tokens": 139981044.0, "step": 116420 }, { "entropy": 1.8687953114509583, "epoch": 0.3609228129536044, "grad_norm": 8.985494613647461, "learning_rate": 4.211020715036995e-06, "loss": 0.5351, "mean_token_accuracy": 0.844055813550949, "num_tokens": 139992856.0, "step": 116430 }, { "entropy": 1.8689031660556794, "epoch": 0.36095381207865407, "grad_norm": 9.813769340515137, "learning_rate": 4.210839885970638e-06, "loss": 0.4723, "mean_token_accuracy": 0.8516811087727547, "num_tokens": 140003419.0, "step": 116440 }, { "entropy": 1.8415801107883454, "epoch": 0.3609848112037038, "grad_norm": 3.827608346939087, "learning_rate": 4.210659080197691e-06, "loss": 0.5246, "mean_token_accuracy": 0.8390990689396858, "num_tokens": 140014798.0, "step": 116450 }, { "entropy": 1.847266887128353, "epoch": 0.36101581032875346, "grad_norm": 8.154986381530762, "learning_rate": 4.210478297713152e-06, "loss": 0.4156, "mean_token_accuracy": 0.8583290934562683, "num_tokens": 140026777.0, "step": 116460 }, { "entropy": 1.872428523004055, "epoch": 0.3610468094538032, "grad_norm": 9.429468154907227, "learning_rate": 4.210297538512023e-06, "loss": 0.4599, "mean_token_accuracy": 0.8427754417061806, "num_tokens": 140038193.0, "step": 116470 }, { "entropy": 1.815330520272255, "epoch": 0.36107780857885285, "grad_norm": 7.44341516494751, "learning_rate": 4.210116802589307e-06, "loss": 0.4494, "mean_token_accuracy": 0.8459246262907982, "num_tokens": 140049896.0, "step": 116480 }, { "entropy": 1.80218645632267, "epoch": 0.3611088077039026, "grad_norm": 4.414290428161621, "learning_rate": 4.209936089940008e-06, "loss": 0.4232, "mean_token_accuracy": 0.8553975149989128, "num_tokens": 140062012.0, "step": 116490 }, { "entropy": 1.8422569379210472, "epoch": 0.36113980682895225, "grad_norm": 7.908370494842529, "learning_rate": 4.20975540055913e-06, "loss": 0.4468, "mean_token_accuracy": 0.8528754249215126, "num_tokens": 140074154.0, "step": 116500 }, { "entropy": 1.7738358169794082, "epoch": 0.36117080595400197, "grad_norm": 7.086633682250977, "learning_rate": 4.2095747344416815e-06, "loss": 0.3905, "mean_token_accuracy": 0.8605238318443298, "num_tokens": 140086516.0, "step": 116510 }, { "entropy": 1.8560999408364296, "epoch": 0.36120180507905164, "grad_norm": 8.640069007873535, "learning_rate": 4.209394091582671e-06, "loss": 0.495, "mean_token_accuracy": 0.8428186282515526, "num_tokens": 140098344.0, "step": 116520 }, { "entropy": 1.8515635520219802, "epoch": 0.36123280420410137, "grad_norm": 9.717535972595215, "learning_rate": 4.209213471977109e-06, "loss": 0.4698, "mean_token_accuracy": 0.846831089258194, "num_tokens": 140110320.0, "step": 116530 }, { "entropy": 1.8465173259377479, "epoch": 0.36126380332915103, "grad_norm": 7.840369701385498, "learning_rate": 4.209032875620006e-06, "loss": 0.4722, "mean_token_accuracy": 0.8543838858604431, "num_tokens": 140122072.0, "step": 116540 }, { "entropy": 1.793773990869522, "epoch": 0.3612948024542007, "grad_norm": 7.085291385650635, "learning_rate": 4.2088523025063745e-06, "loss": 0.4702, "mean_token_accuracy": 0.8551607668399811, "num_tokens": 140134814.0, "step": 116550 }, { "entropy": 1.8004359588027001, "epoch": 0.3613258015792504, "grad_norm": 2.7206969261169434, "learning_rate": 4.208671752631231e-06, "loss": 0.4592, "mean_token_accuracy": 0.8510082602500916, "num_tokens": 140147230.0, "step": 116560 }, { "entropy": 1.838956043124199, "epoch": 0.3613568007043001, "grad_norm": 4.567168235778809, "learning_rate": 4.20849122598959e-06, "loss": 0.4362, "mean_token_accuracy": 0.8527957811951637, "num_tokens": 140159657.0, "step": 116570 }, { "entropy": 1.8510146543383599, "epoch": 0.3613877998293498, "grad_norm": 8.271007537841797, "learning_rate": 4.208310722576469e-06, "loss": 0.4629, "mean_token_accuracy": 0.8527455732226372, "num_tokens": 140171588.0, "step": 116580 }, { "entropy": 1.817806077003479, "epoch": 0.3614187989543995, "grad_norm": 8.602225303649902, "learning_rate": 4.208130242386889e-06, "loss": 0.4285, "mean_token_accuracy": 0.8533978506922721, "num_tokens": 140184012.0, "step": 116590 }, { "entropy": 1.8294128969311714, "epoch": 0.3614497980794492, "grad_norm": 7.451015949249268, "learning_rate": 4.207949785415868e-06, "loss": 0.4322, "mean_token_accuracy": 0.8549151733517647, "num_tokens": 140196072.0, "step": 116600 }, { "entropy": 1.9123011171817779, "epoch": 0.3614807972044989, "grad_norm": 10.21450138092041, "learning_rate": 4.207769351658429e-06, "loss": 0.4947, "mean_token_accuracy": 0.8501939132809639, "num_tokens": 140206777.0, "step": 116610 }, { "entropy": 1.898598951101303, "epoch": 0.3615117963295486, "grad_norm": 7.918070316314697, "learning_rate": 4.2075889411095965e-06, "loss": 0.4788, "mean_token_accuracy": 0.8529366075992584, "num_tokens": 140217957.0, "step": 116620 }, { "entropy": 1.871064305305481, "epoch": 0.3615427954545983, "grad_norm": 3.4128713607788086, "learning_rate": 4.2074085537643945e-06, "loss": 0.4896, "mean_token_accuracy": 0.8430089339613914, "num_tokens": 140229754.0, "step": 116630 }, { "entropy": 1.8388782501220704, "epoch": 0.361573794579648, "grad_norm": 3.9377996921539307, "learning_rate": 4.207228189617849e-06, "loss": 0.4381, "mean_token_accuracy": 0.855095773935318, "num_tokens": 140242273.0, "step": 116640 }, { "entropy": 1.7982016265392304, "epoch": 0.36160479370469767, "grad_norm": 4.408384323120117, "learning_rate": 4.20704784866499e-06, "loss": 0.4145, "mean_token_accuracy": 0.8601232185959816, "num_tokens": 140255003.0, "step": 116650 }, { "entropy": 1.822973382472992, "epoch": 0.3616357928297474, "grad_norm": 9.451082229614258, "learning_rate": 4.206867530900845e-06, "loss": 0.4069, "mean_token_accuracy": 0.8590276911854744, "num_tokens": 140267479.0, "step": 116660 }, { "entropy": 1.7897161670029162, "epoch": 0.36166679195479706, "grad_norm": 8.063048362731934, "learning_rate": 4.206687236320445e-06, "loss": 0.4422, "mean_token_accuracy": 0.8569493010640145, "num_tokens": 140280536.0, "step": 116670 }, { "entropy": 1.8633152276277543, "epoch": 0.3616977910798468, "grad_norm": 7.586259365081787, "learning_rate": 4.206506964918824e-06, "loss": 0.4814, "mean_token_accuracy": 0.84778813123703, "num_tokens": 140292788.0, "step": 116680 }, { "entropy": 1.9379972368478775, "epoch": 0.36172879020489646, "grad_norm": 8.198206901550293, "learning_rate": 4.206326716691015e-06, "loss": 0.5271, "mean_token_accuracy": 0.8326962515711784, "num_tokens": 140303768.0, "step": 116690 }, { "entropy": 1.8760272949934005, "epoch": 0.3617597893299462, "grad_norm": 7.511582851409912, "learning_rate": 4.206146491632053e-06, "loss": 0.4489, "mean_token_accuracy": 0.8527100086212158, "num_tokens": 140314927.0, "step": 116700 }, { "entropy": 1.8044881626963616, "epoch": 0.36179078845499585, "grad_norm": 7.948654651641846, "learning_rate": 4.205966289736976e-06, "loss": 0.4147, "mean_token_accuracy": 0.8563151493668556, "num_tokens": 140327161.0, "step": 116710 }, { "entropy": 1.9257992446422576, "epoch": 0.3618217875800456, "grad_norm": 9.45364761352539, "learning_rate": 4.205786111000822e-06, "loss": 0.5259, "mean_token_accuracy": 0.8418830558657646, "num_tokens": 140337921.0, "step": 116720 }, { "entropy": 1.8222204566001892, "epoch": 0.36185278670509524, "grad_norm": 3.7886269092559814, "learning_rate": 4.2056059554186305e-06, "loss": 0.4949, "mean_token_accuracy": 0.8379009708762168, "num_tokens": 140350153.0, "step": 116730 }, { "entropy": 1.8184516310691834, "epoch": 0.36188378583014497, "grad_norm": 4.151296138763428, "learning_rate": 4.2054258229854435e-06, "loss": 0.4617, "mean_token_accuracy": 0.8527412965893746, "num_tokens": 140363079.0, "step": 116740 }, { "entropy": 1.8538252338767052, "epoch": 0.36191478495519463, "grad_norm": 6.772287845611572, "learning_rate": 4.205245713696304e-06, "loss": 0.4964, "mean_token_accuracy": 0.8451321288943291, "num_tokens": 140375101.0, "step": 116750 }, { "entropy": 1.9226402550935746, "epoch": 0.36194578408024436, "grad_norm": 8.788530349731445, "learning_rate": 4.205065627546256e-06, "loss": 0.5342, "mean_token_accuracy": 0.8383546307682991, "num_tokens": 140386168.0, "step": 116760 }, { "entropy": 1.7971034452319146, "epoch": 0.36197678320529403, "grad_norm": 9.055315017700195, "learning_rate": 4.204885564530345e-06, "loss": 0.4369, "mean_token_accuracy": 0.8534369856119156, "num_tokens": 140399104.0, "step": 116770 }, { "entropy": 1.903681069612503, "epoch": 0.36200778233034375, "grad_norm": 8.377460479736328, "learning_rate": 4.204705524643619e-06, "loss": 0.5469, "mean_token_accuracy": 0.8296264916658401, "num_tokens": 140409966.0, "step": 116780 }, { "entropy": 1.8770312041044235, "epoch": 0.3620387814553934, "grad_norm": 7.98540735244751, "learning_rate": 4.204525507881126e-06, "loss": 0.4922, "mean_token_accuracy": 0.8401692688465119, "num_tokens": 140423005.0, "step": 116790 }, { "entropy": 1.8793371990323067, "epoch": 0.3620697805804431, "grad_norm": 8.905379295349121, "learning_rate": 4.204345514237917e-06, "loss": 0.4797, "mean_token_accuracy": 0.8462567001581192, "num_tokens": 140434436.0, "step": 116800 }, { "entropy": 1.8640392243862152, "epoch": 0.3621007797054928, "grad_norm": 4.022253036499023, "learning_rate": 4.204165543709043e-06, "loss": 0.4321, "mean_token_accuracy": 0.8513988614082336, "num_tokens": 140446710.0, "step": 116810 }, { "entropy": 1.8475000470876695, "epoch": 0.3621317788305425, "grad_norm": 8.355897903442383, "learning_rate": 4.2039855962895605e-06, "loss": 0.4696, "mean_token_accuracy": 0.8459990754723549, "num_tokens": 140458545.0, "step": 116820 }, { "entropy": 1.8094294220209122, "epoch": 0.3621627779555922, "grad_norm": 2.8843464851379395, "learning_rate": 4.203805671974519e-06, "loss": 0.4604, "mean_token_accuracy": 0.8503652885556221, "num_tokens": 140471070.0, "step": 116830 }, { "entropy": 1.8952600628137588, "epoch": 0.3621937770806419, "grad_norm": 7.821008682250977, "learning_rate": 4.203625770758979e-06, "loss": 0.5673, "mean_token_accuracy": 0.8331520974636077, "num_tokens": 140482920.0, "step": 116840 }, { "entropy": 1.827965249121189, "epoch": 0.3622247762056916, "grad_norm": 9.462203025817871, "learning_rate": 4.2034458926379965e-06, "loss": 0.4623, "mean_token_accuracy": 0.854870118200779, "num_tokens": 140495151.0, "step": 116850 }, { "entropy": 1.8622143000364304, "epoch": 0.36225577533074127, "grad_norm": 8.208244323730469, "learning_rate": 4.20326603760663e-06, "loss": 0.4878, "mean_token_accuracy": 0.8542701035737992, "num_tokens": 140507333.0, "step": 116860 }, { "entropy": 1.7744985669851303, "epoch": 0.362286774455791, "grad_norm": 8.417732238769531, "learning_rate": 4.2030862056599415e-06, "loss": 0.3831, "mean_token_accuracy": 0.8632864966988564, "num_tokens": 140520558.0, "step": 116870 }, { "entropy": 1.8826520085334777, "epoch": 0.36231777358084066, "grad_norm": 8.546829223632812, "learning_rate": 4.202906396792993e-06, "loss": 0.4866, "mean_token_accuracy": 0.8448991179466248, "num_tokens": 140532773.0, "step": 116880 }, { "entropy": 1.8119245707988738, "epoch": 0.3623487727058904, "grad_norm": 7.6031599044799805, "learning_rate": 4.2027266110008475e-06, "loss": 0.4156, "mean_token_accuracy": 0.8597419142723084, "num_tokens": 140544815.0, "step": 116890 }, { "entropy": 1.8004948943853378, "epoch": 0.36237977183094006, "grad_norm": 9.985618591308594, "learning_rate": 4.202546848278572e-06, "loss": 0.4757, "mean_token_accuracy": 0.8449292674660682, "num_tokens": 140557252.0, "step": 116900 }, { "entropy": 1.8618132412433623, "epoch": 0.3624107709559898, "grad_norm": 6.6439290046691895, "learning_rate": 4.2023671086212295e-06, "loss": 0.457, "mean_token_accuracy": 0.8493566080927849, "num_tokens": 140568584.0, "step": 116910 }, { "entropy": 1.7272257059812546, "epoch": 0.36244177008103945, "grad_norm": 5.679489612579346, "learning_rate": 4.202187392023891e-06, "loss": 0.3764, "mean_token_accuracy": 0.8584482952952385, "num_tokens": 140582578.0, "step": 116920 }, { "entropy": 1.9361666440963745, "epoch": 0.3624727692060892, "grad_norm": 7.492152690887451, "learning_rate": 4.202007698481626e-06, "loss": 0.4652, "mean_token_accuracy": 0.8505539804697037, "num_tokens": 140593156.0, "step": 116930 }, { "entropy": 1.9066250026226044, "epoch": 0.36250376833113884, "grad_norm": 9.594344139099121, "learning_rate": 4.201828027989504e-06, "loss": 0.4974, "mean_token_accuracy": 0.8503702595829964, "num_tokens": 140604659.0, "step": 116940 }, { "entropy": 1.8485996812582015, "epoch": 0.36253476745618857, "grad_norm": 9.402889251708984, "learning_rate": 4.201648380542599e-06, "loss": 0.4438, "mean_token_accuracy": 0.8464497730135918, "num_tokens": 140616897.0, "step": 116950 }, { "entropy": 1.8569966793060302, "epoch": 0.36256576658123824, "grad_norm": 8.817802429199219, "learning_rate": 4.201468756135983e-06, "loss": 0.4861, "mean_token_accuracy": 0.839309224486351, "num_tokens": 140628644.0, "step": 116960 }, { "entropy": 1.827142098546028, "epoch": 0.36259676570628796, "grad_norm": 7.352452278137207, "learning_rate": 4.201289154764733e-06, "loss": 0.477, "mean_token_accuracy": 0.8442728266119957, "num_tokens": 140640707.0, "step": 116970 }, { "entropy": 1.807940025627613, "epoch": 0.36262776483133763, "grad_norm": 4.695886135101318, "learning_rate": 4.201109576423926e-06, "loss": 0.4861, "mean_token_accuracy": 0.8374917089939118, "num_tokens": 140653668.0, "step": 116980 }, { "entropy": 1.8596377670764923, "epoch": 0.36265876395638735, "grad_norm": 9.165081024169922, "learning_rate": 4.20093002110864e-06, "loss": 0.4625, "mean_token_accuracy": 0.8492574363946914, "num_tokens": 140665982.0, "step": 116990 }, { "entropy": 1.8663609504699707, "epoch": 0.362689763081437, "grad_norm": 7.703036785125732, "learning_rate": 4.200750488813955e-06, "loss": 0.4468, "mean_token_accuracy": 0.8582181662321091, "num_tokens": 140677607.0, "step": 117000 }, { "entropy": 1.808670112490654, "epoch": 0.36272076220648675, "grad_norm": 3.881373405456543, "learning_rate": 4.200570979534951e-06, "loss": 0.3998, "mean_token_accuracy": 0.856096900999546, "num_tokens": 140690750.0, "step": 117010 }, { "entropy": 1.8351006895303725, "epoch": 0.3627517613315364, "grad_norm": 3.9870259761810303, "learning_rate": 4.200391493266714e-06, "loss": 0.4496, "mean_token_accuracy": 0.8495851859450341, "num_tokens": 140702826.0, "step": 117020 }, { "entropy": 1.8416362181305885, "epoch": 0.36278276045658614, "grad_norm": 7.493378639221191, "learning_rate": 4.200212030004326e-06, "loss": 0.4321, "mean_token_accuracy": 0.8568256065249443, "num_tokens": 140714882.0, "step": 117030 }, { "entropy": 1.8965220645070076, "epoch": 0.3628137595816358, "grad_norm": 7.964942455291748, "learning_rate": 4.200032589742872e-06, "loss": 0.4827, "mean_token_accuracy": 0.8496366932988166, "num_tokens": 140726128.0, "step": 117040 }, { "entropy": 1.835560789704323, "epoch": 0.3628447587066855, "grad_norm": 8.899863243103027, "learning_rate": 4.1998531724774405e-06, "loss": 0.4407, "mean_token_accuracy": 0.8567850336432457, "num_tokens": 140738179.0, "step": 117050 }, { "entropy": 1.8459725484251976, "epoch": 0.3628757578317352, "grad_norm": 6.545345306396484, "learning_rate": 4.19967377820312e-06, "loss": 0.4149, "mean_token_accuracy": 0.8515329957008362, "num_tokens": 140750630.0, "step": 117060 }, { "entropy": 1.9014078021049499, "epoch": 0.36290675695678487, "grad_norm": 4.256891250610352, "learning_rate": 4.199494406915001e-06, "loss": 0.4797, "mean_token_accuracy": 0.8485517993569374, "num_tokens": 140762145.0, "step": 117070 }, { "entropy": 1.8998582303524016, "epoch": 0.3629377560818346, "grad_norm": 7.5159430503845215, "learning_rate": 4.1993150586081755e-06, "loss": 0.478, "mean_token_accuracy": 0.845410218834877, "num_tokens": 140773817.0, "step": 117080 }, { "entropy": 1.8339273512363434, "epoch": 0.36296875520688426, "grad_norm": 3.6679487228393555, "learning_rate": 4.1991357332777346e-06, "loss": 0.4336, "mean_token_accuracy": 0.8533756047487259, "num_tokens": 140786301.0, "step": 117090 }, { "entropy": 1.8324194088578225, "epoch": 0.362999754331934, "grad_norm": 8.492642402648926, "learning_rate": 4.198956430918775e-06, "loss": 0.4395, "mean_token_accuracy": 0.8549712345004081, "num_tokens": 140798870.0, "step": 117100 }, { "entropy": 1.8803598403930664, "epoch": 0.36303075345698366, "grad_norm": 7.195762634277344, "learning_rate": 4.1987771515263905e-06, "loss": 0.5104, "mean_token_accuracy": 0.8416722387075424, "num_tokens": 140811461.0, "step": 117110 }, { "entropy": 1.9139654606580734, "epoch": 0.3630617525820334, "grad_norm": 4.575145244598389, "learning_rate": 4.198597895095681e-06, "loss": 0.4905, "mean_token_accuracy": 0.8445180609822274, "num_tokens": 140822762.0, "step": 117120 }, { "entropy": 1.9604573130607605, "epoch": 0.36309275170708305, "grad_norm": 8.761122703552246, "learning_rate": 4.198418661621745e-06, "loss": 0.4941, "mean_token_accuracy": 0.8457563772797585, "num_tokens": 140833186.0, "step": 117130 }, { "entropy": 1.8488843753933906, "epoch": 0.3631237508321328, "grad_norm": 8.668008804321289, "learning_rate": 4.198239451099681e-06, "loss": 0.4754, "mean_token_accuracy": 0.848555526137352, "num_tokens": 140845017.0, "step": 117140 }, { "entropy": 1.8061452731490135, "epoch": 0.36315474995718244, "grad_norm": 8.766512870788574, "learning_rate": 4.198060263524593e-06, "loss": 0.4393, "mean_token_accuracy": 0.8509079724550247, "num_tokens": 140857708.0, "step": 117150 }, { "entropy": 1.8900872439146041, "epoch": 0.36318574908223217, "grad_norm": 8.589384078979492, "learning_rate": 4.197881098891584e-06, "loss": 0.454, "mean_token_accuracy": 0.84776521474123, "num_tokens": 140869482.0, "step": 117160 }, { "entropy": 1.887955754995346, "epoch": 0.36321674820728184, "grad_norm": 8.895465850830078, "learning_rate": 4.197701957195758e-06, "loss": 0.4763, "mean_token_accuracy": 0.8418793961405754, "num_tokens": 140881457.0, "step": 117170 }, { "entropy": 1.9077037498354912, "epoch": 0.36324774733233156, "grad_norm": 8.192936897277832, "learning_rate": 4.197522838432221e-06, "loss": 0.4826, "mean_token_accuracy": 0.8485002934932708, "num_tokens": 140893109.0, "step": 117180 }, { "entropy": 1.9021349415183066, "epoch": 0.36327874645738123, "grad_norm": 8.755243301391602, "learning_rate": 4.197343742596083e-06, "loss": 0.5011, "mean_token_accuracy": 0.8397643223404885, "num_tokens": 140904533.0, "step": 117190 }, { "entropy": 1.8732059866189956, "epoch": 0.36330974558243095, "grad_norm": 8.205700874328613, "learning_rate": 4.19716466968245e-06, "loss": 0.4276, "mean_token_accuracy": 0.8494512528181076, "num_tokens": 140917332.0, "step": 117200 }, { "entropy": 1.8849352940917015, "epoch": 0.3633407447074806, "grad_norm": 3.967863082885742, "learning_rate": 4.196985619686435e-06, "loss": 0.4454, "mean_token_accuracy": 0.8578123107552529, "num_tokens": 140929314.0, "step": 117210 }, { "entropy": 1.8729075148701668, "epoch": 0.36337174383253035, "grad_norm": 8.107409477233887, "learning_rate": 4.196806592603149e-06, "loss": 0.4313, "mean_token_accuracy": 0.8556957468390465, "num_tokens": 140941339.0, "step": 117220 }, { "entropy": 1.9158906683325767, "epoch": 0.36340274295758, "grad_norm": 9.10888385772705, "learning_rate": 4.196627588427705e-06, "loss": 0.4664, "mean_token_accuracy": 0.849520568549633, "num_tokens": 140952013.0, "step": 117230 }, { "entropy": 1.8629322737455367, "epoch": 0.36343374208262974, "grad_norm": 7.760976314544678, "learning_rate": 4.196448607155221e-06, "loss": 0.4591, "mean_token_accuracy": 0.8583438560366631, "num_tokens": 140963744.0, "step": 117240 }, { "entropy": 1.7615326926112176, "epoch": 0.3634647412076794, "grad_norm": 8.887621879577637, "learning_rate": 4.19626964878081e-06, "loss": 0.3829, "mean_token_accuracy": 0.8631403967738152, "num_tokens": 140976932.0, "step": 117250 }, { "entropy": 1.8309187158942222, "epoch": 0.36349574033272913, "grad_norm": 7.472461223602295, "learning_rate": 4.196090713299592e-06, "loss": 0.4227, "mean_token_accuracy": 0.8512938529253006, "num_tokens": 140989695.0, "step": 117260 }, { "entropy": 1.8179723098874092, "epoch": 0.3635267394577788, "grad_norm": 8.319780349731445, "learning_rate": 4.195911800706686e-06, "loss": 0.4561, "mean_token_accuracy": 0.855165833234787, "num_tokens": 141001678.0, "step": 117270 }, { "entropy": 1.873963290452957, "epoch": 0.3635577385828285, "grad_norm": 8.117876052856445, "learning_rate": 4.195732910997212e-06, "loss": 0.5322, "mean_token_accuracy": 0.8410425320267677, "num_tokens": 141012566.0, "step": 117280 }, { "entropy": 1.8815976113080979, "epoch": 0.3635887377078782, "grad_norm": 9.272675514221191, "learning_rate": 4.195554044166294e-06, "loss": 0.4761, "mean_token_accuracy": 0.8484034299850464, "num_tokens": 141023991.0, "step": 117290 }, { "entropy": 1.8928146451711654, "epoch": 0.36361973683292786, "grad_norm": 10.41171646118164, "learning_rate": 4.195375200209055e-06, "loss": 0.4817, "mean_token_accuracy": 0.8498426347970962, "num_tokens": 141034844.0, "step": 117300 }, { "entropy": 1.9140311002731323, "epoch": 0.3636507359579776, "grad_norm": 9.667838096618652, "learning_rate": 4.195196379120619e-06, "loss": 0.5428, "mean_token_accuracy": 0.8315603911876679, "num_tokens": 141045930.0, "step": 117310 }, { "entropy": 1.817529346048832, "epoch": 0.36368173508302726, "grad_norm": 7.76594877243042, "learning_rate": 4.195017580896114e-06, "loss": 0.4412, "mean_token_accuracy": 0.8598337545990944, "num_tokens": 141058763.0, "step": 117320 }, { "entropy": 1.971425500512123, "epoch": 0.363712734208077, "grad_norm": 8.319217681884766, "learning_rate": 4.194838805530668e-06, "loss": 0.5262, "mean_token_accuracy": 0.8371591001749039, "num_tokens": 141068907.0, "step": 117330 }, { "entropy": 1.9198841854929924, "epoch": 0.36374373333312665, "grad_norm": 8.907166481018066, "learning_rate": 4.19466005301941e-06, "loss": 0.4783, "mean_token_accuracy": 0.8449474439024925, "num_tokens": 141080433.0, "step": 117340 }, { "entropy": 1.8737755030393601, "epoch": 0.3637747324581764, "grad_norm": 8.013126373291016, "learning_rate": 4.194481323357473e-06, "loss": 0.444, "mean_token_accuracy": 0.8571465671062469, "num_tokens": 141092637.0, "step": 117350 }, { "entropy": 1.9522608071565628, "epoch": 0.36380573158322604, "grad_norm": 6.9084296226501465, "learning_rate": 4.194302616539986e-06, "loss": 0.4702, "mean_token_accuracy": 0.8557817369699479, "num_tokens": 141103184.0, "step": 117360 }, { "entropy": 1.849492047727108, "epoch": 0.36383673070827577, "grad_norm": 9.977517127990723, "learning_rate": 4.194123932562086e-06, "loss": 0.438, "mean_token_accuracy": 0.8567962691187858, "num_tokens": 141115374.0, "step": 117370 }, { "entropy": 1.8683731004595756, "epoch": 0.36386772983332544, "grad_norm": 3.932969093322754, "learning_rate": 4.193945271418908e-06, "loss": 0.4906, "mean_token_accuracy": 0.8401071175932884, "num_tokens": 141127677.0, "step": 117380 }, { "entropy": 1.8759948313236237, "epoch": 0.36389872895837516, "grad_norm": 6.948093414306641, "learning_rate": 4.193766633105587e-06, "loss": 0.4355, "mean_token_accuracy": 0.8567280665040016, "num_tokens": 141139281.0, "step": 117390 }, { "entropy": 1.9030549958348275, "epoch": 0.36392972808342483, "grad_norm": 4.025951862335205, "learning_rate": 4.1935880176172626e-06, "loss": 0.5086, "mean_token_accuracy": 0.8426888778805732, "num_tokens": 141150437.0, "step": 117400 }, { "entropy": 1.790473000705242, "epoch": 0.36396072720847455, "grad_norm": 7.977076053619385, "learning_rate": 4.193409424949075e-06, "loss": 0.361, "mean_token_accuracy": 0.8608955994248391, "num_tokens": 141163304.0, "step": 117410 }, { "entropy": 1.9095865696668626, "epoch": 0.3639917263335242, "grad_norm": 6.860559940338135, "learning_rate": 4.193230855096164e-06, "loss": 0.4834, "mean_token_accuracy": 0.8461943343281746, "num_tokens": 141174839.0, "step": 117420 }, { "entropy": 1.8187424436211586, "epoch": 0.36402272545857395, "grad_norm": 10.87106704711914, "learning_rate": 4.193052308053674e-06, "loss": 0.4349, "mean_token_accuracy": 0.8446593895554543, "num_tokens": 141187933.0, "step": 117430 }, { "entropy": 1.8248173132538796, "epoch": 0.3640537245836236, "grad_norm": 8.197173118591309, "learning_rate": 4.192873783816748e-06, "loss": 0.4556, "mean_token_accuracy": 0.8538053125143051, "num_tokens": 141200760.0, "step": 117440 }, { "entropy": 1.7480739682912827, "epoch": 0.36408472370867334, "grad_norm": 5.014430046081543, "learning_rate": 4.192695282380531e-06, "loss": 0.3556, "mean_token_accuracy": 0.8659795328974724, "num_tokens": 141215184.0, "step": 117450 }, { "entropy": 1.9292996317148208, "epoch": 0.364115722833723, "grad_norm": 8.716943740844727, "learning_rate": 4.192516803740172e-06, "loss": 0.4974, "mean_token_accuracy": 0.8481010347604752, "num_tokens": 141226311.0, "step": 117460 }, { "entropy": 1.8196698293089866, "epoch": 0.36414672195877273, "grad_norm": 6.581869602203369, "learning_rate": 4.192338347890818e-06, "loss": 0.4392, "mean_token_accuracy": 0.8496568530797959, "num_tokens": 141238353.0, "step": 117470 }, { "entropy": 1.8798624217510223, "epoch": 0.3641777210838224, "grad_norm": 8.914942741394043, "learning_rate": 4.192159914827618e-06, "loss": 0.5287, "mean_token_accuracy": 0.8403584629297256, "num_tokens": 141250412.0, "step": 117480 }, { "entropy": 1.9054614737629891, "epoch": 0.3642087202088721, "grad_norm": 8.384186744689941, "learning_rate": 4.191981504545725e-06, "loss": 0.497, "mean_token_accuracy": 0.8418497174978257, "num_tokens": 141261399.0, "step": 117490 }, { "entropy": 1.8841155782341956, "epoch": 0.3642397193339218, "grad_norm": 9.407716751098633, "learning_rate": 4.191803117040292e-06, "loss": 0.5278, "mean_token_accuracy": 0.8366957977414131, "num_tokens": 141273160.0, "step": 117500 }, { "entropy": 1.82247234582901, "epoch": 0.3642707184589715, "grad_norm": 10.961734771728516, "learning_rate": 4.191624752306471e-06, "loss": 0.4697, "mean_token_accuracy": 0.8399570837616921, "num_tokens": 141285805.0, "step": 117510 }, { "entropy": 1.8197978034615516, "epoch": 0.3643017175840212, "grad_norm": 9.942026138305664, "learning_rate": 4.191446410339419e-06, "loss": 0.4457, "mean_token_accuracy": 0.8496342837810517, "num_tokens": 141297836.0, "step": 117520 }, { "entropy": 1.8120769903063774, "epoch": 0.3643327167090709, "grad_norm": 4.632111549377441, "learning_rate": 4.191268091134293e-06, "loss": 0.4391, "mean_token_accuracy": 0.855706425011158, "num_tokens": 141309962.0, "step": 117530 }, { "entropy": 1.9124917179346084, "epoch": 0.3643637158341206, "grad_norm": 8.424860000610352, "learning_rate": 4.191089794686252e-06, "loss": 0.5288, "mean_token_accuracy": 0.8378676295280456, "num_tokens": 141320584.0, "step": 117540 }, { "entropy": 1.871908935904503, "epoch": 0.36439471495917025, "grad_norm": 3.5765280723571777, "learning_rate": 4.190911520990456e-06, "loss": 0.4489, "mean_token_accuracy": 0.8616762563586235, "num_tokens": 141331690.0, "step": 117550 }, { "entropy": 1.827473521232605, "epoch": 0.36442571408422, "grad_norm": 8.839676856994629, "learning_rate": 4.190733270042066e-06, "loss": 0.4528, "mean_token_accuracy": 0.856446447968483, "num_tokens": 141344147.0, "step": 117560 }, { "entropy": 1.9029761299490928, "epoch": 0.36445671320926964, "grad_norm": 10.330524444580078, "learning_rate": 4.190555041836245e-06, "loss": 0.4899, "mean_token_accuracy": 0.8440795317292213, "num_tokens": 141355707.0, "step": 117570 }, { "entropy": 1.8510171189904212, "epoch": 0.36448771233431937, "grad_norm": 10.09470272064209, "learning_rate": 4.190376836368157e-06, "loss": 0.5337, "mean_token_accuracy": 0.8413840815424919, "num_tokens": 141367817.0, "step": 117580 }, { "entropy": 1.9088376432657241, "epoch": 0.36451871145936904, "grad_norm": 7.446246147155762, "learning_rate": 4.190198653632968e-06, "loss": 0.4873, "mean_token_accuracy": 0.850058002769947, "num_tokens": 141378606.0, "step": 117590 }, { "entropy": 1.7923038378357887, "epoch": 0.36454971058441876, "grad_norm": 4.018001556396484, "learning_rate": 4.190020493625845e-06, "loss": 0.4045, "mean_token_accuracy": 0.8695195659995079, "num_tokens": 141390643.0, "step": 117600 }, { "entropy": 1.9072920486330986, "epoch": 0.36458070970946843, "grad_norm": 7.586421489715576, "learning_rate": 4.1898423563419565e-06, "loss": 0.496, "mean_token_accuracy": 0.8380471915006638, "num_tokens": 141401942.0, "step": 117610 }, { "entropy": 1.8963149681687355, "epoch": 0.36461170883451816, "grad_norm": Infinity, "learning_rate": 4.1896642417764735e-06, "loss": 0.5418, "mean_token_accuracy": 0.8512624606490136, "num_tokens": 141414153.0, "step": 117620 }, { "entropy": 1.8850282415747643, "epoch": 0.3646427079595678, "grad_norm": 10.361580848693848, "learning_rate": 4.189486149924567e-06, "loss": 0.4836, "mean_token_accuracy": 0.8521965146064758, "num_tokens": 141425510.0, "step": 117630 }, { "entropy": 1.8708653166890143, "epoch": 0.36467370708461755, "grad_norm": 9.141669273376465, "learning_rate": 4.189308080781409e-06, "loss": 0.5113, "mean_token_accuracy": 0.8514803513884545, "num_tokens": 141436975.0, "step": 117640 }, { "entropy": 1.86157566010952, "epoch": 0.3647047062096672, "grad_norm": 7.51423454284668, "learning_rate": 4.189130034342174e-06, "loss": 0.4494, "mean_token_accuracy": 0.8555971041321755, "num_tokens": 141449229.0, "step": 117650 }, { "entropy": 1.8771350249648093, "epoch": 0.36473570533471694, "grad_norm": 4.558736801147461, "learning_rate": 4.18895201060204e-06, "loss": 0.5087, "mean_token_accuracy": 0.8356175437569618, "num_tokens": 141460708.0, "step": 117660 }, { "entropy": 1.884423953294754, "epoch": 0.3647667044597666, "grad_norm": 7.785421848297119, "learning_rate": 4.188774009556181e-06, "loss": 0.5019, "mean_token_accuracy": 0.845032088458538, "num_tokens": 141472534.0, "step": 117670 }, { "entropy": 1.8946044102311135, "epoch": 0.36479770358481634, "grad_norm": 8.261967658996582, "learning_rate": 4.188596031199778e-06, "loss": 0.4688, "mean_token_accuracy": 0.8496976479887962, "num_tokens": 141483714.0, "step": 117680 }, { "entropy": 1.7952809534966945, "epoch": 0.364828702709866, "grad_norm": 7.456904888153076, "learning_rate": 4.188418075528011e-06, "loss": 0.4413, "mean_token_accuracy": 0.8519541397690773, "num_tokens": 141497189.0, "step": 117690 }, { "entropy": 1.9262281000614165, "epoch": 0.36485970183491573, "grad_norm": 8.199373245239258, "learning_rate": 4.188240142536061e-06, "loss": 0.5315, "mean_token_accuracy": 0.8272052019834518, "num_tokens": 141508512.0, "step": 117700 }, { "entropy": 1.8607816338539123, "epoch": 0.3648907009599654, "grad_norm": 8.653282165527344, "learning_rate": 4.18806223221911e-06, "loss": 0.5164, "mean_token_accuracy": 0.8485778480768204, "num_tokens": 141521091.0, "step": 117710 }, { "entropy": 1.9451293349266052, "epoch": 0.3649217000850151, "grad_norm": 3.9272289276123047, "learning_rate": 4.1878843445723445e-06, "loss": 0.5076, "mean_token_accuracy": 0.8386497780680656, "num_tokens": 141532558.0, "step": 117720 }, { "entropy": 1.843569315969944, "epoch": 0.3649526992100648, "grad_norm": 9.565539360046387, "learning_rate": 4.187706479590949e-06, "loss": 0.4864, "mean_token_accuracy": 0.8429495915770531, "num_tokens": 141545391.0, "step": 117730 }, { "entropy": 1.8172935619950294, "epoch": 0.3649836983351145, "grad_norm": 3.4644248485565186, "learning_rate": 4.18752863727011e-06, "loss": 0.4049, "mean_token_accuracy": 0.8586100369691849, "num_tokens": 141557495.0, "step": 117740 }, { "entropy": 1.8373050913214684, "epoch": 0.3650146974601642, "grad_norm": 9.560324668884277, "learning_rate": 4.187350817605018e-06, "loss": 0.4405, "mean_token_accuracy": 0.8568969145417213, "num_tokens": 141569946.0, "step": 117750 }, { "entropy": 1.8132469549775123, "epoch": 0.3650456965852139, "grad_norm": 10.291223526000977, "learning_rate": 4.187173020590862e-06, "loss": 0.4115, "mean_token_accuracy": 0.8503494530916214, "num_tokens": 141583063.0, "step": 117760 }, { "entropy": 1.8738684132695198, "epoch": 0.3650766957102636, "grad_norm": 7.412810802459717, "learning_rate": 4.186995246222834e-06, "loss": 0.4488, "mean_token_accuracy": 0.8561365365982055, "num_tokens": 141595030.0, "step": 117770 }, { "entropy": 1.9303662449121475, "epoch": 0.36510769483531325, "grad_norm": 7.82151985168457, "learning_rate": 4.186817494496126e-06, "loss": 0.4914, "mean_token_accuracy": 0.84500802308321, "num_tokens": 141606726.0, "step": 117780 }, { "entropy": 1.8065059944987296, "epoch": 0.36513869396036297, "grad_norm": 7.31696081161499, "learning_rate": 4.186639765405935e-06, "loss": 0.4184, "mean_token_accuracy": 0.8545883074402809, "num_tokens": 141620009.0, "step": 117790 }, { "entropy": 1.8446330174803733, "epoch": 0.36516969308541264, "grad_norm": 8.894010543823242, "learning_rate": 4.186462058947453e-06, "loss": 0.4723, "mean_token_accuracy": 0.8492456763982773, "num_tokens": 141631772.0, "step": 117800 }, { "entropy": 1.773499608039856, "epoch": 0.36520069221046236, "grad_norm": 8.691332817077637, "learning_rate": 4.186284375115881e-06, "loss": 0.4516, "mean_token_accuracy": 0.8507530987262726, "num_tokens": 141646041.0, "step": 117810 }, { "entropy": 1.8811022609472274, "epoch": 0.36523169133551203, "grad_norm": 10.04056453704834, "learning_rate": 4.186106713906415e-06, "loss": 0.4177, "mean_token_accuracy": 0.8536274120211601, "num_tokens": 141657936.0, "step": 117820 }, { "entropy": 1.8523863427340985, "epoch": 0.36526269046056176, "grad_norm": 9.139519691467285, "learning_rate": 4.1859290753142566e-06, "loss": 0.4434, "mean_token_accuracy": 0.8525974780321122, "num_tokens": 141670704.0, "step": 117830 }, { "entropy": 1.8781090155243874, "epoch": 0.3652936895856114, "grad_norm": 11.72330379486084, "learning_rate": 4.185751459334607e-06, "loss": 0.5319, "mean_token_accuracy": 0.8432107910513877, "num_tokens": 141682045.0, "step": 117840 }, { "entropy": 1.7702817119657994, "epoch": 0.36532468871066115, "grad_norm": 7.2551188468933105, "learning_rate": 4.185573865962669e-06, "loss": 0.4034, "mean_token_accuracy": 0.8568816781044006, "num_tokens": 141696085.0, "step": 117850 }, { "entropy": 1.9511079430580138, "epoch": 0.3653556878357108, "grad_norm": 9.1026029586792, "learning_rate": 4.185396295193647e-06, "loss": 0.5668, "mean_token_accuracy": 0.8349045425653457, "num_tokens": 141707166.0, "step": 117860 }, { "entropy": 1.8783316642045975, "epoch": 0.36538668696076054, "grad_norm": 8.064900398254395, "learning_rate": 4.185218747022747e-06, "loss": 0.5292, "mean_token_accuracy": 0.8456951096653939, "num_tokens": 141719014.0, "step": 117870 }, { "entropy": 1.8309227600693703, "epoch": 0.3654176860858102, "grad_norm": 8.56662654876709, "learning_rate": 4.185041221445176e-06, "loss": 0.5239, "mean_token_accuracy": 0.8421471819281579, "num_tokens": 141732151.0, "step": 117880 }, { "entropy": 1.8460636466741562, "epoch": 0.36544868521085994, "grad_norm": 10.314547538757324, "learning_rate": 4.184863718456143e-06, "loss": 0.4106, "mean_token_accuracy": 0.8483897626399994, "num_tokens": 141744646.0, "step": 117890 }, { "entropy": 1.8618800938129425, "epoch": 0.3654796843359096, "grad_norm": 8.248429298400879, "learning_rate": 4.184686238050858e-06, "loss": 0.4772, "mean_token_accuracy": 0.8518645599484443, "num_tokens": 141756281.0, "step": 117900 }, { "entropy": 1.8451855972409248, "epoch": 0.36551068346095933, "grad_norm": 7.507343769073486, "learning_rate": 4.184508780224532e-06, "loss": 0.4514, "mean_token_accuracy": 0.8565283268690109, "num_tokens": 141768488.0, "step": 117910 }, { "entropy": 1.8886141359806061, "epoch": 0.365541682586009, "grad_norm": 8.9036865234375, "learning_rate": 4.1843313449723795e-06, "loss": 0.498, "mean_token_accuracy": 0.8407893583178521, "num_tokens": 141780324.0, "step": 117920 }, { "entropy": 1.8497443065047263, "epoch": 0.3655726817110587, "grad_norm": 4.070679187774658, "learning_rate": 4.184153932289612e-06, "loss": 0.4569, "mean_token_accuracy": 0.8470659956336022, "num_tokens": 141791515.0, "step": 117930 }, { "entropy": 1.8582046449184417, "epoch": 0.3656036808361084, "grad_norm": 8.340042114257812, "learning_rate": 4.183976542171449e-06, "loss": 0.445, "mean_token_accuracy": 0.8539603292942047, "num_tokens": 141803922.0, "step": 117940 }, { "entropy": 1.8717338189482688, "epoch": 0.3656346799611581, "grad_norm": 3.3975770473480225, "learning_rate": 4.183799174613104e-06, "loss": 0.4791, "mean_token_accuracy": 0.8473453000187874, "num_tokens": 141815267.0, "step": 117950 }, { "entropy": 1.821525427699089, "epoch": 0.3656656790862078, "grad_norm": 6.9430251121521, "learning_rate": 4.183621829609798e-06, "loss": 0.4447, "mean_token_accuracy": 0.8513611420989037, "num_tokens": 141827085.0, "step": 117960 }, { "entropy": 1.8612057372927666, "epoch": 0.3656966782112575, "grad_norm": 8.761266708374023, "learning_rate": 4.1834445071567505e-06, "loss": 0.4418, "mean_token_accuracy": 0.8551434949040413, "num_tokens": 141839369.0, "step": 117970 }, { "entropy": 1.7933984741568565, "epoch": 0.3657276773363072, "grad_norm": 8.286885261535645, "learning_rate": 4.183267207249182e-06, "loss": 0.4084, "mean_token_accuracy": 0.8545448705554008, "num_tokens": 141852553.0, "step": 117980 }, { "entropy": 1.702750214934349, "epoch": 0.3657586764613569, "grad_norm": 6.650256633758545, "learning_rate": 4.183089929882318e-06, "loss": 0.3437, "mean_token_accuracy": 0.8643985167145729, "num_tokens": 141867491.0, "step": 117990 }, { "entropy": 1.8753923997282982, "epoch": 0.36578967558640657, "grad_norm": 6.874874114990234, "learning_rate": 4.182912675051379e-06, "loss": 0.4207, "mean_token_accuracy": 0.8641251236200332, "num_tokens": 141879390.0, "step": 118000 }, { "entropy": 1.926875615119934, "epoch": 0.3658206747114563, "grad_norm": 6.646308422088623, "learning_rate": 4.182735442751594e-06, "loss": 0.5192, "mean_token_accuracy": 0.8443400964140892, "num_tokens": 141891040.0, "step": 118010 }, { "entropy": 1.8918773487210274, "epoch": 0.36585167383650596, "grad_norm": 6.332447528839111, "learning_rate": 4.182558232978188e-06, "loss": 0.492, "mean_token_accuracy": 0.8434045284986496, "num_tokens": 141902986.0, "step": 118020 }, { "entropy": 1.9184204563498497, "epoch": 0.36588267296155563, "grad_norm": 7.81022834777832, "learning_rate": 4.182381045726391e-06, "loss": 0.461, "mean_token_accuracy": 0.8520988777279854, "num_tokens": 141914126.0, "step": 118030 }, { "entropy": 1.9360093891620636, "epoch": 0.36591367208660536, "grad_norm": 8.875545501708984, "learning_rate": 4.182203880991431e-06, "loss": 0.5721, "mean_token_accuracy": 0.8364198818802834, "num_tokens": 141925360.0, "step": 118040 }, { "entropy": 1.7633348166942597, "epoch": 0.365944671211655, "grad_norm": 3.641873836517334, "learning_rate": 4.182026738768541e-06, "loss": 0.4041, "mean_token_accuracy": 0.8556516453623771, "num_tokens": 141939081.0, "step": 118050 }, { "entropy": 1.9223826959729196, "epoch": 0.36597567033670475, "grad_norm": 7.793611526489258, "learning_rate": 4.181849619052955e-06, "loss": 0.5248, "mean_token_accuracy": 0.8365512430667877, "num_tokens": 141950887.0, "step": 118060 }, { "entropy": 1.910523234307766, "epoch": 0.3660066694617544, "grad_norm": 3.47296404838562, "learning_rate": 4.181672521839904e-06, "loss": 0.4971, "mean_token_accuracy": 0.8425001531839371, "num_tokens": 141962913.0, "step": 118070 }, { "entropy": 1.8669334262609483, "epoch": 0.36603766858680414, "grad_norm": 7.858112812042236, "learning_rate": 4.1814954471246254e-06, "loss": 0.4452, "mean_token_accuracy": 0.8510735929012299, "num_tokens": 141974965.0, "step": 118080 }, { "entropy": 1.8188461996614933, "epoch": 0.3660686677118538, "grad_norm": 9.689606666564941, "learning_rate": 4.181318394902356e-06, "loss": 0.4232, "mean_token_accuracy": 0.8512760177254677, "num_tokens": 141987859.0, "step": 118090 }, { "entropy": 1.8165872499346734, "epoch": 0.36609966683690354, "grad_norm": 8.783658981323242, "learning_rate": 4.181141365168336e-06, "loss": 0.4307, "mean_token_accuracy": 0.8553719744086266, "num_tokens": 142000311.0, "step": 118100 }, { "entropy": 1.747716872394085, "epoch": 0.3661306659619532, "grad_norm": 3.489741802215576, "learning_rate": 4.1809643579178005e-06, "loss": 0.3822, "mean_token_accuracy": 0.8664152339100838, "num_tokens": 142013950.0, "step": 118110 }, { "entropy": 1.8506772994995118, "epoch": 0.36616166508700293, "grad_norm": 7.3857316970825195, "learning_rate": 4.180787373145996e-06, "loss": 0.4218, "mean_token_accuracy": 0.863612025976181, "num_tokens": 142025425.0, "step": 118120 }, { "entropy": 1.8373650014400482, "epoch": 0.3661926642120526, "grad_norm": 8.373377799987793, "learning_rate": 4.180610410848162e-06, "loss": 0.5219, "mean_token_accuracy": 0.8401133880019188, "num_tokens": 142038565.0, "step": 118130 }, { "entropy": 1.893815578520298, "epoch": 0.3662236633371023, "grad_norm": 3.8234965801239014, "learning_rate": 4.1804334710195425e-06, "loss": 0.4989, "mean_token_accuracy": 0.8400477096438408, "num_tokens": 142050090.0, "step": 118140 }, { "entropy": 1.825938382744789, "epoch": 0.366254662462152, "grad_norm": 4.16547966003418, "learning_rate": 4.180256553655385e-06, "loss": 0.43, "mean_token_accuracy": 0.8617624938488007, "num_tokens": 142062662.0, "step": 118150 }, { "entropy": 1.801938709616661, "epoch": 0.3662856615872017, "grad_norm": 3.787094831466675, "learning_rate": 4.180079658750934e-06, "loss": 0.4252, "mean_token_accuracy": 0.8575030252337456, "num_tokens": 142075475.0, "step": 118160 }, { "entropy": 1.830231237411499, "epoch": 0.3663166607122514, "grad_norm": 3.527515172958374, "learning_rate": 4.179902786301441e-06, "loss": 0.4372, "mean_token_accuracy": 0.8546881586313247, "num_tokens": 142087897.0, "step": 118170 }, { "entropy": 1.8849106505513191, "epoch": 0.3663476598373011, "grad_norm": 7.191827774047852, "learning_rate": 4.179725936302153e-06, "loss": 0.4749, "mean_token_accuracy": 0.8488413885235786, "num_tokens": 142099236.0, "step": 118180 }, { "entropy": 1.9353514075279237, "epoch": 0.3663786589623508, "grad_norm": 9.70740032196045, "learning_rate": 4.1795491087483225e-06, "loss": 0.5316, "mean_token_accuracy": 0.843434551358223, "num_tokens": 142110114.0, "step": 118190 }, { "entropy": 1.7677051618695259, "epoch": 0.3664096580874005, "grad_norm": 6.214950084686279, "learning_rate": 4.1793723036352e-06, "loss": 0.3575, "mean_token_accuracy": 0.8696540012955666, "num_tokens": 142123202.0, "step": 118200 }, { "entropy": 1.8973610028624535, "epoch": 0.36644065721245017, "grad_norm": 7.601381301879883, "learning_rate": 4.179195520958042e-06, "loss": 0.4985, "mean_token_accuracy": 0.8459401935338974, "num_tokens": 142134423.0, "step": 118210 }, { "entropy": 1.877578319609165, "epoch": 0.3664716563374999, "grad_norm": 6.392949104309082, "learning_rate": 4.179018760712103e-06, "loss": 0.4465, "mean_token_accuracy": 0.8500817194581032, "num_tokens": 142145725.0, "step": 118220 }, { "entropy": 1.8080515787005424, "epoch": 0.36650265546254956, "grad_norm": 2.9493696689605713, "learning_rate": 4.178842022892638e-06, "loss": 0.4435, "mean_token_accuracy": 0.8623009279370308, "num_tokens": 142157978.0, "step": 118230 }, { "entropy": 1.8072045862674713, "epoch": 0.3665336545875993, "grad_norm": 8.654043197631836, "learning_rate": 4.178665307494907e-06, "loss": 0.4583, "mean_token_accuracy": 0.8556650042533874, "num_tokens": 142170679.0, "step": 118240 }, { "entropy": 1.8863827347755433, "epoch": 0.36656465371264896, "grad_norm": 8.228194236755371, "learning_rate": 4.178488614514169e-06, "loss": 0.4769, "mean_token_accuracy": 0.843590895831585, "num_tokens": 142182700.0, "step": 118250 }, { "entropy": 1.8511935338377952, "epoch": 0.3665956528376987, "grad_norm": 8.606576919555664, "learning_rate": 4.1783119439456844e-06, "loss": 0.4523, "mean_token_accuracy": 0.8457527369260788, "num_tokens": 142194831.0, "step": 118260 }, { "entropy": 1.8353765562176705, "epoch": 0.36662665196274835, "grad_norm": 9.248912811279297, "learning_rate": 4.178135295784717e-06, "loss": 0.452, "mean_token_accuracy": 0.8504055172204972, "num_tokens": 142207211.0, "step": 118270 }, { "entropy": 1.883970457315445, "epoch": 0.366657651087798, "grad_norm": 7.958624362945557, "learning_rate": 4.17795867002653e-06, "loss": 0.5167, "mean_token_accuracy": 0.8472073882818222, "num_tokens": 142218770.0, "step": 118280 }, { "entropy": 1.866176003217697, "epoch": 0.36668865021284774, "grad_norm": 9.430354118347168, "learning_rate": 4.177782066666388e-06, "loss": 0.5162, "mean_token_accuracy": 0.8443237110972405, "num_tokens": 142230034.0, "step": 118290 }, { "entropy": 1.8722920432686805, "epoch": 0.3667196493378974, "grad_norm": 8.045259475708008, "learning_rate": 4.177605485699558e-06, "loss": 0.4178, "mean_token_accuracy": 0.8541638612747192, "num_tokens": 142242294.0, "step": 118300 }, { "entropy": 1.8110121667385102, "epoch": 0.36675064846294714, "grad_norm": 3.3392722606658936, "learning_rate": 4.177428927121307e-06, "loss": 0.4369, "mean_token_accuracy": 0.8573125541210175, "num_tokens": 142255463.0, "step": 118310 }, { "entropy": 1.826652705669403, "epoch": 0.3667816475879968, "grad_norm": 7.730444431304932, "learning_rate": 4.177252390926905e-06, "loss": 0.4119, "mean_token_accuracy": 0.8576511397957802, "num_tokens": 142267885.0, "step": 118320 }, { "entropy": 1.8853152304887772, "epoch": 0.36681264671304653, "grad_norm": 7.873587131500244, "learning_rate": 4.1770758771116235e-06, "loss": 0.4852, "mean_token_accuracy": 0.8457479134202004, "num_tokens": 142279026.0, "step": 118330 }, { "entropy": 1.7946649104356767, "epoch": 0.3668436458380962, "grad_norm": 3.6855735778808594, "learning_rate": 4.176899385670734e-06, "loss": 0.4067, "mean_token_accuracy": 0.8608448430895805, "num_tokens": 142292139.0, "step": 118340 }, { "entropy": 1.8492568552494049, "epoch": 0.3668746449631459, "grad_norm": 9.059473991394043, "learning_rate": 4.1767229165995095e-06, "loss": 0.4399, "mean_token_accuracy": 0.8556591719388962, "num_tokens": 142304080.0, "step": 118350 }, { "entropy": 1.8320197984576225, "epoch": 0.3669056440881956, "grad_norm": 7.440866470336914, "learning_rate": 4.176546469893225e-06, "loss": 0.4569, "mean_token_accuracy": 0.8550046220421791, "num_tokens": 142315495.0, "step": 118360 }, { "entropy": 1.9020672082901, "epoch": 0.3669366432132453, "grad_norm": 7.690849781036377, "learning_rate": 4.176370045547157e-06, "loss": 0.5199, "mean_token_accuracy": 0.8459215462207794, "num_tokens": 142326196.0, "step": 118370 }, { "entropy": 1.8766709178686143, "epoch": 0.366967642338295, "grad_norm": 7.724151134490967, "learning_rate": 4.176193643556584e-06, "loss": 0.4898, "mean_token_accuracy": 0.8508606314659118, "num_tokens": 142337696.0, "step": 118380 }, { "entropy": 1.8881130993366242, "epoch": 0.3669986414633447, "grad_norm": 8.17661190032959, "learning_rate": 4.176017263916784e-06, "loss": 0.4609, "mean_token_accuracy": 0.8483958140015602, "num_tokens": 142349838.0, "step": 118390 }, { "entropy": 1.9459423005580903, "epoch": 0.3670296405883944, "grad_norm": 8.967854499816895, "learning_rate": 4.17584090662304e-06, "loss": 0.5726, "mean_token_accuracy": 0.8301825195550918, "num_tokens": 142360518.0, "step": 118400 }, { "entropy": 1.889550267159939, "epoch": 0.3670606397134441, "grad_norm": 7.768711566925049, "learning_rate": 4.175664571670631e-06, "loss": 0.5122, "mean_token_accuracy": 0.8391052410006523, "num_tokens": 142372416.0, "step": 118410 }, { "entropy": 1.8851889297366142, "epoch": 0.3670916388384938, "grad_norm": 7.86566686630249, "learning_rate": 4.175488259054841e-06, "loss": 0.4989, "mean_token_accuracy": 0.857041896879673, "num_tokens": 142384483.0, "step": 118420 }, { "entropy": 1.8617109119892121, "epoch": 0.3671226379635435, "grad_norm": 3.7308735847473145, "learning_rate": 4.175311968770956e-06, "loss": 0.4545, "mean_token_accuracy": 0.8486345887184144, "num_tokens": 142396492.0, "step": 118430 }, { "entropy": 1.8434831380844117, "epoch": 0.36715363708859317, "grad_norm": 3.5595266819000244, "learning_rate": 4.175135700814261e-06, "loss": 0.4503, "mean_token_accuracy": 0.8539713755249977, "num_tokens": 142408880.0, "step": 118440 }, { "entropy": 1.8063078075647354, "epoch": 0.3671846362136429, "grad_norm": 7.024228096008301, "learning_rate": 4.174959455180043e-06, "loss": 0.4007, "mean_token_accuracy": 0.8617870509624481, "num_tokens": 142422170.0, "step": 118450 }, { "entropy": 1.9420251905918122, "epoch": 0.36721563533869256, "grad_norm": 7.5710039138793945, "learning_rate": 4.174783231863592e-06, "loss": 0.4772, "mean_token_accuracy": 0.8481424629688263, "num_tokens": 142432968.0, "step": 118460 }, { "entropy": 1.8173451155424118, "epoch": 0.3672466344637423, "grad_norm": 7.755475997924805, "learning_rate": 4.174607030860197e-06, "loss": 0.4012, "mean_token_accuracy": 0.8645861312747002, "num_tokens": 142446003.0, "step": 118470 }, { "entropy": 1.8867697641253471, "epoch": 0.36727763358879195, "grad_norm": 8.526572227478027, "learning_rate": 4.174430852165151e-06, "loss": 0.4801, "mean_token_accuracy": 0.8466065526008606, "num_tokens": 142457251.0, "step": 118480 }, { "entropy": 1.9442676544189452, "epoch": 0.3673086327138417, "grad_norm": 8.537225723266602, "learning_rate": 4.1742546957737465e-06, "loss": 0.4837, "mean_token_accuracy": 0.8544994726777076, "num_tokens": 142468748.0, "step": 118490 }, { "entropy": 1.846812443435192, "epoch": 0.36733963183889135, "grad_norm": 7.822627544403076, "learning_rate": 4.174078561681279e-06, "loss": 0.4134, "mean_token_accuracy": 0.8543079942464828, "num_tokens": 142480975.0, "step": 118500 }, { "entropy": 1.87229622900486, "epoch": 0.36737063096394107, "grad_norm": 4.54270076751709, "learning_rate": 4.173902449883043e-06, "loss": 0.4595, "mean_token_accuracy": 0.8584425121545791, "num_tokens": 142492336.0, "step": 118510 }, { "entropy": 1.889968466758728, "epoch": 0.36740163008899074, "grad_norm": 3.9654746055603027, "learning_rate": 4.173726360374335e-06, "loss": 0.4772, "mean_token_accuracy": 0.850946743786335, "num_tokens": 142504212.0, "step": 118520 }, { "entropy": 1.8861878275871278, "epoch": 0.3674326292140404, "grad_norm": 11.20728588104248, "learning_rate": 4.173550293150456e-06, "loss": 0.4677, "mean_token_accuracy": 0.8468590840697289, "num_tokens": 142516169.0, "step": 118530 }, { "entropy": 1.842411059141159, "epoch": 0.36746362833909013, "grad_norm": 8.36995792388916, "learning_rate": 4.1733742482067035e-06, "loss": 0.4267, "mean_token_accuracy": 0.8587794110178948, "num_tokens": 142528006.0, "step": 118540 }, { "entropy": 1.873109185695648, "epoch": 0.3674946274641398, "grad_norm": 8.865880012512207, "learning_rate": 4.173198225538381e-06, "loss": 0.4709, "mean_token_accuracy": 0.8478357747197152, "num_tokens": 142540361.0, "step": 118550 }, { "entropy": 1.861200602352619, "epoch": 0.3675256265891895, "grad_norm": 7.625993728637695, "learning_rate": 4.173022225140791e-06, "loss": 0.4406, "mean_token_accuracy": 0.8592444315552712, "num_tokens": 142553408.0, "step": 118560 }, { "entropy": 1.8547646909952165, "epoch": 0.3675566257142392, "grad_norm": 8.53388500213623, "learning_rate": 4.172846247009236e-06, "loss": 0.4991, "mean_token_accuracy": 0.8481455728411674, "num_tokens": 142566455.0, "step": 118570 }, { "entropy": 1.8456396833062172, "epoch": 0.3675876248392889, "grad_norm": 3.9087069034576416, "learning_rate": 4.1726702911390225e-06, "loss": 0.4416, "mean_token_accuracy": 0.8550665810704231, "num_tokens": 142578809.0, "step": 118580 }, { "entropy": 1.8159615725278855, "epoch": 0.3676186239643386, "grad_norm": 4.247369289398193, "learning_rate": 4.172494357525458e-06, "loss": 0.4041, "mean_token_accuracy": 0.8549291148781777, "num_tokens": 142591777.0, "step": 118590 }, { "entropy": 1.84511306732893, "epoch": 0.3676496230893883, "grad_norm": 4.138094425201416, "learning_rate": 4.17231844616385e-06, "loss": 0.3894, "mean_token_accuracy": 0.8563702628016472, "num_tokens": 142605209.0, "step": 118600 }, { "entropy": 1.9339896500110627, "epoch": 0.367680622214438, "grad_norm": 7.239840030670166, "learning_rate": 4.17214255704951e-06, "loss": 0.4541, "mean_token_accuracy": 0.8529000446200371, "num_tokens": 142616618.0, "step": 118610 }, { "entropy": 1.9016248881816864, "epoch": 0.3677116213394877, "grad_norm": 10.453241348266602, "learning_rate": 4.171966690177746e-06, "loss": 0.4942, "mean_token_accuracy": 0.8428125187754631, "num_tokens": 142628132.0, "step": 118620 }, { "entropy": 1.9518603295087815, "epoch": 0.3677426204645374, "grad_norm": 8.346458435058594, "learning_rate": 4.171790845543873e-06, "loss": 0.4534, "mean_token_accuracy": 0.8541428744792938, "num_tokens": 142639425.0, "step": 118630 }, { "entropy": 1.8354076534509658, "epoch": 0.3677736195895871, "grad_norm": 10.148712158203125, "learning_rate": 4.171615023143204e-06, "loss": 0.4178, "mean_token_accuracy": 0.855410099029541, "num_tokens": 142651425.0, "step": 118640 }, { "entropy": 1.9070971041917801, "epoch": 0.36780461871463677, "grad_norm": 8.946581840515137, "learning_rate": 4.1714392229710536e-06, "loss": 0.4988, "mean_token_accuracy": 0.8442860797047615, "num_tokens": 142662410.0, "step": 118650 }, { "entropy": 1.8595642820000648, "epoch": 0.3678356178396865, "grad_norm": 9.433381080627441, "learning_rate": 4.17126344502274e-06, "loss": 0.4847, "mean_token_accuracy": 0.8475638180971146, "num_tokens": 142674406.0, "step": 118660 }, { "entropy": 1.951467391848564, "epoch": 0.36786661696473616, "grad_norm": 7.69748592376709, "learning_rate": 4.17108768929358e-06, "loss": 0.5543, "mean_token_accuracy": 0.8346621260046959, "num_tokens": 142685520.0, "step": 118670 }, { "entropy": 1.8777378484606744, "epoch": 0.3678976160897859, "grad_norm": 3.798490285873413, "learning_rate": 4.170911955778893e-06, "loss": 0.4659, "mean_token_accuracy": 0.851957768201828, "num_tokens": 142696814.0, "step": 118680 }, { "entropy": 1.8550070881843568, "epoch": 0.36792861521483555, "grad_norm": 6.965719699859619, "learning_rate": 4.170736244474e-06, "loss": 0.4532, "mean_token_accuracy": 0.8539418116211891, "num_tokens": 142709691.0, "step": 118690 }, { "entropy": 1.8812433794140815, "epoch": 0.3679596143398853, "grad_norm": 8.439732551574707, "learning_rate": 4.170560555374224e-06, "loss": 0.4644, "mean_token_accuracy": 0.8429150357842445, "num_tokens": 142721133.0, "step": 118700 }, { "entropy": 1.8941943876445293, "epoch": 0.36799061346493495, "grad_norm": 3.9443578720092773, "learning_rate": 4.1703848884748875e-06, "loss": 0.4548, "mean_token_accuracy": 0.84272540807724, "num_tokens": 142734302.0, "step": 118710 }, { "entropy": 1.8599026456475258, "epoch": 0.36802161258998467, "grad_norm": 9.261316299438477, "learning_rate": 4.170209243771315e-06, "loss": 0.4548, "mean_token_accuracy": 0.8592417910695076, "num_tokens": 142745854.0, "step": 118720 }, { "entropy": 1.8526142731308937, "epoch": 0.36805261171503434, "grad_norm": 8.83598804473877, "learning_rate": 4.1700336212588336e-06, "loss": 0.4263, "mean_token_accuracy": 0.850617028772831, "num_tokens": 142758084.0, "step": 118730 }, { "entropy": 1.9292904302477836, "epoch": 0.36808361084008406, "grad_norm": 8.976339340209961, "learning_rate": 4.169858020932772e-06, "loss": 0.4837, "mean_token_accuracy": 0.8495315477252007, "num_tokens": 142769212.0, "step": 118740 }, { "entropy": 1.9031863331794738, "epoch": 0.36811460996513373, "grad_norm": 9.556485176086426, "learning_rate": 4.169682442788456e-06, "loss": 0.5131, "mean_token_accuracy": 0.8372549533843994, "num_tokens": 142780283.0, "step": 118750 }, { "entropy": 1.903608924150467, "epoch": 0.36814560909018346, "grad_norm": 8.73618221282959, "learning_rate": 4.16950688682122e-06, "loss": 0.4989, "mean_token_accuracy": 0.841778826713562, "num_tokens": 142791778.0, "step": 118760 }, { "entropy": 1.8728568017482758, "epoch": 0.3681766082152331, "grad_norm": 7.941903591156006, "learning_rate": 4.1693313530263924e-06, "loss": 0.4746, "mean_token_accuracy": 0.8445895060896873, "num_tokens": 142804320.0, "step": 118770 }, { "entropy": 1.822492091357708, "epoch": 0.3682076073402828, "grad_norm": 4.276817798614502, "learning_rate": 4.1691558413993075e-06, "loss": 0.3873, "mean_token_accuracy": 0.8638914600014687, "num_tokens": 142816773.0, "step": 118780 }, { "entropy": 1.846034236252308, "epoch": 0.3682386064653325, "grad_norm": 9.146262168884277, "learning_rate": 4.168980351935301e-06, "loss": 0.4903, "mean_token_accuracy": 0.8438366368412972, "num_tokens": 142828530.0, "step": 118790 }, { "entropy": 1.9032179087400436, "epoch": 0.3682696055903822, "grad_norm": 8.37747573852539, "learning_rate": 4.168804884629708e-06, "loss": 0.5004, "mean_token_accuracy": 0.8452135339379311, "num_tokens": 142839769.0, "step": 118800 }, { "entropy": 1.9008402451872826, "epoch": 0.3683006047154319, "grad_norm": 7.905856609344482, "learning_rate": 4.1686294394778654e-06, "loss": 0.4956, "mean_token_accuracy": 0.8504106491804123, "num_tokens": 142851380.0, "step": 118810 }, { "entropy": 1.8856339365243913, "epoch": 0.3683316038404816, "grad_norm": 6.864739894866943, "learning_rate": 4.168454016475113e-06, "loss": 0.4834, "mean_token_accuracy": 0.8480422243475914, "num_tokens": 142863896.0, "step": 118820 }, { "entropy": 1.872384211421013, "epoch": 0.3683626029655313, "grad_norm": 8.543014526367188, "learning_rate": 4.168278615616788e-06, "loss": 0.4822, "mean_token_accuracy": 0.8365708619356156, "num_tokens": 142876025.0, "step": 118830 }, { "entropy": 1.8250932022929192, "epoch": 0.368393602090581, "grad_norm": 3.684549570083618, "learning_rate": 4.168103236898236e-06, "loss": 0.4323, "mean_token_accuracy": 0.8542123079299927, "num_tokens": 142888811.0, "step": 118840 }, { "entropy": 1.9026092737913132, "epoch": 0.3684246012156307, "grad_norm": 7.081031322479248, "learning_rate": 4.167927880314796e-06, "loss": 0.4878, "mean_token_accuracy": 0.8558070927858352, "num_tokens": 142900186.0, "step": 118850 }, { "entropy": 1.9256568044424056, "epoch": 0.36845560034068037, "grad_norm": 7.484577178955078, "learning_rate": 4.167752545861815e-06, "loss": 0.4956, "mean_token_accuracy": 0.8413769558072091, "num_tokens": 142912507.0, "step": 118860 }, { "entropy": 1.7556425005197525, "epoch": 0.3684865994657301, "grad_norm": 8.272164344787598, "learning_rate": 4.167577233534637e-06, "loss": 0.3423, "mean_token_accuracy": 0.8660473376512527, "num_tokens": 142927368.0, "step": 118870 }, { "entropy": 1.9561314284801483, "epoch": 0.36851759859077976, "grad_norm": 8.683992385864258, "learning_rate": 4.167401943328609e-06, "loss": 0.5259, "mean_token_accuracy": 0.8412436872720719, "num_tokens": 142938818.0, "step": 118880 }, { "entropy": 1.854485747218132, "epoch": 0.3685485977158295, "grad_norm": 4.319868087768555, "learning_rate": 4.167226675239079e-06, "loss": 0.4168, "mean_token_accuracy": 0.8507488906383515, "num_tokens": 142951007.0, "step": 118890 }, { "entropy": 1.7686067208647729, "epoch": 0.36857959684087915, "grad_norm": 7.7503581047058105, "learning_rate": 4.167051429261398e-06, "loss": 0.4149, "mean_token_accuracy": 0.8598582178354264, "num_tokens": 142964704.0, "step": 118900 }, { "entropy": 1.898981074988842, "epoch": 0.3686105959659289, "grad_norm": 6.987488269805908, "learning_rate": 4.166876205390915e-06, "loss": 0.504, "mean_token_accuracy": 0.8474484965205192, "num_tokens": 142976441.0, "step": 118910 }, { "entropy": 1.8347061678767205, "epoch": 0.36864159509097855, "grad_norm": 7.668900489807129, "learning_rate": 4.166701003622984e-06, "loss": 0.4401, "mean_token_accuracy": 0.8599162891507148, "num_tokens": 142989393.0, "step": 118920 }, { "entropy": 1.8358816392719746, "epoch": 0.36867259421602827, "grad_norm": 8.171165466308594, "learning_rate": 4.166525823952959e-06, "loss": 0.4274, "mean_token_accuracy": 0.8551807060837746, "num_tokens": 143001626.0, "step": 118930 }, { "entropy": 1.7875201195478438, "epoch": 0.36870359334107794, "grad_norm": 4.123532295227051, "learning_rate": 4.166350666376194e-06, "loss": 0.4216, "mean_token_accuracy": 0.8510026276111603, "num_tokens": 143014393.0, "step": 118940 }, { "entropy": 1.9123052790760995, "epoch": 0.36873459246612766, "grad_norm": 9.459202766418457, "learning_rate": 4.1661755308880446e-06, "loss": 0.5015, "mean_token_accuracy": 0.8408966720104217, "num_tokens": 143025392.0, "step": 118950 }, { "entropy": 1.8176440641283989, "epoch": 0.36876559159117733, "grad_norm": 7.813144683837891, "learning_rate": 4.166000417483871e-06, "loss": 0.4342, "mean_token_accuracy": 0.8539251640439034, "num_tokens": 143038069.0, "step": 118960 }, { "entropy": 1.8900042787194251, "epoch": 0.36879659071622706, "grad_norm": 3.9542362689971924, "learning_rate": 4.16582532615903e-06, "loss": 0.4541, "mean_token_accuracy": 0.8539608538150787, "num_tokens": 143049603.0, "step": 118970 }, { "entropy": 1.83581335991621, "epoch": 0.3688275898412767, "grad_norm": 3.050680160522461, "learning_rate": 4.165650256908884e-06, "loss": 0.4276, "mean_token_accuracy": 0.8514253705739975, "num_tokens": 143062623.0, "step": 118980 }, { "entropy": 1.7662447020411491, "epoch": 0.36885858896632645, "grad_norm": 5.801089763641357, "learning_rate": 4.165475209728794e-06, "loss": 0.3947, "mean_token_accuracy": 0.8633951663970947, "num_tokens": 143075670.0, "step": 118990 }, { "entropy": 1.8178039237856864, "epoch": 0.3688895880913761, "grad_norm": 3.8043034076690674, "learning_rate": 4.165300184614124e-06, "loss": 0.4101, "mean_token_accuracy": 0.8604209214448929, "num_tokens": 143088556.0, "step": 119000 }, { "entropy": 1.841179284453392, "epoch": 0.36892058721642584, "grad_norm": 7.6272292137146, "learning_rate": 4.1651251815602385e-06, "loss": 0.4022, "mean_token_accuracy": 0.8553322166204452, "num_tokens": 143100756.0, "step": 119010 }, { "entropy": 1.8461539566516876, "epoch": 0.3689515863414755, "grad_norm": 9.249650955200195, "learning_rate": 4.1649502005625024e-06, "loss": 0.4493, "mean_token_accuracy": 0.85612553358078, "num_tokens": 143112605.0, "step": 119020 }, { "entropy": 1.902879323065281, "epoch": 0.3689825854665252, "grad_norm": 7.911184310913086, "learning_rate": 4.164775241616285e-06, "loss": 0.5353, "mean_token_accuracy": 0.835160045325756, "num_tokens": 143124403.0, "step": 119030 }, { "entropy": 1.9350175976753234, "epoch": 0.3690135845915749, "grad_norm": 9.767508506774902, "learning_rate": 4.164600304716953e-06, "loss": 0.5188, "mean_token_accuracy": 0.8444019317626953, "num_tokens": 143135579.0, "step": 119040 }, { "entropy": 1.797468328475952, "epoch": 0.3690445837166246, "grad_norm": 8.605935096740723, "learning_rate": 4.164425389859878e-06, "loss": 0.4181, "mean_token_accuracy": 0.8529829382896423, "num_tokens": 143148800.0, "step": 119050 }, { "entropy": 1.8765917882323264, "epoch": 0.3690755828416743, "grad_norm": 6.596861839294434, "learning_rate": 4.164250497040431e-06, "loss": 0.4719, "mean_token_accuracy": 0.8517785176634789, "num_tokens": 143160367.0, "step": 119060 }, { "entropy": 1.8928829789161683, "epoch": 0.36910658196672397, "grad_norm": 7.310998916625977, "learning_rate": 4.164075626253985e-06, "loss": 0.4928, "mean_token_accuracy": 0.8375116720795631, "num_tokens": 143172139.0, "step": 119070 }, { "entropy": 1.8701190561056138, "epoch": 0.3691375810917737, "grad_norm": 9.065203666687012, "learning_rate": 4.163900777495915e-06, "loss": 0.4576, "mean_token_accuracy": 0.8435375481843949, "num_tokens": 143184281.0, "step": 119080 }, { "entropy": 1.8811526045203208, "epoch": 0.36916858021682336, "grad_norm": 3.9514009952545166, "learning_rate": 4.1637259507615935e-06, "loss": 0.4425, "mean_token_accuracy": 0.8543221414089203, "num_tokens": 143196848.0, "step": 119090 }, { "entropy": 1.8915190115571021, "epoch": 0.3691995793418731, "grad_norm": 9.159911155700684, "learning_rate": 4.163551146046401e-06, "loss": 0.4463, "mean_token_accuracy": 0.8477994874119759, "num_tokens": 143207993.0, "step": 119100 }, { "entropy": 1.9243160128593444, "epoch": 0.36923057846692275, "grad_norm": 8.874519348144531, "learning_rate": 4.163376363345714e-06, "loss": 0.4569, "mean_token_accuracy": 0.8604633152484894, "num_tokens": 143218596.0, "step": 119110 }, { "entropy": 1.9454612672328948, "epoch": 0.3692615775919725, "grad_norm": 9.145526885986328, "learning_rate": 4.163201602654912e-06, "loss": 0.5398, "mean_token_accuracy": 0.8401903316378594, "num_tokens": 143230338.0, "step": 119120 }, { "entropy": 1.896520721912384, "epoch": 0.36929257671702215, "grad_norm": 8.464309692382812, "learning_rate": 4.163026863969376e-06, "loss": 0.4793, "mean_token_accuracy": 0.8491705074906349, "num_tokens": 143242088.0, "step": 119130 }, { "entropy": 1.943563875555992, "epoch": 0.36932357584207187, "grad_norm": 7.9828033447265625, "learning_rate": 4.162852147284489e-06, "loss": 0.5092, "mean_token_accuracy": 0.8447094395756721, "num_tokens": 143253099.0, "step": 119140 }, { "entropy": 1.9148243635892868, "epoch": 0.36935457496712154, "grad_norm": 8.171924591064453, "learning_rate": 4.162677452595636e-06, "loss": 0.5437, "mean_token_accuracy": 0.846064169704914, "num_tokens": 143264931.0, "step": 119150 }, { "entropy": 1.9138666808605194, "epoch": 0.36938557409217126, "grad_norm": 9.261260986328125, "learning_rate": 4.162502779898198e-06, "loss": 0.512, "mean_token_accuracy": 0.8433482199907303, "num_tokens": 143275598.0, "step": 119160 }, { "entropy": 1.8780044361948967, "epoch": 0.36941657321722093, "grad_norm": 8.321491241455078, "learning_rate": 4.162328129187566e-06, "loss": 0.4593, "mean_token_accuracy": 0.8502790480852127, "num_tokens": 143288132.0, "step": 119170 }, { "entropy": 1.9145966663956642, "epoch": 0.36944757234227066, "grad_norm": 7.993964672088623, "learning_rate": 4.162153500459123e-06, "loss": 0.4806, "mean_token_accuracy": 0.8421851694583893, "num_tokens": 143300040.0, "step": 119180 }, { "entropy": 1.8805766895413398, "epoch": 0.3694785714673203, "grad_norm": 8.223487854003906, "learning_rate": 4.161978893708263e-06, "loss": 0.4664, "mean_token_accuracy": 0.8541955664753914, "num_tokens": 143311847.0, "step": 119190 }, { "entropy": 1.93892260491848, "epoch": 0.36950957059237005, "grad_norm": 9.87500286102295, "learning_rate": 4.161804308930374e-06, "loss": 0.548, "mean_token_accuracy": 0.8372417896986007, "num_tokens": 143324118.0, "step": 119200 }, { "entropy": 1.892756275832653, "epoch": 0.3695405697174197, "grad_norm": 7.675792694091797, "learning_rate": 4.161629746120847e-06, "loss": 0.4608, "mean_token_accuracy": 0.849764832854271, "num_tokens": 143335130.0, "step": 119210 }, { "entropy": 1.9276916295289994, "epoch": 0.36957156884246944, "grad_norm": 8.37989330291748, "learning_rate": 4.161455205275077e-06, "loss": 0.5287, "mean_token_accuracy": 0.8419989302754403, "num_tokens": 143346444.0, "step": 119220 }, { "entropy": 1.846215435862541, "epoch": 0.3696025679675191, "grad_norm": 8.054888725280762, "learning_rate": 4.161280686388458e-06, "loss": 0.4651, "mean_token_accuracy": 0.8463585451245308, "num_tokens": 143358709.0, "step": 119230 }, { "entropy": 1.8919729992747307, "epoch": 0.36963356709256884, "grad_norm": 8.983673095703125, "learning_rate": 4.161106189456385e-06, "loss": 0.4864, "mean_token_accuracy": 0.8458945691585541, "num_tokens": 143371208.0, "step": 119240 }, { "entropy": 1.9590601921081543, "epoch": 0.3696645662176185, "grad_norm": 9.514650344848633, "learning_rate": 4.160931714474256e-06, "loss": 0.5253, "mean_token_accuracy": 0.8396427750587463, "num_tokens": 143381837.0, "step": 119250 }, { "entropy": 1.8351401954889297, "epoch": 0.36969556534266823, "grad_norm": 7.223183631896973, "learning_rate": 4.1607572614374696e-06, "loss": 0.4197, "mean_token_accuracy": 0.863115793466568, "num_tokens": 143393900.0, "step": 119260 }, { "entropy": 1.7584014266729355, "epoch": 0.3697265644677179, "grad_norm": 3.2340762615203857, "learning_rate": 4.160582830341426e-06, "loss": 0.3708, "mean_token_accuracy": 0.8620016917586326, "num_tokens": 143408131.0, "step": 119270 }, { "entropy": 1.9106255337595939, "epoch": 0.36975756359276757, "grad_norm": 8.975741386413574, "learning_rate": 4.160408421181527e-06, "loss": 0.493, "mean_token_accuracy": 0.8459363922476768, "num_tokens": 143419497.0, "step": 119280 }, { "entropy": 1.9070470571517943, "epoch": 0.3697885627178173, "grad_norm": 9.103129386901855, "learning_rate": 4.160234033953172e-06, "loss": 0.4853, "mean_token_accuracy": 0.848831920325756, "num_tokens": 143430671.0, "step": 119290 }, { "entropy": 1.8115619271993637, "epoch": 0.36981956184286696, "grad_norm": 9.283622741699219, "learning_rate": 4.160059668651768e-06, "loss": 0.3954, "mean_token_accuracy": 0.8591952756047249, "num_tokens": 143442619.0, "step": 119300 }, { "entropy": 1.8160853952169418, "epoch": 0.3698505609679167, "grad_norm": 3.4729106426239014, "learning_rate": 4.15988532527272e-06, "loss": 0.4294, "mean_token_accuracy": 0.8528380259871483, "num_tokens": 143454996.0, "step": 119310 }, { "entropy": 1.9566965460777284, "epoch": 0.36988156009296635, "grad_norm": 8.186453819274902, "learning_rate": 4.159711003811434e-06, "loss": 0.5251, "mean_token_accuracy": 0.8404311820864677, "num_tokens": 143466235.0, "step": 119320 }, { "entropy": 1.770469543337822, "epoch": 0.3699125592180161, "grad_norm": 4.339020729064941, "learning_rate": 4.1595367042633196e-06, "loss": 0.3832, "mean_token_accuracy": 0.8623701587319375, "num_tokens": 143479903.0, "step": 119330 }, { "entropy": 1.8619060233235358, "epoch": 0.36994355834306575, "grad_norm": 7.124758243560791, "learning_rate": 4.159362426623783e-06, "loss": 0.4594, "mean_token_accuracy": 0.8501885548233986, "num_tokens": 143492204.0, "step": 119340 }, { "entropy": 1.781793449819088, "epoch": 0.3699745574681155, "grad_norm": 6.788565635681152, "learning_rate": 4.159188170888238e-06, "loss": 0.4397, "mean_token_accuracy": 0.8653283238410949, "num_tokens": 143504832.0, "step": 119350 }, { "entropy": 1.8910044834017754, "epoch": 0.37000555659316514, "grad_norm": 9.051020622253418, "learning_rate": 4.159013937052096e-06, "loss": 0.5119, "mean_token_accuracy": 0.8442646443843842, "num_tokens": 143516104.0, "step": 119360 }, { "entropy": 1.9117089003324508, "epoch": 0.37003655571821487, "grad_norm": 6.4968390464782715, "learning_rate": 4.158839725110769e-06, "loss": 0.4627, "mean_token_accuracy": 0.852028714120388, "num_tokens": 143527838.0, "step": 119370 }, { "entropy": 1.80558120906353, "epoch": 0.37006755484326453, "grad_norm": 8.500289916992188, "learning_rate": 4.158665535059673e-06, "loss": 0.4231, "mean_token_accuracy": 0.8530550315976143, "num_tokens": 143541211.0, "step": 119380 }, { "entropy": 1.8069082364439963, "epoch": 0.37009855396831426, "grad_norm": 6.210768699645996, "learning_rate": 4.158491366894224e-06, "loss": 0.4508, "mean_token_accuracy": 0.8599958032369613, "num_tokens": 143553620.0, "step": 119390 }, { "entropy": 1.7626231014728546, "epoch": 0.3701295530933639, "grad_norm": 8.250438690185547, "learning_rate": 4.158317220609839e-06, "loss": 0.405, "mean_token_accuracy": 0.860263791680336, "num_tokens": 143567049.0, "step": 119400 }, { "entropy": 1.888748326897621, "epoch": 0.37016055221841365, "grad_norm": 9.457777976989746, "learning_rate": 4.158143096201936e-06, "loss": 0.486, "mean_token_accuracy": 0.8503834918141365, "num_tokens": 143578643.0, "step": 119410 }, { "entropy": 1.910116845369339, "epoch": 0.3701915513434633, "grad_norm": 6.411842346191406, "learning_rate": 4.157968993665937e-06, "loss": 0.4937, "mean_token_accuracy": 0.8387831076979637, "num_tokens": 143590634.0, "step": 119420 }, { "entropy": 1.8767773866653443, "epoch": 0.37022255046851305, "grad_norm": 9.648838996887207, "learning_rate": 4.157794912997263e-06, "loss": 0.4942, "mean_token_accuracy": 0.843813206255436, "num_tokens": 143602357.0, "step": 119430 }, { "entropy": 1.9029111877083777, "epoch": 0.3702535495935627, "grad_norm": 8.24268913269043, "learning_rate": 4.157620854191336e-06, "loss": 0.5262, "mean_token_accuracy": 0.8355750843882561, "num_tokens": 143613750.0, "step": 119440 }, { "entropy": 1.960300487279892, "epoch": 0.37028454871861244, "grad_norm": 7.390927791595459, "learning_rate": 4.1574468172435805e-06, "loss": 0.5408, "mean_token_accuracy": 0.8368476778268814, "num_tokens": 143624528.0, "step": 119450 }, { "entropy": 1.9035738706588745, "epoch": 0.3703155478436621, "grad_norm": 7.557567596435547, "learning_rate": 4.157272802149423e-06, "loss": 0.509, "mean_token_accuracy": 0.8415401116013527, "num_tokens": 143636080.0, "step": 119460 }, { "entropy": 1.9506605803966521, "epoch": 0.37034654696871183, "grad_norm": 7.982449054718018, "learning_rate": 4.157098808904288e-06, "loss": 0.5286, "mean_token_accuracy": 0.8373087286949158, "num_tokens": 143646846.0, "step": 119470 }, { "entropy": 1.8253500118851662, "epoch": 0.3703775460937615, "grad_norm": 4.673305034637451, "learning_rate": 4.156924837503605e-06, "loss": 0.4425, "mean_token_accuracy": 0.8508133858442306, "num_tokens": 143659349.0, "step": 119480 }, { "entropy": 1.8085382550954818, "epoch": 0.3704085452188112, "grad_norm": 3.9146220684051514, "learning_rate": 4.156750887942804e-06, "loss": 0.4161, "mean_token_accuracy": 0.8524488687515259, "num_tokens": 143672699.0, "step": 119490 }, { "entropy": 1.8396509125828744, "epoch": 0.3704395443438609, "grad_norm": 9.417864799499512, "learning_rate": 4.156576960217317e-06, "loss": 0.4473, "mean_token_accuracy": 0.8548582971096039, "num_tokens": 143684766.0, "step": 119500 }, { "entropy": 1.8102483958005906, "epoch": 0.37047054346891056, "grad_norm": 8.302236557006836, "learning_rate": 4.156403054322573e-06, "loss": 0.3942, "mean_token_accuracy": 0.8558089345693588, "num_tokens": 143696483.0, "step": 119510 }, { "entropy": 1.8374242320656777, "epoch": 0.3705015425939603, "grad_norm": 8.9642972946167, "learning_rate": 4.156229170254007e-06, "loss": 0.434, "mean_token_accuracy": 0.8551341906189919, "num_tokens": 143709585.0, "step": 119520 }, { "entropy": 1.7632934302091599, "epoch": 0.37053254171900996, "grad_norm": 8.926246643066406, "learning_rate": 4.156055308007056e-06, "loss": 0.4146, "mean_token_accuracy": 0.858980093896389, "num_tokens": 143722796.0, "step": 119530 }, { "entropy": 1.7985145077109337, "epoch": 0.3705635408440597, "grad_norm": 3.6693923473358154, "learning_rate": 4.155881467577151e-06, "loss": 0.4514, "mean_token_accuracy": 0.8529761865735054, "num_tokens": 143735885.0, "step": 119540 }, { "entropy": 1.8460396811366082, "epoch": 0.37059453996910935, "grad_norm": 8.484942436218262, "learning_rate": 4.1557076489597354e-06, "loss": 0.4554, "mean_token_accuracy": 0.8540690630674362, "num_tokens": 143747844.0, "step": 119550 }, { "entropy": 1.8268393889069556, "epoch": 0.3706255390941591, "grad_norm": 10.119608879089355, "learning_rate": 4.155533852150245e-06, "loss": 0.4451, "mean_token_accuracy": 0.8529582008719444, "num_tokens": 143760082.0, "step": 119560 }, { "entropy": 1.8313589990139008, "epoch": 0.37065653821920874, "grad_norm": 10.065152168273926, "learning_rate": 4.155360077144119e-06, "loss": 0.4413, "mean_token_accuracy": 0.8478084012866021, "num_tokens": 143772556.0, "step": 119570 }, { "entropy": 1.7658352941274642, "epoch": 0.37068753734425847, "grad_norm": 4.34824800491333, "learning_rate": 4.155186323936802e-06, "loss": 0.3614, "mean_token_accuracy": 0.8594838708639145, "num_tokens": 143786868.0, "step": 119580 }, { "entropy": 1.9014542356133461, "epoch": 0.37071853646930814, "grad_norm": 8.814896583557129, "learning_rate": 4.155012592523735e-06, "loss": 0.4831, "mean_token_accuracy": 0.844634436070919, "num_tokens": 143799003.0, "step": 119590 }, { "entropy": 1.9206546247005463, "epoch": 0.37074953559435786, "grad_norm": 7.594090461730957, "learning_rate": 4.154838882900362e-06, "loss": 0.498, "mean_token_accuracy": 0.8516721650958061, "num_tokens": 143809724.0, "step": 119600 }, { "entropy": 1.920139655470848, "epoch": 0.37078053471940753, "grad_norm": 6.954849720001221, "learning_rate": 4.154665195062129e-06, "loss": 0.474, "mean_token_accuracy": 0.8434002518653869, "num_tokens": 143821147.0, "step": 119610 }, { "entropy": 1.8785637602210046, "epoch": 0.37081153384445725, "grad_norm": 7.791470527648926, "learning_rate": 4.154491529004484e-06, "loss": 0.4904, "mean_token_accuracy": 0.8430327326059341, "num_tokens": 143832843.0, "step": 119620 }, { "entropy": 1.8350661814212799, "epoch": 0.3708425329695069, "grad_norm": 9.10515308380127, "learning_rate": 4.1543178847228734e-06, "loss": 0.4286, "mean_token_accuracy": 0.8528729796409606, "num_tokens": 143844895.0, "step": 119630 }, { "entropy": 1.8383238837122917, "epoch": 0.37087353209455665, "grad_norm": 7.8374834060668945, "learning_rate": 4.154144262212747e-06, "loss": 0.4538, "mean_token_accuracy": 0.8570844009518623, "num_tokens": 143856936.0, "step": 119640 }, { "entropy": 1.8358725041151047, "epoch": 0.3709045312196063, "grad_norm": 10.492681503295898, "learning_rate": 4.153970661469557e-06, "loss": 0.4451, "mean_token_accuracy": 0.8490477934479713, "num_tokens": 143869530.0, "step": 119650 }, { "entropy": 1.889818374812603, "epoch": 0.37093553034465604, "grad_norm": 8.570408821105957, "learning_rate": 4.153797082488753e-06, "loss": 0.4735, "mean_token_accuracy": 0.8489762708544731, "num_tokens": 143880824.0, "step": 119660 }, { "entropy": 1.827413833141327, "epoch": 0.3709665294697057, "grad_norm": 8.959956169128418, "learning_rate": 4.153623525265792e-06, "loss": 0.4455, "mean_token_accuracy": 0.8504405677318573, "num_tokens": 143893214.0, "step": 119670 }, { "entropy": 1.8852188110351562, "epoch": 0.37099752859475543, "grad_norm": 10.366182327270508, "learning_rate": 4.153449989796128e-06, "loss": 0.5028, "mean_token_accuracy": 0.8433555126190185, "num_tokens": 143904446.0, "step": 119680 }, { "entropy": 1.9464125022292138, "epoch": 0.3710285277198051, "grad_norm": 10.189809799194336, "learning_rate": 4.153276476075214e-06, "loss": 0.5475, "mean_token_accuracy": 0.8357294231653214, "num_tokens": 143915556.0, "step": 119690 }, { "entropy": 1.825826181471348, "epoch": 0.3710595268448548, "grad_norm": 8.288381576538086, "learning_rate": 4.1531029840985115e-06, "loss": 0.4332, "mean_token_accuracy": 0.8584688425064086, "num_tokens": 143928024.0, "step": 119700 }, { "entropy": 1.8943134620785713, "epoch": 0.3710905259699045, "grad_norm": 7.004955768585205, "learning_rate": 4.152929513861477e-06, "loss": 0.4718, "mean_token_accuracy": 0.8523566782474518, "num_tokens": 143940068.0, "step": 119710 }, { "entropy": 1.8529562443494796, "epoch": 0.3711215250949542, "grad_norm": 3.9424831867218018, "learning_rate": 4.152756065359572e-06, "loss": 0.4032, "mean_token_accuracy": 0.859981244802475, "num_tokens": 143952445.0, "step": 119720 }, { "entropy": 1.8910843506455421, "epoch": 0.3711525242200039, "grad_norm": 9.168560028076172, "learning_rate": 4.1525826385882565e-06, "loss": 0.4802, "mean_token_accuracy": 0.8453802585601806, "num_tokens": 143964559.0, "step": 119730 }, { "entropy": 1.8304832607507706, "epoch": 0.3711835233450536, "grad_norm": 7.770691871643066, "learning_rate": 4.152409233542995e-06, "loss": 0.481, "mean_token_accuracy": 0.8464579433202744, "num_tokens": 143976473.0, "step": 119740 }, { "entropy": 1.8643721297383309, "epoch": 0.3712145224701033, "grad_norm": 9.255126953125, "learning_rate": 4.15223585021925e-06, "loss": 0.4753, "mean_token_accuracy": 0.8462480574846267, "num_tokens": 143987917.0, "step": 119750 }, { "entropy": 1.9292099431157113, "epoch": 0.37124552159515295, "grad_norm": 10.51768684387207, "learning_rate": 4.1520624886124886e-06, "loss": 0.5321, "mean_token_accuracy": 0.8432277202606201, "num_tokens": 144000155.0, "step": 119760 }, { "entropy": 1.8904988139867782, "epoch": 0.3712765207202027, "grad_norm": 6.968792915344238, "learning_rate": 4.151889148718177e-06, "loss": 0.4317, "mean_token_accuracy": 0.8615191966295243, "num_tokens": 144012142.0, "step": 119770 }, { "entropy": 1.9146782085299492, "epoch": 0.37130751984525234, "grad_norm": 8.367558479309082, "learning_rate": 4.151715830531783e-06, "loss": 0.4812, "mean_token_accuracy": 0.8470437869429588, "num_tokens": 144023416.0, "step": 119780 }, { "entropy": 1.8538981348276138, "epoch": 0.37133851897030207, "grad_norm": 8.344622611999512, "learning_rate": 4.151542534048775e-06, "loss": 0.4413, "mean_token_accuracy": 0.8540352538228035, "num_tokens": 144035624.0, "step": 119790 }, { "entropy": 1.9207333639264106, "epoch": 0.37136951809535174, "grad_norm": 7.7748942375183105, "learning_rate": 4.151369259264627e-06, "loss": 0.505, "mean_token_accuracy": 0.8414986193180084, "num_tokens": 144047105.0, "step": 119800 }, { "entropy": 1.9819932192564012, "epoch": 0.37140051722040146, "grad_norm": 9.306598663330078, "learning_rate": 4.151196006174808e-06, "loss": 0.5185, "mean_token_accuracy": 0.845866845548153, "num_tokens": 144058111.0, "step": 119810 }, { "entropy": 1.94317606985569, "epoch": 0.37143151634545113, "grad_norm": 8.76233196258545, "learning_rate": 4.151022774774793e-06, "loss": 0.4921, "mean_token_accuracy": 0.8466348692774772, "num_tokens": 144068820.0, "step": 119820 }, { "entropy": 1.8876767694950103, "epoch": 0.37146251547050085, "grad_norm": 7.308814525604248, "learning_rate": 4.150849565060057e-06, "loss": 0.4707, "mean_token_accuracy": 0.8494908258318901, "num_tokens": 144080689.0, "step": 119830 }, { "entropy": 1.8951317608356475, "epoch": 0.3714935145955505, "grad_norm": 9.297974586486816, "learning_rate": 4.1506763770260735e-06, "loss": 0.4687, "mean_token_accuracy": 0.8519510939717293, "num_tokens": 144092194.0, "step": 119840 }, { "entropy": 1.9246593773365022, "epoch": 0.37152451372060025, "grad_norm": 7.682300090789795, "learning_rate": 4.150503210668323e-06, "loss": 0.4773, "mean_token_accuracy": 0.84807148873806, "num_tokens": 144103270.0, "step": 119850 }, { "entropy": 1.911957061290741, "epoch": 0.3715555128456499, "grad_norm": 6.910633087158203, "learning_rate": 4.150330065982283e-06, "loss": 0.5411, "mean_token_accuracy": 0.8236610382795334, "num_tokens": 144115512.0, "step": 119860 }, { "entropy": 1.7777829378843308, "epoch": 0.37158651197069964, "grad_norm": 8.125778198242188, "learning_rate": 4.1501569429634335e-06, "loss": 0.3631, "mean_token_accuracy": 0.8673719853162766, "num_tokens": 144128091.0, "step": 119870 }, { "entropy": 1.8605340436100959, "epoch": 0.3716175110957493, "grad_norm": 9.269027709960938, "learning_rate": 4.149983841607256e-06, "loss": 0.4442, "mean_token_accuracy": 0.8562753826379776, "num_tokens": 144139595.0, "step": 119880 }, { "entropy": 1.8629012137651444, "epoch": 0.37164851022079903, "grad_norm": 8.08617877960205, "learning_rate": 4.149810761909232e-06, "loss": 0.4161, "mean_token_accuracy": 0.8640567243099213, "num_tokens": 144151997.0, "step": 119890 }, { "entropy": 1.9436404347419738, "epoch": 0.3716795093458487, "grad_norm": 9.821636199951172, "learning_rate": 4.149637703864848e-06, "loss": 0.4963, "mean_token_accuracy": 0.848535805940628, "num_tokens": 144162535.0, "step": 119900 }, { "entropy": 1.890118558704853, "epoch": 0.3717105084708984, "grad_norm": 8.111401557922363, "learning_rate": 4.149464667469588e-06, "loss": 0.4725, "mean_token_accuracy": 0.846766683459282, "num_tokens": 144174680.0, "step": 119910 }, { "entropy": 1.8793727621436118, "epoch": 0.3717415075959481, "grad_norm": 9.598828315734863, "learning_rate": 4.149291652718937e-06, "loss": 0.5008, "mean_token_accuracy": 0.8412741586565972, "num_tokens": 144186534.0, "step": 119920 }, { "entropy": 1.8049995869398117, "epoch": 0.3717725067209978, "grad_norm": 4.190101623535156, "learning_rate": 4.149118659608385e-06, "loss": 0.413, "mean_token_accuracy": 0.8464117422699928, "num_tokens": 144200420.0, "step": 119930 }, { "entropy": 1.9019189208745957, "epoch": 0.3718035058460475, "grad_norm": 9.553948402404785, "learning_rate": 4.14894568813342e-06, "loss": 0.4836, "mean_token_accuracy": 0.8418327271938324, "num_tokens": 144212107.0, "step": 119940 }, { "entropy": 1.9129536800086497, "epoch": 0.3718345049710972, "grad_norm": 8.531197547912598, "learning_rate": 4.1487727382895345e-06, "loss": 0.4828, "mean_token_accuracy": 0.8501315474510193, "num_tokens": 144223594.0, "step": 119950 }, { "entropy": 1.896179696917534, "epoch": 0.3718655040961469, "grad_norm": 8.905537605285645, "learning_rate": 4.148599810072218e-06, "loss": 0.5049, "mean_token_accuracy": 0.8426758080720902, "num_tokens": 144235324.0, "step": 119960 }, { "entropy": 1.9160192415118218, "epoch": 0.3718965032211966, "grad_norm": 9.644220352172852, "learning_rate": 4.148426903476966e-06, "loss": 0.4903, "mean_token_accuracy": 0.8502436727285385, "num_tokens": 144246974.0, "step": 119970 }, { "entropy": 1.8606981292366982, "epoch": 0.3719275023462463, "grad_norm": 3.5319061279296875, "learning_rate": 4.148254018499271e-06, "loss": 0.4265, "mean_token_accuracy": 0.8643532186746598, "num_tokens": 144259328.0, "step": 119980 }, { "entropy": 1.8820769846439362, "epoch": 0.371958501471296, "grad_norm": 10.861024856567383, "learning_rate": 4.1480811551346305e-06, "loss": 0.4884, "mean_token_accuracy": 0.8372126758098603, "num_tokens": 144270870.0, "step": 119990 }, { "entropy": 1.8569459572434426, "epoch": 0.37198950059634567, "grad_norm": 8.557576179504395, "learning_rate": 4.14790831337854e-06, "loss": 0.4211, "mean_token_accuracy": 0.8561231017112731, "num_tokens": 144282985.0, "step": 120000 }, { "entropy": 1.8007396288216113, "epoch": 0.37202049972139534, "grad_norm": 8.916121482849121, "learning_rate": 4.147735493226499e-06, "loss": 0.3946, "mean_token_accuracy": 0.8646645426750184, "num_tokens": 144296275.0, "step": 120010 }, { "entropy": 1.8762442633509635, "epoch": 0.37205149884644506, "grad_norm": 10.017187118530273, "learning_rate": 4.147562694674007e-06, "loss": 0.4473, "mean_token_accuracy": 0.8562163576483727, "num_tokens": 144307269.0, "step": 120020 }, { "entropy": 1.945700818300247, "epoch": 0.37208249797149473, "grad_norm": 9.23812198638916, "learning_rate": 4.147389917716566e-06, "loss": 0.5345, "mean_token_accuracy": 0.840525084733963, "num_tokens": 144318155.0, "step": 120030 }, { "entropy": 1.9071541115641595, "epoch": 0.37211349709654445, "grad_norm": 6.633033752441406, "learning_rate": 4.1472171623496765e-06, "loss": 0.4695, "mean_token_accuracy": 0.8516213580965996, "num_tokens": 144330070.0, "step": 120040 }, { "entropy": 1.8605321899056435, "epoch": 0.3721444962215941, "grad_norm": 7.755608081817627, "learning_rate": 4.147044428568844e-06, "loss": 0.463, "mean_token_accuracy": 0.8484739094972611, "num_tokens": 144342082.0, "step": 120050 }, { "entropy": 1.8393002033233643, "epoch": 0.37217549534664385, "grad_norm": 3.898113489151001, "learning_rate": 4.146871716369573e-06, "loss": 0.4465, "mean_token_accuracy": 0.8561500370502472, "num_tokens": 144353537.0, "step": 120060 }, { "entropy": 1.9425443708896637, "epoch": 0.3722064944716935, "grad_norm": 7.680639743804932, "learning_rate": 4.146699025747368e-06, "loss": 0.4847, "mean_token_accuracy": 0.8522611662745476, "num_tokens": 144364505.0, "step": 120070 }, { "entropy": 1.8757713958621025, "epoch": 0.37223749359674324, "grad_norm": 8.204666137695312, "learning_rate": 4.14652635669774e-06, "loss": 0.463, "mean_token_accuracy": 0.8506177857518196, "num_tokens": 144375941.0, "step": 120080 }, { "entropy": 1.9103585094213487, "epoch": 0.3722684927217929, "grad_norm": 9.457573890686035, "learning_rate": 4.146353709216196e-06, "loss": 0.5536, "mean_token_accuracy": 0.8371702343225479, "num_tokens": 144386996.0, "step": 120090 }, { "entropy": 1.7752160847187042, "epoch": 0.37229949184684263, "grad_norm": 9.947811126708984, "learning_rate": 4.146181083298246e-06, "loss": 0.4366, "mean_token_accuracy": 0.8600396499037742, "num_tokens": 144400645.0, "step": 120100 }, { "entropy": 1.8245454356074333, "epoch": 0.3723304909718923, "grad_norm": 10.876919746398926, "learning_rate": 4.146008478939402e-06, "loss": 0.4198, "mean_token_accuracy": 0.8561910197138787, "num_tokens": 144412719.0, "step": 120110 }, { "entropy": 1.8593166559934615, "epoch": 0.372361490096942, "grad_norm": 7.8572587966918945, "learning_rate": 4.145835896135177e-06, "loss": 0.4549, "mean_token_accuracy": 0.8545159190893173, "num_tokens": 144424962.0, "step": 120120 }, { "entropy": 1.9504047334194183, "epoch": 0.3723924892219917, "grad_norm": 8.473873138427734, "learning_rate": 4.1456633348810855e-06, "loss": 0.5375, "mean_token_accuracy": 0.8311512500047684, "num_tokens": 144436324.0, "step": 120130 }, { "entropy": 1.8700911119580268, "epoch": 0.3724234883470414, "grad_norm": 4.052376747131348, "learning_rate": 4.145490795172642e-06, "loss": 0.4624, "mean_token_accuracy": 0.8466508388519287, "num_tokens": 144447723.0, "step": 120140 }, { "entropy": 1.8484340474009513, "epoch": 0.3724544874720911, "grad_norm": 3.679072141647339, "learning_rate": 4.145318277005364e-06, "loss": 0.4588, "mean_token_accuracy": 0.8553288698196411, "num_tokens": 144459623.0, "step": 120150 }, { "entropy": 1.8352990061044694, "epoch": 0.3724854865971408, "grad_norm": 10.292163848876953, "learning_rate": 4.1451457803747705e-06, "loss": 0.4709, "mean_token_accuracy": 0.8461218729615212, "num_tokens": 144471544.0, "step": 120160 }, { "entropy": 1.9243695348501206, "epoch": 0.3725164857221905, "grad_norm": 8.172408103942871, "learning_rate": 4.1449733052763785e-06, "loss": 0.4811, "mean_token_accuracy": 0.8539075449109077, "num_tokens": 144482528.0, "step": 120170 }, { "entropy": 1.8598498031497002, "epoch": 0.3725474848472402, "grad_norm": 7.883946895599365, "learning_rate": 4.144800851705711e-06, "loss": 0.4759, "mean_token_accuracy": 0.8527406945824623, "num_tokens": 144493717.0, "step": 120180 }, { "entropy": 1.8283675089478493, "epoch": 0.3725784839722899, "grad_norm": 8.100120544433594, "learning_rate": 4.1446284196582885e-06, "loss": 0.4359, "mean_token_accuracy": 0.8576804473996162, "num_tokens": 144505962.0, "step": 120190 }, { "entropy": 1.910760723054409, "epoch": 0.3726094830973396, "grad_norm": 4.125510215759277, "learning_rate": 4.144456009129636e-06, "loss": 0.4787, "mean_token_accuracy": 0.846273559331894, "num_tokens": 144517019.0, "step": 120200 }, { "entropy": 1.83195867985487, "epoch": 0.37264048222238927, "grad_norm": 7.632051944732666, "learning_rate": 4.144283620115277e-06, "loss": 0.4404, "mean_token_accuracy": 0.8484606102108956, "num_tokens": 144529928.0, "step": 120210 }, { "entropy": 1.913992887735367, "epoch": 0.372671481347439, "grad_norm": 8.024348258972168, "learning_rate": 4.144111252610736e-06, "loss": 0.4761, "mean_token_accuracy": 0.856801763176918, "num_tokens": 144541124.0, "step": 120220 }, { "entropy": 1.7974503114819527, "epoch": 0.37270248047248866, "grad_norm": 4.225975036621094, "learning_rate": 4.143938906611542e-06, "loss": 0.4, "mean_token_accuracy": 0.852984607219696, "num_tokens": 144554372.0, "step": 120230 }, { "entropy": 1.8398534119129182, "epoch": 0.3727334795975384, "grad_norm": 7.547396183013916, "learning_rate": 4.143766582113225e-06, "loss": 0.45, "mean_token_accuracy": 0.848320834338665, "num_tokens": 144567059.0, "step": 120240 }, { "entropy": 1.79450566470623, "epoch": 0.37276447872258806, "grad_norm": 7.509959697723389, "learning_rate": 4.143594279111312e-06, "loss": 0.4259, "mean_token_accuracy": 0.8508245050907135, "num_tokens": 144579668.0, "step": 120250 }, { "entropy": 1.9366710186004639, "epoch": 0.3727954778476377, "grad_norm": 7.8771209716796875, "learning_rate": 4.143421997601335e-06, "loss": 0.5205, "mean_token_accuracy": 0.8401233479380608, "num_tokens": 144590869.0, "step": 120260 }, { "entropy": 1.7975213006138802, "epoch": 0.37282647697268745, "grad_norm": 10.612153053283691, "learning_rate": 4.1432497375788275e-06, "loss": 0.417, "mean_token_accuracy": 0.8648232862353324, "num_tokens": 144603350.0, "step": 120270 }, { "entropy": 1.882333090901375, "epoch": 0.3728574760977371, "grad_norm": 9.56601619720459, "learning_rate": 4.143077499039321e-06, "loss": 0.488, "mean_token_accuracy": 0.8536757841706276, "num_tokens": 144614705.0, "step": 120280 }, { "entropy": 1.8980722784996034, "epoch": 0.37288847522278684, "grad_norm": 8.230175018310547, "learning_rate": 4.142905281978353e-06, "loss": 0.4813, "mean_token_accuracy": 0.8509363621473313, "num_tokens": 144625834.0, "step": 120290 }, { "entropy": 1.881382980942726, "epoch": 0.3729194743478365, "grad_norm": 8.221864700317383, "learning_rate": 4.14273308639146e-06, "loss": 0.4796, "mean_token_accuracy": 0.8483862280845642, "num_tokens": 144636679.0, "step": 120300 }, { "entropy": 1.879370318353176, "epoch": 0.37295047347288623, "grad_norm": 7.582620620727539, "learning_rate": 4.142560912274176e-06, "loss": 0.4857, "mean_token_accuracy": 0.8436816439032555, "num_tokens": 144648082.0, "step": 120310 }, { "entropy": 1.8741892874240875, "epoch": 0.3729814725979359, "grad_norm": 8.145028114318848, "learning_rate": 4.142388759622044e-06, "loss": 0.4767, "mean_token_accuracy": 0.8376132413744927, "num_tokens": 144659565.0, "step": 120320 }, { "entropy": 1.8751545161008836, "epoch": 0.37301247172298563, "grad_norm": 8.928181648254395, "learning_rate": 4.1422166284306016e-06, "loss": 0.5052, "mean_token_accuracy": 0.8437012255191803, "num_tokens": 144670975.0, "step": 120330 }, { "entropy": 1.8685587406158448, "epoch": 0.3730434708480353, "grad_norm": 9.061539649963379, "learning_rate": 4.142044518695391e-06, "loss": 0.4559, "mean_token_accuracy": 0.852770148217678, "num_tokens": 144682788.0, "step": 120340 }, { "entropy": 1.7655042618513108, "epoch": 0.373074469973085, "grad_norm": 6.830787181854248, "learning_rate": 4.141872430411956e-06, "loss": 0.4142, "mean_token_accuracy": 0.8638226523995399, "num_tokens": 144695970.0, "step": 120350 }, { "entropy": 1.8461510226130486, "epoch": 0.3731054690981347, "grad_norm": 6.8738298416137695, "learning_rate": 4.14170036357584e-06, "loss": 0.4545, "mean_token_accuracy": 0.8553184866905212, "num_tokens": 144707528.0, "step": 120360 }, { "entropy": 1.903184102475643, "epoch": 0.3731364682231844, "grad_norm": 7.4232025146484375, "learning_rate": 4.1415283181825895e-06, "loss": 0.4897, "mean_token_accuracy": 0.8506990998983384, "num_tokens": 144718816.0, "step": 120370 }, { "entropy": 1.9077295437455177, "epoch": 0.3731674673482341, "grad_norm": 8.14012622833252, "learning_rate": 4.1413562942277484e-06, "loss": 0.5117, "mean_token_accuracy": 0.8307053163647652, "num_tokens": 144730281.0, "step": 120380 }, { "entropy": 1.8349354296922684, "epoch": 0.3731984664732838, "grad_norm": 8.339512825012207, "learning_rate": 4.141184291706868e-06, "loss": 0.4454, "mean_token_accuracy": 0.8565431639552117, "num_tokens": 144742205.0, "step": 120390 }, { "entropy": 1.8753593668341637, "epoch": 0.3732294655983335, "grad_norm": 9.116205215454102, "learning_rate": 4.141012310615495e-06, "loss": 0.4764, "mean_token_accuracy": 0.8513768821954727, "num_tokens": 144753297.0, "step": 120400 }, { "entropy": 1.8986528918147088, "epoch": 0.3732604647233832, "grad_norm": 7.670609951019287, "learning_rate": 4.140840350949181e-06, "loss": 0.4765, "mean_token_accuracy": 0.8434357807040215, "num_tokens": 144764661.0, "step": 120410 }, { "entropy": 1.7499748080968858, "epoch": 0.37329146384843287, "grad_norm": 8.888646125793457, "learning_rate": 4.140668412703478e-06, "loss": 0.3657, "mean_token_accuracy": 0.862440787255764, "num_tokens": 144778571.0, "step": 120420 }, { "entropy": 1.76868616938591, "epoch": 0.3733224629734826, "grad_norm": 9.065146446228027, "learning_rate": 4.14049649587394e-06, "loss": 0.3731, "mean_token_accuracy": 0.8667063027620315, "num_tokens": 144791265.0, "step": 120430 }, { "entropy": 1.8670168176293374, "epoch": 0.37335346209853226, "grad_norm": 12.608692169189453, "learning_rate": 4.140324600456119e-06, "loss": 0.4971, "mean_token_accuracy": 0.8426472395658493, "num_tokens": 144802626.0, "step": 120440 }, { "entropy": 1.7094080224633217, "epoch": 0.373384461223582, "grad_norm": 9.83113956451416, "learning_rate": 4.140152726445574e-06, "loss": 0.3783, "mean_token_accuracy": 0.8663820832967758, "num_tokens": 144816924.0, "step": 120450 }, { "entropy": 1.88383856266737, "epoch": 0.37341546034863166, "grad_norm": 8.397429466247559, "learning_rate": 4.1399808738378585e-06, "loss": 0.4993, "mean_token_accuracy": 0.8410907715559006, "num_tokens": 144829203.0, "step": 120460 }, { "entropy": 1.768805581331253, "epoch": 0.3734464594736814, "grad_norm": 4.13175106048584, "learning_rate": 4.139809042628535e-06, "loss": 0.3802, "mean_token_accuracy": 0.8638075411319732, "num_tokens": 144841621.0, "step": 120470 }, { "entropy": 1.869265778362751, "epoch": 0.37347745859873105, "grad_norm": 10.438434600830078, "learning_rate": 4.139637232813159e-06, "loss": 0.5482, "mean_token_accuracy": 0.830617117881775, "num_tokens": 144853808.0, "step": 120480 }, { "entropy": 1.8952225580811501, "epoch": 0.3735084577237808, "grad_norm": 7.919762134552002, "learning_rate": 4.139465444387294e-06, "loss": 0.4617, "mean_token_accuracy": 0.851693132519722, "num_tokens": 144865079.0, "step": 120490 }, { "entropy": 1.7868720442056656, "epoch": 0.37353945684883044, "grad_norm": 8.96243953704834, "learning_rate": 4.139293677346502e-06, "loss": 0.4258, "mean_token_accuracy": 0.8544852897524834, "num_tokens": 144878075.0, "step": 120500 }, { "entropy": 1.8284976735711098, "epoch": 0.3735704559738801, "grad_norm": 8.363344192504883, "learning_rate": 4.139121931686345e-06, "loss": 0.4361, "mean_token_accuracy": 0.85543752014637, "num_tokens": 144890326.0, "step": 120510 }, { "entropy": 1.870447462797165, "epoch": 0.37360145509892984, "grad_norm": 7.410390853881836, "learning_rate": 4.13895020740239e-06, "loss": 0.4693, "mean_token_accuracy": 0.8517989814281464, "num_tokens": 144902157.0, "step": 120520 }, { "entropy": 1.800754214823246, "epoch": 0.3736324542239795, "grad_norm": 4.441434383392334, "learning_rate": 4.138778504490201e-06, "loss": 0.4912, "mean_token_accuracy": 0.8548402667045594, "num_tokens": 144915102.0, "step": 120530 }, { "entropy": 1.8087909892201424, "epoch": 0.37366345334902923, "grad_norm": 7.617093086242676, "learning_rate": 4.138606822945347e-06, "loss": 0.4519, "mean_token_accuracy": 0.8564250499010087, "num_tokens": 144927518.0, "step": 120540 }, { "entropy": 1.8769372016191483, "epoch": 0.3736944524740789, "grad_norm": 8.41362476348877, "learning_rate": 4.138435162763395e-06, "loss": 0.4812, "mean_token_accuracy": 0.8450376495718956, "num_tokens": 144939489.0, "step": 120550 }, { "entropy": 1.8020145073533058, "epoch": 0.3737254515991286, "grad_norm": 9.323637008666992, "learning_rate": 4.138263523939916e-06, "loss": 0.4424, "mean_token_accuracy": 0.8448305040597915, "num_tokens": 144952661.0, "step": 120560 }, { "entropy": 1.84602689743042, "epoch": 0.3737564507241783, "grad_norm": 8.894927024841309, "learning_rate": 4.138091906470481e-06, "loss": 0.4314, "mean_token_accuracy": 0.8503866240382194, "num_tokens": 144964533.0, "step": 120570 }, { "entropy": 1.7196414768695831, "epoch": 0.373787449849228, "grad_norm": 4.194610118865967, "learning_rate": 4.137920310350664e-06, "loss": 0.3677, "mean_token_accuracy": 0.8610586509108543, "num_tokens": 144978621.0, "step": 120580 }, { "entropy": 1.8428490251302718, "epoch": 0.3738184489742777, "grad_norm": 4.064561367034912, "learning_rate": 4.137748735576036e-06, "loss": 0.4538, "mean_token_accuracy": 0.8580682083964348, "num_tokens": 144990822.0, "step": 120590 }, { "entropy": 1.8954029515385629, "epoch": 0.3738494480993274, "grad_norm": 8.305089950561523, "learning_rate": 4.137577182142173e-06, "loss": 0.511, "mean_token_accuracy": 0.8431953400373459, "num_tokens": 145002259.0, "step": 120600 }, { "entropy": 1.7457600995898246, "epoch": 0.3738804472243771, "grad_norm": 9.313241958618164, "learning_rate": 4.137405650044653e-06, "loss": 0.3778, "mean_token_accuracy": 0.8654039755463601, "num_tokens": 145016219.0, "step": 120610 }, { "entropy": 1.8425143510103226, "epoch": 0.3739114463494268, "grad_norm": 7.391794204711914, "learning_rate": 4.137234139279052e-06, "loss": 0.4997, "mean_token_accuracy": 0.8488540187478065, "num_tokens": 145028857.0, "step": 120620 }, { "entropy": 1.7952088013291359, "epoch": 0.37394244547447647, "grad_norm": 9.129136085510254, "learning_rate": 4.13706264984095e-06, "loss": 0.3903, "mean_token_accuracy": 0.8728530630469322, "num_tokens": 145041106.0, "step": 120630 }, { "entropy": 1.9029112622141837, "epoch": 0.3739734445995262, "grad_norm": 7.712813377380371, "learning_rate": 4.136891181725925e-06, "loss": 0.5076, "mean_token_accuracy": 0.8385331735014916, "num_tokens": 145052581.0, "step": 120640 }, { "entropy": 1.8401472687721252, "epoch": 0.37400444372457586, "grad_norm": 9.776946067810059, "learning_rate": 4.136719734929561e-06, "loss": 0.4884, "mean_token_accuracy": 0.8453155905008316, "num_tokens": 145064652.0, "step": 120650 }, { "entropy": 1.815567010641098, "epoch": 0.3740354428496256, "grad_norm": 9.336750984191895, "learning_rate": 4.136548309447441e-06, "loss": 0.434, "mean_token_accuracy": 0.8552684485912323, "num_tokens": 145077064.0, "step": 120660 }, { "entropy": 1.8268222376704215, "epoch": 0.37406644197467526, "grad_norm": 8.70843505859375, "learning_rate": 4.136376905275147e-06, "loss": 0.4439, "mean_token_accuracy": 0.8503636911511421, "num_tokens": 145089585.0, "step": 120670 }, { "entropy": 1.8132446378469467, "epoch": 0.374097441099725, "grad_norm": 8.954428672790527, "learning_rate": 4.1362055224082654e-06, "loss": 0.4294, "mean_token_accuracy": 0.8630944639444351, "num_tokens": 145101852.0, "step": 120680 }, { "entropy": 1.8896322906017304, "epoch": 0.37412844022477465, "grad_norm": 6.831114768981934, "learning_rate": 4.136034160842382e-06, "loss": 0.4903, "mean_token_accuracy": 0.84442989975214, "num_tokens": 145113578.0, "step": 120690 }, { "entropy": 1.8678228601813316, "epoch": 0.3741594393498244, "grad_norm": 11.830345153808594, "learning_rate": 4.135862820573086e-06, "loss": 0.4919, "mean_token_accuracy": 0.8500665947794914, "num_tokens": 145125041.0, "step": 120700 }, { "entropy": 1.7717063777148723, "epoch": 0.37419043847487404, "grad_norm": 2.549680233001709, "learning_rate": 4.135691501595966e-06, "loss": 0.3644, "mean_token_accuracy": 0.8615061908960342, "num_tokens": 145138854.0, "step": 120710 }, { "entropy": 1.8354929715394974, "epoch": 0.37422143759992377, "grad_norm": 3.5742509365081787, "learning_rate": 4.1355202039066126e-06, "loss": 0.4324, "mean_token_accuracy": 0.8558676704764366, "num_tokens": 145150497.0, "step": 120720 }, { "entropy": 1.8448586538434029, "epoch": 0.37425243672497344, "grad_norm": 8.689122200012207, "learning_rate": 4.135348927500618e-06, "loss": 0.4805, "mean_token_accuracy": 0.8484915763139724, "num_tokens": 145162443.0, "step": 120730 }, { "entropy": 1.8981183648109436, "epoch": 0.37428343585002316, "grad_norm": 8.72005844116211, "learning_rate": 4.135177672373573e-06, "loss": 0.4603, "mean_token_accuracy": 0.8429804682731629, "num_tokens": 145174166.0, "step": 120740 }, { "entropy": 1.9157382816076278, "epoch": 0.37431443497507283, "grad_norm": 3.737830400466919, "learning_rate": 4.135006438521074e-06, "loss": 0.5013, "mean_token_accuracy": 0.8430665746331215, "num_tokens": 145185172.0, "step": 120750 }, { "entropy": 1.9020906642079354, "epoch": 0.3743454341001225, "grad_norm": 7.51852560043335, "learning_rate": 4.134835225938717e-06, "loss": 0.4872, "mean_token_accuracy": 0.8433557212352752, "num_tokens": 145196378.0, "step": 120760 }, { "entropy": 1.8677937388420105, "epoch": 0.3743764332251722, "grad_norm": 3.965975284576416, "learning_rate": 4.134664034622098e-06, "loss": 0.4406, "mean_token_accuracy": 0.8616374880075455, "num_tokens": 145207527.0, "step": 120770 }, { "entropy": 1.8505346715450286, "epoch": 0.3744074323502219, "grad_norm": 4.419517993927002, "learning_rate": 4.134492864566814e-06, "loss": 0.4014, "mean_token_accuracy": 0.8657907411456108, "num_tokens": 145219338.0, "step": 120780 }, { "entropy": 1.7792392835021018, "epoch": 0.3744384314752716, "grad_norm": 4.039781093597412, "learning_rate": 4.134321715768466e-06, "loss": 0.4888, "mean_token_accuracy": 0.8500527799129486, "num_tokens": 145232885.0, "step": 120790 }, { "entropy": 1.870184451341629, "epoch": 0.3744694306003213, "grad_norm": 3.6062183380126953, "learning_rate": 4.134150588222653e-06, "loss": 0.4869, "mean_token_accuracy": 0.8536598861217499, "num_tokens": 145244298.0, "step": 120800 }, { "entropy": 1.7577032819390297, "epoch": 0.374500429725371, "grad_norm": 3.414278030395508, "learning_rate": 4.13397948192498e-06, "loss": 0.391, "mean_token_accuracy": 0.8604967936873436, "num_tokens": 145257392.0, "step": 120810 }, { "entropy": 1.8007031008601189, "epoch": 0.3745314288504207, "grad_norm": 4.170672416687012, "learning_rate": 4.133808396871046e-06, "loss": 0.4069, "mean_token_accuracy": 0.8616224005818367, "num_tokens": 145270196.0, "step": 120820 }, { "entropy": 1.9025542974472045, "epoch": 0.3745624279754704, "grad_norm": 7.770366191864014, "learning_rate": 4.133637333056459e-06, "loss": 0.4694, "mean_token_accuracy": 0.8512863472104073, "num_tokens": 145281494.0, "step": 120830 }, { "entropy": 1.8452581293880939, "epoch": 0.37459342710052007, "grad_norm": 8.3046293258667, "learning_rate": 4.1334662904768234e-06, "loss": 0.4547, "mean_token_accuracy": 0.8516708582639694, "num_tokens": 145294224.0, "step": 120840 }, { "entropy": 1.7171327024698257, "epoch": 0.3746244262255698, "grad_norm": 3.3472342491149902, "learning_rate": 4.1332952691277465e-06, "loss": 0.3925, "mean_token_accuracy": 0.8607262581586838, "num_tokens": 145308338.0, "step": 120850 }, { "entropy": 1.9564589619636537, "epoch": 0.37465542535061946, "grad_norm": 7.92958402633667, "learning_rate": 4.133124269004837e-06, "loss": 0.5199, "mean_token_accuracy": 0.8395091354846954, "num_tokens": 145319683.0, "step": 120860 }, { "entropy": 1.8489731594920158, "epoch": 0.3746864244756692, "grad_norm": 8.664722442626953, "learning_rate": 4.132953290103704e-06, "loss": 0.4731, "mean_token_accuracy": 0.8534695088863373, "num_tokens": 145332281.0, "step": 120870 }, { "entropy": 1.8806229501962661, "epoch": 0.37471742360071886, "grad_norm": 9.805295944213867, "learning_rate": 4.132782332419957e-06, "loss": 0.4259, "mean_token_accuracy": 0.8593287214636802, "num_tokens": 145343989.0, "step": 120880 }, { "entropy": 1.8413976445794105, "epoch": 0.3747484227257686, "grad_norm": 3.8515052795410156, "learning_rate": 4.13261139594921e-06, "loss": 0.4867, "mean_token_accuracy": 0.8467993855476379, "num_tokens": 145356520.0, "step": 120890 }, { "entropy": 1.9287862733006478, "epoch": 0.37477942185081825, "grad_norm": 8.918717384338379, "learning_rate": 4.132440480687076e-06, "loss": 0.4549, "mean_token_accuracy": 0.854530693590641, "num_tokens": 145367592.0, "step": 120900 }, { "entropy": 1.909818661212921, "epoch": 0.374810420975868, "grad_norm": 11.631319999694824, "learning_rate": 4.13226958662917e-06, "loss": 0.4791, "mean_token_accuracy": 0.8506747603416442, "num_tokens": 145378561.0, "step": 120910 }, { "entropy": 1.8912340737879276, "epoch": 0.37484142010091764, "grad_norm": 9.222434997558594, "learning_rate": 4.132098713771106e-06, "loss": 0.4455, "mean_token_accuracy": 0.8446800738573075, "num_tokens": 145390493.0, "step": 120920 }, { "entropy": 1.891737161576748, "epoch": 0.37487241922596737, "grad_norm": 9.400554656982422, "learning_rate": 4.131927862108504e-06, "loss": 0.4685, "mean_token_accuracy": 0.8476646468043327, "num_tokens": 145401638.0, "step": 120930 }, { "entropy": 1.8229492217302323, "epoch": 0.37490341835101704, "grad_norm": 8.848627090454102, "learning_rate": 4.13175703163698e-06, "loss": 0.4408, "mean_token_accuracy": 0.8519455164670944, "num_tokens": 145414244.0, "step": 120940 }, { "entropy": 1.8781214132905006, "epoch": 0.37493441747606676, "grad_norm": 9.196588516235352, "learning_rate": 4.131586222352156e-06, "loss": 0.4809, "mean_token_accuracy": 0.8479282841086387, "num_tokens": 145425934.0, "step": 120950 }, { "entropy": 1.8320988327264787, "epoch": 0.37496541660111643, "grad_norm": 8.844073295593262, "learning_rate": 4.131415434249652e-06, "loss": 0.436, "mean_token_accuracy": 0.8547179445624351, "num_tokens": 145438172.0, "step": 120960 }, { "entropy": 1.8629610985517502, "epoch": 0.37499641572616615, "grad_norm": 4.892116546630859, "learning_rate": 4.131244667325089e-06, "loss": 0.441, "mean_token_accuracy": 0.8539141744375229, "num_tokens": 145449705.0, "step": 120970 }, { "entropy": 1.8123175263404847, "epoch": 0.3750274148512158, "grad_norm": 3.7633779048919678, "learning_rate": 4.131073921574093e-06, "loss": 0.44, "mean_token_accuracy": 0.8505129277706146, "num_tokens": 145463148.0, "step": 120980 }, { "entropy": 1.9299910217523575, "epoch": 0.3750584139762655, "grad_norm": 10.863778114318848, "learning_rate": 4.130903196992287e-06, "loss": 0.5118, "mean_token_accuracy": 0.8390725657343865, "num_tokens": 145474645.0, "step": 120990 }, { "entropy": 1.8196517676115036, "epoch": 0.3750894131013152, "grad_norm": 4.3333210945129395, "learning_rate": 4.1307324935752964e-06, "loss": 0.4161, "mean_token_accuracy": 0.8509810373187066, "num_tokens": 145487342.0, "step": 121000 }, { "entropy": 1.8983831241726876, "epoch": 0.3751204122263649, "grad_norm": 3.389133930206299, "learning_rate": 4.130561811318752e-06, "loss": 0.5603, "mean_token_accuracy": 0.839329156279564, "num_tokens": 145499254.0, "step": 121010 }, { "entropy": 1.7906238347291947, "epoch": 0.3751514113514146, "grad_norm": 8.32974624633789, "learning_rate": 4.1303911502182784e-06, "loss": 0.4388, "mean_token_accuracy": 0.8561747640371322, "num_tokens": 145511933.0, "step": 121020 }, { "entropy": 1.943038222193718, "epoch": 0.3751824104764643, "grad_norm": 8.438972473144531, "learning_rate": 4.130220510269507e-06, "loss": 0.4776, "mean_token_accuracy": 0.8507806181907653, "num_tokens": 145522873.0, "step": 121030 }, { "entropy": 1.9753375679254532, "epoch": 0.375213409601514, "grad_norm": 6.82116174697876, "learning_rate": 4.1300498914680705e-06, "loss": 0.5384, "mean_token_accuracy": 0.8418274581432342, "num_tokens": 145533983.0, "step": 121040 }, { "entropy": 1.8580348297953606, "epoch": 0.37524440872656367, "grad_norm": 9.025100708007812, "learning_rate": 4.129879293809599e-06, "loss": 0.4782, "mean_token_accuracy": 0.840544268488884, "num_tokens": 145546293.0, "step": 121050 }, { "entropy": 1.8665331095457076, "epoch": 0.3752754078516134, "grad_norm": 4.244259834289551, "learning_rate": 4.129708717289727e-06, "loss": 0.4255, "mean_token_accuracy": 0.856557646393776, "num_tokens": 145557776.0, "step": 121060 }, { "entropy": 1.8694847792387008, "epoch": 0.37530640697666307, "grad_norm": 8.377169609069824, "learning_rate": 4.12953816190409e-06, "loss": 0.4456, "mean_token_accuracy": 0.857828502357006, "num_tokens": 145569154.0, "step": 121070 }, { "entropy": 1.7808043763041497, "epoch": 0.3753374061017128, "grad_norm": 4.1414570808410645, "learning_rate": 4.1293676276483235e-06, "loss": 0.3795, "mean_token_accuracy": 0.8570607796311378, "num_tokens": 145583082.0, "step": 121080 }, { "entropy": 1.9220749616622925, "epoch": 0.37536840522676246, "grad_norm": 8.00906753540039, "learning_rate": 4.1291971145180645e-06, "loss": 0.4529, "mean_token_accuracy": 0.8506890282034874, "num_tokens": 145594524.0, "step": 121090 }, { "entropy": 1.9267291516065597, "epoch": 0.3753994043518122, "grad_norm": 8.382123947143555, "learning_rate": 4.129026622508953e-06, "loss": 0.4762, "mean_token_accuracy": 0.8558269619941712, "num_tokens": 145605078.0, "step": 121100 }, { "entropy": 1.7520041272044182, "epoch": 0.37543040347686185, "grad_norm": 8.029170989990234, "learning_rate": 4.128856151616628e-06, "loss": 0.4173, "mean_token_accuracy": 0.8598437607288361, "num_tokens": 145618913.0, "step": 121110 }, { "entropy": 1.8347510501742363, "epoch": 0.3754614026019116, "grad_norm": 9.550764083862305, "learning_rate": 4.128685701836731e-06, "loss": 0.4574, "mean_token_accuracy": 0.8477082312107086, "num_tokens": 145631433.0, "step": 121120 }, { "entropy": 1.8798089861869811, "epoch": 0.37549240172696124, "grad_norm": 8.3656005859375, "learning_rate": 4.128515273164905e-06, "loss": 0.478, "mean_token_accuracy": 0.8539829984307289, "num_tokens": 145642408.0, "step": 121130 }, { "entropy": 1.7175431214272976, "epoch": 0.37552340085201097, "grad_norm": 8.398488998413086, "learning_rate": 4.128344865596793e-06, "loss": 0.3904, "mean_token_accuracy": 0.8567556262016296, "num_tokens": 145656156.0, "step": 121140 }, { "entropy": 1.7876494243741035, "epoch": 0.37555439997706064, "grad_norm": 9.021903038024902, "learning_rate": 4.12817447912804e-06, "loss": 0.4456, "mean_token_accuracy": 0.8566848263144493, "num_tokens": 145667977.0, "step": 121150 }, { "entropy": 1.7991645127534865, "epoch": 0.37558539910211036, "grad_norm": 8.89357852935791, "learning_rate": 4.128004113754292e-06, "loss": 0.4469, "mean_token_accuracy": 0.8592987477779388, "num_tokens": 145680773.0, "step": 121160 }, { "entropy": 1.8806771039962769, "epoch": 0.37561639822716003, "grad_norm": 7.232542514801025, "learning_rate": 4.127833769471198e-06, "loss": 0.5171, "mean_token_accuracy": 0.8432414412498475, "num_tokens": 145692228.0, "step": 121170 }, { "entropy": 1.8988723203539848, "epoch": 0.37564739735220976, "grad_norm": 9.036412239074707, "learning_rate": 4.127663446274406e-06, "loss": 0.4509, "mean_token_accuracy": 0.8560154557228088, "num_tokens": 145703505.0, "step": 121180 }, { "entropy": 1.876578164100647, "epoch": 0.3756783964772594, "grad_norm": 8.154003143310547, "learning_rate": 4.1274931441595645e-06, "loss": 0.4372, "mean_token_accuracy": 0.8526700213551521, "num_tokens": 145714806.0, "step": 121190 }, { "entropy": 1.831437037885189, "epoch": 0.37570939560230915, "grad_norm": 8.418209075927734, "learning_rate": 4.1273228631223275e-06, "loss": 0.4525, "mean_token_accuracy": 0.8543977767229081, "num_tokens": 145726566.0, "step": 121200 }, { "entropy": 1.7670956924557686, "epoch": 0.3757403947273588, "grad_norm": 8.465004920959473, "learning_rate": 4.1271526031583445e-06, "loss": 0.4109, "mean_token_accuracy": 0.8540250107645988, "num_tokens": 145739210.0, "step": 121210 }, { "entropy": 1.7944332778453826, "epoch": 0.37577139385240854, "grad_norm": 8.313122749328613, "learning_rate": 4.126982364263272e-06, "loss": 0.467, "mean_token_accuracy": 0.8499791830778122, "num_tokens": 145751513.0, "step": 121220 }, { "entropy": 1.8116312876343728, "epoch": 0.3758023929774582, "grad_norm": 3.7453999519348145, "learning_rate": 4.126812146432764e-06, "loss": 0.3914, "mean_token_accuracy": 0.8603421568870544, "num_tokens": 145763916.0, "step": 121230 }, { "entropy": 1.9280547127127647, "epoch": 0.3758333921025079, "grad_norm": 8.784612655639648, "learning_rate": 4.126641949662477e-06, "loss": 0.4732, "mean_token_accuracy": 0.8478071898221969, "num_tokens": 145774772.0, "step": 121240 }, { "entropy": 1.9599653363227845, "epoch": 0.3758643912275576, "grad_norm": 9.786565780639648, "learning_rate": 4.126471773948068e-06, "loss": 0.5111, "mean_token_accuracy": 0.8443946361541748, "num_tokens": 145785993.0, "step": 121250 }, { "entropy": 1.8358087345957756, "epoch": 0.3758953903526073, "grad_norm": 8.388903617858887, "learning_rate": 4.126301619285196e-06, "loss": 0.4631, "mean_token_accuracy": 0.8493312045931816, "num_tokens": 145798151.0, "step": 121260 }, { "entropy": 1.8409698456525803, "epoch": 0.375926389477657, "grad_norm": 9.696656227111816, "learning_rate": 4.126131485669522e-06, "loss": 0.4737, "mean_token_accuracy": 0.8447024121880531, "num_tokens": 145810340.0, "step": 121270 }, { "entropy": 1.9255991280078888, "epoch": 0.37595738860270667, "grad_norm": 9.015790939331055, "learning_rate": 4.125961373096706e-06, "loss": 0.5156, "mean_token_accuracy": 0.8415984019637108, "num_tokens": 145821505.0, "step": 121280 }, { "entropy": 1.9274761855602265, "epoch": 0.3759883877277564, "grad_norm": 7.261139392852783, "learning_rate": 4.125791281562412e-06, "loss": 0.5009, "mean_token_accuracy": 0.853644534945488, "num_tokens": 145832550.0, "step": 121290 }, { "entropy": 1.838864254951477, "epoch": 0.37601938685280606, "grad_norm": 4.556879043579102, "learning_rate": 4.125621211062301e-06, "loss": 0.4268, "mean_token_accuracy": 0.8556353718042373, "num_tokens": 145844415.0, "step": 121300 }, { "entropy": 1.9384997293353081, "epoch": 0.3760503859778558, "grad_norm": 6.320328712463379, "learning_rate": 4.125451161592041e-06, "loss": 0.4963, "mean_token_accuracy": 0.8399920925498009, "num_tokens": 145855907.0, "step": 121310 }, { "entropy": 1.8460743427276611, "epoch": 0.37608138510290545, "grad_norm": 8.746512413024902, "learning_rate": 4.125281133147298e-06, "loss": 0.4219, "mean_token_accuracy": 0.865053367614746, "num_tokens": 145867251.0, "step": 121320 }, { "entropy": 1.8502772450447083, "epoch": 0.3761123842279552, "grad_norm": 4.59258508682251, "learning_rate": 4.125111125723738e-06, "loss": 0.4174, "mean_token_accuracy": 0.8603545278310776, "num_tokens": 145879356.0, "step": 121330 }, { "entropy": 1.8849507763981819, "epoch": 0.37614338335300485, "grad_norm": 7.458291530609131, "learning_rate": 4.124941139317031e-06, "loss": 0.4773, "mean_token_accuracy": 0.8583846285939216, "num_tokens": 145891689.0, "step": 121340 }, { "entropy": 1.8615009844303132, "epoch": 0.37617438247805457, "grad_norm": 8.805126190185547, "learning_rate": 4.124771173922848e-06, "loss": 0.4526, "mean_token_accuracy": 0.8532016187906265, "num_tokens": 145903559.0, "step": 121350 }, { "entropy": 1.81989781036973, "epoch": 0.37620538160310424, "grad_norm": 7.289465427398682, "learning_rate": 4.1246012295368575e-06, "loss": 0.425, "mean_token_accuracy": 0.8597825333476067, "num_tokens": 145916356.0, "step": 121360 }, { "entropy": 1.8147448897361755, "epoch": 0.37623638072815396, "grad_norm": 7.9857497215271, "learning_rate": 4.124431306154735e-06, "loss": 0.423, "mean_token_accuracy": 0.8589125216007233, "num_tokens": 145928998.0, "step": 121370 }, { "entropy": 1.9180129885673523, "epoch": 0.37626737985320363, "grad_norm": 9.49499797821045, "learning_rate": 4.124261403772152e-06, "loss": 0.5094, "mean_token_accuracy": 0.8353124216198922, "num_tokens": 145940800.0, "step": 121380 }, { "entropy": 1.920377929508686, "epoch": 0.37629837897825336, "grad_norm": 9.39726448059082, "learning_rate": 4.124091522384785e-06, "loss": 0.5386, "mean_token_accuracy": 0.834184144437313, "num_tokens": 145952846.0, "step": 121390 }, { "entropy": 1.9227221325039863, "epoch": 0.376329378103303, "grad_norm": 8.036456108093262, "learning_rate": 4.1239216619883106e-06, "loss": 0.4837, "mean_token_accuracy": 0.85260841101408, "num_tokens": 145964633.0, "step": 121400 }, { "entropy": 1.880432690680027, "epoch": 0.37636037722835275, "grad_norm": 9.900726318359375, "learning_rate": 4.123751822578405e-06, "loss": 0.4538, "mean_token_accuracy": 0.8541592597961426, "num_tokens": 145976465.0, "step": 121410 }, { "entropy": 1.845744264125824, "epoch": 0.3763913763534024, "grad_norm": 8.883930206298828, "learning_rate": 4.123582004150748e-06, "loss": 0.4641, "mean_token_accuracy": 0.8476935580372811, "num_tokens": 145988619.0, "step": 121420 }, { "entropy": 1.8941200897097588, "epoch": 0.37642237547845214, "grad_norm": 7.9326910972595215, "learning_rate": 4.123412206701019e-06, "loss": 0.4714, "mean_token_accuracy": 0.8494398832321167, "num_tokens": 145999813.0, "step": 121430 }, { "entropy": 1.950559636950493, "epoch": 0.3764533746035018, "grad_norm": 7.880441665649414, "learning_rate": 4.1232424302249e-06, "loss": 0.5265, "mean_token_accuracy": 0.8358272299170494, "num_tokens": 146010044.0, "step": 121440 }, { "entropy": 1.8937064185738564, "epoch": 0.37648437372855154, "grad_norm": 7.3297438621521, "learning_rate": 4.123072674718073e-06, "loss": 0.4991, "mean_token_accuracy": 0.8427557229995728, "num_tokens": 146021669.0, "step": 121450 }, { "entropy": 1.7439885810017586, "epoch": 0.3765153728536012, "grad_norm": 2.4287400245666504, "learning_rate": 4.122902940176222e-06, "loss": 0.3597, "mean_token_accuracy": 0.8698983415961266, "num_tokens": 146035437.0, "step": 121460 }, { "entropy": 1.8418650522828102, "epoch": 0.37654637197865093, "grad_norm": 8.513551712036133, "learning_rate": 4.122733226595032e-06, "loss": 0.434, "mean_token_accuracy": 0.856321020424366, "num_tokens": 146047669.0, "step": 121470 }, { "entropy": 1.897158458828926, "epoch": 0.3765773711037006, "grad_norm": 8.884608268737793, "learning_rate": 4.122563533970189e-06, "loss": 0.4874, "mean_token_accuracy": 0.8471438035368919, "num_tokens": 146058873.0, "step": 121480 }, { "entropy": 1.8239916279911994, "epoch": 0.37660837022875027, "grad_norm": 7.655562877655029, "learning_rate": 4.122393862297381e-06, "loss": 0.4532, "mean_token_accuracy": 0.8447648495435714, "num_tokens": 146071178.0, "step": 121490 }, { "entropy": 1.8941697224974632, "epoch": 0.3766393693538, "grad_norm": 3.823326349258423, "learning_rate": 4.122224211572297e-06, "loss": 0.4325, "mean_token_accuracy": 0.8591320559382438, "num_tokens": 146082489.0, "step": 121500 }, { "entropy": 1.8416035562753676, "epoch": 0.37667036847884966, "grad_norm": 7.697302341461182, "learning_rate": 4.122054581790626e-06, "loss": 0.4573, "mean_token_accuracy": 0.8484307572245597, "num_tokens": 146094870.0, "step": 121510 }, { "entropy": 1.7722488626837731, "epoch": 0.3767013676038994, "grad_norm": 6.725828170776367, "learning_rate": 4.121884972948061e-06, "loss": 0.3802, "mean_token_accuracy": 0.8639165312051773, "num_tokens": 146108574.0, "step": 121520 }, { "entropy": 1.8973381593823433, "epoch": 0.37673236672894905, "grad_norm": 8.68867301940918, "learning_rate": 4.121715385040292e-06, "loss": 0.5001, "mean_token_accuracy": 0.8411045700311661, "num_tokens": 146120800.0, "step": 121530 }, { "entropy": 1.929002758860588, "epoch": 0.3767633658539988, "grad_norm": 7.320843696594238, "learning_rate": 4.1215458180630136e-06, "loss": 0.4931, "mean_token_accuracy": 0.8451986953616142, "num_tokens": 146132050.0, "step": 121540 }, { "entropy": 1.850935024023056, "epoch": 0.37679436497904845, "grad_norm": 3.584969997406006, "learning_rate": 4.121376272011922e-06, "loss": 0.4786, "mean_token_accuracy": 0.8525848358869552, "num_tokens": 146144075.0, "step": 121550 }, { "entropy": 1.8184596046805381, "epoch": 0.37682536410409817, "grad_norm": 8.334314346313477, "learning_rate": 4.121206746882713e-06, "loss": 0.4425, "mean_token_accuracy": 0.8496242105960846, "num_tokens": 146156909.0, "step": 121560 }, { "entropy": 1.845918568968773, "epoch": 0.37685636322914784, "grad_norm": 7.1232805252075195, "learning_rate": 4.121037242671082e-06, "loss": 0.5159, "mean_token_accuracy": 0.8491025045514107, "num_tokens": 146169187.0, "step": 121570 }, { "entropy": 1.8747231513261795, "epoch": 0.37688736235419756, "grad_norm": 8.403467178344727, "learning_rate": 4.12086775937273e-06, "loss": 0.4414, "mean_token_accuracy": 0.8543328627943992, "num_tokens": 146181009.0, "step": 121580 }, { "entropy": 1.9061955615878106, "epoch": 0.37691836147924723, "grad_norm": 9.928679466247559, "learning_rate": 4.120698296983356e-06, "loss": 0.4784, "mean_token_accuracy": 0.8431771412491799, "num_tokens": 146192524.0, "step": 121590 }, { "entropy": 1.8643248841166495, "epoch": 0.37694936060429696, "grad_norm": 6.862342834472656, "learning_rate": 4.120528855498663e-06, "loss": 0.3942, "mean_token_accuracy": 0.8685376390814781, "num_tokens": 146204768.0, "step": 121600 }, { "entropy": 1.8592729791998863, "epoch": 0.3769803597293466, "grad_norm": 8.456074714660645, "learning_rate": 4.12035943491435e-06, "loss": 0.4934, "mean_token_accuracy": 0.8440333098173142, "num_tokens": 146217180.0, "step": 121610 }, { "entropy": 1.801856230199337, "epoch": 0.37701135885439635, "grad_norm": 4.020124435424805, "learning_rate": 4.120190035226123e-06, "loss": 0.408, "mean_token_accuracy": 0.859604449570179, "num_tokens": 146230138.0, "step": 121620 }, { "entropy": 1.8506782591342925, "epoch": 0.377042357979446, "grad_norm": 7.4072041511535645, "learning_rate": 4.120020656429685e-06, "loss": 0.4412, "mean_token_accuracy": 0.8540009498596192, "num_tokens": 146242786.0, "step": 121630 }, { "entropy": 1.914864283800125, "epoch": 0.37707335710449574, "grad_norm": 7.623665809631348, "learning_rate": 4.119851298520746e-06, "loss": 0.4683, "mean_token_accuracy": 0.8540341034531593, "num_tokens": 146253407.0, "step": 121640 }, { "entropy": 1.9302624300122262, "epoch": 0.3771043562295454, "grad_norm": 9.16896915435791, "learning_rate": 4.119681961495008e-06, "loss": 0.447, "mean_token_accuracy": 0.8566766142845154, "num_tokens": 146264715.0, "step": 121650 }, { "entropy": 1.8666496634483338, "epoch": 0.37713535535459514, "grad_norm": 3.110356092453003, "learning_rate": 4.119512645348184e-06, "loss": 0.4695, "mean_token_accuracy": 0.8514965951442719, "num_tokens": 146276968.0, "step": 121660 }, { "entropy": 1.8799085780978202, "epoch": 0.3771663544796448, "grad_norm": 4.057291030883789, "learning_rate": 4.119343350075981e-06, "loss": 0.447, "mean_token_accuracy": 0.8615951225161552, "num_tokens": 146288576.0, "step": 121670 }, { "entropy": 1.818842075765133, "epoch": 0.37719735360469453, "grad_norm": 10.600461959838867, "learning_rate": 4.119174075674112e-06, "loss": 0.4499, "mean_token_accuracy": 0.852581399679184, "num_tokens": 146300167.0, "step": 121680 }, { "entropy": 1.8106214493513106, "epoch": 0.3772283527297442, "grad_norm": 3.652275323867798, "learning_rate": 4.119004822138288e-06, "loss": 0.4137, "mean_token_accuracy": 0.854650741815567, "num_tokens": 146312797.0, "step": 121690 }, { "entropy": 1.783032974600792, "epoch": 0.3772593518547939, "grad_norm": 2.3988730907440186, "learning_rate": 4.118835589464222e-06, "loss": 0.3963, "mean_token_accuracy": 0.8633334279060364, "num_tokens": 146325854.0, "step": 121700 }, { "entropy": 1.8655279122292996, "epoch": 0.3772903509798436, "grad_norm": 3.8433516025543213, "learning_rate": 4.118666377647631e-06, "loss": 0.459, "mean_token_accuracy": 0.8409992963075638, "num_tokens": 146338473.0, "step": 121710 }, { "entropy": 1.9374163687229156, "epoch": 0.3773213501048933, "grad_norm": 7.3235249519348145, "learning_rate": 4.118497186684229e-06, "loss": 0.504, "mean_token_accuracy": 0.8448733389377594, "num_tokens": 146349721.0, "step": 121720 }, { "entropy": 1.8571631446480752, "epoch": 0.377352349229943, "grad_norm": 4.183323860168457, "learning_rate": 4.118328016569734e-06, "loss": 0.47, "mean_token_accuracy": 0.8527215197682381, "num_tokens": 146362284.0, "step": 121730 }, { "entropy": 1.9317335307598114, "epoch": 0.37738334835499265, "grad_norm": 7.954257965087891, "learning_rate": 4.118158867299864e-06, "loss": 0.537, "mean_token_accuracy": 0.835789829492569, "num_tokens": 146373357.0, "step": 121740 }, { "entropy": 1.8835597395896913, "epoch": 0.3774143474800424, "grad_norm": 8.603527069091797, "learning_rate": 4.117989738870338e-06, "loss": 0.4575, "mean_token_accuracy": 0.8524842366576195, "num_tokens": 146384815.0, "step": 121750 }, { "entropy": 1.8527083411812781, "epoch": 0.37744534660509205, "grad_norm": 15.195161819458008, "learning_rate": 4.117820631276879e-06, "loss": 0.4141, "mean_token_accuracy": 0.8620884269475937, "num_tokens": 146396638.0, "step": 121760 }, { "entropy": 1.8090012684464454, "epoch": 0.37747634573014177, "grad_norm": 9.310328483581543, "learning_rate": 4.117651544515207e-06, "loss": 0.3812, "mean_token_accuracy": 0.8572055116295815, "num_tokens": 146409088.0, "step": 121770 }, { "entropy": 1.8714495450258255, "epoch": 0.37750734485519144, "grad_norm": 8.089369773864746, "learning_rate": 4.117482478581047e-06, "loss": 0.4199, "mean_token_accuracy": 0.8630315214395523, "num_tokens": 146421044.0, "step": 121780 }, { "entropy": 1.8696634009480477, "epoch": 0.37753834398024116, "grad_norm": 4.10288667678833, "learning_rate": 4.117313433470123e-06, "loss": 0.4542, "mean_token_accuracy": 0.8499879941344262, "num_tokens": 146432640.0, "step": 121790 }, { "entropy": 1.8695901393890382, "epoch": 0.37756934310529083, "grad_norm": 7.528837203979492, "learning_rate": 4.11714440917816e-06, "loss": 0.4675, "mean_token_accuracy": 0.8490506067872048, "num_tokens": 146444477.0, "step": 121800 }, { "entropy": 1.8624702721834183, "epoch": 0.37760034223034056, "grad_norm": 7.586561679840088, "learning_rate": 4.116975405700887e-06, "loss": 0.4175, "mean_token_accuracy": 0.853815546631813, "num_tokens": 146457039.0, "step": 121810 }, { "entropy": 1.917418046295643, "epoch": 0.3776313413553902, "grad_norm": 7.109292030334473, "learning_rate": 4.116806423034029e-06, "loss": 0.4573, "mean_token_accuracy": 0.854109600186348, "num_tokens": 146469106.0, "step": 121820 }, { "entropy": 1.8877900682389737, "epoch": 0.37766234048043995, "grad_norm": 2.658243417739868, "learning_rate": 4.116637461173319e-06, "loss": 0.4289, "mean_token_accuracy": 0.8463023707270623, "num_tokens": 146482224.0, "step": 121830 }, { "entropy": 1.9093093752861023, "epoch": 0.3776933396054896, "grad_norm": 7.953685283660889, "learning_rate": 4.116468520114486e-06, "loss": 0.4749, "mean_token_accuracy": 0.8459949418902397, "num_tokens": 146494016.0, "step": 121840 }, { "entropy": 1.8482676953077317, "epoch": 0.37772433873053934, "grad_norm": 5.989774703979492, "learning_rate": 4.116299599853262e-06, "loss": 0.4319, "mean_token_accuracy": 0.8606431394815445, "num_tokens": 146507063.0, "step": 121850 }, { "entropy": 1.6848430618643762, "epoch": 0.377755337855589, "grad_norm": 9.330160140991211, "learning_rate": 4.11613070038538e-06, "loss": 0.3608, "mean_token_accuracy": 0.874959084391594, "num_tokens": 146521265.0, "step": 121860 }, { "entropy": 1.8134713634848594, "epoch": 0.37778633698063874, "grad_norm": 9.139564514160156, "learning_rate": 4.115961821706575e-06, "loss": 0.4115, "mean_token_accuracy": 0.8542681246995926, "num_tokens": 146533625.0, "step": 121870 }, { "entropy": 1.8329403929412365, "epoch": 0.3778173361056884, "grad_norm": 2.2994585037231445, "learning_rate": 4.115792963812582e-06, "loss": 0.4243, "mean_token_accuracy": 0.8562861248850823, "num_tokens": 146546747.0, "step": 121880 }, { "entropy": 1.8927996829152107, "epoch": 0.37784833523073813, "grad_norm": 8.54615306854248, "learning_rate": 4.115624126699139e-06, "loss": 0.4712, "mean_token_accuracy": 0.8517010971903801, "num_tokens": 146559042.0, "step": 121890 }, { "entropy": 1.878790758550167, "epoch": 0.3778793343557878, "grad_norm": 10.510698318481445, "learning_rate": 4.1154553103619835e-06, "loss": 0.5119, "mean_token_accuracy": 0.8425199687480927, "num_tokens": 146570240.0, "step": 121900 }, { "entropy": 1.8623479381203651, "epoch": 0.3779103334808375, "grad_norm": 8.871630668640137, "learning_rate": 4.115286514796853e-06, "loss": 0.4428, "mean_token_accuracy": 0.8559099569916725, "num_tokens": 146583049.0, "step": 121910 }, { "entropy": 1.8767142370343208, "epoch": 0.3779413326058872, "grad_norm": 3.710120677947998, "learning_rate": 4.115117739999491e-06, "loss": 0.5138, "mean_token_accuracy": 0.847675909101963, "num_tokens": 146595103.0, "step": 121920 }, { "entropy": 1.960743510723114, "epoch": 0.3779723317309369, "grad_norm": 7.444425582885742, "learning_rate": 4.114948985965637e-06, "loss": 0.5077, "mean_token_accuracy": 0.8415925487875938, "num_tokens": 146605922.0, "step": 121930 }, { "entropy": 1.8839008912444115, "epoch": 0.3780033308559866, "grad_norm": 9.009261131286621, "learning_rate": 4.114780252691036e-06, "loss": 0.4788, "mean_token_accuracy": 0.8500188961625099, "num_tokens": 146617318.0, "step": 121940 }, { "entropy": 1.89123537838459, "epoch": 0.3780343299810363, "grad_norm": 7.541473865509033, "learning_rate": 4.11461154017143e-06, "loss": 0.4364, "mean_token_accuracy": 0.8585240036249161, "num_tokens": 146628643.0, "step": 121950 }, { "entropy": 1.9025135144591332, "epoch": 0.378065329106086, "grad_norm": 6.690996170043945, "learning_rate": 4.114442848402565e-06, "loss": 0.5119, "mean_token_accuracy": 0.8481958448886872, "num_tokens": 146640271.0, "step": 121960 }, { "entropy": 1.956543865799904, "epoch": 0.3780963282311357, "grad_norm": 8.179557800292969, "learning_rate": 4.1142741773801894e-06, "loss": 0.5092, "mean_token_accuracy": 0.8471057340502739, "num_tokens": 146650293.0, "step": 121970 }, { "entropy": 1.8972681686282158, "epoch": 0.37812732735618537, "grad_norm": 8.407889366149902, "learning_rate": 4.1141055271000485e-06, "loss": 0.4785, "mean_token_accuracy": 0.845854164659977, "num_tokens": 146662226.0, "step": 121980 }, { "entropy": 1.9091035678982735, "epoch": 0.37815832648123504, "grad_norm": 7.07802152633667, "learning_rate": 4.113936897557893e-06, "loss": 0.4828, "mean_token_accuracy": 0.8480123952031136, "num_tokens": 146674239.0, "step": 121990 }, { "entropy": 1.906779918074608, "epoch": 0.37818932560628477, "grad_norm": 10.43696403503418, "learning_rate": 4.113768288749473e-06, "loss": 0.4509, "mean_token_accuracy": 0.8478205010294915, "num_tokens": 146686512.0, "step": 122000 }, { "entropy": 1.9320502176880836, "epoch": 0.37822032473133443, "grad_norm": 9.740495681762695, "learning_rate": 4.113599700670539e-06, "loss": 0.4757, "mean_token_accuracy": 0.8459899872541428, "num_tokens": 146697859.0, "step": 122010 }, { "entropy": 1.9591059625148772, "epoch": 0.37825132385638416, "grad_norm": 8.180789947509766, "learning_rate": 4.113431133316846e-06, "loss": 0.4797, "mean_token_accuracy": 0.8437874868512154, "num_tokens": 146709083.0, "step": 122020 }, { "entropy": 1.802576979994774, "epoch": 0.3782823229814338, "grad_norm": 9.705636024475098, "learning_rate": 4.113262586684146e-06, "loss": 0.4406, "mean_token_accuracy": 0.8546473920345307, "num_tokens": 146722550.0, "step": 122030 }, { "entropy": 1.8501103572547435, "epoch": 0.37831332210648355, "grad_norm": 7.220241069793701, "learning_rate": 4.113094060768193e-06, "loss": 0.4619, "mean_token_accuracy": 0.8407644376158714, "num_tokens": 146736130.0, "step": 122040 }, { "entropy": 1.847523419559002, "epoch": 0.3783443212315332, "grad_norm": 7.426035404205322, "learning_rate": 4.112925555564747e-06, "loss": 0.4282, "mean_token_accuracy": 0.8541855216026306, "num_tokens": 146748100.0, "step": 122050 }, { "entropy": 1.7743535205721854, "epoch": 0.37837532035658294, "grad_norm": 4.075657367706299, "learning_rate": 4.112757071069562e-06, "loss": 0.4231, "mean_token_accuracy": 0.8489448487758636, "num_tokens": 146760893.0, "step": 122060 }, { "entropy": 1.853407160937786, "epoch": 0.3784063194816326, "grad_norm": 10.630528450012207, "learning_rate": 4.1125886072784e-06, "loss": 0.4918, "mean_token_accuracy": 0.8410938188433648, "num_tokens": 146773477.0, "step": 122070 }, { "entropy": 1.831977953016758, "epoch": 0.37843731860668234, "grad_norm": 9.587197303771973, "learning_rate": 4.112420164187019e-06, "loss": 0.461, "mean_token_accuracy": 0.856691500544548, "num_tokens": 146784991.0, "step": 122080 }, { "entropy": 1.8454594373703004, "epoch": 0.378468317731732, "grad_norm": 8.84549331665039, "learning_rate": 4.11225174179118e-06, "loss": 0.4513, "mean_token_accuracy": 0.8553299933671952, "num_tokens": 146797158.0, "step": 122090 }, { "entropy": 1.889687070250511, "epoch": 0.37849931685678173, "grad_norm": 8.103165626525879, "learning_rate": 4.1120833400866465e-06, "loss": 0.4901, "mean_token_accuracy": 0.8467686668038368, "num_tokens": 146808540.0, "step": 122100 }, { "entropy": 1.7636478379368783, "epoch": 0.3785303159818314, "grad_norm": 8.541621208190918, "learning_rate": 4.111914959069182e-06, "loss": 0.4046, "mean_token_accuracy": 0.8589817076921463, "num_tokens": 146822024.0, "step": 122110 }, { "entropy": 1.8948402941226958, "epoch": 0.3785613151068811, "grad_norm": 7.753279685974121, "learning_rate": 4.111746598734551e-06, "loss": 0.4575, "mean_token_accuracy": 0.8480223119258881, "num_tokens": 146833733.0, "step": 122120 }, { "entropy": 1.8317367523908614, "epoch": 0.3785923142319308, "grad_norm": 9.554505348205566, "learning_rate": 4.11157825907852e-06, "loss": 0.4395, "mean_token_accuracy": 0.853913576900959, "num_tokens": 146846482.0, "step": 122130 }, { "entropy": 1.8885809898376464, "epoch": 0.3786233133569805, "grad_norm": 8.104804039001465, "learning_rate": 4.111409940096856e-06, "loss": 0.4745, "mean_token_accuracy": 0.8428619146347046, "num_tokens": 146857645.0, "step": 122140 }, { "entropy": 1.9016202330589294, "epoch": 0.3786543124820302, "grad_norm": 8.457221031188965, "learning_rate": 4.111241641785328e-06, "loss": 0.4571, "mean_token_accuracy": 0.8547043219208718, "num_tokens": 146868691.0, "step": 122150 }, { "entropy": 1.8326540157198905, "epoch": 0.3786853116070799, "grad_norm": 8.81411361694336, "learning_rate": 4.111073364139704e-06, "loss": 0.4304, "mean_token_accuracy": 0.8510780692100525, "num_tokens": 146881779.0, "step": 122160 }, { "entropy": 1.8807669505476952, "epoch": 0.3787163107321296, "grad_norm": 6.89314603805542, "learning_rate": 4.110905107155758e-06, "loss": 0.4651, "mean_token_accuracy": 0.8529252395033836, "num_tokens": 146893089.0, "step": 122170 }, { "entropy": 1.8773617178201676, "epoch": 0.3787473098571793, "grad_norm": 9.011570930480957, "learning_rate": 4.110736870829259e-06, "loss": 0.4395, "mean_token_accuracy": 0.8588549971580506, "num_tokens": 146904814.0, "step": 122180 }, { "entropy": 1.8098601162433625, "epoch": 0.378778308982229, "grad_norm": 4.172004222869873, "learning_rate": 4.110568655155984e-06, "loss": 0.4148, "mean_token_accuracy": 0.8595098748803138, "num_tokens": 146917001.0, "step": 122190 }, { "entropy": 1.9448596596717835, "epoch": 0.3788093081072787, "grad_norm": 9.438422203063965, "learning_rate": 4.110400460131704e-06, "loss": 0.5126, "mean_token_accuracy": 0.8462162002921104, "num_tokens": 146928045.0, "step": 122200 }, { "entropy": 1.8534026801586152, "epoch": 0.37884030723232837, "grad_norm": 7.917478561401367, "learning_rate": 4.110232285752197e-06, "loss": 0.457, "mean_token_accuracy": 0.8487755730748177, "num_tokens": 146939455.0, "step": 122210 }, { "entropy": 1.88424232006073, "epoch": 0.3788713063573781, "grad_norm": 9.62708568572998, "learning_rate": 4.110064132013238e-06, "loss": 0.4474, "mean_token_accuracy": 0.8592986583709716, "num_tokens": 146951083.0, "step": 122220 }, { "entropy": 1.9062071546912194, "epoch": 0.37890230548242776, "grad_norm": 7.392994403839111, "learning_rate": 4.109895998910608e-06, "loss": 0.4531, "mean_token_accuracy": 0.8547699064016342, "num_tokens": 146962391.0, "step": 122230 }, { "entropy": 1.8592015653848648, "epoch": 0.37893330460747743, "grad_norm": 8.539904594421387, "learning_rate": 4.109727886440085e-06, "loss": 0.4418, "mean_token_accuracy": 0.8498929470777512, "num_tokens": 146974481.0, "step": 122240 }, { "entropy": 1.8191335678100586, "epoch": 0.37896430373252715, "grad_norm": 8.98744010925293, "learning_rate": 4.109559794597449e-06, "loss": 0.4187, "mean_token_accuracy": 0.8560615256428719, "num_tokens": 146986678.0, "step": 122250 }, { "entropy": 1.8230956450104714, "epoch": 0.3789953028575768, "grad_norm": 5.308132171630859, "learning_rate": 4.109391723378483e-06, "loss": 0.4226, "mean_token_accuracy": 0.8582112580537796, "num_tokens": 146999024.0, "step": 122260 }, { "entropy": 1.890698865056038, "epoch": 0.37902630198262655, "grad_norm": 8.043314933776855, "learning_rate": 4.109223672778969e-06, "loss": 0.4737, "mean_token_accuracy": 0.851884001493454, "num_tokens": 147011194.0, "step": 122270 }, { "entropy": 1.8781690523028374, "epoch": 0.3790573011076762, "grad_norm": 3.952792167663574, "learning_rate": 4.1090556427946905e-06, "loss": 0.4586, "mean_token_accuracy": 0.8536352217197418, "num_tokens": 147022928.0, "step": 122280 }, { "entropy": 1.8398768171668052, "epoch": 0.37908830023272594, "grad_norm": 4.229760646820068, "learning_rate": 4.108887633421435e-06, "loss": 0.4094, "mean_token_accuracy": 0.8582636058330536, "num_tokens": 147035202.0, "step": 122290 }, { "entropy": 1.8784176617860795, "epoch": 0.3791192993577756, "grad_norm": 4.562318801879883, "learning_rate": 4.10871964465499e-06, "loss": 0.3901, "mean_token_accuracy": 0.8587144047021866, "num_tokens": 147047405.0, "step": 122300 }, { "entropy": 1.8330136485397817, "epoch": 0.37915029848282533, "grad_norm": 7.896421432495117, "learning_rate": 4.108551676491141e-06, "loss": 0.3953, "mean_token_accuracy": 0.8706734910607338, "num_tokens": 147060914.0, "step": 122310 }, { "entropy": 1.9075976356863975, "epoch": 0.379181297607875, "grad_norm": 9.339140892028809, "learning_rate": 4.108383728925676e-06, "loss": 0.4792, "mean_token_accuracy": 0.8520648717880249, "num_tokens": 147072427.0, "step": 122320 }, { "entropy": 1.8779551640152932, "epoch": 0.3792122967329247, "grad_norm": 8.027301788330078, "learning_rate": 4.108215801954389e-06, "loss": 0.4604, "mean_token_accuracy": 0.8487935498356819, "num_tokens": 147085063.0, "step": 122330 }, { "entropy": 1.9310110673308372, "epoch": 0.3792432958579744, "grad_norm": 8.90977668762207, "learning_rate": 4.108047895573069e-06, "loss": 0.4744, "mean_token_accuracy": 0.8506239086389542, "num_tokens": 147096264.0, "step": 122340 }, { "entropy": 1.9274428904056549, "epoch": 0.3792742949830241, "grad_norm": 8.3607759475708, "learning_rate": 4.107880009777509e-06, "loss": 0.4596, "mean_token_accuracy": 0.8582290455698967, "num_tokens": 147108064.0, "step": 122350 }, { "entropy": 1.931325614452362, "epoch": 0.3793052941080738, "grad_norm": 7.63694429397583, "learning_rate": 4.1077121445635036e-06, "loss": 0.4714, "mean_token_accuracy": 0.8499775350093841, "num_tokens": 147120037.0, "step": 122360 }, { "entropy": 1.8099196195602416, "epoch": 0.3793362932331235, "grad_norm": 3.8295421600341797, "learning_rate": 4.107544299926848e-06, "loss": 0.4353, "mean_token_accuracy": 0.8536288380622864, "num_tokens": 147132561.0, "step": 122370 }, { "entropy": 1.8952490270137787, "epoch": 0.3793672923581732, "grad_norm": 3.9062981605529785, "learning_rate": 4.107376475863337e-06, "loss": 0.4945, "mean_token_accuracy": 0.845606592297554, "num_tokens": 147144257.0, "step": 122380 }, { "entropy": 1.8194826439023017, "epoch": 0.3793982914832229, "grad_norm": 2.3818023204803467, "learning_rate": 4.10720867236877e-06, "loss": 0.5514, "mean_token_accuracy": 0.8439658433198929, "num_tokens": 147158022.0, "step": 122390 }, { "entropy": 1.8628859788179397, "epoch": 0.3794292906082726, "grad_norm": 9.438950538635254, "learning_rate": 4.107040889438944e-06, "loss": 0.4349, "mean_token_accuracy": 0.8520619332790375, "num_tokens": 147171213.0, "step": 122400 }, { "entropy": 1.8283814415335655, "epoch": 0.3794602897333223, "grad_norm": 8.620655059814453, "learning_rate": 4.10687312706966e-06, "loss": 0.4471, "mean_token_accuracy": 0.8559184551239014, "num_tokens": 147183776.0, "step": 122410 }, { "entropy": 1.9575203657150269, "epoch": 0.37949128885837197, "grad_norm": 9.457103729248047, "learning_rate": 4.106705385256718e-06, "loss": 0.4955, "mean_token_accuracy": 0.8479184478521347, "num_tokens": 147194326.0, "step": 122420 }, { "entropy": 1.832918418943882, "epoch": 0.3795222879834217, "grad_norm": 4.032721519470215, "learning_rate": 4.106537663995922e-06, "loss": 0.4266, "mean_token_accuracy": 0.857351616024971, "num_tokens": 147207032.0, "step": 122430 }, { "entropy": 1.7993885815143584, "epoch": 0.37955328710847136, "grad_norm": 4.224924087524414, "learning_rate": 4.106369963283075e-06, "loss": 0.4184, "mean_token_accuracy": 0.8557244941592217, "num_tokens": 147220135.0, "step": 122440 }, { "entropy": 1.8836615636944771, "epoch": 0.3795842862335211, "grad_norm": 7.664379596710205, "learning_rate": 4.106202283113981e-06, "loss": 0.4577, "mean_token_accuracy": 0.8581904530525207, "num_tokens": 147232750.0, "step": 122450 }, { "entropy": 1.9607893019914626, "epoch": 0.37961528535857075, "grad_norm": 8.335966110229492, "learning_rate": 4.106034623484447e-06, "loss": 0.5075, "mean_token_accuracy": 0.8552387475967407, "num_tokens": 147244068.0, "step": 122460 }, { "entropy": 1.8356236606836318, "epoch": 0.3796462844836205, "grad_norm": 6.946023941040039, "learning_rate": 4.105866984390278e-06, "loss": 0.4068, "mean_token_accuracy": 0.8576385736465454, "num_tokens": 147256513.0, "step": 122470 }, { "entropy": 1.9411771401762963, "epoch": 0.37967728360867015, "grad_norm": 7.646897792816162, "learning_rate": 4.105699365827284e-06, "loss": 0.4842, "mean_token_accuracy": 0.8406426265835762, "num_tokens": 147267715.0, "step": 122480 }, { "entropy": 1.8944382056593896, "epoch": 0.3797082827337198, "grad_norm": 2.604159355163574, "learning_rate": 4.105531767791274e-06, "loss": 0.4951, "mean_token_accuracy": 0.8485357999801636, "num_tokens": 147279090.0, "step": 122490 }, { "entropy": 1.8582454279065133, "epoch": 0.37973928185876954, "grad_norm": 7.28471565246582, "learning_rate": 4.105364190278059e-06, "loss": 0.4241, "mean_token_accuracy": 0.8588834837079048, "num_tokens": 147291484.0, "step": 122500 }, { "entropy": 1.9929537415504455, "epoch": 0.3797702809838192, "grad_norm": 10.385540962219238, "learning_rate": 4.105196633283452e-06, "loss": 0.5533, "mean_token_accuracy": 0.8405931279063225, "num_tokens": 147302573.0, "step": 122510 }, { "entropy": 1.8923503875732421, "epoch": 0.37980128010886893, "grad_norm": 8.61189079284668, "learning_rate": 4.1050290968032635e-06, "loss": 0.441, "mean_token_accuracy": 0.8514659553766251, "num_tokens": 147314906.0, "step": 122520 }, { "entropy": 1.8648876428604126, "epoch": 0.3798322792339186, "grad_norm": 8.03481388092041, "learning_rate": 4.10486158083331e-06, "loss": 0.4817, "mean_token_accuracy": 0.8421670094132423, "num_tokens": 147327317.0, "step": 122530 }, { "entropy": 1.780473567545414, "epoch": 0.3798632783589683, "grad_norm": 8.313706398010254, "learning_rate": 4.104694085369405e-06, "loss": 0.3914, "mean_token_accuracy": 0.8570443943142891, "num_tokens": 147340984.0, "step": 122540 }, { "entropy": 1.9261644035577774, "epoch": 0.379894277484018, "grad_norm": 7.854968070983887, "learning_rate": 4.104526610407367e-06, "loss": 0.4716, "mean_token_accuracy": 0.8521699935197831, "num_tokens": 147352717.0, "step": 122550 }, { "entropy": 1.8588500022888184, "epoch": 0.3799252766090677, "grad_norm": 3.6017353534698486, "learning_rate": 4.104359155943014e-06, "loss": 0.4202, "mean_token_accuracy": 0.8543045312166214, "num_tokens": 147365365.0, "step": 122560 }, { "entropy": 1.8084479227662087, "epoch": 0.3799562757341174, "grad_norm": 3.5251076221466064, "learning_rate": 4.104191721972163e-06, "loss": 0.4114, "mean_token_accuracy": 0.8611663445830345, "num_tokens": 147378567.0, "step": 122570 }, { "entropy": 1.849086406826973, "epoch": 0.3799872748591671, "grad_norm": 4.066585063934326, "learning_rate": 4.104024308490636e-06, "loss": 0.4579, "mean_token_accuracy": 0.8478951990604401, "num_tokens": 147390692.0, "step": 122580 }, { "entropy": 1.900680673122406, "epoch": 0.3800182739842168, "grad_norm": 10.499907493591309, "learning_rate": 4.103856915494254e-06, "loss": 0.4607, "mean_token_accuracy": 0.8512833088636398, "num_tokens": 147402469.0, "step": 122590 }, { "entropy": 1.8157877206802369, "epoch": 0.3800492731092665, "grad_norm": 8.330013275146484, "learning_rate": 4.1036895429788395e-06, "loss": 0.4072, "mean_token_accuracy": 0.8638851836323738, "num_tokens": 147415369.0, "step": 122600 }, { "entropy": 1.9261372715234757, "epoch": 0.3800802722343162, "grad_norm": 8.978737831115723, "learning_rate": 4.103522190940217e-06, "loss": 0.5264, "mean_token_accuracy": 0.8390441820025444, "num_tokens": 147426432.0, "step": 122610 }, { "entropy": 1.9050227865576743, "epoch": 0.3801112713593659, "grad_norm": 8.126078605651855, "learning_rate": 4.10335485937421e-06, "loss": 0.5085, "mean_token_accuracy": 0.844034643471241, "num_tokens": 147437847.0, "step": 122620 }, { "entropy": 1.9048843890428544, "epoch": 0.38014227048441557, "grad_norm": 7.828693389892578, "learning_rate": 4.103187548276646e-06, "loss": 0.4734, "mean_token_accuracy": 0.8543620184063911, "num_tokens": 147449041.0, "step": 122630 }, { "entropy": 1.8402218729257585, "epoch": 0.3801732696094653, "grad_norm": 8.167677879333496, "learning_rate": 4.103020257643353e-06, "loss": 0.4197, "mean_token_accuracy": 0.8610726460814476, "num_tokens": 147461307.0, "step": 122640 }, { "entropy": 1.8394718445837497, "epoch": 0.38020426873451496, "grad_norm": 4.112548828125, "learning_rate": 4.1028529874701575e-06, "loss": 0.4175, "mean_token_accuracy": 0.8556538373231888, "num_tokens": 147474356.0, "step": 122650 }, { "entropy": 1.833746100962162, "epoch": 0.3802352678595647, "grad_norm": 6.661306858062744, "learning_rate": 4.102685737752892e-06, "loss": 0.4167, "mean_token_accuracy": 0.855821980535984, "num_tokens": 147487546.0, "step": 122660 }, { "entropy": 1.8778820380568504, "epoch": 0.38026626698461435, "grad_norm": 7.601857662200928, "learning_rate": 4.102518508487384e-06, "loss": 0.4151, "mean_token_accuracy": 0.8578482121229172, "num_tokens": 147500179.0, "step": 122670 }, { "entropy": 1.8702596746385098, "epoch": 0.3802972661096641, "grad_norm": 8.159707069396973, "learning_rate": 4.10235129966947e-06, "loss": 0.4358, "mean_token_accuracy": 0.8527193248271943, "num_tokens": 147512342.0, "step": 122680 }, { "entropy": 1.9300275981426238, "epoch": 0.38032826523471375, "grad_norm": 9.76501750946045, "learning_rate": 4.10218411129498e-06, "loss": 0.5027, "mean_token_accuracy": 0.842798325419426, "num_tokens": 147523781.0, "step": 122690 }, { "entropy": 1.8886654764413833, "epoch": 0.38035926435976347, "grad_norm": 8.383800506591797, "learning_rate": 4.10201694335975e-06, "loss": 0.5191, "mean_token_accuracy": 0.8469223186373711, "num_tokens": 147536454.0, "step": 122700 }, { "entropy": 1.9181490674614907, "epoch": 0.38039026348481314, "grad_norm": 7.91489315032959, "learning_rate": 4.1018497958596145e-06, "loss": 0.4989, "mean_token_accuracy": 0.8422677874565124, "num_tokens": 147548041.0, "step": 122710 }, { "entropy": 1.7913452178239821, "epoch": 0.3804212626098628, "grad_norm": 8.690677642822266, "learning_rate": 4.1016826687904125e-06, "loss": 0.4008, "mean_token_accuracy": 0.8631450653076171, "num_tokens": 147561955.0, "step": 122720 }, { "entropy": 1.8757226839661598, "epoch": 0.38045226173491253, "grad_norm": 8.690422058105469, "learning_rate": 4.10151556214798e-06, "loss": 0.4643, "mean_token_accuracy": 0.8484616339206695, "num_tokens": 147574345.0, "step": 122730 }, { "entropy": 1.9869503617286681, "epoch": 0.3804832608599622, "grad_norm": 9.068852424621582, "learning_rate": 4.101348475928157e-06, "loss": 0.4854, "mean_token_accuracy": 0.8462048456072807, "num_tokens": 147584992.0, "step": 122740 }, { "entropy": 1.8192487761378289, "epoch": 0.3805142599850119, "grad_norm": 8.221050262451172, "learning_rate": 4.101181410126785e-06, "loss": 0.4065, "mean_token_accuracy": 0.8553620144724846, "num_tokens": 147598434.0, "step": 122750 }, { "entropy": 1.9016779504716397, "epoch": 0.3805452591100616, "grad_norm": 8.561354637145996, "learning_rate": 4.101014364739705e-06, "loss": 0.4814, "mean_token_accuracy": 0.8509173199534417, "num_tokens": 147610587.0, "step": 122760 }, { "entropy": 1.9240503638982773, "epoch": 0.3805762582351113, "grad_norm": 8.848665237426758, "learning_rate": 4.100847339762759e-06, "loss": 0.481, "mean_token_accuracy": 0.8474395364522934, "num_tokens": 147622419.0, "step": 122770 }, { "entropy": 1.9462111860513687, "epoch": 0.380607257360161, "grad_norm": 8.207417488098145, "learning_rate": 4.100680335191792e-06, "loss": 0.4987, "mean_token_accuracy": 0.8494761645793915, "num_tokens": 147632828.0, "step": 122780 }, { "entropy": 1.8405832841992378, "epoch": 0.3806382564852107, "grad_norm": 4.011960506439209, "learning_rate": 4.1005133510226495e-06, "loss": 0.4005, "mean_token_accuracy": 0.8583981603384018, "num_tokens": 147645476.0, "step": 122790 }, { "entropy": 1.9420069836080074, "epoch": 0.3806692556102604, "grad_norm": 6.821649074554443, "learning_rate": 4.100346387251177e-06, "loss": 0.5369, "mean_token_accuracy": 0.8337976217269898, "num_tokens": 147657157.0, "step": 122800 }, { "entropy": 1.9372777387499809, "epoch": 0.3807002547353101, "grad_norm": 8.612418174743652, "learning_rate": 4.100179443873223e-06, "loss": 0.4738, "mean_token_accuracy": 0.8544177770614624, "num_tokens": 147668847.0, "step": 122810 }, { "entropy": 1.840282154083252, "epoch": 0.3807312538603598, "grad_norm": 4.8995041847229, "learning_rate": 4.100012520884635e-06, "loss": 0.5072, "mean_token_accuracy": 0.8471588909626007, "num_tokens": 147682236.0, "step": 122820 }, { "entropy": 1.9169215068221093, "epoch": 0.3807622529854095, "grad_norm": 8.391762733459473, "learning_rate": 4.0998456182812634e-06, "loss": 0.4782, "mean_token_accuracy": 0.8483142048120499, "num_tokens": 147693794.0, "step": 122830 }, { "entropy": 1.8828682228922844, "epoch": 0.38079325211045917, "grad_norm": 3.6466331481933594, "learning_rate": 4.099678736058961e-06, "loss": 0.4414, "mean_token_accuracy": 0.8551005318760871, "num_tokens": 147706128.0, "step": 122840 }, { "entropy": 1.894105489552021, "epoch": 0.3808242512355089, "grad_norm": 9.506867408752441, "learning_rate": 4.0995118742135785e-06, "loss": 0.4382, "mean_token_accuracy": 0.8543181657791138, "num_tokens": 147717697.0, "step": 122850 }, { "entropy": 1.8872636377811431, "epoch": 0.38085525036055856, "grad_norm": 9.389772415161133, "learning_rate": 4.09934503274097e-06, "loss": 0.4521, "mean_token_accuracy": 0.8546540632843971, "num_tokens": 147730381.0, "step": 122860 }, { "entropy": 1.8735971391201018, "epoch": 0.3808862494856083, "grad_norm": 10.220447540283203, "learning_rate": 4.09917821163699e-06, "loss": 0.448, "mean_token_accuracy": 0.8552726373076439, "num_tokens": 147742566.0, "step": 122870 }, { "entropy": 1.8962013125419617, "epoch": 0.38091724861065795, "grad_norm": 7.584221363067627, "learning_rate": 4.099011410897494e-06, "loss": 0.4982, "mean_token_accuracy": 0.8471159264445305, "num_tokens": 147755055.0, "step": 122880 }, { "entropy": 1.9144986018538475, "epoch": 0.3809482477357077, "grad_norm": 8.150506019592285, "learning_rate": 4.098844630518339e-06, "loss": 0.439, "mean_token_accuracy": 0.8621457502245903, "num_tokens": 147766218.0, "step": 122890 }, { "entropy": 1.8816324099898338, "epoch": 0.38097924686075735, "grad_norm": 8.89142894744873, "learning_rate": 4.0986778704953845e-06, "loss": 0.4577, "mean_token_accuracy": 0.8532285436987876, "num_tokens": 147778543.0, "step": 122900 }, { "entropy": 1.9272763848304748, "epoch": 0.3810102459858071, "grad_norm": 8.55495548248291, "learning_rate": 4.098511130824489e-06, "loss": 0.4988, "mean_token_accuracy": 0.845673693716526, "num_tokens": 147789726.0, "step": 122910 }, { "entropy": 1.7462919235229493, "epoch": 0.38104124511085674, "grad_norm": 9.728035926818848, "learning_rate": 4.0983444115015134e-06, "loss": 0.3724, "mean_token_accuracy": 0.8587867051362992, "num_tokens": 147804271.0, "step": 122920 }, { "entropy": 1.8114602223038674, "epoch": 0.38107224423590647, "grad_norm": 5.929993152618408, "learning_rate": 4.098177712522317e-06, "loss": 0.3789, "mean_token_accuracy": 0.860235296189785, "num_tokens": 147817072.0, "step": 122930 }, { "entropy": 1.846523255109787, "epoch": 0.38110324336095613, "grad_norm": 8.399463653564453, "learning_rate": 4.098011033882767e-06, "loss": 0.4305, "mean_token_accuracy": 0.8544706001877784, "num_tokens": 147830099.0, "step": 122940 }, { "entropy": 1.9178942322731019, "epoch": 0.38113424248600586, "grad_norm": 4.426633358001709, "learning_rate": 4.0978443755787265e-06, "loss": 0.4682, "mean_token_accuracy": 0.8501872152090073, "num_tokens": 147841437.0, "step": 122950 }, { "entropy": 1.881843727827072, "epoch": 0.3811652416110555, "grad_norm": 4.205018043518066, "learning_rate": 4.097677737606057e-06, "loss": 0.4347, "mean_token_accuracy": 0.8623296022415161, "num_tokens": 147852742.0, "step": 122960 }, { "entropy": 1.8286479495465755, "epoch": 0.3811962407361052, "grad_norm": 10.341126441955566, "learning_rate": 4.09751111996063e-06, "loss": 0.3951, "mean_token_accuracy": 0.8560968965291977, "num_tokens": 147865179.0, "step": 122970 }, { "entropy": 1.8264545932412148, "epoch": 0.3812272398611549, "grad_norm": 8.994978904724121, "learning_rate": 4.09734452263831e-06, "loss": 0.4299, "mean_token_accuracy": 0.8656197428703308, "num_tokens": 147877567.0, "step": 122980 }, { "entropy": 1.9247392192482948, "epoch": 0.3812582389862046, "grad_norm": 7.663559913635254, "learning_rate": 4.097177945634967e-06, "loss": 0.4954, "mean_token_accuracy": 0.8439255699515342, "num_tokens": 147889528.0, "step": 122990 }, { "entropy": 1.8423993363976479, "epoch": 0.3812892381112543, "grad_norm": 3.762521266937256, "learning_rate": 4.0970113889464705e-06, "loss": 0.3907, "mean_token_accuracy": 0.8593509882688523, "num_tokens": 147902458.0, "step": 123000 }, { "entropy": 1.9567664802074431, "epoch": 0.381320237236304, "grad_norm": 8.625659942626953, "learning_rate": 4.096844852568692e-06, "loss": 0.5569, "mean_token_accuracy": 0.8298911958932876, "num_tokens": 147913729.0, "step": 123010 }, { "entropy": 1.8710352510213852, "epoch": 0.3813512363613537, "grad_norm": 8.000286102294922, "learning_rate": 4.096678336497503e-06, "loss": 0.4474, "mean_token_accuracy": 0.8588433533906936, "num_tokens": 147925548.0, "step": 123020 }, { "entropy": 1.9936707019805908, "epoch": 0.3813822354864034, "grad_norm": 9.606999397277832, "learning_rate": 4.09651184072878e-06, "loss": 0.5462, "mean_token_accuracy": 0.8335342779755592, "num_tokens": 147936901.0, "step": 123030 }, { "entropy": 1.9054792627692223, "epoch": 0.3814132346114531, "grad_norm": 9.011368751525879, "learning_rate": 4.096345365258394e-06, "loss": 0.4861, "mean_token_accuracy": 0.8416555240750313, "num_tokens": 147948698.0, "step": 123040 }, { "entropy": 1.9143636167049407, "epoch": 0.38144423373650277, "grad_norm": 9.330082893371582, "learning_rate": 4.096178910082223e-06, "loss": 0.4695, "mean_token_accuracy": 0.8574577882885933, "num_tokens": 147960109.0, "step": 123050 }, { "entropy": 1.8986399203538895, "epoch": 0.3814752328615525, "grad_norm": 8.78416919708252, "learning_rate": 4.096012475196143e-06, "loss": 0.4591, "mean_token_accuracy": 0.8498771920800209, "num_tokens": 147972193.0, "step": 123060 }, { "entropy": 1.8185602709650994, "epoch": 0.38150623198660216, "grad_norm": 8.202901840209961, "learning_rate": 4.095846060596033e-06, "loss": 0.4556, "mean_token_accuracy": 0.8550837740302086, "num_tokens": 147985079.0, "step": 123070 }, { "entropy": 1.9409471869468689, "epoch": 0.3815372311116519, "grad_norm": 7.777408123016357, "learning_rate": 4.095679666277773e-06, "loss": 0.4654, "mean_token_accuracy": 0.8446749314665795, "num_tokens": 147996703.0, "step": 123080 }, { "entropy": 1.9127967476844787, "epoch": 0.38156823023670156, "grad_norm": 8.144055366516113, "learning_rate": 4.095513292237241e-06, "loss": 0.5223, "mean_token_accuracy": 0.8405533611774445, "num_tokens": 148007728.0, "step": 123090 }, { "entropy": 1.8132119834423066, "epoch": 0.3815992293617513, "grad_norm": 4.817179203033447, "learning_rate": 4.095346938470322e-06, "loss": 0.4972, "mean_token_accuracy": 0.8394502356648446, "num_tokens": 148021176.0, "step": 123100 }, { "entropy": 1.8567951753735543, "epoch": 0.38163022848680095, "grad_norm": 8.94315242767334, "learning_rate": 4.095180604972897e-06, "loss": 0.4276, "mean_token_accuracy": 0.8610012695193291, "num_tokens": 148033775.0, "step": 123110 }, { "entropy": 1.7960670098662377, "epoch": 0.3816612276118507, "grad_norm": 8.07936954498291, "learning_rate": 4.095014291740849e-06, "loss": 0.3632, "mean_token_accuracy": 0.8699286490678787, "num_tokens": 148046444.0, "step": 123120 }, { "entropy": 1.880557769536972, "epoch": 0.38169222673690034, "grad_norm": 8.235962867736816, "learning_rate": 4.094847998770066e-06, "loss": 0.4425, "mean_token_accuracy": 0.859109228849411, "num_tokens": 148058765.0, "step": 123130 }, { "entropy": 1.8710905313491821, "epoch": 0.38172322586195007, "grad_norm": 8.953729629516602, "learning_rate": 4.094681726056433e-06, "loss": 0.4518, "mean_token_accuracy": 0.8571123152971267, "num_tokens": 148071128.0, "step": 123140 }, { "entropy": 1.8630672127008439, "epoch": 0.38175422498699974, "grad_norm": 8.592804908752441, "learning_rate": 4.094515473595838e-06, "loss": 0.4443, "mean_token_accuracy": 0.8458248943090438, "num_tokens": 148082981.0, "step": 123150 }, { "entropy": 1.900773896276951, "epoch": 0.38178522411204946, "grad_norm": 7.337060928344727, "learning_rate": 4.0943492413841685e-06, "loss": 0.4902, "mean_token_accuracy": 0.8392305105924607, "num_tokens": 148095109.0, "step": 123160 }, { "entropy": 1.827962300926447, "epoch": 0.38181622323709913, "grad_norm": 7.826770305633545, "learning_rate": 4.094183029417316e-06, "loss": 0.3842, "mean_token_accuracy": 0.8655768454074859, "num_tokens": 148108413.0, "step": 123170 }, { "entropy": 1.8747796788811684, "epoch": 0.38184722236214885, "grad_norm": 8.268677711486816, "learning_rate": 4.0940168376911705e-06, "loss": 0.4595, "mean_token_accuracy": 0.8500054642558098, "num_tokens": 148120152.0, "step": 123180 }, { "entropy": 1.8802484720945358, "epoch": 0.3818782214871985, "grad_norm": 9.56521987915039, "learning_rate": 4.093850666201625e-06, "loss": 0.4321, "mean_token_accuracy": 0.8550689592957497, "num_tokens": 148131543.0, "step": 123190 }, { "entropy": 1.9417632609605788, "epoch": 0.38190922061224825, "grad_norm": 8.511123657226562, "learning_rate": 4.093684514944573e-06, "loss": 0.5105, "mean_token_accuracy": 0.8440429598093033, "num_tokens": 148142246.0, "step": 123200 }, { "entropy": 1.862280185520649, "epoch": 0.3819402197372979, "grad_norm": 9.24006175994873, "learning_rate": 4.093518383915908e-06, "loss": 0.4417, "mean_token_accuracy": 0.8556727141141891, "num_tokens": 148154604.0, "step": 123210 }, { "entropy": 1.90358504652977, "epoch": 0.3819712188623476, "grad_norm": 7.789550304412842, "learning_rate": 4.093352273111527e-06, "loss": 0.4723, "mean_token_accuracy": 0.8538018524646759, "num_tokens": 148166038.0, "step": 123220 }, { "entropy": 1.8469286099076272, "epoch": 0.3820022179873973, "grad_norm": 8.740273475646973, "learning_rate": 4.093186182527327e-06, "loss": 0.4235, "mean_token_accuracy": 0.858376681804657, "num_tokens": 148178188.0, "step": 123230 }, { "entropy": 1.8784661173820496, "epoch": 0.382033217112447, "grad_norm": 8.926239013671875, "learning_rate": 4.093020112159205e-06, "loss": 0.616, "mean_token_accuracy": 0.8364622831344605, "num_tokens": 148190734.0, "step": 123240 }, { "entropy": 1.9453864216804504, "epoch": 0.3820642162374967, "grad_norm": 7.907076358795166, "learning_rate": 4.092854062003061e-06, "loss": 0.47, "mean_token_accuracy": 0.8546721920371055, "num_tokens": 148201633.0, "step": 123250 }, { "entropy": 1.8376804277300836, "epoch": 0.38209521536254637, "grad_norm": 9.020318031311035, "learning_rate": 4.092688032054795e-06, "loss": 0.4549, "mean_token_accuracy": 0.8542071163654328, "num_tokens": 148214535.0, "step": 123260 }, { "entropy": 1.8767757460474968, "epoch": 0.3821262144875961, "grad_norm": 8.229818344116211, "learning_rate": 4.092522022310309e-06, "loss": 0.5102, "mean_token_accuracy": 0.8497373566031456, "num_tokens": 148226463.0, "step": 123270 }, { "entropy": 1.8976902469992638, "epoch": 0.38215721361264576, "grad_norm": 8.769594192504883, "learning_rate": 4.092356032765506e-06, "loss": 0.5086, "mean_token_accuracy": 0.8375331163406372, "num_tokens": 148238620.0, "step": 123280 }, { "entropy": 1.9701337844133378, "epoch": 0.3821882127376955, "grad_norm": 9.727083206176758, "learning_rate": 4.092190063416288e-06, "loss": 0.5527, "mean_token_accuracy": 0.8314969301223755, "num_tokens": 148249082.0, "step": 123290 }, { "entropy": 1.8411318197846414, "epoch": 0.38221921186274516, "grad_norm": 7.282608985900879, "learning_rate": 4.0920241142585636e-06, "loss": 0.39, "mean_token_accuracy": 0.8594763651490211, "num_tokens": 148261567.0, "step": 123300 }, { "entropy": 1.9043711677193642, "epoch": 0.3822502109877949, "grad_norm": 7.864800453186035, "learning_rate": 4.091858185288235e-06, "loss": 0.5022, "mean_token_accuracy": 0.8414738133549691, "num_tokens": 148273132.0, "step": 123310 }, { "entropy": 1.809215198457241, "epoch": 0.38228121011284455, "grad_norm": 8.225022315979004, "learning_rate": 4.091692276501213e-06, "loss": 0.3809, "mean_token_accuracy": 0.8564652130007744, "num_tokens": 148286238.0, "step": 123320 }, { "entropy": 1.7878570154309272, "epoch": 0.3823122092378943, "grad_norm": 4.167350769042969, "learning_rate": 4.0915263878934044e-06, "loss": 0.4234, "mean_token_accuracy": 0.8619178295135498, "num_tokens": 148298922.0, "step": 123330 }, { "entropy": 1.850788153707981, "epoch": 0.38234320836294394, "grad_norm": 7.613214492797852, "learning_rate": 4.091360519460719e-06, "loss": 0.4202, "mean_token_accuracy": 0.8597221150994301, "num_tokens": 148311479.0, "step": 123340 }, { "entropy": 1.8307570219039917, "epoch": 0.38237420748799367, "grad_norm": 4.395393371582031, "learning_rate": 4.0911946711990686e-06, "loss": 0.4586, "mean_token_accuracy": 0.8481859222054482, "num_tokens": 148324806.0, "step": 123350 }, { "entropy": 1.9396772772073745, "epoch": 0.38240520661304334, "grad_norm": 3.719348430633545, "learning_rate": 4.091028843104365e-06, "loss": 0.4781, "mean_token_accuracy": 0.8467726603150367, "num_tokens": 148336424.0, "step": 123360 }, { "entropy": 1.8743967324495316, "epoch": 0.38243620573809306, "grad_norm": 3.7305939197540283, "learning_rate": 4.090863035172519e-06, "loss": 0.443, "mean_token_accuracy": 0.8555470928549767, "num_tokens": 148348774.0, "step": 123370 }, { "entropy": 1.9252464964985847, "epoch": 0.38246720486314273, "grad_norm": 8.94243335723877, "learning_rate": 4.090697247399448e-06, "loss": 0.476, "mean_token_accuracy": 0.8391307726502418, "num_tokens": 148360798.0, "step": 123380 }, { "entropy": 1.9863820880651475, "epoch": 0.38249820398819245, "grad_norm": 9.898303031921387, "learning_rate": 4.090531479781067e-06, "loss": 0.5399, "mean_token_accuracy": 0.8426146388053894, "num_tokens": 148371596.0, "step": 123390 }, { "entropy": 1.8762628570199014, "epoch": 0.3825292031132421, "grad_norm": 7.574060440063477, "learning_rate": 4.0903657323132925e-06, "loss": 0.4402, "mean_token_accuracy": 0.8638320744037629, "num_tokens": 148383122.0, "step": 123400 }, { "entropy": 1.9100775212049483, "epoch": 0.38256020223829185, "grad_norm": 4.106917858123779, "learning_rate": 4.0902000049920414e-06, "loss": 0.5017, "mean_token_accuracy": 0.8347994074225425, "num_tokens": 148394952.0, "step": 123410 }, { "entropy": 1.9066940754652024, "epoch": 0.3825912013633415, "grad_norm": 7.716053009033203, "learning_rate": 4.090034297813234e-06, "loss": 0.4522, "mean_token_accuracy": 0.849234452843666, "num_tokens": 148407262.0, "step": 123420 }, { "entropy": 1.7800323203206063, "epoch": 0.38262220048839124, "grad_norm": 4.275322914123535, "learning_rate": 4.089868610772788e-06, "loss": 0.4183, "mean_token_accuracy": 0.8655282810330391, "num_tokens": 148421221.0, "step": 123430 }, { "entropy": 1.823851725459099, "epoch": 0.3826531996134409, "grad_norm": 8.498555183410645, "learning_rate": 4.089702943866628e-06, "loss": 0.3806, "mean_token_accuracy": 0.8620473012328148, "num_tokens": 148434376.0, "step": 123440 }, { "entropy": 1.9421150475740432, "epoch": 0.38268419873849063, "grad_norm": 8.584688186645508, "learning_rate": 4.0895372970906745e-06, "loss": 0.5023, "mean_token_accuracy": 0.8448976293206215, "num_tokens": 148445096.0, "step": 123450 }, { "entropy": 1.8550694674253463, "epoch": 0.3827151978635403, "grad_norm": 8.82845401763916, "learning_rate": 4.089371670440852e-06, "loss": 0.4385, "mean_token_accuracy": 0.8511300772428513, "num_tokens": 148456927.0, "step": 123460 }, { "entropy": 1.92039655148983, "epoch": 0.38274619698858997, "grad_norm": 8.373238563537598, "learning_rate": 4.089206063913085e-06, "loss": 0.4981, "mean_token_accuracy": 0.8491540059447289, "num_tokens": 148468499.0, "step": 123470 }, { "entropy": 1.8469054311513902, "epoch": 0.3827771961136397, "grad_norm": 8.064623832702637, "learning_rate": 4.089040477503299e-06, "loss": 0.4028, "mean_token_accuracy": 0.8639935791492462, "num_tokens": 148480307.0, "step": 123480 }, { "entropy": 1.961165651679039, "epoch": 0.38280819523868936, "grad_norm": 7.094374656677246, "learning_rate": 4.088874911207421e-06, "loss": 0.5031, "mean_token_accuracy": 0.8461368843913079, "num_tokens": 148490982.0, "step": 123490 }, { "entropy": 1.8972523733973503, "epoch": 0.3828391943637391, "grad_norm": 9.41102123260498, "learning_rate": 4.08870936502138e-06, "loss": 0.4796, "mean_token_accuracy": 0.8459867879748344, "num_tokens": 148503085.0, "step": 123500 }, { "entropy": 1.8196463674306869, "epoch": 0.38287019348878876, "grad_norm": 8.395617485046387, "learning_rate": 4.088543838941105e-06, "loss": 0.4227, "mean_token_accuracy": 0.8633744567632675, "num_tokens": 148515132.0, "step": 123510 }, { "entropy": 1.933710703253746, "epoch": 0.3829011926138385, "grad_norm": 9.440218925476074, "learning_rate": 4.088378332962527e-06, "loss": 0.5686, "mean_token_accuracy": 0.8320772811770439, "num_tokens": 148526393.0, "step": 123520 }, { "entropy": 1.8306830495595932, "epoch": 0.38293219173888815, "grad_norm": 3.874952554702759, "learning_rate": 4.088212847081577e-06, "loss": 0.4136, "mean_token_accuracy": 0.8493449553847313, "num_tokens": 148539616.0, "step": 123530 }, { "entropy": 1.8871184498071671, "epoch": 0.3829631908639379, "grad_norm": 4.710858345031738, "learning_rate": 4.0880473812941876e-06, "loss": 0.4812, "mean_token_accuracy": 0.848838959634304, "num_tokens": 148551814.0, "step": 123540 }, { "entropy": 1.8882265403866767, "epoch": 0.38299418998898754, "grad_norm": 7.315601825714111, "learning_rate": 4.087881935596294e-06, "loss": 0.4169, "mean_token_accuracy": 0.8602377876639367, "num_tokens": 148563466.0, "step": 123550 }, { "entropy": 1.9142208576202393, "epoch": 0.38302518911403727, "grad_norm": 8.4950590133667, "learning_rate": 4.08771650998383e-06, "loss": 0.4305, "mean_token_accuracy": 0.8584097474813461, "num_tokens": 148575212.0, "step": 123560 }, { "entropy": 1.9166872769594192, "epoch": 0.38305618823908694, "grad_norm": 8.157064437866211, "learning_rate": 4.087551104452733e-06, "loss": 0.5462, "mean_token_accuracy": 0.8363913476467133, "num_tokens": 148587345.0, "step": 123570 }, { "entropy": 1.852193793654442, "epoch": 0.38308718736413666, "grad_norm": 5.7400031089782715, "learning_rate": 4.08738571899894e-06, "loss": 0.4275, "mean_token_accuracy": 0.8614915952086448, "num_tokens": 148599766.0, "step": 123580 }, { "entropy": 1.6889735147356988, "epoch": 0.38311818648918633, "grad_norm": 9.48442554473877, "learning_rate": 4.08722035361839e-06, "loss": 0.3805, "mean_token_accuracy": 0.8619127184152603, "num_tokens": 148613775.0, "step": 123590 }, { "entropy": 1.8661672458052636, "epoch": 0.38314918561423605, "grad_norm": 3.9152071475982666, "learning_rate": 4.087055008307023e-06, "loss": 0.4521, "mean_token_accuracy": 0.8527839332818985, "num_tokens": 148625883.0, "step": 123600 }, { "entropy": 1.9174211084842683, "epoch": 0.3831801847392857, "grad_norm": 4.289289951324463, "learning_rate": 4.086889683060778e-06, "loss": 0.466, "mean_token_accuracy": 0.8489474713802337, "num_tokens": 148637944.0, "step": 123610 }, { "entropy": 1.9397675469517708, "epoch": 0.38321118386433545, "grad_norm": 8.902761459350586, "learning_rate": 4.086724377875599e-06, "loss": 0.5397, "mean_token_accuracy": 0.8369091704487801, "num_tokens": 148649564.0, "step": 123620 }, { "entropy": 1.8931380987167359, "epoch": 0.3832421829893851, "grad_norm": 7.973553657531738, "learning_rate": 4.08655909274743e-06, "loss": 0.4915, "mean_token_accuracy": 0.8456487700343132, "num_tokens": 148661427.0, "step": 123630 }, { "entropy": 1.779283882677555, "epoch": 0.38327318211443484, "grad_norm": 9.01836109161377, "learning_rate": 4.086393827672212e-06, "loss": 0.449, "mean_token_accuracy": 0.8560737937688827, "num_tokens": 148675109.0, "step": 123640 }, { "entropy": 1.899433010816574, "epoch": 0.3833041812394845, "grad_norm": 9.137697219848633, "learning_rate": 4.086228582645893e-06, "loss": 0.445, "mean_token_accuracy": 0.8533209547400474, "num_tokens": 148686499.0, "step": 123650 }, { "entropy": 1.9183125644922256, "epoch": 0.38333518036453423, "grad_norm": 8.239258766174316, "learning_rate": 4.08606335766442e-06, "loss": 0.4644, "mean_token_accuracy": 0.8569030150771141, "num_tokens": 148697749.0, "step": 123660 }, { "entropy": 1.7693776428699493, "epoch": 0.3833661794895839, "grad_norm": 3.299410343170166, "learning_rate": 4.08589815272374e-06, "loss": 0.3586, "mean_token_accuracy": 0.8623643219470978, "num_tokens": 148712318.0, "step": 123670 }, { "entropy": 1.9084981709718705, "epoch": 0.3833971786146336, "grad_norm": 7.353041172027588, "learning_rate": 4.085732967819801e-06, "loss": 0.4917, "mean_token_accuracy": 0.847068789601326, "num_tokens": 148724494.0, "step": 123680 }, { "entropy": 1.9519029945135116, "epoch": 0.3834281777396833, "grad_norm": 8.309041976928711, "learning_rate": 4.085567802948554e-06, "loss": 0.4379, "mean_token_accuracy": 0.8656853973865509, "num_tokens": 148735242.0, "step": 123690 }, { "entropy": 1.8672381609678268, "epoch": 0.383459176864733, "grad_norm": 8.3343505859375, "learning_rate": 4.085402658105951e-06, "loss": 0.4244, "mean_token_accuracy": 0.8520313948392868, "num_tokens": 148747370.0, "step": 123700 }, { "entropy": 1.9503202199935914, "epoch": 0.3834901759897827, "grad_norm": 7.8455729484558105, "learning_rate": 4.085237533287944e-06, "loss": 0.4949, "mean_token_accuracy": 0.8526298850774765, "num_tokens": 148757963.0, "step": 123710 }, { "entropy": 1.861949661374092, "epoch": 0.38352117511483236, "grad_norm": 3.735707998275757, "learning_rate": 4.085072428490485e-06, "loss": 0.4121, "mean_token_accuracy": 0.8578638583421707, "num_tokens": 148770099.0, "step": 123720 }, { "entropy": 1.8470420002937318, "epoch": 0.3835521742398821, "grad_norm": 10.177236557006836, "learning_rate": 4.0849073437095295e-06, "loss": 0.4692, "mean_token_accuracy": 0.8429055437445641, "num_tokens": 148782367.0, "step": 123730 }, { "entropy": 1.9221158519387245, "epoch": 0.38358317336493175, "grad_norm": 4.895500183105469, "learning_rate": 4.0847422789410344e-06, "loss": 0.4672, "mean_token_accuracy": 0.8465067788958549, "num_tokens": 148794003.0, "step": 123740 }, { "entropy": 1.970751866698265, "epoch": 0.3836141724899815, "grad_norm": 7.53320837020874, "learning_rate": 4.084577234180957e-06, "loss": 0.48, "mean_token_accuracy": 0.8521478682756424, "num_tokens": 148804935.0, "step": 123750 }, { "entropy": 1.7943205565214158, "epoch": 0.38364517161503114, "grad_norm": 6.433713436126709, "learning_rate": 4.084412209425253e-06, "loss": 0.3881, "mean_token_accuracy": 0.8569139391183853, "num_tokens": 148818096.0, "step": 123760 }, { "entropy": 1.7551213905215264, "epoch": 0.38367617074008087, "grad_norm": 7.526129245758057, "learning_rate": 4.084247204669883e-06, "loss": 0.3707, "mean_token_accuracy": 0.8625258266925812, "num_tokens": 148831912.0, "step": 123770 }, { "entropy": 1.862623292207718, "epoch": 0.38370716986513054, "grad_norm": 8.402950286865234, "learning_rate": 4.084082219910807e-06, "loss": 0.3984, "mean_token_accuracy": 0.8564006343483925, "num_tokens": 148844690.0, "step": 123780 }, { "entropy": 1.90057223290205, "epoch": 0.38373816899018026, "grad_norm": 3.392810344696045, "learning_rate": 4.083917255143988e-06, "loss": 0.4533, "mean_token_accuracy": 0.8512577295303345, "num_tokens": 148856761.0, "step": 123790 }, { "entropy": 1.9783919692039489, "epoch": 0.38376916811522993, "grad_norm": 9.549758911132812, "learning_rate": 4.083752310365388e-06, "loss": 0.5152, "mean_token_accuracy": 0.8406032085418701, "num_tokens": 148867423.0, "step": 123800 }, { "entropy": 1.8232928544282914, "epoch": 0.38380016724027965, "grad_norm": 4.337226867675781, "learning_rate": 4.083587385570969e-06, "loss": 0.4175, "mean_token_accuracy": 0.852597390115261, "num_tokens": 148879925.0, "step": 123810 }, { "entropy": 1.8824557334184646, "epoch": 0.3838311663653293, "grad_norm": 7.782412528991699, "learning_rate": 4.083422480756698e-06, "loss": 0.4413, "mean_token_accuracy": 0.856006121635437, "num_tokens": 148891322.0, "step": 123820 }, { "entropy": 1.903056225925684, "epoch": 0.38386216549037905, "grad_norm": 8.749157905578613, "learning_rate": 4.083257595918541e-06, "loss": 0.454, "mean_token_accuracy": 0.845898973941803, "num_tokens": 148903673.0, "step": 123830 }, { "entropy": 1.9123461306095124, "epoch": 0.3838931646154287, "grad_norm": 8.949573516845703, "learning_rate": 4.083092731052464e-06, "loss": 0.4843, "mean_token_accuracy": 0.8477382779121398, "num_tokens": 148915982.0, "step": 123840 }, { "entropy": 1.867120423913002, "epoch": 0.38392416374047844, "grad_norm": 8.769627571105957, "learning_rate": 4.082927886154436e-06, "loss": 0.4548, "mean_token_accuracy": 0.8450190261006355, "num_tokens": 148928151.0, "step": 123850 }, { "entropy": 1.9295039504766465, "epoch": 0.3839551628655281, "grad_norm": 10.065016746520996, "learning_rate": 4.0827630612204275e-06, "loss": 0.5505, "mean_token_accuracy": 0.8388926431536674, "num_tokens": 148939030.0, "step": 123860 }, { "entropy": 1.8576544880867005, "epoch": 0.38398616199057783, "grad_norm": 11.196900367736816, "learning_rate": 4.082598256246408e-06, "loss": 0.479, "mean_token_accuracy": 0.8566042557358742, "num_tokens": 148950841.0, "step": 123870 }, { "entropy": 1.8984556332230569, "epoch": 0.3840171611156275, "grad_norm": 9.907042503356934, "learning_rate": 4.082433471228349e-06, "loss": 0.5076, "mean_token_accuracy": 0.8443589746952057, "num_tokens": 148962371.0, "step": 123880 }, { "entropy": 1.7866959184408189, "epoch": 0.3840481602406772, "grad_norm": 7.650696754455566, "learning_rate": 4.082268706162224e-06, "loss": 0.4578, "mean_token_accuracy": 0.8526209697127343, "num_tokens": 148975931.0, "step": 123890 }, { "entropy": 1.8315860643982886, "epoch": 0.3840791593657269, "grad_norm": 2.8190009593963623, "learning_rate": 4.082103961044008e-06, "loss": 0.4691, "mean_token_accuracy": 0.8488441839814186, "num_tokens": 148989418.0, "step": 123900 }, { "entropy": 1.803328700363636, "epoch": 0.3841101584907766, "grad_norm": 8.294083595275879, "learning_rate": 4.081939235869675e-06, "loss": 0.4175, "mean_token_accuracy": 0.8606008633971214, "num_tokens": 149003217.0, "step": 123910 }, { "entropy": 1.8937855035066604, "epoch": 0.3841411576158263, "grad_norm": 8.08438777923584, "learning_rate": 4.081774530635201e-06, "loss": 0.511, "mean_token_accuracy": 0.8416438445448875, "num_tokens": 149014790.0, "step": 123920 }, { "entropy": 1.9089516907930375, "epoch": 0.384172156740876, "grad_norm": 8.53679370880127, "learning_rate": 4.081609845336565e-06, "loss": 0.4647, "mean_token_accuracy": 0.848609185218811, "num_tokens": 149025831.0, "step": 123930 }, { "entropy": 1.7949181117117405, "epoch": 0.3842031558659257, "grad_norm": 4.385751247406006, "learning_rate": 4.0814451799697436e-06, "loss": 0.4284, "mean_token_accuracy": 0.8539316862821579, "num_tokens": 149038872.0, "step": 123940 }, { "entropy": 1.8139029011130332, "epoch": 0.3842341549909754, "grad_norm": 3.748425006866455, "learning_rate": 4.081280534530718e-06, "loss": 0.4179, "mean_token_accuracy": 0.8566805422306061, "num_tokens": 149051871.0, "step": 123950 }, { "entropy": 1.7684552431106568, "epoch": 0.3842651541160251, "grad_norm": 10.310443878173828, "learning_rate": 4.081115909015469e-06, "loss": 0.4372, "mean_token_accuracy": 0.8510794118046761, "num_tokens": 149065114.0, "step": 123960 }, { "entropy": 1.8360266655683517, "epoch": 0.38429615324107474, "grad_norm": 9.245149612426758, "learning_rate": 4.080951303419979e-06, "loss": 0.4395, "mean_token_accuracy": 0.844825841486454, "num_tokens": 149077418.0, "step": 123970 }, { "entropy": 1.7973667308688164, "epoch": 0.38432715236612447, "grad_norm": 8.100232124328613, "learning_rate": 4.0807867177402305e-06, "loss": 0.4194, "mean_token_accuracy": 0.8598975196480751, "num_tokens": 149090213.0, "step": 123980 }, { "entropy": 1.8818108469247818, "epoch": 0.38435815149117414, "grad_norm": 8.730313301086426, "learning_rate": 4.080622151972207e-06, "loss": 0.4529, "mean_token_accuracy": 0.8592151805758477, "num_tokens": 149102306.0, "step": 123990 }, { "entropy": 1.9396295577287674, "epoch": 0.38438915061622386, "grad_norm": 8.592621803283691, "learning_rate": 4.080457606111895e-06, "loss": 0.5319, "mean_token_accuracy": 0.8406242683529854, "num_tokens": 149114284.0, "step": 124000 }, { "entropy": 1.8134768947958946, "epoch": 0.38442014974127353, "grad_norm": 8.235865592956543, "learning_rate": 4.080293080155281e-06, "loss": 0.422, "mean_token_accuracy": 0.8475548833608627, "num_tokens": 149127288.0, "step": 124010 }, { "entropy": 1.7447491750121116, "epoch": 0.38445114886632326, "grad_norm": 8.99509334564209, "learning_rate": 4.080128574098353e-06, "loss": 0.3478, "mean_token_accuracy": 0.8710707902908326, "num_tokens": 149140350.0, "step": 124020 }, { "entropy": 1.9281997457146645, "epoch": 0.3844821479913729, "grad_norm": 9.224446296691895, "learning_rate": 4.0799640879370986e-06, "loss": 0.5329, "mean_token_accuracy": 0.8362716525793076, "num_tokens": 149151526.0, "step": 124030 }, { "entropy": 1.907413001358509, "epoch": 0.38451314711642265, "grad_norm": 8.881566047668457, "learning_rate": 4.079799621667508e-06, "loss": 0.5119, "mean_token_accuracy": 0.841308769583702, "num_tokens": 149162744.0, "step": 124040 }, { "entropy": 1.764004696905613, "epoch": 0.3845441462414723, "grad_norm": 8.439988136291504, "learning_rate": 4.079635175285575e-06, "loss": 0.4012, "mean_token_accuracy": 0.8685743331909179, "num_tokens": 149175813.0, "step": 124050 }, { "entropy": 1.8820173025131226, "epoch": 0.38457514536652204, "grad_norm": 7.626965522766113, "learning_rate": 4.079470748787288e-06, "loss": 0.438, "mean_token_accuracy": 0.8496981009840965, "num_tokens": 149188028.0, "step": 124060 }, { "entropy": 1.9048197478055955, "epoch": 0.3846061444915717, "grad_norm": 4.152190685272217, "learning_rate": 4.079306342168641e-06, "loss": 0.4677, "mean_token_accuracy": 0.8482609540224075, "num_tokens": 149199701.0, "step": 124070 }, { "entropy": 1.9153966382145882, "epoch": 0.38463714361662144, "grad_norm": 7.222431659698486, "learning_rate": 4.079141955425631e-06, "loss": 0.4673, "mean_token_accuracy": 0.8490887343883514, "num_tokens": 149211482.0, "step": 124080 }, { "entropy": 1.7790709897875785, "epoch": 0.3846681427416711, "grad_norm": 3.140486001968384, "learning_rate": 4.0789775885542525e-06, "loss": 0.417, "mean_token_accuracy": 0.8636253029108047, "num_tokens": 149224774.0, "step": 124090 }, { "entropy": 1.9251772895455361, "epoch": 0.38469914186672083, "grad_norm": 4.184020519256592, "learning_rate": 4.0788132415505e-06, "loss": 0.4724, "mean_token_accuracy": 0.8495764017105103, "num_tokens": 149236068.0, "step": 124100 }, { "entropy": 1.8736867174506187, "epoch": 0.3847301409917705, "grad_norm": 9.371009826660156, "learning_rate": 4.078648914410375e-06, "loss": 0.4437, "mean_token_accuracy": 0.8543329164385796, "num_tokens": 149248502.0, "step": 124110 }, { "entropy": 1.8708575084805488, "epoch": 0.3847611401168202, "grad_norm": 3.769465684890747, "learning_rate": 4.078484607129874e-06, "loss": 0.4439, "mean_token_accuracy": 0.8484385311603546, "num_tokens": 149260240.0, "step": 124120 }, { "entropy": 1.9300208121538163, "epoch": 0.3847921392418699, "grad_norm": 7.285602569580078, "learning_rate": 4.078320319704997e-06, "loss": 0.492, "mean_token_accuracy": 0.8484133720397949, "num_tokens": 149271397.0, "step": 124130 }, { "entropy": 1.8357270821928977, "epoch": 0.3848231383669196, "grad_norm": 3.4529945850372314, "learning_rate": 4.078156052131747e-06, "loss": 0.4362, "mean_token_accuracy": 0.8534119576215744, "num_tokens": 149283833.0, "step": 124140 }, { "entropy": 1.9487372577190398, "epoch": 0.3848541374919693, "grad_norm": 7.477880001068115, "learning_rate": 4.0779918044061244e-06, "loss": 0.5247, "mean_token_accuracy": 0.8423938781023026, "num_tokens": 149294571.0, "step": 124150 }, { "entropy": 1.8556806325912476, "epoch": 0.384885136617019, "grad_norm": 4.033416748046875, "learning_rate": 4.077827576524136e-06, "loss": 0.4794, "mean_token_accuracy": 0.84943907558918, "num_tokens": 149307020.0, "step": 124160 }, { "entropy": 1.877730706334114, "epoch": 0.3849161357420687, "grad_norm": 8.740376472473145, "learning_rate": 4.077663368481781e-06, "loss": 0.442, "mean_token_accuracy": 0.8621541142463685, "num_tokens": 149318207.0, "step": 124170 }, { "entropy": 1.9129869103431703, "epoch": 0.3849471348671184, "grad_norm": 8.78154468536377, "learning_rate": 4.07749918027507e-06, "loss": 0.4759, "mean_token_accuracy": 0.8474374935030937, "num_tokens": 149329638.0, "step": 124180 }, { "entropy": 1.8814658731222154, "epoch": 0.38497813399216807, "grad_norm": 8.555291175842285, "learning_rate": 4.077335011900008e-06, "loss": 0.4554, "mean_token_accuracy": 0.8533140555024147, "num_tokens": 149341341.0, "step": 124190 }, { "entropy": 1.8951398521661758, "epoch": 0.38500913311721774, "grad_norm": 7.886125564575195, "learning_rate": 4.077170863352603e-06, "loss": 0.5017, "mean_token_accuracy": 0.8445818021893501, "num_tokens": 149353637.0, "step": 124200 }, { "entropy": 1.8607033982872963, "epoch": 0.38504013224226746, "grad_norm": 6.102626323699951, "learning_rate": 4.0770067346288645e-06, "loss": 0.4808, "mean_token_accuracy": 0.8472836226224899, "num_tokens": 149365535.0, "step": 124210 }, { "entropy": 1.9797284454107285, "epoch": 0.38507113136731713, "grad_norm": 8.020270347595215, "learning_rate": 4.076842625724803e-06, "loss": 0.5767, "mean_token_accuracy": 0.8357729539275169, "num_tokens": 149376980.0, "step": 124220 }, { "entropy": 1.8790086820721625, "epoch": 0.38510213049236686, "grad_norm": 8.278203010559082, "learning_rate": 4.076678536636429e-06, "loss": 0.4368, "mean_token_accuracy": 0.8577861353754997, "num_tokens": 149388865.0, "step": 124230 }, { "entropy": 1.8905736938118936, "epoch": 0.3851331296174165, "grad_norm": 9.991707801818848, "learning_rate": 4.076514467359756e-06, "loss": 0.4826, "mean_token_accuracy": 0.857266665995121, "num_tokens": 149400675.0, "step": 124240 }, { "entropy": 1.9448306143283844, "epoch": 0.38516412874246625, "grad_norm": 8.666476249694824, "learning_rate": 4.076350417890796e-06, "loss": 0.4914, "mean_token_accuracy": 0.8469108313322067, "num_tokens": 149411540.0, "step": 124250 }, { "entropy": 1.8579439014196395, "epoch": 0.3851951278675159, "grad_norm": 4.379420280456543, "learning_rate": 4.076186388225566e-06, "loss": 0.4502, "mean_token_accuracy": 0.8480482071638107, "num_tokens": 149424172.0, "step": 124260 }, { "entropy": 1.7416525185108185, "epoch": 0.38522612699256564, "grad_norm": 4.337442874908447, "learning_rate": 4.076022378360081e-06, "loss": 0.3671, "mean_token_accuracy": 0.87484562844038, "num_tokens": 149438737.0, "step": 124270 }, { "entropy": 1.8429612591862679, "epoch": 0.3852571261176153, "grad_norm": 3.8294622898101807, "learning_rate": 4.075858388290356e-06, "loss": 0.4457, "mean_token_accuracy": 0.855830529332161, "num_tokens": 149451375.0, "step": 124280 }, { "entropy": 1.8703719988465308, "epoch": 0.38528812524266504, "grad_norm": 8.686786651611328, "learning_rate": 4.075694418012412e-06, "loss": 0.4051, "mean_token_accuracy": 0.8558905765414238, "num_tokens": 149463577.0, "step": 124290 }, { "entropy": 1.9531716659665108, "epoch": 0.3853191243677147, "grad_norm": 10.40597915649414, "learning_rate": 4.075530467522267e-06, "loss": 0.5363, "mean_token_accuracy": 0.8417243003845215, "num_tokens": 149474986.0, "step": 124300 }, { "entropy": 1.9056412920355796, "epoch": 0.38535012349276443, "grad_norm": 9.468035697937012, "learning_rate": 4.0753665368159415e-06, "loss": 0.4469, "mean_token_accuracy": 0.8571117267012596, "num_tokens": 149486586.0, "step": 124310 }, { "entropy": 1.8688831850886345, "epoch": 0.3853811226178141, "grad_norm": 7.645292282104492, "learning_rate": 4.0752026258894575e-06, "loss": 0.4171, "mean_token_accuracy": 0.8533623024821282, "num_tokens": 149499014.0, "step": 124320 }, { "entropy": 1.8921454787254333, "epoch": 0.3854121217428638, "grad_norm": 3.7551565170288086, "learning_rate": 4.0750387347388356e-06, "loss": 0.4491, "mean_token_accuracy": 0.8515683576464653, "num_tokens": 149511579.0, "step": 124330 }, { "entropy": 1.933222909271717, "epoch": 0.3854431208679135, "grad_norm": 9.027485847473145, "learning_rate": 4.074874863360102e-06, "loss": 0.4913, "mean_token_accuracy": 0.8418224409222603, "num_tokens": 149522694.0, "step": 124340 }, { "entropy": 1.88301939368248, "epoch": 0.3854741199929632, "grad_norm": 7.786672115325928, "learning_rate": 4.074711011749281e-06, "loss": 0.4449, "mean_token_accuracy": 0.8608944162726402, "num_tokens": 149534481.0, "step": 124350 }, { "entropy": 1.870416359603405, "epoch": 0.3855051191180129, "grad_norm": 7.477562427520752, "learning_rate": 4.0745471799023966e-06, "loss": 0.4254, "mean_token_accuracy": 0.8581738561391831, "num_tokens": 149546190.0, "step": 124360 }, { "entropy": 1.8137023776769639, "epoch": 0.3855361182430626, "grad_norm": 8.037935256958008, "learning_rate": 4.074383367815478e-06, "loss": 0.4451, "mean_token_accuracy": 0.855655774474144, "num_tokens": 149559214.0, "step": 124370 }, { "entropy": 1.890501284599304, "epoch": 0.3855671173681123, "grad_norm": 4.326991558074951, "learning_rate": 4.074219575484553e-06, "loss": 0.4686, "mean_token_accuracy": 0.8478201061487198, "num_tokens": 149570897.0, "step": 124380 }, { "entropy": 1.8549866631627083, "epoch": 0.385598116493162, "grad_norm": 9.21172046661377, "learning_rate": 4.07405580290565e-06, "loss": 0.5014, "mean_token_accuracy": 0.8404435843229294, "num_tokens": 149583130.0, "step": 124390 }, { "entropy": 1.7634894296526908, "epoch": 0.38562911561821167, "grad_norm": 2.975172996520996, "learning_rate": 4.073892050074802e-06, "loss": 0.3846, "mean_token_accuracy": 0.8556309074163437, "num_tokens": 149597170.0, "step": 124400 }, { "entropy": 1.9400777205824853, "epoch": 0.3856601147432614, "grad_norm": 4.058889389038086, "learning_rate": 4.073728316988036e-06, "loss": 0.4736, "mean_token_accuracy": 0.8479943916201591, "num_tokens": 149608439.0, "step": 124410 }, { "entropy": 1.9313759356737137, "epoch": 0.38569111386831106, "grad_norm": 9.538460731506348, "learning_rate": 4.073564603641389e-06, "loss": 0.5035, "mean_token_accuracy": 0.8419632911682129, "num_tokens": 149619530.0, "step": 124420 }, { "entropy": 1.8018861994147302, "epoch": 0.3857221129933608, "grad_norm": 3.8183743953704834, "learning_rate": 4.073400910030892e-06, "loss": 0.3968, "mean_token_accuracy": 0.8584095433354377, "num_tokens": 149632347.0, "step": 124430 }, { "entropy": 1.8204200729727744, "epoch": 0.38575311211841046, "grad_norm": 4.118322849273682, "learning_rate": 4.073237236152582e-06, "loss": 0.39, "mean_token_accuracy": 0.8663435325026512, "num_tokens": 149645135.0, "step": 124440 }, { "entropy": 1.8887554615736009, "epoch": 0.3857841112434601, "grad_norm": 2.7827911376953125, "learning_rate": 4.073073582002494e-06, "loss": 0.465, "mean_token_accuracy": 0.8540782734751702, "num_tokens": 149657595.0, "step": 124450 }, { "entropy": 1.8850025668740273, "epoch": 0.38581511036850985, "grad_norm": 8.43443489074707, "learning_rate": 4.0729099475766645e-06, "loss": 0.4813, "mean_token_accuracy": 0.8449831783771515, "num_tokens": 149669584.0, "step": 124460 }, { "entropy": 1.873554064333439, "epoch": 0.3858461094935595, "grad_norm": 7.510732173919678, "learning_rate": 4.072746332871133e-06, "loss": 0.4565, "mean_token_accuracy": 0.8536177009344101, "num_tokens": 149681728.0, "step": 124470 }, { "entropy": 1.9078126922249794, "epoch": 0.38587710861860924, "grad_norm": 7.718918800354004, "learning_rate": 4.072582737881938e-06, "loss": 0.4752, "mean_token_accuracy": 0.8453854739665985, "num_tokens": 149693031.0, "step": 124480 }, { "entropy": 1.8811730653047563, "epoch": 0.3859081077436589, "grad_norm": 7.272776126861572, "learning_rate": 4.0724191626051195e-06, "loss": 0.4922, "mean_token_accuracy": 0.8460710808634758, "num_tokens": 149704960.0, "step": 124490 }, { "entropy": 1.930656510591507, "epoch": 0.38593910686870864, "grad_norm": 7.962609767913818, "learning_rate": 4.07225560703672e-06, "loss": 0.5241, "mean_token_accuracy": 0.8429863288998604, "num_tokens": 149716392.0, "step": 124500 }, { "entropy": 1.9329011127352715, "epoch": 0.3859701059937583, "grad_norm": 9.818818092346191, "learning_rate": 4.072092071172782e-06, "loss": 0.5077, "mean_token_accuracy": 0.8381221756339073, "num_tokens": 149727625.0, "step": 124510 }, { "entropy": 1.841926720738411, "epoch": 0.38600110511880803, "grad_norm": 9.0508394241333, "learning_rate": 4.071928555009349e-06, "loss": 0.4744, "mean_token_accuracy": 0.8424027681350708, "num_tokens": 149740643.0, "step": 124520 }, { "entropy": 1.8008276581764222, "epoch": 0.3860321042438577, "grad_norm": 7.965822219848633, "learning_rate": 4.071765058542466e-06, "loss": 0.4137, "mean_token_accuracy": 0.8529940769076347, "num_tokens": 149753969.0, "step": 124530 }, { "entropy": 1.9138699561357497, "epoch": 0.3860631033689074, "grad_norm": 4.05026388168335, "learning_rate": 4.07160158176818e-06, "loss": 0.4546, "mean_token_accuracy": 0.8561643600463867, "num_tokens": 149765733.0, "step": 124540 }, { "entropy": 1.9274681612849236, "epoch": 0.3860941024939571, "grad_norm": 8.39354133605957, "learning_rate": 4.071438124682538e-06, "loss": 0.4649, "mean_token_accuracy": 0.8445206269621849, "num_tokens": 149777474.0, "step": 124550 }, { "entropy": 1.8434103056788445, "epoch": 0.3861251016190068, "grad_norm": 3.958385944366455, "learning_rate": 4.071274687281586e-06, "loss": 0.4663, "mean_token_accuracy": 0.8522407054901123, "num_tokens": 149789629.0, "step": 124560 }, { "entropy": 1.9501070573925972, "epoch": 0.3861561007440565, "grad_norm": 3.8399457931518555, "learning_rate": 4.071111269561375e-06, "loss": 0.4496, "mean_token_accuracy": 0.8507745549082756, "num_tokens": 149800781.0, "step": 124570 }, { "entropy": 1.8660836547613144, "epoch": 0.3861870998691062, "grad_norm": 3.3907036781311035, "learning_rate": 4.0709478715179555e-06, "loss": 0.441, "mean_token_accuracy": 0.852207650244236, "num_tokens": 149813258.0, "step": 124580 }, { "entropy": 1.9248910203576088, "epoch": 0.3862180989941559, "grad_norm": 3.793597459793091, "learning_rate": 4.070784493147379e-06, "loss": 0.4541, "mean_token_accuracy": 0.8523493394255638, "num_tokens": 149824551.0, "step": 124590 }, { "entropy": 1.913597546517849, "epoch": 0.3862490981192056, "grad_norm": 8.392363548278809, "learning_rate": 4.0706211344457e-06, "loss": 0.506, "mean_token_accuracy": 0.846868097782135, "num_tokens": 149836161.0, "step": 124600 }, { "entropy": 1.8576731622219085, "epoch": 0.38628009724425527, "grad_norm": 8.112271308898926, "learning_rate": 4.070457795408968e-06, "loss": 0.4333, "mean_token_accuracy": 0.8608527705073357, "num_tokens": 149848097.0, "step": 124610 }, { "entropy": 1.8224573642015458, "epoch": 0.386311096369305, "grad_norm": 3.8690743446350098, "learning_rate": 4.0702944760332415e-06, "loss": 0.4221, "mean_token_accuracy": 0.8590767070651054, "num_tokens": 149860778.0, "step": 124620 }, { "entropy": 1.8621004924178124, "epoch": 0.38634209549435466, "grad_norm": 8.412699699401855, "learning_rate": 4.070131176314576e-06, "loss": 0.4381, "mean_token_accuracy": 0.8528893545269967, "num_tokens": 149872659.0, "step": 124630 }, { "entropy": 1.8653160884976387, "epoch": 0.3863730946194044, "grad_norm": 4.710209846496582, "learning_rate": 4.069967896249026e-06, "loss": 0.4242, "mean_token_accuracy": 0.8521018907427788, "num_tokens": 149885265.0, "step": 124640 }, { "entropy": 1.8452029392123221, "epoch": 0.38640409374445406, "grad_norm": 7.326101779937744, "learning_rate": 4.069804635832654e-06, "loss": 0.4809, "mean_token_accuracy": 0.8424705550074577, "num_tokens": 149897692.0, "step": 124650 }, { "entropy": 1.8934701785445214, "epoch": 0.3864350928695038, "grad_norm": 9.25935173034668, "learning_rate": 4.069641395061516e-06, "loss": 0.515, "mean_token_accuracy": 0.8364736765623093, "num_tokens": 149910077.0, "step": 124660 }, { "entropy": 1.900476099550724, "epoch": 0.38646609199455345, "grad_norm": 11.954851150512695, "learning_rate": 4.069478173931674e-06, "loss": 0.4723, "mean_token_accuracy": 0.8435856163501739, "num_tokens": 149921585.0, "step": 124670 }, { "entropy": 1.9049529254436492, "epoch": 0.3864970911196032, "grad_norm": 8.252809524536133, "learning_rate": 4.069314972439188e-06, "loss": 0.4684, "mean_token_accuracy": 0.8509178534150124, "num_tokens": 149933471.0, "step": 124680 }, { "entropy": 1.914153940975666, "epoch": 0.38652809024465284, "grad_norm": 8.16724681854248, "learning_rate": 4.0691517905801225e-06, "loss": 0.4896, "mean_token_accuracy": 0.8479727208614349, "num_tokens": 149944962.0, "step": 124690 }, { "entropy": 1.9179544657468797, "epoch": 0.3865590893697025, "grad_norm": 8.549003601074219, "learning_rate": 4.0689886283505405e-06, "loss": 0.4904, "mean_token_accuracy": 0.8387503355741501, "num_tokens": 149956453.0, "step": 124700 }, { "entropy": 1.944225938618183, "epoch": 0.38659008849475224, "grad_norm": 8.267438888549805, "learning_rate": 4.068825485746507e-06, "loss": 0.4917, "mean_token_accuracy": 0.8414188399910927, "num_tokens": 149968058.0, "step": 124710 }, { "entropy": 1.8530795663595199, "epoch": 0.3866210876198019, "grad_norm": 3.6572959423065186, "learning_rate": 4.068662362764087e-06, "loss": 0.4355, "mean_token_accuracy": 0.8553397417068481, "num_tokens": 149980786.0, "step": 124720 }, { "entropy": 1.8606258913874627, "epoch": 0.38665208674485163, "grad_norm": 9.493535995483398, "learning_rate": 4.06849925939935e-06, "loss": 0.4338, "mean_token_accuracy": 0.8513687327504158, "num_tokens": 149994145.0, "step": 124730 }, { "entropy": 1.9374653339385985, "epoch": 0.3866830858699013, "grad_norm": 8.496316909790039, "learning_rate": 4.0683361756483615e-06, "loss": 0.514, "mean_token_accuracy": 0.8495217755436897, "num_tokens": 150005113.0, "step": 124740 }, { "entropy": 1.8425881132483481, "epoch": 0.386714084994951, "grad_norm": 8.17953872680664, "learning_rate": 4.068173111507192e-06, "loss": 0.4268, "mean_token_accuracy": 0.8537844985723495, "num_tokens": 150018495.0, "step": 124750 }, { "entropy": 1.8482965901494026, "epoch": 0.3867450841200007, "grad_norm": 5.024632453918457, "learning_rate": 4.068010066971912e-06, "loss": 0.4146, "mean_token_accuracy": 0.8522414952516556, "num_tokens": 150031464.0, "step": 124760 }, { "entropy": 1.934024366736412, "epoch": 0.3867760832450504, "grad_norm": 8.86674976348877, "learning_rate": 4.067847042038591e-06, "loss": 0.4842, "mean_token_accuracy": 0.84689302444458, "num_tokens": 150043092.0, "step": 124770 }, { "entropy": 1.871715322136879, "epoch": 0.3868070823701001, "grad_norm": 3.58791184425354, "learning_rate": 4.067684036703305e-06, "loss": 0.4503, "mean_token_accuracy": 0.8503245860338211, "num_tokens": 150055771.0, "step": 124780 }, { "entropy": 1.833419594168663, "epoch": 0.3868380814951498, "grad_norm": 7.293220043182373, "learning_rate": 4.067521050962126e-06, "loss": 0.4072, "mean_token_accuracy": 0.8605907499790192, "num_tokens": 150068131.0, "step": 124790 }, { "entropy": 1.8409696131944657, "epoch": 0.3868690806201995, "grad_norm": 7.163329124450684, "learning_rate": 4.067358084811128e-06, "loss": 0.4406, "mean_token_accuracy": 0.853056974709034, "num_tokens": 150080793.0, "step": 124800 }, { "entropy": 1.8847892254590988, "epoch": 0.3869000797452492, "grad_norm": 8.353142738342285, "learning_rate": 4.067195138246388e-06, "loss": 0.4377, "mean_token_accuracy": 0.8540623351931572, "num_tokens": 150093526.0, "step": 124810 }, { "entropy": 2.0080907315015795, "epoch": 0.3869310788702989, "grad_norm": 8.072431564331055, "learning_rate": 4.067032211263983e-06, "loss": 0.5497, "mean_token_accuracy": 0.8375670999288559, "num_tokens": 150104134.0, "step": 124820 }, { "entropy": 1.8393555164337159, "epoch": 0.3869620779953486, "grad_norm": 2.448547840118408, "learning_rate": 4.06686930385999e-06, "loss": 0.431, "mean_token_accuracy": 0.8578300714492798, "num_tokens": 150116750.0, "step": 124830 }, { "entropy": 1.8307070322334766, "epoch": 0.38699307712039827, "grad_norm": 3.879143476486206, "learning_rate": 4.06670641603049e-06, "loss": 0.4256, "mean_token_accuracy": 0.8474024429917335, "num_tokens": 150130258.0, "step": 124840 }, { "entropy": 1.9666341543197632, "epoch": 0.387024076245448, "grad_norm": 7.913427352905273, "learning_rate": 4.066543547771561e-06, "loss": 0.5647, "mean_token_accuracy": 0.8228250458836556, "num_tokens": 150141397.0, "step": 124850 }, { "entropy": 1.9169279381632804, "epoch": 0.38705507537049766, "grad_norm": 7.764703750610352, "learning_rate": 4.066380699079287e-06, "loss": 0.4324, "mean_token_accuracy": 0.8474949941039085, "num_tokens": 150153315.0, "step": 124860 }, { "entropy": 1.9108419820666314, "epoch": 0.3870860744955474, "grad_norm": 8.689712524414062, "learning_rate": 4.066217869949748e-06, "loss": 0.4448, "mean_token_accuracy": 0.8523913621902466, "num_tokens": 150165774.0, "step": 124870 }, { "entropy": 1.9573454082012176, "epoch": 0.38711707362059705, "grad_norm": 7.767167091369629, "learning_rate": 4.0660550603790286e-06, "loss": 0.4841, "mean_token_accuracy": 0.8440973252058029, "num_tokens": 150177549.0, "step": 124880 }, { "entropy": 1.8977736786007882, "epoch": 0.3871480727456468, "grad_norm": 4.0380425453186035, "learning_rate": 4.065892270363214e-06, "loss": 0.4515, "mean_token_accuracy": 0.8582114532589913, "num_tokens": 150189097.0, "step": 124890 }, { "entropy": 1.832365168631077, "epoch": 0.38717907187069645, "grad_norm": 4.158474922180176, "learning_rate": 4.06572949989839e-06, "loss": 0.4398, "mean_token_accuracy": 0.8578600034117698, "num_tokens": 150202332.0, "step": 124900 }, { "entropy": 1.9247950717806817, "epoch": 0.38721007099574617, "grad_norm": 8.511116027832031, "learning_rate": 4.065566748980642e-06, "loss": 0.4413, "mean_token_accuracy": 0.8546933025121689, "num_tokens": 150213722.0, "step": 124910 }, { "entropy": 1.8911121636629105, "epoch": 0.38724107012079584, "grad_norm": 7.204141616821289, "learning_rate": 4.065404017606059e-06, "loss": 0.4546, "mean_token_accuracy": 0.8537615105509758, "num_tokens": 150225589.0, "step": 124920 }, { "entropy": 1.8968706488609315, "epoch": 0.38727206924584556, "grad_norm": 7.295899868011475, "learning_rate": 4.06524130577073e-06, "loss": 0.4531, "mean_token_accuracy": 0.8530194133520126, "num_tokens": 150236807.0, "step": 124930 }, { "entropy": 1.8373442202806474, "epoch": 0.38730306837089523, "grad_norm": 8.396409034729004, "learning_rate": 4.065078613470747e-06, "loss": 0.3856, "mean_token_accuracy": 0.8561060458421708, "num_tokens": 150249132.0, "step": 124940 }, { "entropy": 1.834753280878067, "epoch": 0.3873340674959449, "grad_norm": 8.531126022338867, "learning_rate": 4.0649159407021976e-06, "loss": 0.4259, "mean_token_accuracy": 0.8525745943188667, "num_tokens": 150261614.0, "step": 124950 }, { "entropy": 1.9869435787200929, "epoch": 0.3873650666209946, "grad_norm": 8.008123397827148, "learning_rate": 4.064753287461177e-06, "loss": 0.4939, "mean_token_accuracy": 0.8471731215715408, "num_tokens": 150272837.0, "step": 124960 }, { "entropy": 1.9212356433272362, "epoch": 0.3873960657460443, "grad_norm": 9.139792442321777, "learning_rate": 4.064590653743777e-06, "loss": 0.4965, "mean_token_accuracy": 0.8533764079213142, "num_tokens": 150284178.0, "step": 124970 }, { "entropy": 1.9715091735124588, "epoch": 0.387427064871094, "grad_norm": 7.504796504974365, "learning_rate": 4.0644280395460925e-06, "loss": 0.5083, "mean_token_accuracy": 0.8431293860077858, "num_tokens": 150295307.0, "step": 124980 }, { "entropy": 1.9665077716112136, "epoch": 0.3874580639961437, "grad_norm": 7.719574928283691, "learning_rate": 4.064265444864221e-06, "loss": 0.5193, "mean_token_accuracy": 0.8392296627163887, "num_tokens": 150306726.0, "step": 124990 }, { "entropy": 1.8853733599185944, "epoch": 0.3874890631211934, "grad_norm": 7.892232894897461, "learning_rate": 4.064102869694256e-06, "loss": 0.494, "mean_token_accuracy": 0.84963990598917, "num_tokens": 150318729.0, "step": 125000 }, { "entropy": 1.8874075815081597, "epoch": 0.3875200622462431, "grad_norm": 8.320926666259766, "learning_rate": 4.063940314032298e-06, "loss": 0.4802, "mean_token_accuracy": 0.8477603197097778, "num_tokens": 150329931.0, "step": 125010 }, { "entropy": 1.9008696138858796, "epoch": 0.3875510613712928, "grad_norm": 8.944568634033203, "learning_rate": 4.063777777874444e-06, "loss": 0.4467, "mean_token_accuracy": 0.8521299719810486, "num_tokens": 150341393.0, "step": 125020 }, { "entropy": 1.9297545284032822, "epoch": 0.3875820604963425, "grad_norm": 8.812226295471191, "learning_rate": 4.063615261216795e-06, "loss": 0.4696, "mean_token_accuracy": 0.8559518858790398, "num_tokens": 150352453.0, "step": 125030 }, { "entropy": 1.9456378057599069, "epoch": 0.3876130596213922, "grad_norm": 7.481411933898926, "learning_rate": 4.063452764055453e-06, "loss": 0.4765, "mean_token_accuracy": 0.8542866796255112, "num_tokens": 150363663.0, "step": 125040 }, { "entropy": 1.843149345368147, "epoch": 0.38764405874644187, "grad_norm": 7.64381742477417, "learning_rate": 4.063290286386518e-06, "loss": 0.4247, "mean_token_accuracy": 0.8573990285396575, "num_tokens": 150376498.0, "step": 125050 }, { "entropy": 1.926984567940235, "epoch": 0.3876750578714916, "grad_norm": 7.688129901885986, "learning_rate": 4.063127828206096e-06, "loss": 0.4657, "mean_token_accuracy": 0.8490166947245598, "num_tokens": 150387561.0, "step": 125060 }, { "entropy": 1.8717382565140723, "epoch": 0.38770605699654126, "grad_norm": 3.5536370277404785, "learning_rate": 4.062965389510289e-06, "loss": 0.4053, "mean_token_accuracy": 0.8601777657866478, "num_tokens": 150399917.0, "step": 125070 }, { "entropy": 1.8823929965496062, "epoch": 0.387737056121591, "grad_norm": 4.017459869384766, "learning_rate": 4.062802970295203e-06, "loss": 0.3831, "mean_token_accuracy": 0.8708006635308265, "num_tokens": 150412228.0, "step": 125080 }, { "entropy": 1.9448561638593673, "epoch": 0.38776805524664065, "grad_norm": 9.202611923217773, "learning_rate": 4.062640570556946e-06, "loss": 0.4823, "mean_token_accuracy": 0.842719966173172, "num_tokens": 150423234.0, "step": 125090 }, { "entropy": 1.9790111839771272, "epoch": 0.3877990543716904, "grad_norm": 8.097685813903809, "learning_rate": 4.062478190291623e-06, "loss": 0.5268, "mean_token_accuracy": 0.8447407379746437, "num_tokens": 150434200.0, "step": 125100 }, { "entropy": 1.9440268695354461, "epoch": 0.38783005349674005, "grad_norm": 6.974407196044922, "learning_rate": 4.062315829495345e-06, "loss": 0.5078, "mean_token_accuracy": 0.8498797222971917, "num_tokens": 150446535.0, "step": 125110 }, { "entropy": 1.7851212821900844, "epoch": 0.38786105262178977, "grad_norm": 3.838473320007324, "learning_rate": 4.062153488164221e-06, "loss": 0.4113, "mean_token_accuracy": 0.8568575367331505, "num_tokens": 150459866.0, "step": 125120 }, { "entropy": 1.8512066915631293, "epoch": 0.38789205174683944, "grad_norm": 8.369192123413086, "learning_rate": 4.0619911662943615e-06, "loss": 0.4093, "mean_token_accuracy": 0.8748730286955834, "num_tokens": 150471449.0, "step": 125130 }, { "entropy": 1.9556255728006362, "epoch": 0.38792305087188916, "grad_norm": 8.41415023803711, "learning_rate": 4.06182886388188e-06, "loss": 0.5092, "mean_token_accuracy": 0.8445400312542916, "num_tokens": 150482247.0, "step": 125140 }, { "entropy": 1.9603990465402603, "epoch": 0.38795404999693883, "grad_norm": 11.196761131286621, "learning_rate": 4.061666580922887e-06, "loss": 0.5023, "mean_token_accuracy": 0.842097707092762, "num_tokens": 150492674.0, "step": 125150 }, { "entropy": 1.9363124072551727, "epoch": 0.38798504912198856, "grad_norm": 7.762144565582275, "learning_rate": 4.0615043174135e-06, "loss": 0.4877, "mean_token_accuracy": 0.8462110877037048, "num_tokens": 150503435.0, "step": 125160 }, { "entropy": 1.8757888361811639, "epoch": 0.3880160482470382, "grad_norm": 7.726643085479736, "learning_rate": 4.061342073349831e-06, "loss": 0.428, "mean_token_accuracy": 0.8505197063088417, "num_tokens": 150515945.0, "step": 125170 }, { "entropy": 1.903496977686882, "epoch": 0.38804704737208795, "grad_norm": 8.79697036743164, "learning_rate": 4.061179848727998e-06, "loss": 0.4493, "mean_token_accuracy": 0.8483191832900048, "num_tokens": 150527375.0, "step": 125180 }, { "entropy": 1.8405410438776015, "epoch": 0.3880780464971376, "grad_norm": 7.9003586769104, "learning_rate": 4.061017643544118e-06, "loss": 0.4483, "mean_token_accuracy": 0.854458573460579, "num_tokens": 150540114.0, "step": 125190 }, { "entropy": 1.94074095338583, "epoch": 0.3881090456221873, "grad_norm": 7.372513294219971, "learning_rate": 4.06085545779431e-06, "loss": 0.4543, "mean_token_accuracy": 0.8478065714240074, "num_tokens": 150551950.0, "step": 125200 }, { "entropy": 1.9185340121388434, "epoch": 0.388140044747237, "grad_norm": 8.795543670654297, "learning_rate": 4.060693291474694e-06, "loss": 0.4689, "mean_token_accuracy": 0.8613968342542648, "num_tokens": 150563662.0, "step": 125210 }, { "entropy": 1.780056294798851, "epoch": 0.3881710438722867, "grad_norm": 6.95949125289917, "learning_rate": 4.06053114458139e-06, "loss": 0.3812, "mean_token_accuracy": 0.8654720067977906, "num_tokens": 150576950.0, "step": 125220 }, { "entropy": 1.856193946301937, "epoch": 0.3882020429973364, "grad_norm": 6.373475074768066, "learning_rate": 4.060369017110518e-06, "loss": 0.4502, "mean_token_accuracy": 0.854848501086235, "num_tokens": 150589326.0, "step": 125230 }, { "entropy": 1.8003525391221047, "epoch": 0.3882330421223861, "grad_norm": 10.690299034118652, "learning_rate": 4.060206909058204e-06, "loss": 0.4474, "mean_token_accuracy": 0.8460852265357971, "num_tokens": 150602928.0, "step": 125240 }, { "entropy": 1.7654723808169366, "epoch": 0.3882640412474358, "grad_norm": 3.999009132385254, "learning_rate": 4.060044820420571e-06, "loss": 0.3632, "mean_token_accuracy": 0.8616338670253754, "num_tokens": 150616871.0, "step": 125250 }, { "entropy": 1.8826753467321395, "epoch": 0.38829504037248547, "grad_norm": 8.977141380310059, "learning_rate": 4.059882751193742e-06, "loss": 0.466, "mean_token_accuracy": 0.8482603415846824, "num_tokens": 150628654.0, "step": 125260 }, { "entropy": 1.904711978137493, "epoch": 0.3883260394975352, "grad_norm": 7.154592990875244, "learning_rate": 4.059720701373846e-06, "loss": 0.4669, "mean_token_accuracy": 0.8563105076551437, "num_tokens": 150639652.0, "step": 125270 }, { "entropy": 1.9130603343248367, "epoch": 0.38835703862258486, "grad_norm": 7.394834518432617, "learning_rate": 4.059558670957009e-06, "loss": 0.5137, "mean_token_accuracy": 0.8381910189986229, "num_tokens": 150651907.0, "step": 125280 }, { "entropy": 1.8639254540205001, "epoch": 0.3883880377476346, "grad_norm": 7.72260856628418, "learning_rate": 4.059396659939359e-06, "loss": 0.4821, "mean_token_accuracy": 0.8464089751243591, "num_tokens": 150664885.0, "step": 125290 }, { "entropy": 1.8498515799641608, "epoch": 0.38841903687268425, "grad_norm": 4.0547871589660645, "learning_rate": 4.059234668317025e-06, "loss": 0.4409, "mean_token_accuracy": 0.8645502269268036, "num_tokens": 150677100.0, "step": 125300 }, { "entropy": 1.844531959295273, "epoch": 0.388450035997734, "grad_norm": 8.105052947998047, "learning_rate": 4.059072696086137e-06, "loss": 0.4685, "mean_token_accuracy": 0.8442863315343857, "num_tokens": 150689825.0, "step": 125310 }, { "entropy": 1.894164504110813, "epoch": 0.38848103512278365, "grad_norm": 7.457192897796631, "learning_rate": 4.058910743242828e-06, "loss": 0.4526, "mean_token_accuracy": 0.8596911758184433, "num_tokens": 150701538.0, "step": 125320 }, { "entropy": 1.7797346115112305, "epoch": 0.38851203424783337, "grad_norm": 8.076257705688477, "learning_rate": 4.05874880978323e-06, "loss": 0.3842, "mean_token_accuracy": 0.8683399274945259, "num_tokens": 150713771.0, "step": 125330 }, { "entropy": 1.8412994965910912, "epoch": 0.38854303337288304, "grad_norm": 8.09342098236084, "learning_rate": 4.058586895703477e-06, "loss": 0.4193, "mean_token_accuracy": 0.8563569858670235, "num_tokens": 150726360.0, "step": 125340 }, { "entropy": 1.8202768236398696, "epoch": 0.38857403249793276, "grad_norm": 8.33877182006836, "learning_rate": 4.058425000999703e-06, "loss": 0.3963, "mean_token_accuracy": 0.8674609377980232, "num_tokens": 150738959.0, "step": 125350 }, { "entropy": 1.8868903383612632, "epoch": 0.38860503162298243, "grad_norm": 6.489454746246338, "learning_rate": 4.058263125668045e-06, "loss": 0.4395, "mean_token_accuracy": 0.8557512402534485, "num_tokens": 150750974.0, "step": 125360 }, { "entropy": 1.8816938310861588, "epoch": 0.38863603074803216, "grad_norm": 3.8871240615844727, "learning_rate": 4.058101269704639e-06, "loss": 0.5132, "mean_token_accuracy": 0.8465664908289909, "num_tokens": 150763017.0, "step": 125370 }, { "entropy": 1.967480507493019, "epoch": 0.3886670298730818, "grad_norm": 8.47203254699707, "learning_rate": 4.0579394331056216e-06, "loss": 0.5202, "mean_token_accuracy": 0.8347558289766311, "num_tokens": 150773877.0, "step": 125380 }, { "entropy": 1.846261352300644, "epoch": 0.38869802899813155, "grad_norm": 8.301435470581055, "learning_rate": 4.057777615867134e-06, "loss": 0.4412, "mean_token_accuracy": 0.8551396802067757, "num_tokens": 150786304.0, "step": 125390 }, { "entropy": 1.8747814357280732, "epoch": 0.3887290281231812, "grad_norm": 4.218428134918213, "learning_rate": 4.057615817985316e-06, "loss": 0.4316, "mean_token_accuracy": 0.8523815110325813, "num_tokens": 150797931.0, "step": 125400 }, { "entropy": 1.90915547311306, "epoch": 0.38876002724823094, "grad_norm": 4.607741355895996, "learning_rate": 4.057454039456308e-06, "loss": 0.4806, "mean_token_accuracy": 0.8459027215838433, "num_tokens": 150809552.0, "step": 125410 }, { "entropy": 1.9121482491493225, "epoch": 0.3887910263732806, "grad_norm": 3.7354090213775635, "learning_rate": 4.057292280276254e-06, "loss": 0.4652, "mean_token_accuracy": 0.8472837626934051, "num_tokens": 150820587.0, "step": 125420 }, { "entropy": 1.8476616084575652, "epoch": 0.38882202549833034, "grad_norm": 8.451462745666504, "learning_rate": 4.057130540441295e-06, "loss": 0.4871, "mean_token_accuracy": 0.8486483857035637, "num_tokens": 150832835.0, "step": 125430 }, { "entropy": 1.8385225757956505, "epoch": 0.38885302462338, "grad_norm": 10.56385612487793, "learning_rate": 4.0569688199475765e-06, "loss": 0.4569, "mean_token_accuracy": 0.8505441263318062, "num_tokens": 150845471.0, "step": 125440 }, { "entropy": 1.8741918548941612, "epoch": 0.3888840237484297, "grad_norm": 7.1380085945129395, "learning_rate": 4.056807118791245e-06, "loss": 0.4526, "mean_token_accuracy": 0.8532653734087944, "num_tokens": 150857775.0, "step": 125450 }, { "entropy": 1.8643349304795265, "epoch": 0.3889150228734794, "grad_norm": 7.431187152862549, "learning_rate": 4.056645436968446e-06, "loss": 0.4259, "mean_token_accuracy": 0.8581862285733223, "num_tokens": 150869444.0, "step": 125460 }, { "entropy": 1.892384371161461, "epoch": 0.38894602199852907, "grad_norm": 5.618492603302002, "learning_rate": 4.056483774475327e-06, "loss": 0.4758, "mean_token_accuracy": 0.8496564209461213, "num_tokens": 150881306.0, "step": 125470 }, { "entropy": 1.9383496299386025, "epoch": 0.3889770211235788, "grad_norm": 9.300127029418945, "learning_rate": 4.0563221313080375e-06, "loss": 0.482, "mean_token_accuracy": 0.8460114181041718, "num_tokens": 150892540.0, "step": 125480 }, { "entropy": 1.969851815700531, "epoch": 0.38900802024862846, "grad_norm": 10.012271881103516, "learning_rate": 4.056160507462727e-06, "loss": 0.5231, "mean_token_accuracy": 0.8429014101624489, "num_tokens": 150902915.0, "step": 125490 }, { "entropy": 1.801128013432026, "epoch": 0.3890390193736782, "grad_norm": 7.085505962371826, "learning_rate": 4.055998902935546e-06, "loss": 0.441, "mean_token_accuracy": 0.8595685541629792, "num_tokens": 150915697.0, "step": 125500 }, { "entropy": 1.899875347316265, "epoch": 0.38907001849872785, "grad_norm": 7.814671993255615, "learning_rate": 4.055837317722647e-06, "loss": 0.4748, "mean_token_accuracy": 0.8444872334599495, "num_tokens": 150927597.0, "step": 125510 }, { "entropy": 1.9567907482385636, "epoch": 0.3891010176237776, "grad_norm": 8.426041603088379, "learning_rate": 4.055675751820183e-06, "loss": 0.5164, "mean_token_accuracy": 0.8452389568090439, "num_tokens": 150938164.0, "step": 125520 }, { "entropy": 1.9653800070285796, "epoch": 0.38913201674882725, "grad_norm": 8.481525421142578, "learning_rate": 4.055514205224307e-06, "loss": 0.544, "mean_token_accuracy": 0.8337672173976898, "num_tokens": 150948930.0, "step": 125530 }, { "entropy": 1.8959671720862388, "epoch": 0.38916301587387697, "grad_norm": 3.948054075241089, "learning_rate": 4.055352677931176e-06, "loss": 0.4379, "mean_token_accuracy": 0.8531327813863754, "num_tokens": 150960961.0, "step": 125540 }, { "entropy": 1.934750607609749, "epoch": 0.38919401499892664, "grad_norm": 10.166702270507812, "learning_rate": 4.055191169936945e-06, "loss": 0.5107, "mean_token_accuracy": 0.8438746586441994, "num_tokens": 150971612.0, "step": 125550 }, { "entropy": 1.9563965529203415, "epoch": 0.38922501412397636, "grad_norm": 9.370492935180664, "learning_rate": 4.055029681237772e-06, "loss": 0.5295, "mean_token_accuracy": 0.8403390854597091, "num_tokens": 150982858.0, "step": 125560 }, { "entropy": 1.8438422948122024, "epoch": 0.38925601324902603, "grad_norm": 7.086329460144043, "learning_rate": 4.054868211829815e-06, "loss": 0.3796, "mean_token_accuracy": 0.8686971604824066, "num_tokens": 150994866.0, "step": 125570 }, { "entropy": 1.858051958680153, "epoch": 0.38928701237407576, "grad_norm": 8.350471496582031, "learning_rate": 4.054706761709233e-06, "loss": 0.4018, "mean_token_accuracy": 0.8600917294621467, "num_tokens": 151007774.0, "step": 125580 }, { "entropy": 1.8484682857990264, "epoch": 0.3893180114991254, "grad_norm": 8.876520156860352, "learning_rate": 4.054545330872188e-06, "loss": 0.4343, "mean_token_accuracy": 0.853037391602993, "num_tokens": 151020015.0, "step": 125590 }, { "entropy": 1.8667544916272163, "epoch": 0.38934901062417515, "grad_norm": 8.567416191101074, "learning_rate": 4.054383919314841e-06, "loss": 0.4612, "mean_token_accuracy": 0.8439093992114067, "num_tokens": 151033160.0, "step": 125600 }, { "entropy": 1.825586286187172, "epoch": 0.3893800097492248, "grad_norm": 6.282723426818848, "learning_rate": 4.054222527033354e-06, "loss": 0.4506, "mean_token_accuracy": 0.8462097451090813, "num_tokens": 151046530.0, "step": 125610 }, { "entropy": 1.9695552736520767, "epoch": 0.38941100887427454, "grad_norm": 11.352642059326172, "learning_rate": 4.054061154023891e-06, "loss": 0.5256, "mean_token_accuracy": 0.8349577113986015, "num_tokens": 151057681.0, "step": 125620 }, { "entropy": 1.8230530142784118, "epoch": 0.3894420079993242, "grad_norm": 3.0121681690216064, "learning_rate": 4.053899800282617e-06, "loss": 0.3696, "mean_token_accuracy": 0.8639827251434327, "num_tokens": 151070490.0, "step": 125630 }, { "entropy": 1.9242575734853744, "epoch": 0.38947300712437394, "grad_norm": 7.926018714904785, "learning_rate": 4.053738465805698e-06, "loss": 0.4306, "mean_token_accuracy": 0.8579060733318329, "num_tokens": 151083191.0, "step": 125640 }, { "entropy": 1.8758929014205932, "epoch": 0.3895040062494236, "grad_norm": 4.752248287200928, "learning_rate": 4.0535771505893e-06, "loss": 0.4575, "mean_token_accuracy": 0.8485000059008598, "num_tokens": 151096082.0, "step": 125650 }, { "entropy": 1.9501382157206535, "epoch": 0.38953500537447333, "grad_norm": 4.228458881378174, "learning_rate": 4.053415854629593e-06, "loss": 0.4896, "mean_token_accuracy": 0.8403441533446312, "num_tokens": 151107333.0, "step": 125660 }, { "entropy": 1.985108458995819, "epoch": 0.389566004499523, "grad_norm": 9.141436576843262, "learning_rate": 4.053254577922745e-06, "loss": 0.5549, "mean_token_accuracy": 0.8309772089123726, "num_tokens": 151118366.0, "step": 125670 }, { "entropy": 1.9115714818239211, "epoch": 0.3895970036245727, "grad_norm": 3.911010980606079, "learning_rate": 4.053093320464925e-06, "loss": 0.4499, "mean_token_accuracy": 0.8501477718353272, "num_tokens": 151130118.0, "step": 125680 }, { "entropy": 1.976603901386261, "epoch": 0.3896280027496224, "grad_norm": 8.365656852722168, "learning_rate": 4.0529320822523064e-06, "loss": 0.4979, "mean_token_accuracy": 0.8426365479826927, "num_tokens": 151140975.0, "step": 125690 }, { "entropy": 1.900614893436432, "epoch": 0.38965900187467206, "grad_norm": 8.18988037109375, "learning_rate": 4.05277086328106e-06, "loss": 0.4591, "mean_token_accuracy": 0.8563957184553146, "num_tokens": 151152889.0, "step": 125700 }, { "entropy": 1.9433655887842178, "epoch": 0.3896900009997218, "grad_norm": 7.366755962371826, "learning_rate": 4.05260966354736e-06, "loss": 0.4645, "mean_token_accuracy": 0.855003735423088, "num_tokens": 151164778.0, "step": 125710 }, { "entropy": 1.882521539926529, "epoch": 0.38972100012477146, "grad_norm": 7.148138523101807, "learning_rate": 4.05244848304738e-06, "loss": 0.4767, "mean_token_accuracy": 0.8558038398623466, "num_tokens": 151176942.0, "step": 125720 }, { "entropy": 1.8969984114170075, "epoch": 0.3897519992498212, "grad_norm": 6.991118907928467, "learning_rate": 4.052287321777295e-06, "loss": 0.4465, "mean_token_accuracy": 0.8517007067799568, "num_tokens": 151188748.0, "step": 125730 }, { "entropy": 1.8719248160719872, "epoch": 0.38978299837487085, "grad_norm": 8.159533500671387, "learning_rate": 4.052126179733283e-06, "loss": 0.4746, "mean_token_accuracy": 0.8400168597698212, "num_tokens": 151201409.0, "step": 125740 }, { "entropy": 1.8552237376570702, "epoch": 0.3898139974999206, "grad_norm": 4.326776504516602, "learning_rate": 4.051965056911522e-06, "loss": 0.4472, "mean_token_accuracy": 0.8551932379603386, "num_tokens": 151213645.0, "step": 125750 }, { "entropy": 1.88699069917202, "epoch": 0.38984499662497024, "grad_norm": 7.64647102355957, "learning_rate": 4.051803953308188e-06, "loss": 0.4361, "mean_token_accuracy": 0.8565610900521279, "num_tokens": 151225668.0, "step": 125760 }, { "entropy": 1.8817365534603596, "epoch": 0.38987599575001997, "grad_norm": 9.160236358642578, "learning_rate": 4.051642868919464e-06, "loss": 0.4781, "mean_token_accuracy": 0.8513475954532623, "num_tokens": 151238004.0, "step": 125770 }, { "entropy": 1.8975436851382255, "epoch": 0.38990699487506963, "grad_norm": 6.79841423034668, "learning_rate": 4.0514818037415285e-06, "loss": 0.4854, "mean_token_accuracy": 0.8526737794280053, "num_tokens": 151250091.0, "step": 125780 }, { "entropy": 1.9469924926757813, "epoch": 0.38993799400011936, "grad_norm": 7.432028770446777, "learning_rate": 4.051320757770564e-06, "loss": 0.4792, "mean_token_accuracy": 0.8524128124117851, "num_tokens": 151261967.0, "step": 125790 }, { "entropy": 1.9410534769296646, "epoch": 0.38996899312516903, "grad_norm": 3.4470081329345703, "learning_rate": 4.051159731002754e-06, "loss": 0.483, "mean_token_accuracy": 0.844953115284443, "num_tokens": 151273163.0, "step": 125800 }, { "entropy": 1.9425782278180121, "epoch": 0.38999999225021875, "grad_norm": 7.442099094390869, "learning_rate": 4.050998723434281e-06, "loss": 0.461, "mean_token_accuracy": 0.8562641754746437, "num_tokens": 151284496.0, "step": 125810 }, { "entropy": 1.8365588143467904, "epoch": 0.3900309913752684, "grad_norm": 8.772604942321777, "learning_rate": 4.050837735061332e-06, "loss": 0.454, "mean_token_accuracy": 0.8577765494585037, "num_tokens": 151297315.0, "step": 125820 }, { "entropy": 1.8861859440803528, "epoch": 0.39006199050031815, "grad_norm": 7.54157829284668, "learning_rate": 4.050676765880091e-06, "loss": 0.4461, "mean_token_accuracy": 0.8518304273486137, "num_tokens": 151309596.0, "step": 125830 }, { "entropy": 1.8863596022129059, "epoch": 0.3900929896253678, "grad_norm": 9.19207763671875, "learning_rate": 4.0505158158867485e-06, "loss": 0.4917, "mean_token_accuracy": 0.8433287471532822, "num_tokens": 151320769.0, "step": 125840 }, { "entropy": 1.9262786269187928, "epoch": 0.39012398875041754, "grad_norm": 5.372649192810059, "learning_rate": 4.050354885077489e-06, "loss": 0.5296, "mean_token_accuracy": 0.8311650440096855, "num_tokens": 151332882.0, "step": 125850 }, { "entropy": 1.913481391966343, "epoch": 0.3901549878754672, "grad_norm": 8.025369644165039, "learning_rate": 4.050193973448504e-06, "loss": 0.4681, "mean_token_accuracy": 0.8461648374795914, "num_tokens": 151344485.0, "step": 125860 }, { "entropy": 1.9113824039697647, "epoch": 0.39018598700051693, "grad_norm": 2.482351779937744, "learning_rate": 4.050033080995983e-06, "loss": 0.4437, "mean_token_accuracy": 0.8611611127853394, "num_tokens": 151356413.0, "step": 125870 }, { "entropy": 1.9198512472212315, "epoch": 0.3902169861255666, "grad_norm": 9.45190143585205, "learning_rate": 4.049872207716118e-06, "loss": 0.453, "mean_token_accuracy": 0.8463770866394043, "num_tokens": 151369155.0, "step": 125880 }, { "entropy": 1.9116858139634132, "epoch": 0.3902479852506163, "grad_norm": 7.803056716918945, "learning_rate": 4.0497113536051e-06, "loss": 0.4642, "mean_token_accuracy": 0.8526840418577194, "num_tokens": 151381458.0, "step": 125890 }, { "entropy": 1.8976224765181542, "epoch": 0.390278984375666, "grad_norm": 9.358246803283691, "learning_rate": 4.049550518659125e-06, "loss": 0.4582, "mean_token_accuracy": 0.8442497715353966, "num_tokens": 151393539.0, "step": 125900 }, { "entropy": 1.8757586926221848, "epoch": 0.3903099835007157, "grad_norm": 7.887486457824707, "learning_rate": 4.049389702874385e-06, "loss": 0.5401, "mean_token_accuracy": 0.8386203497648239, "num_tokens": 151406150.0, "step": 125910 }, { "entropy": 1.851169802248478, "epoch": 0.3903409826257654, "grad_norm": 9.607558250427246, "learning_rate": 4.049228906247078e-06, "loss": 0.4247, "mean_token_accuracy": 0.8545200616121292, "num_tokens": 151419052.0, "step": 125920 }, { "entropy": 1.8620378583669663, "epoch": 0.39037198175081506, "grad_norm": 4.9543986320495605, "learning_rate": 4.049068128773399e-06, "loss": 0.4283, "mean_token_accuracy": 0.8555585101246834, "num_tokens": 151431366.0, "step": 125930 }, { "entropy": 1.9239520370960235, "epoch": 0.3904029808758648, "grad_norm": 7.769740104675293, "learning_rate": 4.048907370449547e-06, "loss": 0.4579, "mean_token_accuracy": 0.8550285518169403, "num_tokens": 151442721.0, "step": 125940 }, { "entropy": 1.9248721554875374, "epoch": 0.39043398000091445, "grad_norm": 7.393733978271484, "learning_rate": 4.048746631271719e-06, "loss": 0.4521, "mean_token_accuracy": 0.8553394272923469, "num_tokens": 151454600.0, "step": 125950 }, { "entropy": 1.8826577708125114, "epoch": 0.3904649791259642, "grad_norm": 8.594537734985352, "learning_rate": 4.048585911236118e-06, "loss": 0.4206, "mean_token_accuracy": 0.8587727382779121, "num_tokens": 151466942.0, "step": 125960 }, { "entropy": 1.8130075439810753, "epoch": 0.39049597825101384, "grad_norm": 9.427589416503906, "learning_rate": 4.048425210338942e-06, "loss": 0.4433, "mean_token_accuracy": 0.8479527696967125, "num_tokens": 151480312.0, "step": 125970 }, { "entropy": 1.9348285168409347, "epoch": 0.39052697737606357, "grad_norm": 8.240906715393066, "learning_rate": 4.048264528576393e-06, "loss": 0.5186, "mean_token_accuracy": 0.8450803935527802, "num_tokens": 151491045.0, "step": 125980 }, { "entropy": 1.8613590627908707, "epoch": 0.39055797650111324, "grad_norm": 4.4133734703063965, "learning_rate": 4.048103865944676e-06, "loss": 0.4125, "mean_token_accuracy": 0.8640340849757194, "num_tokens": 151502174.0, "step": 125990 }, { "entropy": 1.905706176161766, "epoch": 0.39058897562616296, "grad_norm": 7.541008949279785, "learning_rate": 4.047943222439993e-06, "loss": 0.4654, "mean_token_accuracy": 0.8492152616381645, "num_tokens": 151513749.0, "step": 126000 }, { "entropy": 1.88614112585783, "epoch": 0.39061997475121263, "grad_norm": 8.197542190551758, "learning_rate": 4.047782598058551e-06, "loss": 0.4462, "mean_token_accuracy": 0.8495625197887421, "num_tokens": 151525450.0, "step": 126010 }, { "entropy": 1.8722525179386138, "epoch": 0.39065097387626235, "grad_norm": 7.363889217376709, "learning_rate": 4.047621992796555e-06, "loss": 0.4151, "mean_token_accuracy": 0.8617817178368569, "num_tokens": 151538034.0, "step": 126020 }, { "entropy": 1.9224463373422622, "epoch": 0.390681973001312, "grad_norm": 9.672123908996582, "learning_rate": 4.047461406650214e-06, "loss": 0.4682, "mean_token_accuracy": 0.8521727249026299, "num_tokens": 151549821.0, "step": 126030 }, { "entropy": 1.9509896606206893, "epoch": 0.39071297212636175, "grad_norm": 7.809655666351318, "learning_rate": 4.047300839615734e-06, "loss": 0.5067, "mean_token_accuracy": 0.8400021120905876, "num_tokens": 151561202.0, "step": 126040 }, { "entropy": 1.8582612618803978, "epoch": 0.3907439712514114, "grad_norm": 4.203096866607666, "learning_rate": 4.047140291689325e-06, "loss": 0.4059, "mean_token_accuracy": 0.864464844763279, "num_tokens": 151574760.0, "step": 126050 }, { "entropy": 1.9256421521306037, "epoch": 0.39077497037646114, "grad_norm": 7.765346050262451, "learning_rate": 4.0469797628671996e-06, "loss": 0.4424, "mean_token_accuracy": 0.8578554153442383, "num_tokens": 151586713.0, "step": 126060 }, { "entropy": 1.9531617119908333, "epoch": 0.3908059695015108, "grad_norm": 7.535767078399658, "learning_rate": 4.046819253145565e-06, "loss": 0.4852, "mean_token_accuracy": 0.8484017476439476, "num_tokens": 151598618.0, "step": 126070 }, { "entropy": 1.8639555156230927, "epoch": 0.39083696862656053, "grad_norm": 8.342696189880371, "learning_rate": 4.046658762520638e-06, "loss": 0.4913, "mean_token_accuracy": 0.8571187525987625, "num_tokens": 151610588.0, "step": 126080 }, { "entropy": 1.8678423702716827, "epoch": 0.3908679677516102, "grad_norm": 3.639899730682373, "learning_rate": 4.046498290988629e-06, "loss": 0.3858, "mean_token_accuracy": 0.8643327265977859, "num_tokens": 151622625.0, "step": 126090 }, { "entropy": 1.8767968013882637, "epoch": 0.3908989668766599, "grad_norm": 7.925841808319092, "learning_rate": 4.046337838545756e-06, "loss": 0.4044, "mean_token_accuracy": 0.8563542291522026, "num_tokens": 151636178.0, "step": 126100 }, { "entropy": 1.8535062327980996, "epoch": 0.3909299660017096, "grad_norm": 6.453927516937256, "learning_rate": 4.046177405188231e-06, "loss": 0.4687, "mean_token_accuracy": 0.8552742913365364, "num_tokens": 151649219.0, "step": 126110 }, { "entropy": 1.9538506001234055, "epoch": 0.3909609651267593, "grad_norm": 8.822364807128906, "learning_rate": 4.046016990912272e-06, "loss": 0.4634, "mean_token_accuracy": 0.8550143092870712, "num_tokens": 151660943.0, "step": 126120 }, { "entropy": 1.8665287435054778, "epoch": 0.390991964251809, "grad_norm": 8.066899299621582, "learning_rate": 4.045856595714099e-06, "loss": 0.4545, "mean_token_accuracy": 0.8569189369678497, "num_tokens": 151673423.0, "step": 126130 }, { "entropy": 1.9262668624520303, "epoch": 0.3910229633768587, "grad_norm": 7.15204381942749, "learning_rate": 4.045696219589927e-06, "loss": 0.5058, "mean_token_accuracy": 0.847204127907753, "num_tokens": 151684997.0, "step": 126140 }, { "entropy": 1.9528811901807785, "epoch": 0.3910539625019084, "grad_norm": 8.068526268005371, "learning_rate": 4.045535862535979e-06, "loss": 0.5037, "mean_token_accuracy": 0.8466339007019996, "num_tokens": 151696214.0, "step": 126150 }, { "entropy": 1.8696447387337685, "epoch": 0.3910849616269581, "grad_norm": 8.606704711914062, "learning_rate": 4.045375524548474e-06, "loss": 0.4442, "mean_token_accuracy": 0.852445213496685, "num_tokens": 151708632.0, "step": 126160 }, { "entropy": 1.899414698779583, "epoch": 0.3911159607520078, "grad_norm": 7.6843342781066895, "learning_rate": 4.0452152056236355e-06, "loss": 0.4273, "mean_token_accuracy": 0.8635346978902817, "num_tokens": 151720657.0, "step": 126170 }, { "entropy": 1.8455950021743774, "epoch": 0.39114695987705744, "grad_norm": 8.676742553710938, "learning_rate": 4.045054905757685e-06, "loss": 0.4219, "mean_token_accuracy": 0.8539397269487381, "num_tokens": 151732951.0, "step": 126180 }, { "entropy": 1.9040052101016045, "epoch": 0.39117795900210717, "grad_norm": 3.939152956008911, "learning_rate": 4.044894624946848e-06, "loss": 0.5054, "mean_token_accuracy": 0.8455827251076699, "num_tokens": 151744428.0, "step": 126190 }, { "entropy": 1.840182974934578, "epoch": 0.39120895812715684, "grad_norm": 7.910426616668701, "learning_rate": 4.044734363187349e-06, "loss": 0.4163, "mean_token_accuracy": 0.8497889280319214, "num_tokens": 151757200.0, "step": 126200 }, { "entropy": 1.9612287282943726, "epoch": 0.39123995725220656, "grad_norm": 8.55207347869873, "learning_rate": 4.044574120475414e-06, "loss": 0.5217, "mean_token_accuracy": 0.8453647062182427, "num_tokens": 151768054.0, "step": 126210 }, { "entropy": 1.925032651424408, "epoch": 0.39127095637725623, "grad_norm": 10.712833404541016, "learning_rate": 4.044413896807269e-06, "loss": 0.5053, "mean_token_accuracy": 0.845679797232151, "num_tokens": 151778718.0, "step": 126220 }, { "entropy": 1.9098077476024629, "epoch": 0.39130195550230595, "grad_norm": 10.31223201751709, "learning_rate": 4.044253692179145e-06, "loss": 0.471, "mean_token_accuracy": 0.8485254973173142, "num_tokens": 151789936.0, "step": 126230 }, { "entropy": 1.926163762807846, "epoch": 0.3913329546273556, "grad_norm": 7.671618938446045, "learning_rate": 4.044093506587268e-06, "loss": 0.4995, "mean_token_accuracy": 0.8456138476729393, "num_tokens": 151800690.0, "step": 126240 }, { "entropy": 1.9659644454717635, "epoch": 0.39136395375240535, "grad_norm": 10.632147789001465, "learning_rate": 4.043933340027872e-06, "loss": 0.5037, "mean_token_accuracy": 0.8462962001562119, "num_tokens": 151811555.0, "step": 126250 }, { "entropy": 1.9211021691560746, "epoch": 0.391394952877455, "grad_norm": 8.475778579711914, "learning_rate": 4.043773192497186e-06, "loss": 0.455, "mean_token_accuracy": 0.8569527640938759, "num_tokens": 151823099.0, "step": 126260 }, { "entropy": 1.8699581772089005, "epoch": 0.39142595200250474, "grad_norm": 7.7866530418396, "learning_rate": 4.043613063991444e-06, "loss": 0.4393, "mean_token_accuracy": 0.8597669988870621, "num_tokens": 151834750.0, "step": 126270 }, { "entropy": 1.9547498047351837, "epoch": 0.3914569511275544, "grad_norm": 8.300743103027344, "learning_rate": 4.043452954506876e-06, "loss": 0.5292, "mean_token_accuracy": 0.8422427833080292, "num_tokens": 151846263.0, "step": 126280 }, { "entropy": 1.8979168817400933, "epoch": 0.39148795025260413, "grad_norm": 9.36401653289795, "learning_rate": 4.04329286403972e-06, "loss": 0.4558, "mean_token_accuracy": 0.8518133416771889, "num_tokens": 151857975.0, "step": 126290 }, { "entropy": 1.8685284316539765, "epoch": 0.3915189493776538, "grad_norm": 7.125948429107666, "learning_rate": 4.043132792586211e-06, "loss": 0.4297, "mean_token_accuracy": 0.8564907148480415, "num_tokens": 151869148.0, "step": 126300 }, { "entropy": 1.932071290910244, "epoch": 0.3915499485027035, "grad_norm": 8.5045804977417, "learning_rate": 4.042972740142585e-06, "loss": 0.4501, "mean_token_accuracy": 0.85165533721447, "num_tokens": 151880669.0, "step": 126310 }, { "entropy": 1.898838460445404, "epoch": 0.3915809476277532, "grad_norm": 8.528064727783203, "learning_rate": 4.04281270670508e-06, "loss": 0.4681, "mean_token_accuracy": 0.8440765485167503, "num_tokens": 151893003.0, "step": 126320 }, { "entropy": 1.8209847897291183, "epoch": 0.3916119467528029, "grad_norm": 9.340559959411621, "learning_rate": 4.042652692269934e-06, "loss": 0.4549, "mean_token_accuracy": 0.859174670279026, "num_tokens": 151905874.0, "step": 126330 }, { "entropy": 1.95557002723217, "epoch": 0.3916429458778526, "grad_norm": 8.08029842376709, "learning_rate": 4.042492696833388e-06, "loss": 0.5362, "mean_token_accuracy": 0.8356934756040573, "num_tokens": 151917270.0, "step": 126340 }, { "entropy": 1.8023017689585685, "epoch": 0.3916739450029023, "grad_norm": 8.126782417297363, "learning_rate": 4.042332720391681e-06, "loss": 0.421, "mean_token_accuracy": 0.8570797294378281, "num_tokens": 151931119.0, "step": 126350 }, { "entropy": 1.881139089167118, "epoch": 0.391704944127952, "grad_norm": 8.571240425109863, "learning_rate": 4.042172762941057e-06, "loss": 0.4845, "mean_token_accuracy": 0.8508111834526062, "num_tokens": 151942409.0, "step": 126360 }, { "entropy": 1.8666364967823028, "epoch": 0.3917359432530017, "grad_norm": 8.6209077835083, "learning_rate": 4.042012824477757e-06, "loss": 0.4607, "mean_token_accuracy": 0.8415684968233108, "num_tokens": 151954323.0, "step": 126370 }, { "entropy": 1.8725038409233092, "epoch": 0.3917669423780514, "grad_norm": 7.019345760345459, "learning_rate": 4.041852904998025e-06, "loss": 0.4529, "mean_token_accuracy": 0.850992114841938, "num_tokens": 151967314.0, "step": 126380 }, { "entropy": 1.8260181456804276, "epoch": 0.3917979415031011, "grad_norm": 8.401409149169922, "learning_rate": 4.0416930044981064e-06, "loss": 0.4613, "mean_token_accuracy": 0.84242385327816, "num_tokens": 151980943.0, "step": 126390 }, { "entropy": 1.9128731414675713, "epoch": 0.39182894062815077, "grad_norm": 3.6636550426483154, "learning_rate": 4.041533122974248e-06, "loss": 0.4671, "mean_token_accuracy": 0.8557316899299622, "num_tokens": 151992446.0, "step": 126400 }, { "entropy": 1.8801569387316703, "epoch": 0.3918599397532005, "grad_norm": 4.1143107414245605, "learning_rate": 4.041373260422696e-06, "loss": 0.4701, "mean_token_accuracy": 0.8517079591751099, "num_tokens": 152004499.0, "step": 126410 }, { "entropy": 1.9254588589072228, "epoch": 0.39189093887825016, "grad_norm": 7.596971035003662, "learning_rate": 4.041213416839698e-06, "loss": 0.4795, "mean_token_accuracy": 0.8518746435642243, "num_tokens": 152016295.0, "step": 126420 }, { "entropy": 1.9142033576965332, "epoch": 0.39192193800329983, "grad_norm": 7.351873397827148, "learning_rate": 4.0410535922215046e-06, "loss": 0.4874, "mean_token_accuracy": 0.8495833858847618, "num_tokens": 152027558.0, "step": 126430 }, { "entropy": 1.892107328772545, "epoch": 0.39195293712834955, "grad_norm": 9.62601089477539, "learning_rate": 4.040893786564364e-06, "loss": 0.4711, "mean_token_accuracy": 0.8505282923579216, "num_tokens": 152039402.0, "step": 126440 }, { "entropy": 1.8616481512784957, "epoch": 0.3919839362533992, "grad_norm": 7.576359748840332, "learning_rate": 4.040733999864529e-06, "loss": 0.4808, "mean_token_accuracy": 0.8497416257858277, "num_tokens": 152051794.0, "step": 126450 }, { "entropy": 1.7416065499186515, "epoch": 0.39201493537844895, "grad_norm": 4.21736478805542, "learning_rate": 4.0405742321182505e-06, "loss": 0.3715, "mean_token_accuracy": 0.857705582678318, "num_tokens": 152065713.0, "step": 126460 }, { "entropy": 1.8679188892245293, "epoch": 0.3920459345034986, "grad_norm": 7.051849365234375, "learning_rate": 4.040414483321783e-06, "loss": 0.4499, "mean_token_accuracy": 0.8559218123555183, "num_tokens": 152078414.0, "step": 126470 }, { "entropy": 1.9041266560554504, "epoch": 0.39207693362854834, "grad_norm": 8.104182243347168, "learning_rate": 4.0402547534713795e-06, "loss": 0.4943, "mean_token_accuracy": 0.8375504925847054, "num_tokens": 152090227.0, "step": 126480 }, { "entropy": 1.9575203120708466, "epoch": 0.392107932753598, "grad_norm": 7.981137752532959, "learning_rate": 4.0400950425632965e-06, "loss": 0.5094, "mean_token_accuracy": 0.8419002950191498, "num_tokens": 152101040.0, "step": 126490 }, { "entropy": 1.907607588171959, "epoch": 0.39213893187864773, "grad_norm": 8.032891273498535, "learning_rate": 4.03993535059379e-06, "loss": 0.4519, "mean_token_accuracy": 0.8457751244306564, "num_tokens": 152113002.0, "step": 126500 }, { "entropy": 1.8676989868283271, "epoch": 0.3921699310036974, "grad_norm": 8.83267879486084, "learning_rate": 4.039775677559117e-06, "loss": 0.4944, "mean_token_accuracy": 0.839282539486885, "num_tokens": 152125801.0, "step": 126510 }, { "entropy": 1.943473118543625, "epoch": 0.3922009301287471, "grad_norm": 9.639033317565918, "learning_rate": 4.0396160234555365e-06, "loss": 0.5114, "mean_token_accuracy": 0.8399049505591393, "num_tokens": 152137319.0, "step": 126520 }, { "entropy": 1.885578916966915, "epoch": 0.3922319292537968, "grad_norm": 4.101627349853516, "learning_rate": 4.039456388279307e-06, "loss": 0.4814, "mean_token_accuracy": 0.8486464694142342, "num_tokens": 152148855.0, "step": 126530 }, { "entropy": 1.8974051237106324, "epoch": 0.3922629283788465, "grad_norm": 4.128299713134766, "learning_rate": 4.03929677202669e-06, "loss": 0.4754, "mean_token_accuracy": 0.8376821115612983, "num_tokens": 152161206.0, "step": 126540 }, { "entropy": 1.8535172313451767, "epoch": 0.3922939275038962, "grad_norm": 8.3539400100708, "learning_rate": 4.0391371746939465e-06, "loss": 0.3962, "mean_token_accuracy": 0.8649252921342849, "num_tokens": 152172699.0, "step": 126550 }, { "entropy": 1.954978096485138, "epoch": 0.3923249266289459, "grad_norm": 3.532893180847168, "learning_rate": 4.03897759627734e-06, "loss": 0.4886, "mean_token_accuracy": 0.8465725928544998, "num_tokens": 152184753.0, "step": 126560 }, { "entropy": 1.9255331978201866, "epoch": 0.3923559257539956, "grad_norm": 10.25304126739502, "learning_rate": 4.038818036773132e-06, "loss": 0.4848, "mean_token_accuracy": 0.8339841410517692, "num_tokens": 152196175.0, "step": 126570 }, { "entropy": 1.893292284011841, "epoch": 0.3923869248790453, "grad_norm": 4.092190742492676, "learning_rate": 4.038658496177589e-06, "loss": 0.4712, "mean_token_accuracy": 0.8404751896858216, "num_tokens": 152208309.0, "step": 126580 }, { "entropy": 1.892499789595604, "epoch": 0.392417924004095, "grad_norm": 7.2047438621521, "learning_rate": 4.0384989744869764e-06, "loss": 0.4035, "mean_token_accuracy": 0.8644706353545188, "num_tokens": 152220743.0, "step": 126590 }, { "entropy": 1.8051872000098228, "epoch": 0.3924489231291447, "grad_norm": 8.140718460083008, "learning_rate": 4.03833947169756e-06, "loss": 0.4284, "mean_token_accuracy": 0.857630018889904, "num_tokens": 152234128.0, "step": 126600 }, { "entropy": 1.9284190341830254, "epoch": 0.39247992225419437, "grad_norm": 8.732551574707031, "learning_rate": 4.038179987805609e-06, "loss": 0.4658, "mean_token_accuracy": 0.8481160834431648, "num_tokens": 152245832.0, "step": 126610 }, { "entropy": 1.8666399344801903, "epoch": 0.3925109213792441, "grad_norm": 5.154988765716553, "learning_rate": 4.038020522807391e-06, "loss": 0.4379, "mean_token_accuracy": 0.8664086773991585, "num_tokens": 152258281.0, "step": 126620 }, { "entropy": 1.9185226619243623, "epoch": 0.39254192050429376, "grad_norm": 8.254905700683594, "learning_rate": 4.037861076699175e-06, "loss": 0.4265, "mean_token_accuracy": 0.8595322445034981, "num_tokens": 152270283.0, "step": 126630 }, { "entropy": 1.881806494295597, "epoch": 0.3925729196293435, "grad_norm": 7.021206378936768, "learning_rate": 4.037701649477234e-06, "loss": 0.4585, "mean_token_accuracy": 0.8548443630337715, "num_tokens": 152282446.0, "step": 126640 }, { "entropy": 1.9258292108774184, "epoch": 0.39260391875439316, "grad_norm": 8.01991081237793, "learning_rate": 4.037542241137839e-06, "loss": 0.489, "mean_token_accuracy": 0.8480772167444229, "num_tokens": 152294011.0, "step": 126650 }, { "entropy": 1.976582020521164, "epoch": 0.3926349178794429, "grad_norm": 8.293559074401855, "learning_rate": 4.0373828516772615e-06, "loss": 0.4921, "mean_token_accuracy": 0.8434167116880417, "num_tokens": 152304686.0, "step": 126660 }, { "entropy": 1.8964557886123656, "epoch": 0.39266591700449255, "grad_norm": 8.829238891601562, "learning_rate": 4.037223481091777e-06, "loss": 0.4369, "mean_token_accuracy": 0.8579582333564758, "num_tokens": 152316928.0, "step": 126670 }, { "entropy": 1.889539574831724, "epoch": 0.3926969161295422, "grad_norm": 9.728771209716797, "learning_rate": 4.037064129377661e-06, "loss": 0.4205, "mean_token_accuracy": 0.8489784002304077, "num_tokens": 152329516.0, "step": 126680 }, { "entropy": 1.9084494978189468, "epoch": 0.39272791525459194, "grad_norm": 8.081507682800293, "learning_rate": 4.036904796531187e-06, "loss": 0.4672, "mean_token_accuracy": 0.8519722208380699, "num_tokens": 152340705.0, "step": 126690 }, { "entropy": 1.9536496102809906, "epoch": 0.3927589143796416, "grad_norm": 7.832414150238037, "learning_rate": 4.036745482548634e-06, "loss": 0.4672, "mean_token_accuracy": 0.8512647017836571, "num_tokens": 152351867.0, "step": 126700 }, { "entropy": 1.8545057475566864, "epoch": 0.39278991350469133, "grad_norm": 3.9451944828033447, "learning_rate": 4.03658618742628e-06, "loss": 0.4381, "mean_token_accuracy": 0.8511400848627091, "num_tokens": 152364222.0, "step": 126710 }, { "entropy": 1.8836131229996682, "epoch": 0.392820912629741, "grad_norm": 7.37644624710083, "learning_rate": 4.036426911160403e-06, "loss": 0.4651, "mean_token_accuracy": 0.8465021848678589, "num_tokens": 152376071.0, "step": 126720 }, { "entropy": 1.908482810854912, "epoch": 0.39285191175479073, "grad_norm": 8.756072998046875, "learning_rate": 4.036267653747283e-06, "loss": 0.4601, "mean_token_accuracy": 0.8456471160054206, "num_tokens": 152387712.0, "step": 126730 }, { "entropy": 1.8821537524461747, "epoch": 0.3928829108798404, "grad_norm": 6.746029853820801, "learning_rate": 4.036108415183202e-06, "loss": 0.4408, "mean_token_accuracy": 0.853573402762413, "num_tokens": 152399464.0, "step": 126740 }, { "entropy": 1.7690861240029334, "epoch": 0.3929139100048901, "grad_norm": 8.50271224975586, "learning_rate": 4.035949195464444e-06, "loss": 0.3848, "mean_token_accuracy": 0.8643131896853447, "num_tokens": 152412703.0, "step": 126750 }, { "entropy": 1.9057681828737258, "epoch": 0.3929449091299398, "grad_norm": 7.735318183898926, "learning_rate": 4.035789994587288e-06, "loss": 0.4516, "mean_token_accuracy": 0.8501883447170258, "num_tokens": 152423734.0, "step": 126760 }, { "entropy": 1.980392262339592, "epoch": 0.3929759082549895, "grad_norm": 9.195259094238281, "learning_rate": 4.0356308125480205e-06, "loss": 0.57, "mean_token_accuracy": 0.830004945397377, "num_tokens": 152435210.0, "step": 126770 }, { "entropy": 1.9139831706881523, "epoch": 0.3930069073800392, "grad_norm": 3.758761167526245, "learning_rate": 4.035471649342926e-06, "loss": 0.4639, "mean_token_accuracy": 0.8497570350766182, "num_tokens": 152447602.0, "step": 126780 }, { "entropy": 1.9336750611662865, "epoch": 0.3930379065050889, "grad_norm": 7.929681777954102, "learning_rate": 4.035312504968292e-06, "loss": 0.461, "mean_token_accuracy": 0.852272717654705, "num_tokens": 152458841.0, "step": 126790 }, { "entropy": 1.922458928823471, "epoch": 0.3930689056301386, "grad_norm": 7.431619644165039, "learning_rate": 4.035153379420406e-06, "loss": 0.4604, "mean_token_accuracy": 0.8492286145687103, "num_tokens": 152469765.0, "step": 126800 }, { "entropy": 1.9320431604981423, "epoch": 0.3930999047551883, "grad_norm": 7.810932636260986, "learning_rate": 4.034994272695553e-06, "loss": 0.5431, "mean_token_accuracy": 0.8349621132016182, "num_tokens": 152481239.0, "step": 126810 }, { "entropy": 1.911438837647438, "epoch": 0.39313090388023797, "grad_norm": 9.275655746459961, "learning_rate": 4.0348351847900256e-06, "loss": 0.4774, "mean_token_accuracy": 0.8501843497157097, "num_tokens": 152492347.0, "step": 126820 }, { "entropy": 1.8184763744473458, "epoch": 0.3931619030052877, "grad_norm": 8.230948448181152, "learning_rate": 4.034676115700114e-06, "loss": 0.4501, "mean_token_accuracy": 0.8377979636192322, "num_tokens": 152505653.0, "step": 126830 }, { "entropy": 1.9131136581301689, "epoch": 0.39319290213033736, "grad_norm": 7.071003437042236, "learning_rate": 4.0345170654221075e-06, "loss": 0.4916, "mean_token_accuracy": 0.8539568200707436, "num_tokens": 152517329.0, "step": 126840 }, { "entropy": 1.9276820585131644, "epoch": 0.3932239012553871, "grad_norm": 7.994453430175781, "learning_rate": 4.0343580339523e-06, "loss": 0.4784, "mean_token_accuracy": 0.8405784666538239, "num_tokens": 152528760.0, "step": 126850 }, { "entropy": 1.8666841223835946, "epoch": 0.39325490038043676, "grad_norm": 9.046262741088867, "learning_rate": 4.034199021286984e-06, "loss": 0.4388, "mean_token_accuracy": 0.8560965120792389, "num_tokens": 152540919.0, "step": 126860 }, { "entropy": 1.8318147659301758, "epoch": 0.3932858995054865, "grad_norm": 9.050399780273438, "learning_rate": 4.0340400274224554e-06, "loss": 0.4038, "mean_token_accuracy": 0.8550884276628494, "num_tokens": 152553211.0, "step": 126870 }, { "entropy": 1.8964627265930176, "epoch": 0.39331689863053615, "grad_norm": 7.297853946685791, "learning_rate": 4.033881052355008e-06, "loss": 0.467, "mean_token_accuracy": 0.840998500585556, "num_tokens": 152565550.0, "step": 126880 }, { "entropy": 1.8291507720947267, "epoch": 0.3933478977555859, "grad_norm": 4.372984886169434, "learning_rate": 4.03372209608094e-06, "loss": 0.4242, "mean_token_accuracy": 0.8626144364476204, "num_tokens": 152578617.0, "step": 126890 }, { "entropy": 1.9425703272223473, "epoch": 0.39337889688063554, "grad_norm": 9.826508522033691, "learning_rate": 4.033563158596547e-06, "loss": 0.5442, "mean_token_accuracy": 0.8346266880631447, "num_tokens": 152589941.0, "step": 126900 }, { "entropy": 1.8560305163264275, "epoch": 0.39340989600568527, "grad_norm": 9.724489212036133, "learning_rate": 4.0334042398981285e-06, "loss": 0.4451, "mean_token_accuracy": 0.8553323075175285, "num_tokens": 152602105.0, "step": 126910 }, { "entropy": 1.921321277320385, "epoch": 0.39344089513073494, "grad_norm": 8.991716384887695, "learning_rate": 4.033245339981984e-06, "loss": 0.4953, "mean_token_accuracy": 0.8386275783181191, "num_tokens": 152613975.0, "step": 126920 }, { "entropy": 1.9884714633226395, "epoch": 0.3934718942557846, "grad_norm": 7.993074893951416, "learning_rate": 4.033086458844415e-06, "loss": 0.495, "mean_token_accuracy": 0.8479173853993416, "num_tokens": 152624593.0, "step": 126930 }, { "entropy": 1.8521944627165794, "epoch": 0.39350289338083433, "grad_norm": 8.29841136932373, "learning_rate": 4.032927596481721e-06, "loss": 0.4397, "mean_token_accuracy": 0.8486550331115723, "num_tokens": 152636893.0, "step": 126940 }, { "entropy": 1.853082077205181, "epoch": 0.393533892505884, "grad_norm": 8.913239479064941, "learning_rate": 4.032768752890206e-06, "loss": 0.4259, "mean_token_accuracy": 0.8620326921343804, "num_tokens": 152648688.0, "step": 126950 }, { "entropy": 1.9352153725922108, "epoch": 0.3935648916309337, "grad_norm": 7.439796447753906, "learning_rate": 4.032609928066174e-06, "loss": 0.4972, "mean_token_accuracy": 0.8431309580802917, "num_tokens": 152660197.0, "step": 126960 }, { "entropy": 1.9533848583698272, "epoch": 0.3935958907559834, "grad_norm": 12.858529090881348, "learning_rate": 4.032451122005929e-06, "loss": 0.4903, "mean_token_accuracy": 0.8557668402791023, "num_tokens": 152671469.0, "step": 126970 }, { "entropy": 1.872800037264824, "epoch": 0.3936268898810331, "grad_norm": 7.507105350494385, "learning_rate": 4.032292334705776e-06, "loss": 0.4325, "mean_token_accuracy": 0.8509954452514649, "num_tokens": 152684255.0, "step": 126980 }, { "entropy": 1.8584757506847382, "epoch": 0.3936578890060828, "grad_norm": 3.439596176147461, "learning_rate": 4.032133566162022e-06, "loss": 0.4717, "mean_token_accuracy": 0.853032597899437, "num_tokens": 152696156.0, "step": 126990 }, { "entropy": 1.841444942355156, "epoch": 0.3936888881311325, "grad_norm": 8.641845703125, "learning_rate": 4.031974816370976e-06, "loss": 0.4546, "mean_token_accuracy": 0.8564640492200851, "num_tokens": 152708857.0, "step": 127000 }, { "entropy": 1.9723876774311067, "epoch": 0.3937198872561822, "grad_norm": 8.145781517028809, "learning_rate": 4.031816085328946e-06, "loss": 0.4604, "mean_token_accuracy": 0.8606492772698402, "num_tokens": 152719903.0, "step": 127010 }, { "entropy": 1.8345705628395081, "epoch": 0.3937508863812319, "grad_norm": 7.235647201538086, "learning_rate": 4.03165737303224e-06, "loss": 0.3913, "mean_token_accuracy": 0.8646158143877983, "num_tokens": 152732756.0, "step": 127020 }, { "entropy": 1.9376291260123253, "epoch": 0.39378188550628157, "grad_norm": 3.9151713848114014, "learning_rate": 4.031498679477171e-06, "loss": 0.4656, "mean_token_accuracy": 0.8439146399497985, "num_tokens": 152744727.0, "step": 127030 }, { "entropy": 1.903841333091259, "epoch": 0.3938128846313313, "grad_norm": 8.165589332580566, "learning_rate": 4.03134000466005e-06, "loss": 0.4755, "mean_token_accuracy": 0.840572564303875, "num_tokens": 152756881.0, "step": 127040 }, { "entropy": 1.8646742686629296, "epoch": 0.39384388375638096, "grad_norm": 5.022664546966553, "learning_rate": 4.0311813485771896e-06, "loss": 0.4021, "mean_token_accuracy": 0.8504794493317605, "num_tokens": 152769508.0, "step": 127050 }, { "entropy": 1.9327761620283126, "epoch": 0.3938748828814307, "grad_norm": 7.481938362121582, "learning_rate": 4.031022711224904e-06, "loss": 0.4982, "mean_token_accuracy": 0.8473081424832344, "num_tokens": 152781394.0, "step": 127060 }, { "entropy": 1.9920929014682769, "epoch": 0.39390588200648036, "grad_norm": 7.522092819213867, "learning_rate": 4.030864092599508e-06, "loss": 0.5138, "mean_token_accuracy": 0.8395040348172188, "num_tokens": 152792361.0, "step": 127070 }, { "entropy": 1.8420628361403941, "epoch": 0.3939368811315301, "grad_norm": 7.69534158706665, "learning_rate": 4.030705492697317e-06, "loss": 0.4029, "mean_token_accuracy": 0.8510744541883468, "num_tokens": 152805416.0, "step": 127080 }, { "entropy": 1.9465318858623504, "epoch": 0.39396788025657975, "grad_norm": 7.537118911743164, "learning_rate": 4.030546911514647e-06, "loss": 0.4564, "mean_token_accuracy": 0.851013197004795, "num_tokens": 152817242.0, "step": 127090 }, { "entropy": 1.861098751425743, "epoch": 0.3939988793816295, "grad_norm": 8.616077423095703, "learning_rate": 4.030388349047818e-06, "loss": 0.4309, "mean_token_accuracy": 0.8491953507065773, "num_tokens": 152830104.0, "step": 127100 }, { "entropy": 1.9238109394907952, "epoch": 0.39402987850667914, "grad_norm": 8.807307243347168, "learning_rate": 4.0302298052931475e-06, "loss": 0.4784, "mean_token_accuracy": 0.8525387272238731, "num_tokens": 152841452.0, "step": 127110 }, { "entropy": 1.9179314807057382, "epoch": 0.39406087763172887, "grad_norm": 10.16199779510498, "learning_rate": 4.030071280246956e-06, "loss": 0.4914, "mean_token_accuracy": 0.8477940663695336, "num_tokens": 152852810.0, "step": 127120 }, { "entropy": 1.913467188179493, "epoch": 0.39409187675677854, "grad_norm": 3.260228395462036, "learning_rate": 4.029912773905563e-06, "loss": 0.4572, "mean_token_accuracy": 0.848815667629242, "num_tokens": 152864554.0, "step": 127130 }, { "entropy": 1.8686377540230752, "epoch": 0.39412287588182826, "grad_norm": 9.483077049255371, "learning_rate": 4.029754286265293e-06, "loss": 0.4847, "mean_token_accuracy": 0.8370977133512497, "num_tokens": 152877426.0, "step": 127140 }, { "entropy": 1.929710279405117, "epoch": 0.39415387500687793, "grad_norm": 2.918980360031128, "learning_rate": 4.029595817322465e-06, "loss": 0.453, "mean_token_accuracy": 0.8524357438087463, "num_tokens": 152889339.0, "step": 127150 }, { "entropy": 1.879020507633686, "epoch": 0.39418487413192765, "grad_norm": 7.318241596221924, "learning_rate": 4.029437367073407e-06, "loss": 0.3923, "mean_token_accuracy": 0.868370558321476, "num_tokens": 152902178.0, "step": 127160 }, { "entropy": 1.8702649146318435, "epoch": 0.3942158732569773, "grad_norm": 8.086983680725098, "learning_rate": 4.029278935514442e-06, "loss": 0.4811, "mean_token_accuracy": 0.847602641582489, "num_tokens": 152914178.0, "step": 127170 }, { "entropy": 1.8651431202888489, "epoch": 0.394246872382027, "grad_norm": 8.561652183532715, "learning_rate": 4.029120522641896e-06, "loss": 0.4401, "mean_token_accuracy": 0.8519322633743286, "num_tokens": 152926886.0, "step": 127180 }, { "entropy": 1.8370242148637772, "epoch": 0.3942778715070767, "grad_norm": 4.489937782287598, "learning_rate": 4.028962128452095e-06, "loss": 0.4949, "mean_token_accuracy": 0.8447151646018028, "num_tokens": 152940001.0, "step": 127190 }, { "entropy": 1.806211344152689, "epoch": 0.3943088706321264, "grad_norm": 7.989223957061768, "learning_rate": 4.02880375294137e-06, "loss": 0.3879, "mean_token_accuracy": 0.8558181077241898, "num_tokens": 152953714.0, "step": 127200 }, { "entropy": 1.8846919029951095, "epoch": 0.3943398697571761, "grad_norm": 7.853222846984863, "learning_rate": 4.0286453961060475e-06, "loss": 0.4158, "mean_token_accuracy": 0.8626481354236603, "num_tokens": 152965285.0, "step": 127210 }, { "entropy": 1.8866841107606889, "epoch": 0.3943708688822258, "grad_norm": 3.7979695796966553, "learning_rate": 4.0284870579424576e-06, "loss": 0.4481, "mean_token_accuracy": 0.8536775171756744, "num_tokens": 152978547.0, "step": 127220 }, { "entropy": 1.9279194965958595, "epoch": 0.3944018680072755, "grad_norm": 8.925294876098633, "learning_rate": 4.028328738446932e-06, "loss": 0.4702, "mean_token_accuracy": 0.8454255104064942, "num_tokens": 152990780.0, "step": 127230 }, { "entropy": 1.8884064748883247, "epoch": 0.39443286713232517, "grad_norm": 4.231663227081299, "learning_rate": 4.028170437615802e-06, "loss": 0.4264, "mean_token_accuracy": 0.8594095915555954, "num_tokens": 153003179.0, "step": 127240 }, { "entropy": 1.9022858142852783, "epoch": 0.3944638662573749, "grad_norm": 4.5120038986206055, "learning_rate": 4.028012155445402e-06, "loss": 0.468, "mean_token_accuracy": 0.8501111850142479, "num_tokens": 153015801.0, "step": 127250 }, { "entropy": 1.9620060488581657, "epoch": 0.39449486538242456, "grad_norm": 7.684929370880127, "learning_rate": 4.027853891932065e-06, "loss": 0.4612, "mean_token_accuracy": 0.854415363073349, "num_tokens": 153027019.0, "step": 127260 }, { "entropy": 1.9046915173530579, "epoch": 0.3945258645074743, "grad_norm": 4.640239238739014, "learning_rate": 4.027695647072125e-06, "loss": 0.4763, "mean_token_accuracy": 0.8494089663028717, "num_tokens": 153039194.0, "step": 127270 }, { "entropy": 1.9410059854388237, "epoch": 0.39455686363252396, "grad_norm": 9.024331092834473, "learning_rate": 4.027537420861921e-06, "loss": 0.4837, "mean_token_accuracy": 0.846622771024704, "num_tokens": 153050529.0, "step": 127280 }, { "entropy": 1.84144167304039, "epoch": 0.3945878627575737, "grad_norm": 10.240126609802246, "learning_rate": 4.027379213297787e-06, "loss": 0.4221, "mean_token_accuracy": 0.8550687476992607, "num_tokens": 153063542.0, "step": 127290 }, { "entropy": 1.8818937510251998, "epoch": 0.39461886188262335, "grad_norm": 4.128033638000488, "learning_rate": 4.027221024376063e-06, "loss": 0.4108, "mean_token_accuracy": 0.8581587508320808, "num_tokens": 153076371.0, "step": 127300 }, { "entropy": 1.9620690792798996, "epoch": 0.3946498610076731, "grad_norm": 8.45438289642334, "learning_rate": 4.027062854093087e-06, "loss": 0.5293, "mean_token_accuracy": 0.8367072865366936, "num_tokens": 153086758.0, "step": 127310 }, { "entropy": 1.879226452112198, "epoch": 0.39468086013272274, "grad_norm": 8.631587982177734, "learning_rate": 4.0269047024452e-06, "loss": 0.4485, "mean_token_accuracy": 0.8429103165864944, "num_tokens": 153099124.0, "step": 127320 }, { "entropy": 1.911029715836048, "epoch": 0.39471185925777247, "grad_norm": 6.511907577514648, "learning_rate": 4.0267465694287424e-06, "loss": 0.4207, "mean_token_accuracy": 0.8632428720593452, "num_tokens": 153110564.0, "step": 127330 }, { "entropy": 1.8814439699053764, "epoch": 0.39474285838282214, "grad_norm": 8.013952255249023, "learning_rate": 4.026588455040057e-06, "loss": 0.4122, "mean_token_accuracy": 0.8586022913455963, "num_tokens": 153123090.0, "step": 127340 }, { "entropy": 1.9142462641000748, "epoch": 0.39477385750787186, "grad_norm": 6.910181045532227, "learning_rate": 4.0264303592754846e-06, "loss": 0.4592, "mean_token_accuracy": 0.8546132728457451, "num_tokens": 153134532.0, "step": 127350 }, { "entropy": 1.8884952366352081, "epoch": 0.39480485663292153, "grad_norm": 11.307275772094727, "learning_rate": 4.026272282131373e-06, "loss": 0.4446, "mean_token_accuracy": 0.8564356371760369, "num_tokens": 153146234.0, "step": 127360 }, { "entropy": 1.845706556737423, "epoch": 0.39483585575797125, "grad_norm": 6.627220630645752, "learning_rate": 4.026114223604065e-06, "loss": 0.3955, "mean_token_accuracy": 0.8640777125954628, "num_tokens": 153159242.0, "step": 127370 }, { "entropy": 1.8887208893895149, "epoch": 0.3948668548830209, "grad_norm": 10.363401412963867, "learning_rate": 4.025956183689907e-06, "loss": 0.4095, "mean_token_accuracy": 0.8451639533042907, "num_tokens": 153172510.0, "step": 127380 }, { "entropy": 1.9080938518047332, "epoch": 0.39489785400807065, "grad_norm": 7.9390482902526855, "learning_rate": 4.025798162385246e-06, "loss": 0.4962, "mean_token_accuracy": 0.8457752048969269, "num_tokens": 153184696.0, "step": 127390 }, { "entropy": 1.9023298189043998, "epoch": 0.3949288531331203, "grad_norm": 8.671669006347656, "learning_rate": 4.02564015968643e-06, "loss": 0.4492, "mean_token_accuracy": 0.851709508895874, "num_tokens": 153196489.0, "step": 127400 }, { "entropy": 1.8737532109022141, "epoch": 0.39495985225817, "grad_norm": 6.4061198234558105, "learning_rate": 4.025482175589809e-06, "loss": 0.4051, "mean_token_accuracy": 0.8673104450106621, "num_tokens": 153207793.0, "step": 127410 }, { "entropy": 1.8804204553365707, "epoch": 0.3949908513832197, "grad_norm": 7.620257377624512, "learning_rate": 4.025324210091733e-06, "loss": 0.4195, "mean_token_accuracy": 0.8681916758418083, "num_tokens": 153219491.0, "step": 127420 }, { "entropy": 1.803827053308487, "epoch": 0.3950218505082694, "grad_norm": 4.123118877410889, "learning_rate": 4.0251662631885505e-06, "loss": 0.4314, "mean_token_accuracy": 0.8462093636393547, "num_tokens": 153232842.0, "step": 127430 }, { "entropy": 1.870395976305008, "epoch": 0.3950528496333191, "grad_norm": 8.609538078308105, "learning_rate": 4.025008334876618e-06, "loss": 0.453, "mean_token_accuracy": 0.8614588662981987, "num_tokens": 153244929.0, "step": 127440 }, { "entropy": 1.83759603202343, "epoch": 0.39508384875836877, "grad_norm": 8.134893417358398, "learning_rate": 4.024850425152286e-06, "loss": 0.3945, "mean_token_accuracy": 0.8604315787553787, "num_tokens": 153258122.0, "step": 127450 }, { "entropy": 1.919068345427513, "epoch": 0.3951148478834185, "grad_norm": 7.001434326171875, "learning_rate": 4.024692534011908e-06, "loss": 0.4842, "mean_token_accuracy": 0.8476987138390542, "num_tokens": 153269204.0, "step": 127460 }, { "entropy": 1.839623036980629, "epoch": 0.39514584700846817, "grad_norm": 8.048707962036133, "learning_rate": 4.024534661451841e-06, "loss": 0.4184, "mean_token_accuracy": 0.8495053514838219, "num_tokens": 153282189.0, "step": 127470 }, { "entropy": 1.9202652007341385, "epoch": 0.3951768461335179, "grad_norm": 8.451288223266602, "learning_rate": 4.02437680746844e-06, "loss": 0.4258, "mean_token_accuracy": 0.857101121544838, "num_tokens": 153293971.0, "step": 127480 }, { "entropy": 1.8971919193863869, "epoch": 0.39520784525856756, "grad_norm": 7.40570592880249, "learning_rate": 4.024218972058062e-06, "loss": 0.4246, "mean_token_accuracy": 0.8695945486426353, "num_tokens": 153305994.0, "step": 127490 }, { "entropy": 1.8754101365804672, "epoch": 0.3952388443836173, "grad_norm": 7.888429641723633, "learning_rate": 4.0240611552170665e-06, "loss": 0.4087, "mean_token_accuracy": 0.8595315098762513, "num_tokens": 153319098.0, "step": 127500 }, { "entropy": 1.86890929043293, "epoch": 0.39526984350866695, "grad_norm": 8.941306114196777, "learning_rate": 4.023903356941811e-06, "loss": 0.4383, "mean_token_accuracy": 0.8576035022735595, "num_tokens": 153331242.0, "step": 127510 }, { "entropy": 1.8997353851795196, "epoch": 0.3953008426337167, "grad_norm": 7.998995780944824, "learning_rate": 4.023745577228658e-06, "loss": 0.4046, "mean_token_accuracy": 0.8570963263511657, "num_tokens": 153344304.0, "step": 127520 }, { "entropy": 1.820690706372261, "epoch": 0.39533184175876634, "grad_norm": 7.785648822784424, "learning_rate": 4.023587816073965e-06, "loss": 0.4075, "mean_token_accuracy": 0.8614529862999916, "num_tokens": 153358390.0, "step": 127530 }, { "entropy": 1.8702727839350701, "epoch": 0.39536284088381607, "grad_norm": 4.13304328918457, "learning_rate": 4.023430073474097e-06, "loss": 0.4442, "mean_token_accuracy": 0.8575648859143257, "num_tokens": 153370126.0, "step": 127540 }, { "entropy": 1.81645250543952, "epoch": 0.39539384000886574, "grad_norm": 2.846233606338501, "learning_rate": 4.023272349425415e-06, "loss": 0.3549, "mean_token_accuracy": 0.8668262645602226, "num_tokens": 153383650.0, "step": 127550 }, { "entropy": 1.899166676402092, "epoch": 0.39542483913391546, "grad_norm": 8.356348037719727, "learning_rate": 4.023114643924286e-06, "loss": 0.4868, "mean_token_accuracy": 0.8431504383683205, "num_tokens": 153395416.0, "step": 127560 }, { "entropy": 1.8529985576868058, "epoch": 0.39545583825896513, "grad_norm": 2.3607380390167236, "learning_rate": 4.022956956967072e-06, "loss": 0.4343, "mean_token_accuracy": 0.8648950770497322, "num_tokens": 153407938.0, "step": 127570 }, { "entropy": 1.8317057311534881, "epoch": 0.39548683738401486, "grad_norm": 8.973584175109863, "learning_rate": 4.022799288550142e-06, "loss": 0.4097, "mean_token_accuracy": 0.8503635957837105, "num_tokens": 153420425.0, "step": 127580 }, { "entropy": 1.967195200920105, "epoch": 0.3955178365090645, "grad_norm": 8.250533103942871, "learning_rate": 4.022641638669861e-06, "loss": 0.4677, "mean_token_accuracy": 0.8539433136582375, "num_tokens": 153431948.0, "step": 127590 }, { "entropy": 1.8443527117371559, "epoch": 0.39554883563411425, "grad_norm": 8.231595039367676, "learning_rate": 4.022484007322598e-06, "loss": 0.4609, "mean_token_accuracy": 0.8479686647653579, "num_tokens": 153445161.0, "step": 127600 }, { "entropy": 1.9092737153172492, "epoch": 0.3955798347591639, "grad_norm": 8.519765853881836, "learning_rate": 4.022326394504722e-06, "loss": 0.4604, "mean_token_accuracy": 0.8497352659702301, "num_tokens": 153456782.0, "step": 127610 }, { "entropy": 1.9170891061425208, "epoch": 0.39561083388421364, "grad_norm": 7.752858638763428, "learning_rate": 4.0221688002126016e-06, "loss": 0.462, "mean_token_accuracy": 0.8510538384318351, "num_tokens": 153468381.0, "step": 127620 }, { "entropy": 1.8833612218499183, "epoch": 0.3956418330092633, "grad_norm": 4.76165246963501, "learning_rate": 4.022011224442611e-06, "loss": 0.4545, "mean_token_accuracy": 0.8437580853700638, "num_tokens": 153481424.0, "step": 127630 }, { "entropy": 1.9320109218358994, "epoch": 0.39567283213431304, "grad_norm": 4.280218601226807, "learning_rate": 4.02185366719112e-06, "loss": 0.4584, "mean_token_accuracy": 0.848100745677948, "num_tokens": 153493443.0, "step": 127640 }, { "entropy": 1.960513624548912, "epoch": 0.3957038312593627, "grad_norm": 8.801411628723145, "learning_rate": 4.021696128454502e-06, "loss": 0.503, "mean_token_accuracy": 0.8425059333443642, "num_tokens": 153504656.0, "step": 127650 }, { "entropy": 1.9096018448472023, "epoch": 0.3957348303844124, "grad_norm": 3.8925299644470215, "learning_rate": 4.0215386082291315e-06, "loss": 0.4661, "mean_token_accuracy": 0.8456997722387314, "num_tokens": 153516555.0, "step": 127660 }, { "entropy": 1.9139437288045884, "epoch": 0.3957658295094621, "grad_norm": 7.8598127365112305, "learning_rate": 4.021381106511384e-06, "loss": 0.5097, "mean_token_accuracy": 0.8359185487031937, "num_tokens": 153528681.0, "step": 127670 }, { "entropy": 1.925663386285305, "epoch": 0.39579682863451177, "grad_norm": 7.7925944328308105, "learning_rate": 4.021223623297635e-06, "loss": 0.4377, "mean_token_accuracy": 0.8525381490588189, "num_tokens": 153540256.0, "step": 127680 }, { "entropy": 1.9570807069540024, "epoch": 0.3958278277595615, "grad_norm": 7.370650291442871, "learning_rate": 4.021066158584261e-06, "loss": 0.5033, "mean_token_accuracy": 0.848370935022831, "num_tokens": 153551273.0, "step": 127690 }, { "entropy": 1.8661099091172217, "epoch": 0.39585882688461116, "grad_norm": 4.928553581237793, "learning_rate": 4.020908712367641e-06, "loss": 0.4293, "mean_token_accuracy": 0.8484266191720963, "num_tokens": 153563180.0, "step": 127700 }, { "entropy": 1.9060293585062027, "epoch": 0.3958898260096609, "grad_norm": 9.447659492492676, "learning_rate": 4.020751284644153e-06, "loss": 0.4944, "mean_token_accuracy": 0.8432995095849037, "num_tokens": 153575032.0, "step": 127710 }, { "entropy": 1.953291055560112, "epoch": 0.39592082513471055, "grad_norm": 7.042132377624512, "learning_rate": 4.020593875410178e-06, "loss": 0.4451, "mean_token_accuracy": 0.8566155821084976, "num_tokens": 153586028.0, "step": 127720 }, { "entropy": 1.9117076322436333, "epoch": 0.3959518242597603, "grad_norm": 8.872218132019043, "learning_rate": 4.020436484662098e-06, "loss": 0.4485, "mean_token_accuracy": 0.8476521626114846, "num_tokens": 153599040.0, "step": 127730 }, { "entropy": 1.8983178213238716, "epoch": 0.39598282338480995, "grad_norm": 8.394247055053711, "learning_rate": 4.020279112396293e-06, "loss": 0.4207, "mean_token_accuracy": 0.8589667662978172, "num_tokens": 153610777.0, "step": 127740 }, { "entropy": 1.9231909066438675, "epoch": 0.39601382250985967, "grad_norm": 10.013471603393555, "learning_rate": 4.0201217586091465e-06, "loss": 0.4891, "mean_token_accuracy": 0.8497250765562058, "num_tokens": 153621814.0, "step": 127750 }, { "entropy": 1.900232744216919, "epoch": 0.39604482163490934, "grad_norm": 9.529150009155273, "learning_rate": 4.019964423297043e-06, "loss": 0.4934, "mean_token_accuracy": 0.842905105650425, "num_tokens": 153634211.0, "step": 127760 }, { "entropy": 1.8112396225333214, "epoch": 0.39607582075995906, "grad_norm": 9.09139347076416, "learning_rate": 4.019807106456367e-06, "loss": 0.3551, "mean_token_accuracy": 0.8626585349440574, "num_tokens": 153647738.0, "step": 127770 }, { "entropy": 1.8331888422369957, "epoch": 0.39610681988500873, "grad_norm": 6.125622272491455, "learning_rate": 4.0196498080835054e-06, "loss": 0.3867, "mean_token_accuracy": 0.8652116924524307, "num_tokens": 153661042.0, "step": 127780 }, { "entropy": 1.9142784118652343, "epoch": 0.39613781901005846, "grad_norm": 4.094144821166992, "learning_rate": 4.019492528174845e-06, "loss": 0.4589, "mean_token_accuracy": 0.8501773819327354, "num_tokens": 153673112.0, "step": 127790 }, { "entropy": 1.8980029091238975, "epoch": 0.3961688181351081, "grad_norm": 3.3827157020568848, "learning_rate": 4.0193352667267725e-06, "loss": 0.4441, "mean_token_accuracy": 0.8512454509735108, "num_tokens": 153685077.0, "step": 127800 }, { "entropy": 1.716671919822693, "epoch": 0.39619981726015785, "grad_norm": 3.326303482055664, "learning_rate": 4.019178023735678e-06, "loss": 0.2917, "mean_token_accuracy": 0.8814573958516121, "num_tokens": 153699159.0, "step": 127810 }, { "entropy": 1.7483276754617691, "epoch": 0.3962308163852075, "grad_norm": 8.083634376525879, "learning_rate": 4.019020799197951e-06, "loss": 0.4023, "mean_token_accuracy": 0.8582610175013542, "num_tokens": 153713542.0, "step": 127820 }, { "entropy": 1.915240579843521, "epoch": 0.39626181551025724, "grad_norm": 10.262462615966797, "learning_rate": 4.018863593109982e-06, "loss": 0.4624, "mean_token_accuracy": 0.8524910137057304, "num_tokens": 153725218.0, "step": 127830 }, { "entropy": 1.8652495056390763, "epoch": 0.3962928146353069, "grad_norm": 9.742287635803223, "learning_rate": 4.018706405468165e-06, "loss": 0.4266, "mean_token_accuracy": 0.8553525045514107, "num_tokens": 153737029.0, "step": 127840 }, { "entropy": 1.9075027361512185, "epoch": 0.39632381376035664, "grad_norm": 9.754807472229004, "learning_rate": 4.018549236268891e-06, "loss": 0.4476, "mean_token_accuracy": 0.8644733369350434, "num_tokens": 153748555.0, "step": 127850 }, { "entropy": 1.8555178031325341, "epoch": 0.3963548128854063, "grad_norm": 10.997365951538086, "learning_rate": 4.018392085508554e-06, "loss": 0.4603, "mean_token_accuracy": 0.8505488753318786, "num_tokens": 153761293.0, "step": 127860 }, { "entropy": 1.8854364529252052, "epoch": 0.39638581201045603, "grad_norm": 10.64345932006836, "learning_rate": 4.018234953183549e-06, "loss": 0.4653, "mean_token_accuracy": 0.8452537760138512, "num_tokens": 153773673.0, "step": 127870 }, { "entropy": 1.9308582305908204, "epoch": 0.3964168111355057, "grad_norm": 8.477107048034668, "learning_rate": 4.018077839290272e-06, "loss": 0.4758, "mean_token_accuracy": 0.8420649126172066, "num_tokens": 153785000.0, "step": 127880 }, { "entropy": 1.8462091460824013, "epoch": 0.3964478102605554, "grad_norm": 4.125033855438232, "learning_rate": 4.01792074382512e-06, "loss": 0.4301, "mean_token_accuracy": 0.8481978312134743, "num_tokens": 153797926.0, "step": 127890 }, { "entropy": 1.9186622604727746, "epoch": 0.3964788093856051, "grad_norm": 8.226593971252441, "learning_rate": 4.0177636667844914e-06, "loss": 0.4926, "mean_token_accuracy": 0.8365380227565765, "num_tokens": 153809721.0, "step": 127900 }, { "entropy": 1.8989548355340957, "epoch": 0.39650980851065476, "grad_norm": 7.229928970336914, "learning_rate": 4.017606608164784e-06, "loss": 0.4022, "mean_token_accuracy": 0.8661728858947754, "num_tokens": 153821479.0, "step": 127910 }, { "entropy": 1.875709056854248, "epoch": 0.3965408076357045, "grad_norm": 5.093418598175049, "learning_rate": 4.017449567962398e-06, "loss": 0.4293, "mean_token_accuracy": 0.8482154250144959, "num_tokens": 153833724.0, "step": 127920 }, { "entropy": 1.9880733832716941, "epoch": 0.39657180676075415, "grad_norm": 8.915475845336914, "learning_rate": 4.0172925461737336e-06, "loss": 0.4746, "mean_token_accuracy": 0.8514209687709808, "num_tokens": 153844594.0, "step": 127930 }, { "entropy": 1.8570735067129136, "epoch": 0.3966028058858039, "grad_norm": 4.229099273681641, "learning_rate": 4.017135542795195e-06, "loss": 0.4222, "mean_token_accuracy": 0.8634174600243568, "num_tokens": 153856932.0, "step": 127940 }, { "entropy": 1.9012201085686684, "epoch": 0.39663380501085355, "grad_norm": 11.16518783569336, "learning_rate": 4.016978557823181e-06, "loss": 0.4162, "mean_token_accuracy": 0.8594008833169937, "num_tokens": 153868678.0, "step": 127950 }, { "entropy": 1.7977351397275925, "epoch": 0.39666480413590327, "grad_norm": 7.758661270141602, "learning_rate": 4.016821591254099e-06, "loss": 0.3605, "mean_token_accuracy": 0.8647834539413453, "num_tokens": 153882380.0, "step": 127960 }, { "entropy": 1.86228808760643, "epoch": 0.39669580326095294, "grad_norm": 4.080726623535156, "learning_rate": 4.016664643084352e-06, "loss": 0.4335, "mean_token_accuracy": 0.8644084423780442, "num_tokens": 153894569.0, "step": 127970 }, { "entropy": 1.9319358631968497, "epoch": 0.39672680238600266, "grad_norm": 7.336208820343018, "learning_rate": 4.016507713310346e-06, "loss": 0.474, "mean_token_accuracy": 0.8546565786004067, "num_tokens": 153905410.0, "step": 127980 }, { "entropy": 1.8738821625709534, "epoch": 0.39675780151105233, "grad_norm": 8.000831604003906, "learning_rate": 4.016350801928489e-06, "loss": 0.4445, "mean_token_accuracy": 0.8442562118172645, "num_tokens": 153918966.0, "step": 127990 }, { "entropy": 1.8228215545415878, "epoch": 0.39678880063610206, "grad_norm": 6.075873374938965, "learning_rate": 4.0161939089351855e-06, "loss": 0.4462, "mean_token_accuracy": 0.852639339864254, "num_tokens": 153931674.0, "step": 128000 }, { "entropy": 1.8811346724629403, "epoch": 0.3968197997611517, "grad_norm": 8.297804832458496, "learning_rate": 4.016037034326846e-06, "loss": 0.4313, "mean_token_accuracy": 0.8590738877654076, "num_tokens": 153943910.0, "step": 128010 }, { "entropy": 1.9253056347370148, "epoch": 0.39685079888620145, "grad_norm": 9.22485637664795, "learning_rate": 4.015880178099881e-06, "loss": 0.4444, "mean_token_accuracy": 0.8511346384882927, "num_tokens": 153955547.0, "step": 128020 }, { "entropy": 1.814525055885315, "epoch": 0.3968817980112511, "grad_norm": 7.030633449554443, "learning_rate": 4.0157233402507e-06, "loss": 0.3723, "mean_token_accuracy": 0.8703080207109452, "num_tokens": 153968920.0, "step": 128030 }, { "entropy": 1.9247617200016975, "epoch": 0.39691279713630084, "grad_norm": 9.265406608581543, "learning_rate": 4.015566520775715e-06, "loss": 0.4815, "mean_token_accuracy": 0.849250017106533, "num_tokens": 153980346.0, "step": 128040 }, { "entropy": 1.9104125529527665, "epoch": 0.3969437962613505, "grad_norm": 3.5930426120758057, "learning_rate": 4.015409719671339e-06, "loss": 0.4676, "mean_token_accuracy": 0.8543394804000854, "num_tokens": 153992553.0, "step": 128050 }, { "entropy": 1.866149678826332, "epoch": 0.39697479538640024, "grad_norm": 6.8628926277160645, "learning_rate": 4.015252936933985e-06, "loss": 0.4338, "mean_token_accuracy": 0.8579761996865273, "num_tokens": 154004883.0, "step": 128060 }, { "entropy": 1.9460046872496606, "epoch": 0.3970057945114499, "grad_norm": 9.495827674865723, "learning_rate": 4.015096172560067e-06, "loss": 0.4824, "mean_token_accuracy": 0.8454651072621345, "num_tokens": 154016816.0, "step": 128070 }, { "entropy": 1.8353903412818908, "epoch": 0.39703679363649963, "grad_norm": 8.298847198486328, "learning_rate": 4.014939426546002e-06, "loss": 0.4392, "mean_token_accuracy": 0.8565979763865471, "num_tokens": 154029332.0, "step": 128080 }, { "entropy": 1.8864996194839478, "epoch": 0.3970677927615493, "grad_norm": 8.607029914855957, "learning_rate": 4.014782698888205e-06, "loss": 0.4839, "mean_token_accuracy": 0.842142577469349, "num_tokens": 154040881.0, "step": 128090 }, { "entropy": 1.8832897543907166, "epoch": 0.397098791886599, "grad_norm": 3.863781452178955, "learning_rate": 4.014625989583094e-06, "loss": 0.4027, "mean_token_accuracy": 0.8512258976697922, "num_tokens": 154052885.0, "step": 128100 }, { "entropy": 1.8848641395568848, "epoch": 0.3971297910116487, "grad_norm": 7.123293876647949, "learning_rate": 4.014469298627088e-06, "loss": 0.4602, "mean_token_accuracy": 0.856060591340065, "num_tokens": 154064134.0, "step": 128110 }, { "entropy": 1.8236127287149428, "epoch": 0.3971607901366984, "grad_norm": 5.063018321990967, "learning_rate": 4.0143126260166075e-06, "loss": 0.4309, "mean_token_accuracy": 0.8463837578892708, "num_tokens": 154077307.0, "step": 128120 }, { "entropy": 1.9382410168647766, "epoch": 0.3971917892617481, "grad_norm": 8.761957168579102, "learning_rate": 4.014155971748069e-06, "loss": 0.5266, "mean_token_accuracy": 0.8408651709556579, "num_tokens": 154088700.0, "step": 128130 }, { "entropy": 1.888756561279297, "epoch": 0.3972227883867978, "grad_norm": 9.47224235534668, "learning_rate": 4.013999335817898e-06, "loss": 0.4318, "mean_token_accuracy": 0.8587385430932045, "num_tokens": 154100614.0, "step": 128140 }, { "entropy": 1.9622124269604684, "epoch": 0.3972537875118475, "grad_norm": 8.621251106262207, "learning_rate": 4.013842718222516e-06, "loss": 0.5208, "mean_token_accuracy": 0.8376434296369553, "num_tokens": 154112125.0, "step": 128150 }, { "entropy": 1.9215806141495704, "epoch": 0.39728478663689715, "grad_norm": 8.473328590393066, "learning_rate": 4.013686118958345e-06, "loss": 0.4631, "mean_token_accuracy": 0.852269695699215, "num_tokens": 154123987.0, "step": 128160 }, { "entropy": 1.8634486734867095, "epoch": 0.39731578576194687, "grad_norm": 8.564555168151855, "learning_rate": 4.013529538021809e-06, "loss": 0.4144, "mean_token_accuracy": 0.8536643877625465, "num_tokens": 154137001.0, "step": 128170 }, { "entropy": 1.9504847824573517, "epoch": 0.39734678488699654, "grad_norm": 8.242449760437012, "learning_rate": 4.013372975409336e-06, "loss": 0.4803, "mean_token_accuracy": 0.8405697852373123, "num_tokens": 154148533.0, "step": 128180 }, { "entropy": 1.8996037617325783, "epoch": 0.39737778401204626, "grad_norm": 10.100019454956055, "learning_rate": 4.01321643111735e-06, "loss": 0.4573, "mean_token_accuracy": 0.8533552616834641, "num_tokens": 154160553.0, "step": 128190 }, { "entropy": 1.8098711207509042, "epoch": 0.39740878313709593, "grad_norm": 9.241110801696777, "learning_rate": 4.013059905142279e-06, "loss": 0.3669, "mean_token_accuracy": 0.871603924036026, "num_tokens": 154174245.0, "step": 128200 }, { "entropy": 1.9533662751317025, "epoch": 0.39743978226214566, "grad_norm": 9.006794929504395, "learning_rate": 4.012903397480549e-06, "loss": 0.4952, "mean_token_accuracy": 0.8515314489603043, "num_tokens": 154185508.0, "step": 128210 }, { "entropy": 1.847785583138466, "epoch": 0.3974707813871953, "grad_norm": 4.733590126037598, "learning_rate": 4.012746908128595e-06, "loss": 0.4237, "mean_token_accuracy": 0.8513537049293518, "num_tokens": 154198114.0, "step": 128220 }, { "entropy": 1.908326794207096, "epoch": 0.39750178051224505, "grad_norm": 7.348505973815918, "learning_rate": 4.012590437082841e-06, "loss": 0.4096, "mean_token_accuracy": 0.8644715324044228, "num_tokens": 154210418.0, "step": 128230 }, { "entropy": 1.9537509649991989, "epoch": 0.3975327796372947, "grad_norm": 7.655646800994873, "learning_rate": 4.0124339843397216e-06, "loss": 0.4951, "mean_token_accuracy": 0.8567704558372498, "num_tokens": 154221854.0, "step": 128240 }, { "entropy": 1.8390052139759063, "epoch": 0.39756377876234444, "grad_norm": 8.591985702514648, "learning_rate": 4.012277549895668e-06, "loss": 0.4074, "mean_token_accuracy": 0.859342285990715, "num_tokens": 154234898.0, "step": 128250 }, { "entropy": 1.8981177046895028, "epoch": 0.3975947778873941, "grad_norm": 3.5473618507385254, "learning_rate": 4.012121133747113e-06, "loss": 0.438, "mean_token_accuracy": 0.8572326749563217, "num_tokens": 154247303.0, "step": 128260 }, { "entropy": 1.9318851605057716, "epoch": 0.39762577701244384, "grad_norm": 6.904049873352051, "learning_rate": 4.011964735890492e-06, "loss": 0.4364, "mean_token_accuracy": 0.8485365882515907, "num_tokens": 154259127.0, "step": 128270 }, { "entropy": 1.856573697924614, "epoch": 0.3976567761374935, "grad_norm": 7.510590553283691, "learning_rate": 4.011808356322238e-06, "loss": 0.4312, "mean_token_accuracy": 0.8440538689494133, "num_tokens": 154271893.0, "step": 128280 }, { "entropy": 1.9165368214249612, "epoch": 0.39768777526254323, "grad_norm": 3.662731885910034, "learning_rate": 4.011651995038789e-06, "loss": 0.4618, "mean_token_accuracy": 0.8502262338995934, "num_tokens": 154283424.0, "step": 128290 }, { "entropy": 1.9400472521781922, "epoch": 0.3977187743875929, "grad_norm": 8.449536323547363, "learning_rate": 4.011495652036581e-06, "loss": 0.4438, "mean_token_accuracy": 0.864606074988842, "num_tokens": 154294507.0, "step": 128300 }, { "entropy": 1.9318364679813385, "epoch": 0.3977497735126426, "grad_norm": 7.544651031494141, "learning_rate": 4.011339327312052e-06, "loss": 0.4951, "mean_token_accuracy": 0.851364016532898, "num_tokens": 154306187.0, "step": 128310 }, { "entropy": 1.859896543622017, "epoch": 0.3977807726376923, "grad_norm": 7.740284442901611, "learning_rate": 4.0111830208616405e-06, "loss": 0.4208, "mean_token_accuracy": 0.8656293138861656, "num_tokens": 154318477.0, "step": 128320 }, { "entropy": 1.9752064496278763, "epoch": 0.397811771762742, "grad_norm": 7.771737575531006, "learning_rate": 4.011026732681787e-06, "loss": 0.47, "mean_token_accuracy": 0.8587437257170677, "num_tokens": 154329265.0, "step": 128330 }, { "entropy": 1.826411110162735, "epoch": 0.3978427708877917, "grad_norm": 9.697999000549316, "learning_rate": 4.010870462768933e-06, "loss": 0.4771, "mean_token_accuracy": 0.8450366973876953, "num_tokens": 154343157.0, "step": 128340 }, { "entropy": 1.8539129607379436, "epoch": 0.3978737700128414, "grad_norm": 9.17566967010498, "learning_rate": 4.010714211119519e-06, "loss": 0.4242, "mean_token_accuracy": 0.8553266122937202, "num_tokens": 154354274.0, "step": 128350 }, { "entropy": 1.8710831806063652, "epoch": 0.3979047691378911, "grad_norm": 6.939380168914795, "learning_rate": 4.010557977729989e-06, "loss": 0.402, "mean_token_accuracy": 0.8506567060947419, "num_tokens": 154367463.0, "step": 128360 }, { "entropy": 1.9934969753026963, "epoch": 0.3979357682629408, "grad_norm": 9.315759658813477, "learning_rate": 4.010401762596786e-06, "loss": 0.5463, "mean_token_accuracy": 0.8378830254077911, "num_tokens": 154378868.0, "step": 128370 }, { "entropy": 1.9435987308621407, "epoch": 0.39796676738799047, "grad_norm": 7.836657524108887, "learning_rate": 4.010245565716356e-06, "loss": 0.443, "mean_token_accuracy": 0.857276414334774, "num_tokens": 154390849.0, "step": 128380 }, { "entropy": 1.8966143906116486, "epoch": 0.3979977665130402, "grad_norm": 8.332956314086914, "learning_rate": 4.010089387085143e-06, "loss": 0.4451, "mean_token_accuracy": 0.8508517384529114, "num_tokens": 154402741.0, "step": 128390 }, { "entropy": 1.9793959528207778, "epoch": 0.39802876563808987, "grad_norm": 8.801210403442383, "learning_rate": 4.009933226699596e-06, "loss": 0.5463, "mean_token_accuracy": 0.8385202527046204, "num_tokens": 154413301.0, "step": 128400 }, { "entropy": 1.9293900340795518, "epoch": 0.39805976476313953, "grad_norm": 10.354657173156738, "learning_rate": 4.00977708455616e-06, "loss": 0.5402, "mean_token_accuracy": 0.8320423439145088, "num_tokens": 154424899.0, "step": 128410 }, { "entropy": 1.7377336770296097, "epoch": 0.39809076388818926, "grad_norm": 7.612285137176514, "learning_rate": 4.009620960651285e-06, "loss": 0.3372, "mean_token_accuracy": 0.8693819224834443, "num_tokens": 154439450.0, "step": 128420 }, { "entropy": 1.9407954409718513, "epoch": 0.3981217630132389, "grad_norm": 7.916438579559326, "learning_rate": 4.00946485498142e-06, "loss": 0.4653, "mean_token_accuracy": 0.8480086639523506, "num_tokens": 154451790.0, "step": 128430 }, { "entropy": 1.9476406499743462, "epoch": 0.39815276213828865, "grad_norm": 4.425556659698486, "learning_rate": 4.009308767543017e-06, "loss": 0.4649, "mean_token_accuracy": 0.8459201797842979, "num_tokens": 154463897.0, "step": 128440 }, { "entropy": 2.0008150786161423, "epoch": 0.3981837612633383, "grad_norm": 9.704833030700684, "learning_rate": 4.009152698332526e-06, "loss": 0.4657, "mean_token_accuracy": 0.8437413021922111, "num_tokens": 154475527.0, "step": 128450 }, { "entropy": 1.7799909293651581, "epoch": 0.39821476038838804, "grad_norm": 3.9981141090393066, "learning_rate": 4.0089966473464e-06, "loss": 0.3358, "mean_token_accuracy": 0.8721995532512665, "num_tokens": 154489848.0, "step": 128460 }, { "entropy": 1.9085913822054863, "epoch": 0.3982457595134377, "grad_norm": 5.100532054901123, "learning_rate": 4.008840614581093e-06, "loss": 0.436, "mean_token_accuracy": 0.856728957593441, "num_tokens": 154502460.0, "step": 128470 }, { "entropy": 1.9310724124312402, "epoch": 0.39827675863848744, "grad_norm": 7.409860610961914, "learning_rate": 4.008684600033059e-06, "loss": 0.4814, "mean_token_accuracy": 0.8475088179111481, "num_tokens": 154515158.0, "step": 128480 }, { "entropy": 1.9532846316695214, "epoch": 0.3983077577635371, "grad_norm": 6.9244561195373535, "learning_rate": 4.008528603698753e-06, "loss": 0.4868, "mean_token_accuracy": 0.8392842382192611, "num_tokens": 154526053.0, "step": 128490 }, { "entropy": 1.864257997274399, "epoch": 0.39833875688858683, "grad_norm": 8.621221542358398, "learning_rate": 4.008372625574633e-06, "loss": 0.4103, "mean_token_accuracy": 0.8733592018485069, "num_tokens": 154537962.0, "step": 128500 }, { "entropy": 1.7741185277700424, "epoch": 0.3983697560136365, "grad_norm": 12.221007347106934, "learning_rate": 4.008216665657155e-06, "loss": 0.36, "mean_token_accuracy": 0.8721621558070183, "num_tokens": 154551264.0, "step": 128510 }, { "entropy": 1.8366203412413598, "epoch": 0.3984007551386862, "grad_norm": 8.838532447814941, "learning_rate": 4.008060723942776e-06, "loss": 0.3882, "mean_token_accuracy": 0.8588550105690956, "num_tokens": 154563900.0, "step": 128520 }, { "entropy": 1.9761778950691222, "epoch": 0.3984317542637359, "grad_norm": 8.271575927734375, "learning_rate": 4.007904800427958e-06, "loss": 0.5278, "mean_token_accuracy": 0.8369174391031265, "num_tokens": 154574439.0, "step": 128530 }, { "entropy": 1.871373300254345, "epoch": 0.3984627533887856, "grad_norm": 3.379486322402954, "learning_rate": 4.0077488951091595e-06, "loss": 0.4258, "mean_token_accuracy": 0.852562639117241, "num_tokens": 154586791.0, "step": 128540 }, { "entropy": 1.8863118886947632, "epoch": 0.3984937525138353, "grad_norm": 8.740182876586914, "learning_rate": 4.007593007982842e-06, "loss": 0.4544, "mean_token_accuracy": 0.8477497771382332, "num_tokens": 154599491.0, "step": 128550 }, { "entropy": 1.8409544050693512, "epoch": 0.398524751638885, "grad_norm": 4.045310020446777, "learning_rate": 4.007437139045469e-06, "loss": 0.3935, "mean_token_accuracy": 0.8576883286237716, "num_tokens": 154612812.0, "step": 128560 }, { "entropy": 1.8716674730181695, "epoch": 0.3985557507639347, "grad_norm": 4.682071208953857, "learning_rate": 4.007281288293502e-06, "loss": 0.4365, "mean_token_accuracy": 0.8520221546292305, "num_tokens": 154625583.0, "step": 128570 }, { "entropy": 1.9001308396458625, "epoch": 0.3985867498889844, "grad_norm": 7.292641639709473, "learning_rate": 4.007125455723405e-06, "loss": 0.4524, "mean_token_accuracy": 0.8661713659763336, "num_tokens": 154637640.0, "step": 128580 }, { "entropy": 1.9417346104979516, "epoch": 0.3986177490140341, "grad_norm": 8.141678810119629, "learning_rate": 4.006969641331644e-06, "loss": 0.4585, "mean_token_accuracy": 0.8521144181489945, "num_tokens": 154649418.0, "step": 128590 }, { "entropy": 1.9820822283625603, "epoch": 0.3986487481390838, "grad_norm": 10.370221138000488, "learning_rate": 4.0068138451146845e-06, "loss": 0.5383, "mean_token_accuracy": 0.8394621968269348, "num_tokens": 154660964.0, "step": 128600 }, { "entropy": 1.836016722023487, "epoch": 0.39867974726413347, "grad_norm": 3.998152017593384, "learning_rate": 4.006658067068994e-06, "loss": 0.4016, "mean_token_accuracy": 0.8594549700617791, "num_tokens": 154674134.0, "step": 128610 }, { "entropy": 1.9683793038129807, "epoch": 0.3987107463891832, "grad_norm": 8.68581771850586, "learning_rate": 4.0065023071910395e-06, "loss": 0.4343, "mean_token_accuracy": 0.866059435904026, "num_tokens": 154685620.0, "step": 128620 }, { "entropy": 1.9310366541147232, "epoch": 0.39874174551423286, "grad_norm": 8.650096893310547, "learning_rate": 4.006346565477289e-06, "loss": 0.4354, "mean_token_accuracy": 0.8481193721294403, "num_tokens": 154697293.0, "step": 128630 }, { "entropy": 1.9691759124398232, "epoch": 0.3987727446392826, "grad_norm": 7.362710952758789, "learning_rate": 4.006190841924217e-06, "loss": 0.4832, "mean_token_accuracy": 0.8400867089629174, "num_tokens": 154708876.0, "step": 128640 }, { "entropy": 1.8157849997282027, "epoch": 0.39880374376433225, "grad_norm": 3.911681652069092, "learning_rate": 4.006035136528288e-06, "loss": 0.4236, "mean_token_accuracy": 0.8522502645850182, "num_tokens": 154721940.0, "step": 128650 }, { "entropy": 1.9943493276834487, "epoch": 0.3988347428893819, "grad_norm": 9.267487525939941, "learning_rate": 4.005879449285979e-06, "loss": 0.5359, "mean_token_accuracy": 0.8361817836761475, "num_tokens": 154732230.0, "step": 128660 }, { "entropy": 1.9468561723828315, "epoch": 0.39886574201443165, "grad_norm": 8.407017707824707, "learning_rate": 4.00572378019376e-06, "loss": 0.4778, "mean_token_accuracy": 0.8474451348185539, "num_tokens": 154743873.0, "step": 128670 }, { "entropy": 1.883972604572773, "epoch": 0.3988967411394813, "grad_norm": 10.778926849365234, "learning_rate": 4.005568129248105e-06, "loss": 0.4577, "mean_token_accuracy": 0.8490250527858734, "num_tokens": 154756322.0, "step": 128680 }, { "entropy": 1.9082299306988717, "epoch": 0.39892774026453104, "grad_norm": 8.559072494506836, "learning_rate": 4.0054124964454904e-06, "loss": 0.4325, "mean_token_accuracy": 0.8479592993855476, "num_tokens": 154768822.0, "step": 128690 }, { "entropy": 1.915241825580597, "epoch": 0.3989587393895807, "grad_norm": 8.535916328430176, "learning_rate": 4.005256881782389e-06, "loss": 0.4834, "mean_token_accuracy": 0.848818339407444, "num_tokens": 154780787.0, "step": 128700 }, { "entropy": 1.85912893563509, "epoch": 0.39898973851463043, "grad_norm": 8.055673599243164, "learning_rate": 4.005101285255279e-06, "loss": 0.4004, "mean_token_accuracy": 0.8605345159769058, "num_tokens": 154793208.0, "step": 128710 }, { "entropy": 1.9649493724107743, "epoch": 0.3990207376396801, "grad_norm": 8.454475402832031, "learning_rate": 4.004945706860638e-06, "loss": 0.4744, "mean_token_accuracy": 0.8544456735253334, "num_tokens": 154804930.0, "step": 128720 }, { "entropy": 1.923764231801033, "epoch": 0.3990517367647298, "grad_norm": 7.50667142868042, "learning_rate": 4.004790146594944e-06, "loss": 0.5058, "mean_token_accuracy": 0.8489472419023514, "num_tokens": 154817425.0, "step": 128730 }, { "entropy": 1.8540160700678825, "epoch": 0.3990827358897795, "grad_norm": 4.772271633148193, "learning_rate": 4.004634604454677e-06, "loss": 0.4086, "mean_token_accuracy": 0.8635622560977936, "num_tokens": 154829558.0, "step": 128740 }, { "entropy": 1.8815680295228958, "epoch": 0.3991137350148292, "grad_norm": 8.889459609985352, "learning_rate": 4.004479080436317e-06, "loss": 0.4128, "mean_token_accuracy": 0.8565109595656395, "num_tokens": 154841865.0, "step": 128750 }, { "entropy": 1.8703197434544563, "epoch": 0.3991447341398789, "grad_norm": 3.177785634994507, "learning_rate": 4.004323574536345e-06, "loss": 0.4666, "mean_token_accuracy": 0.851426774263382, "num_tokens": 154854216.0, "step": 128760 }, { "entropy": 1.8539531543850898, "epoch": 0.3991757332649286, "grad_norm": 7.972646236419678, "learning_rate": 4.004168086751243e-06, "loss": 0.3709, "mean_token_accuracy": 0.8601129725575447, "num_tokens": 154867350.0, "step": 128770 }, { "entropy": 1.9226906821131706, "epoch": 0.3992067323899783, "grad_norm": 3.4731192588806152, "learning_rate": 4.0040126170774955e-06, "loss": 0.4739, "mean_token_accuracy": 0.8491318255662919, "num_tokens": 154879654.0, "step": 128780 }, { "entropy": 1.8054320514202118, "epoch": 0.399237731515028, "grad_norm": 9.286853790283203, "learning_rate": 4.003857165511587e-06, "loss": 0.3863, "mean_token_accuracy": 0.8595024034380913, "num_tokens": 154893043.0, "step": 128790 }, { "entropy": 1.8495088413357734, "epoch": 0.3992687306400777, "grad_norm": 8.713004112243652, "learning_rate": 4.003701732050002e-06, "loss": 0.423, "mean_token_accuracy": 0.8574372127652168, "num_tokens": 154905390.0, "step": 128800 }, { "entropy": 1.87299737483263, "epoch": 0.3992997297651274, "grad_norm": 9.513683319091797, "learning_rate": 4.003546316689225e-06, "loss": 0.4342, "mean_token_accuracy": 0.8459521770477295, "num_tokens": 154918334.0, "step": 128810 }, { "entropy": 1.9595311015844346, "epoch": 0.39933072889017707, "grad_norm": 9.396753311157227, "learning_rate": 4.003390919425746e-06, "loss": 0.4926, "mean_token_accuracy": 0.8480179443955421, "num_tokens": 154929275.0, "step": 128820 }, { "entropy": 1.8720010727643968, "epoch": 0.3993617280152268, "grad_norm": 8.757028579711914, "learning_rate": 4.003235540256052e-06, "loss": 0.4176, "mean_token_accuracy": 0.8585854455828666, "num_tokens": 154941320.0, "step": 128830 }, { "entropy": 1.9553624257445335, "epoch": 0.39939272714027646, "grad_norm": 10.621780395507812, "learning_rate": 4.003080179176631e-06, "loss": 0.4905, "mean_token_accuracy": 0.8457315772771835, "num_tokens": 154952903.0, "step": 128840 }, { "entropy": 1.7949346914887427, "epoch": 0.3994237262653262, "grad_norm": 2.807647466659546, "learning_rate": 4.002924836183973e-06, "loss": 0.3984, "mean_token_accuracy": 0.8518297106027604, "num_tokens": 154967282.0, "step": 128850 }, { "entropy": 1.8987135276198388, "epoch": 0.39945472539037585, "grad_norm": 8.776113510131836, "learning_rate": 4.002769511274571e-06, "loss": 0.4808, "mean_token_accuracy": 0.8481128692626954, "num_tokens": 154979746.0, "step": 128860 }, { "entropy": 1.9235922917723656, "epoch": 0.3994857245154256, "grad_norm": 9.092692375183105, "learning_rate": 4.002614204444915e-06, "loss": 0.4885, "mean_token_accuracy": 0.8417194813489914, "num_tokens": 154991799.0, "step": 128870 }, { "entropy": 1.8608113884925843, "epoch": 0.39951672364047525, "grad_norm": 7.770845413208008, "learning_rate": 4.002458915691497e-06, "loss": 0.4511, "mean_token_accuracy": 0.8441184401512146, "num_tokens": 155005153.0, "step": 128880 }, { "entropy": 1.9966261342167855, "epoch": 0.39954772276552497, "grad_norm": 7.349844932556152, "learning_rate": 4.002303645010813e-06, "loss": 0.496, "mean_token_accuracy": 0.8491346299648285, "num_tokens": 155016749.0, "step": 128890 }, { "entropy": 1.8176558762788773, "epoch": 0.39957872189057464, "grad_norm": 7.31537389755249, "learning_rate": 4.002148392399357e-06, "loss": 0.386, "mean_token_accuracy": 0.8675624996423721, "num_tokens": 155030413.0, "step": 128900 }, { "entropy": 1.7823689445853232, "epoch": 0.3996097210156243, "grad_norm": 8.566244125366211, "learning_rate": 4.001993157853624e-06, "loss": 0.4131, "mean_token_accuracy": 0.8506886452436447, "num_tokens": 155044038.0, "step": 128910 }, { "entropy": 1.8545478105545044, "epoch": 0.39964072014067403, "grad_norm": 7.566455364227295, "learning_rate": 4.001837941370112e-06, "loss": 0.3984, "mean_token_accuracy": 0.8633383393287659, "num_tokens": 155056896.0, "step": 128920 }, { "entropy": 1.94671870470047, "epoch": 0.3996717192657237, "grad_norm": 9.15937328338623, "learning_rate": 4.0016827429453155e-06, "loss": 0.4891, "mean_token_accuracy": 0.8502067849040031, "num_tokens": 155067894.0, "step": 128930 }, { "entropy": 1.8951860249042511, "epoch": 0.3997027183907734, "grad_norm": 9.120491027832031, "learning_rate": 4.001527562575736e-06, "loss": 0.4282, "mean_token_accuracy": 0.8581095486879349, "num_tokens": 155079974.0, "step": 128940 }, { "entropy": 1.973567920923233, "epoch": 0.3997337175158231, "grad_norm": 8.294892311096191, "learning_rate": 4.001372400257873e-06, "loss": 0.5001, "mean_token_accuracy": 0.8433159068226814, "num_tokens": 155091036.0, "step": 128950 }, { "entropy": 1.85403670668602, "epoch": 0.3997647166408728, "grad_norm": 4.179879188537598, "learning_rate": 4.001217255988225e-06, "loss": 0.4339, "mean_token_accuracy": 0.8468100443482399, "num_tokens": 155103812.0, "step": 128960 }, { "entropy": 1.8100439786911011, "epoch": 0.3997957157659225, "grad_norm": 3.8184187412261963, "learning_rate": 4.001062129763296e-06, "loss": 0.4437, "mean_token_accuracy": 0.870311813056469, "num_tokens": 155117031.0, "step": 128970 }, { "entropy": 1.9028596684336663, "epoch": 0.3998267148909722, "grad_norm": 7.593201637268066, "learning_rate": 4.000907021579585e-06, "loss": 0.4921, "mean_token_accuracy": 0.8530150771141052, "num_tokens": 155128674.0, "step": 128980 }, { "entropy": 1.9456078946590423, "epoch": 0.3998577140160219, "grad_norm": 8.108650207519531, "learning_rate": 4.000751931433597e-06, "loss": 0.4632, "mean_token_accuracy": 0.8464592576026917, "num_tokens": 155139476.0, "step": 128990 }, { "entropy": 1.9551035940647126, "epoch": 0.3998887131410716, "grad_norm": 9.936694145202637, "learning_rate": 4.000596859321837e-06, "loss": 0.5173, "mean_token_accuracy": 0.8364872843027115, "num_tokens": 155151556.0, "step": 129000 }, { "entropy": 1.7921033650636673, "epoch": 0.3999197122661213, "grad_norm": 6.708187103271484, "learning_rate": 4.000441805240809e-06, "loss": 0.4141, "mean_token_accuracy": 0.8557077631354332, "num_tokens": 155165232.0, "step": 129010 }, { "entropy": 1.9106485813856124, "epoch": 0.399950711391171, "grad_norm": 3.6080920696258545, "learning_rate": 4.00028676918702e-06, "loss": 0.4454, "mean_token_accuracy": 0.8508982062339783, "num_tokens": 155176886.0, "step": 129020 }, { "entropy": 1.9070033580064774, "epoch": 0.39998171051622067, "grad_norm": 7.818281650543213, "learning_rate": 4.000131751156976e-06, "loss": 0.4894, "mean_token_accuracy": 0.8354123339056969, "num_tokens": 155189493.0, "step": 129030 }, { "entropy": 1.9348053887486458, "epoch": 0.4000127096412704, "grad_norm": 9.461575508117676, "learning_rate": 3.999976751147185e-06, "loss": 0.5011, "mean_token_accuracy": 0.8445191279053688, "num_tokens": 155200851.0, "step": 129040 }, { "entropy": 1.958994448184967, "epoch": 0.40004370876632006, "grad_norm": 10.124030113220215, "learning_rate": 3.999821769154158e-06, "loss": 0.4937, "mean_token_accuracy": 0.8466064438223839, "num_tokens": 155212316.0, "step": 129050 }, { "entropy": 1.9629956305027008, "epoch": 0.4000747078913698, "grad_norm": 7.477722644805908, "learning_rate": 3.999666805174402e-06, "loss": 0.5206, "mean_token_accuracy": 0.8446162462234497, "num_tokens": 155223329.0, "step": 129060 }, { "entropy": 1.8508127138018609, "epoch": 0.40010570701641945, "grad_norm": 9.45833969116211, "learning_rate": 3.999511859204431e-06, "loss": 0.4495, "mean_token_accuracy": 0.8511440798640251, "num_tokens": 155235927.0, "step": 129070 }, { "entropy": 1.8051378756761551, "epoch": 0.4001367061414692, "grad_norm": 8.635483741760254, "learning_rate": 3.999356931240755e-06, "loss": 0.4505, "mean_token_accuracy": 0.846815425157547, "num_tokens": 155248596.0, "step": 129080 }, { "entropy": 1.7926154106855392, "epoch": 0.40016770526651885, "grad_norm": 8.050457954406738, "learning_rate": 3.999202021279885e-06, "loss": 0.3689, "mean_token_accuracy": 0.8725738286972046, "num_tokens": 155261547.0, "step": 129090 }, { "entropy": 1.9420858532190324, "epoch": 0.40019870439156857, "grad_norm": 9.84321403503418, "learning_rate": 3.999047129318338e-06, "loss": 0.4839, "mean_token_accuracy": 0.8433678567409515, "num_tokens": 155272699.0, "step": 129100 }, { "entropy": 1.7918237030506134, "epoch": 0.40022970351661824, "grad_norm": 4.470568656921387, "learning_rate": 3.998892255352627e-06, "loss": 0.4126, "mean_token_accuracy": 0.8606519907712936, "num_tokens": 155286705.0, "step": 129110 }, { "entropy": 1.8863151326775551, "epoch": 0.40026070264166796, "grad_norm": 7.721844673156738, "learning_rate": 3.998737399379268e-06, "loss": 0.4217, "mean_token_accuracy": 0.8649403393268585, "num_tokens": 155298688.0, "step": 129120 }, { "entropy": 1.9350427404046058, "epoch": 0.40029170176671763, "grad_norm": 7.882344722747803, "learning_rate": 3.998582561394776e-06, "loss": 0.4691, "mean_token_accuracy": 0.8500554978847503, "num_tokens": 155310013.0, "step": 129130 }, { "entropy": 1.8537417277693748, "epoch": 0.4003227008917673, "grad_norm": 8.492751121520996, "learning_rate": 3.998427741395671e-06, "loss": 0.4096, "mean_token_accuracy": 0.8571956619620323, "num_tokens": 155322433.0, "step": 129140 }, { "entropy": 1.9245151728391647, "epoch": 0.400353700016817, "grad_norm": 2.7976245880126953, "learning_rate": 3.99827293937847e-06, "loss": 0.4817, "mean_token_accuracy": 0.8548303097486496, "num_tokens": 155334298.0, "step": 129150 }, { "entropy": 1.9198249489068986, "epoch": 0.4003846991418667, "grad_norm": 3.582043170928955, "learning_rate": 3.998118155339692e-06, "loss": 0.5238, "mean_token_accuracy": 0.8364661797881127, "num_tokens": 155345756.0, "step": 129160 }, { "entropy": 1.9318518668413163, "epoch": 0.4004156982669164, "grad_norm": 8.037443161010742, "learning_rate": 3.997963389275859e-06, "loss": 0.4618, "mean_token_accuracy": 0.8507610768079757, "num_tokens": 155356700.0, "step": 129170 }, { "entropy": 1.9149801477789878, "epoch": 0.4004466973919661, "grad_norm": 8.608445167541504, "learning_rate": 3.99780864118349e-06, "loss": 0.4695, "mean_token_accuracy": 0.8519381731748581, "num_tokens": 155367938.0, "step": 129180 }, { "entropy": 1.8103131994605064, "epoch": 0.4004776965170158, "grad_norm": 5.219323635101318, "learning_rate": 3.9976539110591086e-06, "loss": 0.4518, "mean_token_accuracy": 0.8530828103423118, "num_tokens": 155381084.0, "step": 129190 }, { "entropy": 1.7915343165397644, "epoch": 0.4005086956420655, "grad_norm": 8.658716201782227, "learning_rate": 3.997499198899237e-06, "loss": 0.3531, "mean_token_accuracy": 0.8671585947275162, "num_tokens": 155394355.0, "step": 129200 }, { "entropy": 1.791802540421486, "epoch": 0.4005396947671152, "grad_norm": 7.633795261383057, "learning_rate": 3.997344504700401e-06, "loss": 0.3853, "mean_token_accuracy": 0.8690174669027328, "num_tokens": 155407374.0, "step": 129210 }, { "entropy": 1.8313867419958114, "epoch": 0.4005706938921649, "grad_norm": 7.055886745452881, "learning_rate": 3.997189828459124e-06, "loss": 0.3852, "mean_token_accuracy": 0.8657983660697937, "num_tokens": 155420166.0, "step": 129220 }, { "entropy": 1.8343457579612732, "epoch": 0.4006016930172146, "grad_norm": 8.553956031799316, "learning_rate": 3.997035170171932e-06, "loss": 0.4769, "mean_token_accuracy": 0.8479964166879654, "num_tokens": 155432912.0, "step": 129230 }, { "entropy": 1.8380364283919335, "epoch": 0.40063269214226427, "grad_norm": 4.127384185791016, "learning_rate": 3.996880529835352e-06, "loss": 0.4349, "mean_token_accuracy": 0.8608715251088143, "num_tokens": 155445327.0, "step": 129240 }, { "entropy": 1.8258098438382149, "epoch": 0.400663691267314, "grad_norm": 3.8394930362701416, "learning_rate": 3.996725907445914e-06, "loss": 0.4834, "mean_token_accuracy": 0.8447212398052215, "num_tokens": 155458107.0, "step": 129250 }, { "entropy": 1.9279157117009162, "epoch": 0.40069469039236366, "grad_norm": 8.191624641418457, "learning_rate": 3.996571303000143e-06, "loss": 0.5147, "mean_token_accuracy": 0.8399505227804184, "num_tokens": 155469474.0, "step": 129260 }, { "entropy": 1.8246611893177032, "epoch": 0.4007256895174134, "grad_norm": 4.23314905166626, "learning_rate": 3.996416716494572e-06, "loss": 0.418, "mean_token_accuracy": 0.8508252546191215, "num_tokens": 155482390.0, "step": 129270 }, { "entropy": 1.796310657262802, "epoch": 0.40075668864246305, "grad_norm": 3.9624927043914795, "learning_rate": 3.996262147925729e-06, "loss": 0.3754, "mean_token_accuracy": 0.857945391535759, "num_tokens": 155494865.0, "step": 129280 }, { "entropy": 1.8756929695606233, "epoch": 0.4007876877675128, "grad_norm": 10.10248851776123, "learning_rate": 3.996107597290148e-06, "loss": 0.4743, "mean_token_accuracy": 0.8458318576216698, "num_tokens": 155507263.0, "step": 129290 }, { "entropy": 1.853571632504463, "epoch": 0.40081868689256245, "grad_norm": 8.234441757202148, "learning_rate": 3.99595306458436e-06, "loss": 0.4667, "mean_token_accuracy": 0.8422810658812523, "num_tokens": 155519150.0, "step": 129300 }, { "entropy": 1.8298091277480126, "epoch": 0.4008496860176122, "grad_norm": 4.043398380279541, "learning_rate": 3.995798549804898e-06, "loss": 0.4288, "mean_token_accuracy": 0.8495631635189056, "num_tokens": 155531954.0, "step": 129310 }, { "entropy": 1.8689694941043853, "epoch": 0.40088068514266184, "grad_norm": 8.660704612731934, "learning_rate": 3.995644052948298e-06, "loss": 0.4544, "mean_token_accuracy": 0.8553383216261864, "num_tokens": 155543288.0, "step": 129320 }, { "entropy": 1.8053039491176606, "epoch": 0.40091168426771157, "grad_norm": 8.172130584716797, "learning_rate": 3.995489574011096e-06, "loss": 0.4413, "mean_token_accuracy": 0.8528603136539459, "num_tokens": 155555748.0, "step": 129330 }, { "entropy": 1.9306787729263306, "epoch": 0.40094268339276123, "grad_norm": 9.331960678100586, "learning_rate": 3.995335112989825e-06, "loss": 0.4986, "mean_token_accuracy": 0.8444873213768005, "num_tokens": 155566893.0, "step": 129340 }, { "entropy": 1.8586668640375137, "epoch": 0.40097368251781096, "grad_norm": 9.09153938293457, "learning_rate": 3.995180669881025e-06, "loss": 0.4446, "mean_token_accuracy": 0.8564663842320442, "num_tokens": 155578818.0, "step": 129350 }, { "entropy": 1.8796383678913116, "epoch": 0.4010046816428606, "grad_norm": 8.167010307312012, "learning_rate": 3.995026244681234e-06, "loss": 0.4534, "mean_token_accuracy": 0.842990031838417, "num_tokens": 155590442.0, "step": 129360 }, { "entropy": 1.9271098881959916, "epoch": 0.40103568076791035, "grad_norm": 9.545135498046875, "learning_rate": 3.994871837386989e-06, "loss": 0.515, "mean_token_accuracy": 0.8358207404613495, "num_tokens": 155601889.0, "step": 129370 }, { "entropy": 1.9072769358754158, "epoch": 0.40106667989296, "grad_norm": 7.5232038497924805, "learning_rate": 3.994717447994832e-06, "loss": 0.4578, "mean_token_accuracy": 0.8516602620482445, "num_tokens": 155614028.0, "step": 129380 }, { "entropy": 1.82378898113966, "epoch": 0.4010976790180097, "grad_norm": 5.845879554748535, "learning_rate": 3.994563076501303e-06, "loss": 0.4415, "mean_token_accuracy": 0.8528670743107796, "num_tokens": 155626507.0, "step": 129390 }, { "entropy": 1.8010734841227531, "epoch": 0.4011286781430594, "grad_norm": 3.6198856830596924, "learning_rate": 3.994408722902945e-06, "loss": 0.4038, "mean_token_accuracy": 0.8558384582400322, "num_tokens": 155639677.0, "step": 129400 }, { "entropy": 1.8200062766671181, "epoch": 0.4011596772681091, "grad_norm": 3.8507888317108154, "learning_rate": 3.9942543871963006e-06, "loss": 0.4381, "mean_token_accuracy": 0.8555910781025886, "num_tokens": 155652366.0, "step": 129410 }, { "entropy": 1.866781549155712, "epoch": 0.4011906763931588, "grad_norm": 6.747231483459473, "learning_rate": 3.994100069377912e-06, "loss": 0.44, "mean_token_accuracy": 0.8502749800682068, "num_tokens": 155664907.0, "step": 129420 }, { "entropy": 1.8888089403510093, "epoch": 0.4012216755182085, "grad_norm": 8.488142967224121, "learning_rate": 3.993945769444325e-06, "loss": 0.4885, "mean_token_accuracy": 0.8422931343317032, "num_tokens": 155676728.0, "step": 129430 }, { "entropy": 1.827822931110859, "epoch": 0.4012526746432582, "grad_norm": 4.2824883460998535, "learning_rate": 3.9937914873920855e-06, "loss": 0.4095, "mean_token_accuracy": 0.855113011598587, "num_tokens": 155689256.0, "step": 129440 }, { "entropy": 1.82640281021595, "epoch": 0.40128367376830787, "grad_norm": 9.845888137817383, "learning_rate": 3.99363722321774e-06, "loss": 0.3959, "mean_token_accuracy": 0.8682078808546067, "num_tokens": 155701978.0, "step": 129450 }, { "entropy": 1.8602233231067657, "epoch": 0.4013146728933576, "grad_norm": 9.442498207092285, "learning_rate": 3.9934829769178365e-06, "loss": 0.5007, "mean_token_accuracy": 0.8478624925017357, "num_tokens": 155714775.0, "step": 129460 }, { "entropy": 1.9146529287099838, "epoch": 0.40134567201840726, "grad_norm": 7.782778263092041, "learning_rate": 3.993328748488922e-06, "loss": 0.4922, "mean_token_accuracy": 0.8408509835600853, "num_tokens": 155726524.0, "step": 129470 }, { "entropy": 1.8323335126042366, "epoch": 0.401376671143457, "grad_norm": 4.129303455352783, "learning_rate": 3.993174537927546e-06, "loss": 0.3861, "mean_token_accuracy": 0.8619013637304306, "num_tokens": 155739316.0, "step": 129480 }, { "entropy": 1.8000301405787469, "epoch": 0.40140767026850666, "grad_norm": 4.743396282196045, "learning_rate": 3.993020345230262e-06, "loss": 0.4435, "mean_token_accuracy": 0.8503546267747879, "num_tokens": 155752513.0, "step": 129490 }, { "entropy": 1.846390649676323, "epoch": 0.4014386693935564, "grad_norm": 3.5405542850494385, "learning_rate": 3.9928661703936164e-06, "loss": 0.5197, "mean_token_accuracy": 0.8494727715849877, "num_tokens": 155764952.0, "step": 129500 }, { "entropy": 1.8727680340409278, "epoch": 0.40146966851860605, "grad_norm": 7.60482931137085, "learning_rate": 3.992712013414165e-06, "loss": 0.4306, "mean_token_accuracy": 0.8475085526704789, "num_tokens": 155777679.0, "step": 129510 }, { "entropy": 1.9042856127023697, "epoch": 0.4015006676436558, "grad_norm": 7.735726833343506, "learning_rate": 3.992557874288459e-06, "loss": 0.49, "mean_token_accuracy": 0.8530709683895111, "num_tokens": 155789778.0, "step": 129520 }, { "entropy": 1.8619258537888528, "epoch": 0.40153166676870544, "grad_norm": 8.040087699890137, "learning_rate": 3.992403753013052e-06, "loss": 0.4286, "mean_token_accuracy": 0.8604286208748817, "num_tokens": 155801838.0, "step": 129530 }, { "entropy": 1.9772648215293884, "epoch": 0.40156266589375517, "grad_norm": 10.059983253479004, "learning_rate": 3.9922496495845015e-06, "loss": 0.5051, "mean_token_accuracy": 0.8421792671084404, "num_tokens": 155812867.0, "step": 129540 }, { "entropy": 1.9216817393898964, "epoch": 0.40159366501880484, "grad_norm": 7.800021648406982, "learning_rate": 3.992095563999361e-06, "loss": 0.4909, "mean_token_accuracy": 0.8464254021644593, "num_tokens": 155823811.0, "step": 129550 }, { "entropy": 1.828902542591095, "epoch": 0.40162466414385456, "grad_norm": 10.652735710144043, "learning_rate": 3.991941496254188e-06, "loss": 0.3814, "mean_token_accuracy": 0.8681714370846748, "num_tokens": 155836495.0, "step": 129560 }, { "entropy": 1.7818224623799324, "epoch": 0.40165566326890423, "grad_norm": 8.231464385986328, "learning_rate": 3.991787446345542e-06, "loss": 0.431, "mean_token_accuracy": 0.8553294375538826, "num_tokens": 155849733.0, "step": 129570 }, { "entropy": 1.933050660789013, "epoch": 0.40168666239395395, "grad_norm": 9.135824203491211, "learning_rate": 3.991633414269979e-06, "loss": 0.4789, "mean_token_accuracy": 0.8501686662435531, "num_tokens": 155861168.0, "step": 129580 }, { "entropy": 1.9456676498055459, "epoch": 0.4017176615190036, "grad_norm": 7.7797393798828125, "learning_rate": 3.9914794000240604e-06, "loss": 0.4655, "mean_token_accuracy": 0.8487606376409531, "num_tokens": 155872569.0, "step": 129590 }, { "entropy": 1.9395177766680718, "epoch": 0.40174866064405335, "grad_norm": 4.4937334060668945, "learning_rate": 3.9913254036043455e-06, "loss": 0.5013, "mean_token_accuracy": 0.8409850835800171, "num_tokens": 155884343.0, "step": 129600 }, { "entropy": 1.96222285926342, "epoch": 0.401779659769103, "grad_norm": 8.145906448364258, "learning_rate": 3.991171425007396e-06, "loss": 0.5238, "mean_token_accuracy": 0.832419815659523, "num_tokens": 155895310.0, "step": 129610 }, { "entropy": 1.9075540453195572, "epoch": 0.40181065889415274, "grad_norm": 6.813295364379883, "learning_rate": 3.991017464229776e-06, "loss": 0.4826, "mean_token_accuracy": 0.8529313713312149, "num_tokens": 155907503.0, "step": 129620 }, { "entropy": 1.8687444925308228, "epoch": 0.4018416580192024, "grad_norm": 8.512529373168945, "learning_rate": 3.990863521268047e-06, "loss": 0.4664, "mean_token_accuracy": 0.8458928510546684, "num_tokens": 155919609.0, "step": 129630 }, { "entropy": 1.937071642279625, "epoch": 0.4018726571442521, "grad_norm": 8.758728981018066, "learning_rate": 3.990709596118774e-06, "loss": 0.486, "mean_token_accuracy": 0.8510117352008819, "num_tokens": 155931150.0, "step": 129640 }, { "entropy": 1.9854602545499802, "epoch": 0.4019036562693018, "grad_norm": 7.997261047363281, "learning_rate": 3.990555688778521e-06, "loss": 0.4844, "mean_token_accuracy": 0.8390512794256211, "num_tokens": 155943070.0, "step": 129650 }, { "entropy": 1.8987844973802566, "epoch": 0.40193465539435147, "grad_norm": 3.581861734390259, "learning_rate": 3.990401799243856e-06, "loss": 0.4422, "mean_token_accuracy": 0.8575407922267914, "num_tokens": 155955435.0, "step": 129660 }, { "entropy": 1.9078197583556176, "epoch": 0.4019656545194012, "grad_norm": 8.173437118530273, "learning_rate": 3.990247927511345e-06, "loss": 0.4263, "mean_token_accuracy": 0.8537064164876937, "num_tokens": 155967188.0, "step": 129670 }, { "entropy": 1.9190190985798836, "epoch": 0.40199665364445086, "grad_norm": 9.123544692993164, "learning_rate": 3.990094073577556e-06, "loss": 0.4679, "mean_token_accuracy": 0.8527463868260383, "num_tokens": 155978679.0, "step": 129680 }, { "entropy": 1.9677730560302735, "epoch": 0.4020276527695006, "grad_norm": 8.531991004943848, "learning_rate": 3.9899402374390585e-06, "loss": 0.5025, "mean_token_accuracy": 0.8477371469140053, "num_tokens": 155989818.0, "step": 129690 }, { "entropy": 1.8901991337537765, "epoch": 0.40205865189455026, "grad_norm": 4.090048789978027, "learning_rate": 3.989786419092422e-06, "loss": 0.4215, "mean_token_accuracy": 0.8558493494987488, "num_tokens": 156002187.0, "step": 129700 }, { "entropy": 1.90907621383667, "epoch": 0.4020896510196, "grad_norm": 5.129952907562256, "learning_rate": 3.989632618534216e-06, "loss": 0.4733, "mean_token_accuracy": 0.8454459354281425, "num_tokens": 156014603.0, "step": 129710 }, { "entropy": 1.821659305691719, "epoch": 0.40212065014464965, "grad_norm": 7.665596961975098, "learning_rate": 3.9894788357610134e-06, "loss": 0.3828, "mean_token_accuracy": 0.8629834577441216, "num_tokens": 156028439.0, "step": 129720 }, { "entropy": 1.8766173496842384, "epoch": 0.4021516492696994, "grad_norm": 3.5944769382476807, "learning_rate": 3.989325070769388e-06, "loss": 0.4096, "mean_token_accuracy": 0.8606648370623589, "num_tokens": 156040404.0, "step": 129730 }, { "entropy": 1.8934729993343353, "epoch": 0.40218264839474904, "grad_norm": 3.758676052093506, "learning_rate": 3.98917132355591e-06, "loss": 0.4176, "mean_token_accuracy": 0.8599832877516747, "num_tokens": 156052297.0, "step": 129740 }, { "entropy": 1.9285873532295228, "epoch": 0.40221364751979877, "grad_norm": 7.010026454925537, "learning_rate": 3.989017594117158e-06, "loss": 0.4784, "mean_token_accuracy": 0.8508692249655724, "num_tokens": 156063027.0, "step": 129750 }, { "entropy": 1.7936880350112916, "epoch": 0.40224464664484844, "grad_norm": 9.10521125793457, "learning_rate": 3.9888638824497034e-06, "loss": 0.4211, "mean_token_accuracy": 0.8516425848007202, "num_tokens": 156075651.0, "step": 129760 }, { "entropy": 1.8739636823534966, "epoch": 0.40227564576989816, "grad_norm": 8.865586280822754, "learning_rate": 3.988710188550125e-06, "loss": 0.4647, "mean_token_accuracy": 0.8452788576483726, "num_tokens": 156087306.0, "step": 129770 }, { "entropy": 1.9241146951913835, "epoch": 0.40230664489494783, "grad_norm": 8.022590637207031, "learning_rate": 3.988556512415e-06, "loss": 0.5015, "mean_token_accuracy": 0.8422558963298797, "num_tokens": 156098765.0, "step": 129780 }, { "entropy": 1.8548906803131104, "epoch": 0.40233764401999755, "grad_norm": 6.8973493576049805, "learning_rate": 3.988402854040903e-06, "loss": 0.414, "mean_token_accuracy": 0.8576776921749115, "num_tokens": 156110825.0, "step": 129790 }, { "entropy": 1.7588834911584854, "epoch": 0.4023686431450472, "grad_norm": 8.585075378417969, "learning_rate": 3.988249213424419e-06, "loss": 0.3764, "mean_token_accuracy": 0.8594303146004677, "num_tokens": 156124449.0, "step": 129800 }, { "entropy": 1.8105765163898468, "epoch": 0.40239964227009695, "grad_norm": 8.616347312927246, "learning_rate": 3.9880955905621235e-06, "loss": 0.4874, "mean_token_accuracy": 0.850447241961956, "num_tokens": 156137506.0, "step": 129810 }, { "entropy": 1.8526711270213128, "epoch": 0.4024306413951466, "grad_norm": 8.065713882446289, "learning_rate": 3.9879419854506e-06, "loss": 0.4683, "mean_token_accuracy": 0.8551761761307717, "num_tokens": 156149785.0, "step": 129820 }, { "entropy": 1.8890022948384284, "epoch": 0.40246164052019634, "grad_norm": 7.917675495147705, "learning_rate": 3.987788398086428e-06, "loss": 0.4387, "mean_token_accuracy": 0.8588599741458893, "num_tokens": 156160850.0, "step": 129830 }, { "entropy": 1.784026588499546, "epoch": 0.402492639645246, "grad_norm": 9.626664161682129, "learning_rate": 3.987634828466191e-06, "loss": 0.3686, "mean_token_accuracy": 0.8714671805500984, "num_tokens": 156173409.0, "step": 129840 }, { "entropy": 1.8477544769644738, "epoch": 0.40252363877029573, "grad_norm": 8.537195205688477, "learning_rate": 3.987481276586474e-06, "loss": 0.4918, "mean_token_accuracy": 0.8426776066422462, "num_tokens": 156186477.0, "step": 129850 }, { "entropy": 1.736336489021778, "epoch": 0.4025546378953454, "grad_norm": 2.7146239280700684, "learning_rate": 3.98732774244386e-06, "loss": 0.4036, "mean_token_accuracy": 0.862640056014061, "num_tokens": 156200279.0, "step": 129860 }, { "entropy": 1.8483411461114883, "epoch": 0.4025856370203951, "grad_norm": 3.8119378089904785, "learning_rate": 3.987174226034936e-06, "loss": 0.4548, "mean_token_accuracy": 0.8419306710362434, "num_tokens": 156212592.0, "step": 129870 }, { "entropy": 1.9207856342196465, "epoch": 0.4026166361454448, "grad_norm": 3.5462663173675537, "learning_rate": 3.987020727356287e-06, "loss": 0.4749, "mean_token_accuracy": 0.8504367902874946, "num_tokens": 156223684.0, "step": 129880 }, { "entropy": 1.8283285796642303, "epoch": 0.40264763527049446, "grad_norm": 8.442768096923828, "learning_rate": 3.9868672464045005e-06, "loss": 0.4547, "mean_token_accuracy": 0.8510240688920021, "num_tokens": 156236409.0, "step": 129890 }, { "entropy": 1.9546339631080627, "epoch": 0.4026786343955442, "grad_norm": 7.817316055297852, "learning_rate": 3.986713783176166e-06, "loss": 0.47, "mean_token_accuracy": 0.8490804046392441, "num_tokens": 156247811.0, "step": 129900 }, { "entropy": 1.9404812157154083, "epoch": 0.40270963352059386, "grad_norm": 8.437518119812012, "learning_rate": 3.986560337667872e-06, "loss": 0.5045, "mean_token_accuracy": 0.8467909589409828, "num_tokens": 156259779.0, "step": 129910 }, { "entropy": 1.9066607609391213, "epoch": 0.4027406326456436, "grad_norm": 7.6331305503845215, "learning_rate": 3.986406909876207e-06, "loss": 0.4798, "mean_token_accuracy": 0.8462503507733345, "num_tokens": 156271585.0, "step": 129920 }, { "entropy": 1.8766513273119927, "epoch": 0.40277163177069325, "grad_norm": 3.2706258296966553, "learning_rate": 3.986253499797765e-06, "loss": 0.4622, "mean_token_accuracy": 0.8481630995869637, "num_tokens": 156284035.0, "step": 129930 }, { "entropy": 1.8868813574314118, "epoch": 0.402802630895743, "grad_norm": 8.877392768859863, "learning_rate": 3.986100107429135e-06, "loss": 0.484, "mean_token_accuracy": 0.8435383632779121, "num_tokens": 156296009.0, "step": 129940 }, { "entropy": 1.9167783632874489, "epoch": 0.40283363002079264, "grad_norm": 8.852008819580078, "learning_rate": 3.985946732766913e-06, "loss": 0.4591, "mean_token_accuracy": 0.8485684707760811, "num_tokens": 156307643.0, "step": 129950 }, { "entropy": 1.9228922203183174, "epoch": 0.40286462914584237, "grad_norm": 7.26152229309082, "learning_rate": 3.98579337580769e-06, "loss": 0.4331, "mean_token_accuracy": 0.8554493680596351, "num_tokens": 156319030.0, "step": 129960 }, { "entropy": 1.874145193397999, "epoch": 0.40289562827089204, "grad_norm": 8.481700897216797, "learning_rate": 3.985640036548062e-06, "loss": 0.4785, "mean_token_accuracy": 0.8435692340135574, "num_tokens": 156331237.0, "step": 129970 }, { "entropy": 1.8589432962238788, "epoch": 0.40292662739594176, "grad_norm": 7.1247687339782715, "learning_rate": 3.985486714984625e-06, "loss": 0.4164, "mean_token_accuracy": 0.8517856851220131, "num_tokens": 156344722.0, "step": 129980 }, { "entropy": 1.9143261551856994, "epoch": 0.40295762652099143, "grad_norm": 9.23549747467041, "learning_rate": 3.985333411113975e-06, "loss": 0.509, "mean_token_accuracy": 0.8484362363815308, "num_tokens": 156356528.0, "step": 129990 }, { "entropy": 1.9343707114458084, "epoch": 0.40298862564604115, "grad_norm": 8.890726089477539, "learning_rate": 3.985180124932709e-06, "loss": 0.4702, "mean_token_accuracy": 0.8550547063350677, "num_tokens": 156367827.0, "step": 130000 }, { "entropy": 1.8906396552920341, "epoch": 0.4030196247710908, "grad_norm": 3.963322639465332, "learning_rate": 3.9850268564374256e-06, "loss": 0.4382, "mean_token_accuracy": 0.8531676039099694, "num_tokens": 156379328.0, "step": 130010 }, { "entropy": 1.76583993434906, "epoch": 0.40305062389614055, "grad_norm": 9.098811149597168, "learning_rate": 3.9848736056247245e-06, "loss": 0.3939, "mean_token_accuracy": 0.8658165216445923, "num_tokens": 156392630.0, "step": 130020 }, { "entropy": 1.838332974910736, "epoch": 0.4030816230211902, "grad_norm": 8.506731033325195, "learning_rate": 3.984720372491206e-06, "loss": 0.435, "mean_token_accuracy": 0.8585373058915138, "num_tokens": 156405536.0, "step": 130030 }, { "entropy": 1.806590475142002, "epoch": 0.40311262214623994, "grad_norm": 5.69447660446167, "learning_rate": 3.984567157033471e-06, "loss": 0.449, "mean_token_accuracy": 0.8464230895042419, "num_tokens": 156418437.0, "step": 130040 }, { "entropy": 1.9229248493909836, "epoch": 0.4031436212712896, "grad_norm": 8.785764694213867, "learning_rate": 3.9844139592481195e-06, "loss": 0.5001, "mean_token_accuracy": 0.8359622538089753, "num_tokens": 156430560.0, "step": 130050 }, { "entropy": 1.9032806217670442, "epoch": 0.40317462039633933, "grad_norm": 8.77900218963623, "learning_rate": 3.984260779131759e-06, "loss": 0.4543, "mean_token_accuracy": 0.8543096274137497, "num_tokens": 156442730.0, "step": 130060 }, { "entropy": 1.9101463302969932, "epoch": 0.403205619521389, "grad_norm": 7.89210319519043, "learning_rate": 3.984107616680989e-06, "loss": 0.4598, "mean_token_accuracy": 0.8459172546863556, "num_tokens": 156454070.0, "step": 130070 }, { "entropy": 1.9074820682406426, "epoch": 0.4032366186464387, "grad_norm": 9.048776626586914, "learning_rate": 3.983954471892417e-06, "loss": 0.4616, "mean_token_accuracy": 0.8488796040415764, "num_tokens": 156465623.0, "step": 130080 }, { "entropy": 1.8907459661364556, "epoch": 0.4032676177714884, "grad_norm": 9.203822135925293, "learning_rate": 3.983801344762646e-06, "loss": 0.439, "mean_token_accuracy": 0.8531972080469131, "num_tokens": 156477633.0, "step": 130090 }, { "entropy": 1.9408058658242227, "epoch": 0.4032986168965381, "grad_norm": 8.398517608642578, "learning_rate": 3.983648235288285e-06, "loss": 0.4529, "mean_token_accuracy": 0.8516205415129662, "num_tokens": 156488438.0, "step": 130100 }, { "entropy": 1.8947420373558999, "epoch": 0.4033296160215878, "grad_norm": 7.668551445007324, "learning_rate": 3.983495143465942e-06, "loss": 0.4659, "mean_token_accuracy": 0.8498603880405426, "num_tokens": 156499812.0, "step": 130110 }, { "entropy": 1.7990976139903068, "epoch": 0.4033606151466375, "grad_norm": 3.560288190841675, "learning_rate": 3.983342069292223e-06, "loss": 0.3632, "mean_token_accuracy": 0.8639690637588501, "num_tokens": 156512790.0, "step": 130120 }, { "entropy": 1.8366239294409752, "epoch": 0.4033916142716872, "grad_norm": 9.26343059539795, "learning_rate": 3.983189012763739e-06, "loss": 0.3931, "mean_token_accuracy": 0.8648645102977752, "num_tokens": 156525094.0, "step": 130130 }, { "entropy": 1.8781366422772408, "epoch": 0.40342261339673685, "grad_norm": 7.926459789276123, "learning_rate": 3.983035973877099e-06, "loss": 0.4221, "mean_token_accuracy": 0.8611466780304908, "num_tokens": 156537483.0, "step": 130140 }, { "entropy": 1.8221728757023812, "epoch": 0.4034536125217866, "grad_norm": 3.698610782623291, "learning_rate": 3.982882952628916e-06, "loss": 0.3992, "mean_token_accuracy": 0.8600765854120255, "num_tokens": 156549954.0, "step": 130150 }, { "entropy": 1.9444189876317979, "epoch": 0.40348461164683624, "grad_norm": 8.673882484436035, "learning_rate": 3.9827299490158e-06, "loss": 0.4982, "mean_token_accuracy": 0.8447524651885032, "num_tokens": 156561547.0, "step": 130160 }, { "entropy": 1.836028276383877, "epoch": 0.40351561077188597, "grad_norm": 9.08564567565918, "learning_rate": 3.982576963034364e-06, "loss": 0.4743, "mean_token_accuracy": 0.8561201602220535, "num_tokens": 156574641.0, "step": 130170 }, { "entropy": 1.918661078810692, "epoch": 0.40354660989693564, "grad_norm": 7.405217170715332, "learning_rate": 3.982423994681225e-06, "loss": 0.4877, "mean_token_accuracy": 0.8417747050523758, "num_tokens": 156586063.0, "step": 130180 }, { "entropy": 1.8970381796360016, "epoch": 0.40357760902198536, "grad_norm": 8.172847747802734, "learning_rate": 3.982271043952995e-06, "loss": 0.4547, "mean_token_accuracy": 0.8463637545704842, "num_tokens": 156598100.0, "step": 130190 }, { "entropy": 1.896133790910244, "epoch": 0.40360860814703503, "grad_norm": 2.4702107906341553, "learning_rate": 3.9821181108462895e-06, "loss": 0.4897, "mean_token_accuracy": 0.8457187637686729, "num_tokens": 156609713.0, "step": 130200 }, { "entropy": 1.8511537820100785, "epoch": 0.40363960727208475, "grad_norm": 3.5449514389038086, "learning_rate": 3.981965195357727e-06, "loss": 0.4154, "mean_token_accuracy": 0.8625165119767189, "num_tokens": 156622527.0, "step": 130210 }, { "entropy": 1.8973491430282592, "epoch": 0.4036706063971344, "grad_norm": 7.712673664093018, "learning_rate": 3.981812297483923e-06, "loss": 0.4554, "mean_token_accuracy": 0.8516885727643967, "num_tokens": 156634254.0, "step": 130220 }, { "entropy": 1.8966005221009254, "epoch": 0.40370160552218415, "grad_norm": 8.126500129699707, "learning_rate": 3.981659417221498e-06, "loss": 0.4368, "mean_token_accuracy": 0.8538167625665665, "num_tokens": 156646618.0, "step": 130230 }, { "entropy": 1.85729219019413, "epoch": 0.4037326046472338, "grad_norm": 9.697176933288574, "learning_rate": 3.98150655456707e-06, "loss": 0.4314, "mean_token_accuracy": 0.8491919696331024, "num_tokens": 156659338.0, "step": 130240 }, { "entropy": 1.8698885142803192, "epoch": 0.40376360377228354, "grad_norm": 7.321453094482422, "learning_rate": 3.981353709517259e-06, "loss": 0.4241, "mean_token_accuracy": 0.850645835697651, "num_tokens": 156671170.0, "step": 130250 }, { "entropy": 1.8850924864411354, "epoch": 0.4037946028973332, "grad_norm": 8.737915992736816, "learning_rate": 3.9812008820686864e-06, "loss": 0.4605, "mean_token_accuracy": 0.8499901428818702, "num_tokens": 156683391.0, "step": 130260 }, { "entropy": 1.9188417434692382, "epoch": 0.40382560202238293, "grad_norm": 8.441498756408691, "learning_rate": 3.981048072217976e-06, "loss": 0.4621, "mean_token_accuracy": 0.8469459965825081, "num_tokens": 156695474.0, "step": 130270 }, { "entropy": 1.832290355861187, "epoch": 0.4038566011474326, "grad_norm": 5.164080619812012, "learning_rate": 3.980895279961748e-06, "loss": 0.4268, "mean_token_accuracy": 0.8454208686947823, "num_tokens": 156708620.0, "step": 130280 }, { "entropy": 1.9142166703939438, "epoch": 0.40388760027248233, "grad_norm": 7.258204936981201, "learning_rate": 3.980742505296629e-06, "loss": 0.4338, "mean_token_accuracy": 0.8631983742117881, "num_tokens": 156720342.0, "step": 130290 }, { "entropy": 1.9650160878896714, "epoch": 0.403918599397532, "grad_norm": 7.455199241638184, "learning_rate": 3.980589748219241e-06, "loss": 0.5014, "mean_token_accuracy": 0.8472874984145164, "num_tokens": 156732490.0, "step": 130300 }, { "entropy": 1.816853478550911, "epoch": 0.4039495985225817, "grad_norm": 3.841028928756714, "learning_rate": 3.980437008726212e-06, "loss": 0.3849, "mean_token_accuracy": 0.8634612441062928, "num_tokens": 156745943.0, "step": 130310 }, { "entropy": 1.9299751341342926, "epoch": 0.4039805976476314, "grad_norm": 4.5318474769592285, "learning_rate": 3.980284286814167e-06, "loss": 0.4939, "mean_token_accuracy": 0.8458565920591354, "num_tokens": 156757351.0, "step": 130320 }, { "entropy": 1.8615489616990089, "epoch": 0.4040115967726811, "grad_norm": 8.26394271850586, "learning_rate": 3.980131582479735e-06, "loss": 0.4583, "mean_token_accuracy": 0.8518011167645454, "num_tokens": 156768452.0, "step": 130330 }, { "entropy": 1.9241826206445694, "epoch": 0.4040425958977308, "grad_norm": 9.016687393188477, "learning_rate": 3.979978895719543e-06, "loss": 0.4847, "mean_token_accuracy": 0.8521223112940788, "num_tokens": 156779453.0, "step": 130340 }, { "entropy": 2.0048281461000443, "epoch": 0.4040735950227805, "grad_norm": 8.775766372680664, "learning_rate": 3.979826226530221e-06, "loss": 0.5079, "mean_token_accuracy": 0.8402179971337318, "num_tokens": 156789976.0, "step": 130350 }, { "entropy": 1.8103154242038726, "epoch": 0.4041045941478302, "grad_norm": 3.5552315711975098, "learning_rate": 3.9796735749084e-06, "loss": 0.3948, "mean_token_accuracy": 0.8577532470226288, "num_tokens": 156803981.0, "step": 130360 }, { "entropy": 1.9318371683359146, "epoch": 0.4041355932728799, "grad_norm": 8.001313209533691, "learning_rate": 3.97952094085071e-06, "loss": 0.4546, "mean_token_accuracy": 0.8499713644385338, "num_tokens": 156816659.0, "step": 130370 }, { "entropy": 1.9495342776179314, "epoch": 0.40416659239792957, "grad_norm": 4.196635723114014, "learning_rate": 3.979368324353783e-06, "loss": 0.4924, "mean_token_accuracy": 0.8435593828558922, "num_tokens": 156828530.0, "step": 130380 }, { "entropy": 1.8581201523542403, "epoch": 0.40419759152297924, "grad_norm": 7.378039836883545, "learning_rate": 3.979215725414253e-06, "loss": 0.4111, "mean_token_accuracy": 0.8562660500407219, "num_tokens": 156841203.0, "step": 130390 }, { "entropy": 1.8645180001854897, "epoch": 0.40422859064802896, "grad_norm": 7.564363956451416, "learning_rate": 3.9790631440287516e-06, "loss": 0.3761, "mean_token_accuracy": 0.8670828878879547, "num_tokens": 156853244.0, "step": 130400 }, { "entropy": 1.9289032772183419, "epoch": 0.40425958977307863, "grad_norm": 4.7226481437683105, "learning_rate": 3.978910580193916e-06, "loss": 0.4384, "mean_token_accuracy": 0.8462576329708099, "num_tokens": 156864953.0, "step": 130410 }, { "entropy": 1.9896679311990737, "epoch": 0.40429058889812836, "grad_norm": 9.488637924194336, "learning_rate": 3.978758033906382e-06, "loss": 0.5219, "mean_token_accuracy": 0.8406891971826553, "num_tokens": 156875482.0, "step": 130420 }, { "entropy": 1.859781025350094, "epoch": 0.404321588023178, "grad_norm": 3.904017210006714, "learning_rate": 3.978605505162784e-06, "loss": 0.3909, "mean_token_accuracy": 0.8608370333909988, "num_tokens": 156887299.0, "step": 130430 }, { "entropy": 1.8310018733143807, "epoch": 0.40435258714822775, "grad_norm": 8.23388671875, "learning_rate": 3.978452993959761e-06, "loss": 0.4589, "mean_token_accuracy": 0.8504728376865387, "num_tokens": 156900461.0, "step": 130440 }, { "entropy": 1.877791903913021, "epoch": 0.4043835862732774, "grad_norm": 7.384108066558838, "learning_rate": 3.978300500293951e-06, "loss": 0.4635, "mean_token_accuracy": 0.8471254542469978, "num_tokens": 156913284.0, "step": 130450 }, { "entropy": 1.6766640424728394, "epoch": 0.40441458539832714, "grad_norm": 3.489778995513916, "learning_rate": 3.978148024161993e-06, "loss": 0.3129, "mean_token_accuracy": 0.873348993062973, "num_tokens": 156928424.0, "step": 130460 }, { "entropy": 1.8710533007979393, "epoch": 0.4044455845233768, "grad_norm": 6.751518726348877, "learning_rate": 3.977995565560528e-06, "loss": 0.4647, "mean_token_accuracy": 0.8581561148166656, "num_tokens": 156940418.0, "step": 130470 }, { "entropy": 1.8982125908136367, "epoch": 0.40447658364842654, "grad_norm": 7.623547554016113, "learning_rate": 3.977843124486196e-06, "loss": 0.4227, "mean_token_accuracy": 0.859375411272049, "num_tokens": 156952679.0, "step": 130480 }, { "entropy": 1.9297777190804482, "epoch": 0.4045075827734762, "grad_norm": 8.541840553283691, "learning_rate": 3.977690700935639e-06, "loss": 0.4418, "mean_token_accuracy": 0.8506446555256844, "num_tokens": 156964447.0, "step": 130490 }, { "entropy": 1.8197549387812615, "epoch": 0.40453858189852593, "grad_norm": 4.050614833831787, "learning_rate": 3.9775382949055e-06, "loss": 0.3907, "mean_token_accuracy": 0.8605873674154282, "num_tokens": 156977447.0, "step": 130500 }, { "entropy": 1.7729148700833322, "epoch": 0.4045695810235756, "grad_norm": 4.528966903686523, "learning_rate": 3.977385906392423e-06, "loss": 0.3785, "mean_token_accuracy": 0.8583493396639824, "num_tokens": 156991476.0, "step": 130510 }, { "entropy": 1.9727448597550392, "epoch": 0.4046005801486253, "grad_norm": 9.29497241973877, "learning_rate": 3.977233535393054e-06, "loss": 0.5015, "mean_token_accuracy": 0.843626768887043, "num_tokens": 157002963.0, "step": 130520 }, { "entropy": 1.9293553322553634, "epoch": 0.404631579273675, "grad_norm": 9.896842956542969, "learning_rate": 3.977081181904036e-06, "loss": 0.4716, "mean_token_accuracy": 0.8478871420025825, "num_tokens": 157014701.0, "step": 130530 }, { "entropy": 1.977966983616352, "epoch": 0.4046625783987247, "grad_norm": 8.197452545166016, "learning_rate": 3.976928845922018e-06, "loss": 0.5224, "mean_token_accuracy": 0.8346485048532486, "num_tokens": 157026114.0, "step": 130540 }, { "entropy": 1.9490982070565224, "epoch": 0.4046935775237744, "grad_norm": 3.955064058303833, "learning_rate": 3.976776527443644e-06, "loss": 0.4488, "mean_token_accuracy": 0.8499052435159683, "num_tokens": 157038105.0, "step": 130550 }, { "entropy": 1.921256160736084, "epoch": 0.4047245766488241, "grad_norm": 7.925784111022949, "learning_rate": 3.9766242264655655e-06, "loss": 0.5104, "mean_token_accuracy": 0.8429896846413613, "num_tokens": 157050666.0, "step": 130560 }, { "entropy": 1.9208467230200768, "epoch": 0.4047555757738738, "grad_norm": 4.279351711273193, "learning_rate": 3.976471942984431e-06, "loss": 0.4727, "mean_token_accuracy": 0.8471018105745316, "num_tokens": 157062556.0, "step": 130570 }, { "entropy": 1.8374285951256752, "epoch": 0.4047865748989235, "grad_norm": 8.640376091003418, "learning_rate": 3.976319676996889e-06, "loss": 0.4165, "mean_token_accuracy": 0.8620464310050011, "num_tokens": 157075344.0, "step": 130580 }, { "entropy": 1.9202581122517586, "epoch": 0.40481757402397317, "grad_norm": 7.99689245223999, "learning_rate": 3.976167428499592e-06, "loss": 0.4589, "mean_token_accuracy": 0.8536014124751091, "num_tokens": 157086341.0, "step": 130590 }, { "entropy": 1.9324933484196662, "epoch": 0.4048485731490229, "grad_norm": 6.284863471984863, "learning_rate": 3.976015197489192e-06, "loss": 0.5003, "mean_token_accuracy": 0.8359458863735199, "num_tokens": 157098510.0, "step": 130600 }, { "entropy": 1.811049999296665, "epoch": 0.40487957227407256, "grad_norm": 8.513614654541016, "learning_rate": 3.97586298396234e-06, "loss": 0.4118, "mean_token_accuracy": 0.8566889658570289, "num_tokens": 157111635.0, "step": 130610 }, { "entropy": 1.895810031890869, "epoch": 0.40491057139912223, "grad_norm": 8.150436401367188, "learning_rate": 3.975710787915691e-06, "loss": 0.4337, "mean_token_accuracy": 0.8592909917235374, "num_tokens": 157123381.0, "step": 130620 }, { "entropy": 1.8806810095906257, "epoch": 0.40494157052417196, "grad_norm": 9.090899467468262, "learning_rate": 3.9755586093459e-06, "loss": 0.4232, "mean_token_accuracy": 0.8572476267814636, "num_tokens": 157135740.0, "step": 130630 }, { "entropy": 1.7848129168152809, "epoch": 0.4049725696492216, "grad_norm": 4.283092975616455, "learning_rate": 3.97540644824962e-06, "loss": 0.3555, "mean_token_accuracy": 0.8615420237183571, "num_tokens": 157148676.0, "step": 130640 }, { "entropy": 1.8715970084071158, "epoch": 0.40500356877427135, "grad_norm": 9.602254867553711, "learning_rate": 3.975254304623512e-06, "loss": 0.4169, "mean_token_accuracy": 0.8585036590695381, "num_tokens": 157161564.0, "step": 130650 }, { "entropy": 1.8966206587851047, "epoch": 0.405034567899321, "grad_norm": 6.511444091796875, "learning_rate": 3.975102178464229e-06, "loss": 0.3862, "mean_token_accuracy": 0.8608822152018547, "num_tokens": 157174560.0, "step": 130660 }, { "entropy": 1.9423101127147675, "epoch": 0.40506556702437074, "grad_norm": 8.167618751525879, "learning_rate": 3.974950069768429e-06, "loss": 0.4911, "mean_token_accuracy": 0.8432308033108711, "num_tokens": 157186728.0, "step": 130670 }, { "entropy": 1.9459773391485213, "epoch": 0.4050965661494204, "grad_norm": 8.282353401184082, "learning_rate": 3.974797978532774e-06, "loss": 0.478, "mean_token_accuracy": 0.8470408290624618, "num_tokens": 157198743.0, "step": 130680 }, { "entropy": 1.8234005078673363, "epoch": 0.40512756527447014, "grad_norm": 3.401383399963379, "learning_rate": 3.974645904753922e-06, "loss": 0.3927, "mean_token_accuracy": 0.8553168892860412, "num_tokens": 157211958.0, "step": 130690 }, { "entropy": 1.9370823100209236, "epoch": 0.4051585643995198, "grad_norm": 6.692420959472656, "learning_rate": 3.974493848428535e-06, "loss": 0.4796, "mean_token_accuracy": 0.8524964049458503, "num_tokens": 157223127.0, "step": 130700 }, { "entropy": 1.9691085010766982, "epoch": 0.40518956352456953, "grad_norm": 8.417125701904297, "learning_rate": 3.974341809553272e-06, "loss": 0.4557, "mean_token_accuracy": 0.8546971321105957, "num_tokens": 157234327.0, "step": 130710 }, { "entropy": 1.8974324196577073, "epoch": 0.4052205626496192, "grad_norm": 9.326712608337402, "learning_rate": 3.974189788124799e-06, "loss": 0.4357, "mean_token_accuracy": 0.8539463758468628, "num_tokens": 157246787.0, "step": 130720 }, { "entropy": 1.9472648695111274, "epoch": 0.4052515617746689, "grad_norm": 8.63284683227539, "learning_rate": 3.974037784139778e-06, "loss": 0.4983, "mean_token_accuracy": 0.8510000556707382, "num_tokens": 157258105.0, "step": 130730 }, { "entropy": 1.9133790254592895, "epoch": 0.4052825608997186, "grad_norm": 7.009464263916016, "learning_rate": 3.973885797594873e-06, "loss": 0.4128, "mean_token_accuracy": 0.8659856930375099, "num_tokens": 157269330.0, "step": 130740 }, { "entropy": 1.9145527601242065, "epoch": 0.4053135600247683, "grad_norm": 6.98771333694458, "learning_rate": 3.973733828486749e-06, "loss": 0.4235, "mean_token_accuracy": 0.8578495368361473, "num_tokens": 157282000.0, "step": 130750 }, { "entropy": 1.9058576181530953, "epoch": 0.405344559149818, "grad_norm": 9.95231819152832, "learning_rate": 3.9735818768120745e-06, "loss": 0.4964, "mean_token_accuracy": 0.8475971177220345, "num_tokens": 157293637.0, "step": 130760 }, { "entropy": 1.789780667424202, "epoch": 0.4053755582748677, "grad_norm": 7.046390533447266, "learning_rate": 3.973429942567513e-06, "loss": 0.3527, "mean_token_accuracy": 0.8669392168521881, "num_tokens": 157306793.0, "step": 130770 }, { "entropy": 1.988757422566414, "epoch": 0.4054065573999174, "grad_norm": 9.767016410827637, "learning_rate": 3.973278025749736e-06, "loss": 0.4872, "mean_token_accuracy": 0.8483888059854507, "num_tokens": 157318126.0, "step": 130780 }, { "entropy": 1.9013223245739936, "epoch": 0.4054375565249671, "grad_norm": 4.095386028289795, "learning_rate": 3.97312612635541e-06, "loss": 0.4834, "mean_token_accuracy": 0.8486203476786613, "num_tokens": 157329583.0, "step": 130790 }, { "entropy": 1.9669806063175201, "epoch": 0.40546855565001677, "grad_norm": 7.4656596183776855, "learning_rate": 3.9729742443812056e-06, "loss": 0.5092, "mean_token_accuracy": 0.8445022612810135, "num_tokens": 157340387.0, "step": 130800 }, { "entropy": 1.8516760841012, "epoch": 0.4054995547750665, "grad_norm": Infinity, "learning_rate": 3.972822379823793e-06, "loss": 0.388, "mean_token_accuracy": 0.8624450042843819, "num_tokens": 157353043.0, "step": 130810 }, { "entropy": 1.915142248570919, "epoch": 0.40553055390011616, "grad_norm": 7.543905258178711, "learning_rate": 3.972670532679844e-06, "loss": 0.4563, "mean_token_accuracy": 0.8492273300886154, "num_tokens": 157365310.0, "step": 130820 }, { "entropy": 1.8361800596117974, "epoch": 0.4055615530251659, "grad_norm": 4.767913341522217, "learning_rate": 3.9725187029460316e-06, "loss": 0.4211, "mean_token_accuracy": 0.8620738372206688, "num_tokens": 157377616.0, "step": 130830 }, { "entropy": 1.9048438847064972, "epoch": 0.40559255215021556, "grad_norm": 8.4035005569458, "learning_rate": 3.972366890619029e-06, "loss": 0.4554, "mean_token_accuracy": 0.8537156358361244, "num_tokens": 157389121.0, "step": 130840 }, { "entropy": 1.9738389521837234, "epoch": 0.4056235512752653, "grad_norm": 8.513267517089844, "learning_rate": 3.972215095695508e-06, "loss": 0.4703, "mean_token_accuracy": 0.8514445766806602, "num_tokens": 157400361.0, "step": 130850 }, { "entropy": 1.9617274329066277, "epoch": 0.40565455040031495, "grad_norm": 4.258702278137207, "learning_rate": 3.972063318172147e-06, "loss": 0.4877, "mean_token_accuracy": 0.847972746193409, "num_tokens": 157411570.0, "step": 130860 }, { "entropy": 1.9130688726902008, "epoch": 0.4056855495253646, "grad_norm": 7.041831970214844, "learning_rate": 3.97191155804562e-06, "loss": 0.4609, "mean_token_accuracy": 0.8498567163944244, "num_tokens": 157423298.0, "step": 130870 }, { "entropy": 1.8860970079898833, "epoch": 0.40571654865041434, "grad_norm": 7.607229709625244, "learning_rate": 3.971759815312605e-06, "loss": 0.4647, "mean_token_accuracy": 0.8536626756191253, "num_tokens": 157434605.0, "step": 130880 }, { "entropy": 1.9061809971928596, "epoch": 0.405747547775464, "grad_norm": 7.09719705581665, "learning_rate": 3.971608089969779e-06, "loss": 0.4901, "mean_token_accuracy": 0.848254905641079, "num_tokens": 157446268.0, "step": 130890 }, { "entropy": 1.7818841926753521, "epoch": 0.40577854690051374, "grad_norm": 4.486595630645752, "learning_rate": 3.971456382013821e-06, "loss": 0.3597, "mean_token_accuracy": 0.8559565782546997, "num_tokens": 157460438.0, "step": 130900 }, { "entropy": 1.763669066131115, "epoch": 0.4058095460255634, "grad_norm": 3.9831771850585938, "learning_rate": 3.97130469144141e-06, "loss": 0.3303, "mean_token_accuracy": 0.8683875605463982, "num_tokens": 157475220.0, "step": 130910 }, { "entropy": 1.8374385461211205, "epoch": 0.40584054515061313, "grad_norm": 8.03730583190918, "learning_rate": 3.9711530182492266e-06, "loss": 0.4052, "mean_token_accuracy": 0.858574740588665, "num_tokens": 157487520.0, "step": 130920 }, { "entropy": 1.9221992582082748, "epoch": 0.4058715442756628, "grad_norm": 9.426704406738281, "learning_rate": 3.971001362433953e-06, "loss": 0.484, "mean_token_accuracy": 0.8404927179217339, "num_tokens": 157499623.0, "step": 130930 }, { "entropy": 1.8548153042793274, "epoch": 0.4059025434007125, "grad_norm": 4.058870792388916, "learning_rate": 3.97084972399227e-06, "loss": 0.4315, "mean_token_accuracy": 0.8509564101696014, "num_tokens": 157512269.0, "step": 130940 }, { "entropy": 1.898500706255436, "epoch": 0.4059335425257622, "grad_norm": 7.100831985473633, "learning_rate": 3.970698102920861e-06, "loss": 0.4517, "mean_token_accuracy": 0.8517476499080658, "num_tokens": 157524534.0, "step": 130950 }, { "entropy": 1.938453209400177, "epoch": 0.4059645416508119, "grad_norm": 7.448749542236328, "learning_rate": 3.970546499216411e-06, "loss": 0.4693, "mean_token_accuracy": 0.8483160063624382, "num_tokens": 157535978.0, "step": 130960 }, { "entropy": 1.8814398035407067, "epoch": 0.4059955407758616, "grad_norm": 8.682611465454102, "learning_rate": 3.970394912875604e-06, "loss": 0.4537, "mean_token_accuracy": 0.8522828727960586, "num_tokens": 157548563.0, "step": 130970 }, { "entropy": 1.9689362928271295, "epoch": 0.4060265399009113, "grad_norm": 8.23831558227539, "learning_rate": 3.970243343895126e-06, "loss": 0.4459, "mean_token_accuracy": 0.8545093983411789, "num_tokens": 157559741.0, "step": 130980 }, { "entropy": 1.8692365869879723, "epoch": 0.406057539025961, "grad_norm": 8.109051704406738, "learning_rate": 3.970091792271663e-06, "loss": 0.4326, "mean_token_accuracy": 0.8523834586143494, "num_tokens": 157571939.0, "step": 130990 }, { "entropy": 1.9243786290287972, "epoch": 0.4060885381510107, "grad_norm": 10.623024940490723, "learning_rate": 3.969940258001903e-06, "loss": 0.4833, "mean_token_accuracy": 0.8421697750687599, "num_tokens": 157583545.0, "step": 131000 }, { "entropy": 1.817786581814289, "epoch": 0.40611953727606037, "grad_norm": 9.550358772277832, "learning_rate": 3.969788741082535e-06, "loss": 0.3799, "mean_token_accuracy": 0.8594713494181633, "num_tokens": 157596216.0, "step": 131010 }, { "entropy": 1.9418366000056266, "epoch": 0.4061505364011101, "grad_norm": 7.840337753295898, "learning_rate": 3.969637241510247e-06, "loss": 0.4571, "mean_token_accuracy": 0.8537692606449128, "num_tokens": 157607196.0, "step": 131020 }, { "entropy": 1.8996048077940941, "epoch": 0.40618153552615976, "grad_norm": 8.591992378234863, "learning_rate": 3.9694857592817295e-06, "loss": 0.4538, "mean_token_accuracy": 0.8468469053506851, "num_tokens": 157619202.0, "step": 131030 }, { "entropy": 1.8347910821437836, "epoch": 0.4062125346512095, "grad_norm": 3.285649299621582, "learning_rate": 3.969334294393675e-06, "loss": 0.448, "mean_token_accuracy": 0.8462332352995873, "num_tokens": 157632132.0, "step": 131040 }, { "entropy": 1.9242204323410987, "epoch": 0.40624353377625916, "grad_norm": 4.810262680053711, "learning_rate": 3.969182846842773e-06, "loss": 0.5069, "mean_token_accuracy": 0.8468155965209008, "num_tokens": 157643575.0, "step": 131050 }, { "entropy": 1.9612080991268157, "epoch": 0.4062745329013089, "grad_norm": 9.51435375213623, "learning_rate": 3.9690314166257186e-06, "loss": 0.5294, "mean_token_accuracy": 0.8334100112318993, "num_tokens": 157654699.0, "step": 131060 }, { "entropy": 1.9367137864232062, "epoch": 0.40630553202635855, "grad_norm": 9.918806076049805, "learning_rate": 3.968880003739205e-06, "loss": 0.4602, "mean_token_accuracy": 0.851118703186512, "num_tokens": 157665786.0, "step": 131070 }, { "entropy": 1.8767028152942657, "epoch": 0.4063365311514083, "grad_norm": 10.18786334991455, "learning_rate": 3.9687286081799244e-06, "loss": 0.4366, "mean_token_accuracy": 0.8559330344200134, "num_tokens": 157678031.0, "step": 131080 }, { "entropy": 1.9314553529024123, "epoch": 0.40636753027645794, "grad_norm": 6.053582191467285, "learning_rate": 3.9685772299445754e-06, "loss": 0.4421, "mean_token_accuracy": 0.8622308388352394, "num_tokens": 157689998.0, "step": 131090 }, { "entropy": 1.6999683201313018, "epoch": 0.40639852940150767, "grad_norm": 5.942327499389648, "learning_rate": 3.9684258690298525e-06, "loss": 0.3754, "mean_token_accuracy": 0.8629785016179085, "num_tokens": 157704433.0, "step": 131100 }, { "entropy": 1.864445061981678, "epoch": 0.40642952852655734, "grad_norm": 7.0887346267700195, "learning_rate": 3.968274525432454e-06, "loss": 0.4122, "mean_token_accuracy": 0.8601906552910805, "num_tokens": 157717013.0, "step": 131110 }, { "entropy": 1.8925062775611878, "epoch": 0.406460527651607, "grad_norm": 10.952113151550293, "learning_rate": 3.968123199149077e-06, "loss": 0.4281, "mean_token_accuracy": 0.8586850896477699, "num_tokens": 157729145.0, "step": 131120 }, { "entropy": 1.9502949953079223, "epoch": 0.40649152677665673, "grad_norm": 10.193359375, "learning_rate": 3.967971890176421e-06, "loss": 0.4922, "mean_token_accuracy": 0.8508687302470207, "num_tokens": 157740267.0, "step": 131130 }, { "entropy": 1.9118008241057396, "epoch": 0.4065225259017064, "grad_norm": 3.9366962909698486, "learning_rate": 3.967820598511186e-06, "loss": 0.4496, "mean_token_accuracy": 0.8517689287662507, "num_tokens": 157751681.0, "step": 131140 }, { "entropy": 1.9390445798635483, "epoch": 0.4065535250267561, "grad_norm": 9.202500343322754, "learning_rate": 3.9676693241500725e-06, "loss": 0.4959, "mean_token_accuracy": 0.8437588036060333, "num_tokens": 157762806.0, "step": 131150 }, { "entropy": 1.873625774681568, "epoch": 0.4065845241518058, "grad_norm": 9.123173713684082, "learning_rate": 3.967518067089782e-06, "loss": 0.4701, "mean_token_accuracy": 0.8525558322668075, "num_tokens": 157775344.0, "step": 131160 }, { "entropy": 1.8773087307810783, "epoch": 0.4066155232768555, "grad_norm": 7.2432684898376465, "learning_rate": 3.967366827327019e-06, "loss": 0.421, "mean_token_accuracy": 0.8548357993364334, "num_tokens": 157788421.0, "step": 131170 }, { "entropy": 1.9489660397171975, "epoch": 0.4066465224019052, "grad_norm": 8.408989906311035, "learning_rate": 3.9672156048584825e-06, "loss": 0.5063, "mean_token_accuracy": 0.8430541291832924, "num_tokens": 157799818.0, "step": 131180 }, { "entropy": 1.8156598702073097, "epoch": 0.4066775215269549, "grad_norm": 3.8361010551452637, "learning_rate": 3.9670643996808805e-06, "loss": 0.3904, "mean_token_accuracy": 0.8656172275543212, "num_tokens": 157812552.0, "step": 131190 }, { "entropy": 1.876960425078869, "epoch": 0.4067085206520046, "grad_norm": 4.139932632446289, "learning_rate": 3.966913211790917e-06, "loss": 0.4741, "mean_token_accuracy": 0.8435041248798371, "num_tokens": 157824792.0, "step": 131200 }, { "entropy": 1.8027958139777183, "epoch": 0.4067395197770543, "grad_norm": 8.876256942749023, "learning_rate": 3.966762041185298e-06, "loss": 0.4064, "mean_token_accuracy": 0.8562049359083176, "num_tokens": 157837760.0, "step": 131210 }, { "entropy": 1.845954880863428, "epoch": 0.406770518902104, "grad_norm": 5.402946949005127, "learning_rate": 3.966610887860731e-06, "loss": 0.4745, "mean_token_accuracy": 0.8410774037241936, "num_tokens": 157850821.0, "step": 131220 }, { "entropy": 1.9590726420283318, "epoch": 0.4068015180271537, "grad_norm": 9.471705436706543, "learning_rate": 3.966459751813921e-06, "loss": 0.4975, "mean_token_accuracy": 0.8433647140860557, "num_tokens": 157862272.0, "step": 131230 }, { "entropy": 1.8670239843428136, "epoch": 0.40683251715220337, "grad_norm": 8.589696884155273, "learning_rate": 3.966308633041582e-06, "loss": 0.432, "mean_token_accuracy": 0.8549284100532532, "num_tokens": 157874516.0, "step": 131240 }, { "entropy": 1.8836659103631974, "epoch": 0.4068635162772531, "grad_norm": 7.1528449058532715, "learning_rate": 3.9661575315404185e-06, "loss": 0.536, "mean_token_accuracy": 0.8425373405218124, "num_tokens": 157887098.0, "step": 131250 }, { "entropy": 1.9439944818615913, "epoch": 0.40689451540230276, "grad_norm": 4.756436824798584, "learning_rate": 3.966006447307143e-06, "loss": 0.5211, "mean_token_accuracy": 0.8354384452104568, "num_tokens": 157898487.0, "step": 131260 }, { "entropy": 1.9490654364228248, "epoch": 0.4069255145273525, "grad_norm": 8.603204727172852, "learning_rate": 3.965855380338467e-06, "loss": 0.4939, "mean_token_accuracy": 0.841651976108551, "num_tokens": 157909630.0, "step": 131270 }, { "entropy": 1.831677147746086, "epoch": 0.40695651365240215, "grad_norm": 4.0721611976623535, "learning_rate": 3.965704330631102e-06, "loss": 0.4343, "mean_token_accuracy": 0.8579563215374947, "num_tokens": 157922831.0, "step": 131280 }, { "entropy": 1.8643750965595245, "epoch": 0.4069875127774519, "grad_norm": 9.7418851852417, "learning_rate": 3.965553298181761e-06, "loss": 0.4613, "mean_token_accuracy": 0.848840269446373, "num_tokens": 157934557.0, "step": 131290 }, { "entropy": 1.9244591280817986, "epoch": 0.40701851190250155, "grad_norm": 7.745121955871582, "learning_rate": 3.965402282987159e-06, "loss": 0.4905, "mean_token_accuracy": 0.8503209039568901, "num_tokens": 157945931.0, "step": 131300 }, { "entropy": 1.866335666179657, "epoch": 0.40704951102755127, "grad_norm": 8.668558120727539, "learning_rate": 3.96525128504401e-06, "loss": 0.464, "mean_token_accuracy": 0.849652573466301, "num_tokens": 157958476.0, "step": 131310 }, { "entropy": 1.8344866186380386, "epoch": 0.40708051015260094, "grad_norm": 7.905651569366455, "learning_rate": 3.965100304349029e-06, "loss": 0.4137, "mean_token_accuracy": 0.859797616302967, "num_tokens": 157971189.0, "step": 131320 }, { "entropy": 1.9577922523021698, "epoch": 0.40711150927765066, "grad_norm": 7.413038730621338, "learning_rate": 3.964949340898934e-06, "loss": 0.4654, "mean_token_accuracy": 0.8549491494894028, "num_tokens": 157981983.0, "step": 131330 }, { "entropy": 1.9203011736273765, "epoch": 0.40714250840270033, "grad_norm": 9.896458625793457, "learning_rate": 3.9647983946904416e-06, "loss": 0.462, "mean_token_accuracy": 0.8488733306527138, "num_tokens": 157993609.0, "step": 131340 }, { "entropy": 1.9292757645249368, "epoch": 0.40717350752775006, "grad_norm": 9.621637344360352, "learning_rate": 3.964647465720271e-06, "loss": 0.5658, "mean_token_accuracy": 0.8377192333340645, "num_tokens": 158005340.0, "step": 131350 }, { "entropy": 1.9438342347741127, "epoch": 0.4072045066527997, "grad_norm": 7.706085681915283, "learning_rate": 3.964496553985139e-06, "loss": 0.4935, "mean_token_accuracy": 0.8447881907224655, "num_tokens": 158016548.0, "step": 131360 }, { "entropy": 1.825490552186966, "epoch": 0.4072355057778494, "grad_norm": 4.330419063568115, "learning_rate": 3.964345659481768e-06, "loss": 0.4223, "mean_token_accuracy": 0.8532146289944649, "num_tokens": 158029148.0, "step": 131370 }, { "entropy": 1.8673056378960609, "epoch": 0.4072665049028991, "grad_norm": 7.6978230476379395, "learning_rate": 3.964194782206878e-06, "loss": 0.4265, "mean_token_accuracy": 0.8550721898674964, "num_tokens": 158041404.0, "step": 131380 }, { "entropy": 1.8040344282984733, "epoch": 0.4072975040279488, "grad_norm": 5.431396007537842, "learning_rate": 3.964043922157191e-06, "loss": 0.3902, "mean_token_accuracy": 0.8702194571495057, "num_tokens": 158054518.0, "step": 131390 }, { "entropy": 1.8987352877855301, "epoch": 0.4073285031529985, "grad_norm": 7.753914833068848, "learning_rate": 3.963893079329429e-06, "loss": 0.4926, "mean_token_accuracy": 0.8489699825644493, "num_tokens": 158065523.0, "step": 131400 }, { "entropy": 1.918555736541748, "epoch": 0.4073595022780482, "grad_norm": 8.709218978881836, "learning_rate": 3.9637422537203165e-06, "loss": 0.4729, "mean_token_accuracy": 0.8495317488908768, "num_tokens": 158077808.0, "step": 131410 }, { "entropy": 1.9698147997260094, "epoch": 0.4073905014030979, "grad_norm": 8.470252990722656, "learning_rate": 3.963591445326578e-06, "loss": 0.5221, "mean_token_accuracy": 0.8412455469369888, "num_tokens": 158089356.0, "step": 131420 }, { "entropy": 1.7146087184548378, "epoch": 0.4074215005281476, "grad_norm": 3.0368010997772217, "learning_rate": 3.963440654144938e-06, "loss": 0.3414, "mean_token_accuracy": 0.8623756617307663, "num_tokens": 158104069.0, "step": 131430 }, { "entropy": 1.9334975138306618, "epoch": 0.4074524996531973, "grad_norm": 7.6624884605407715, "learning_rate": 3.963289880172123e-06, "loss": 0.4892, "mean_token_accuracy": 0.8561296299099922, "num_tokens": 158115199.0, "step": 131440 }, { "entropy": 1.9267579466104507, "epoch": 0.40748349877824697, "grad_norm": 7.29675817489624, "learning_rate": 3.96313912340486e-06, "loss": 0.4165, "mean_token_accuracy": 0.8549336135387421, "num_tokens": 158127014.0, "step": 131450 }, { "entropy": 1.8517323270440103, "epoch": 0.4075144979032967, "grad_norm": Infinity, "learning_rate": 3.962988383839877e-06, "loss": 0.4007, "mean_token_accuracy": 0.8570884570479393, "num_tokens": 158139840.0, "step": 131460 }, { "entropy": 1.9198008626699448, "epoch": 0.40754549702834636, "grad_norm": 9.076776504516602, "learning_rate": 3.962837661473903e-06, "loss": 0.4474, "mean_token_accuracy": 0.8622155413031578, "num_tokens": 158150997.0, "step": 131470 }, { "entropy": 1.9445372179150582, "epoch": 0.4075764961533961, "grad_norm": 9.01667308807373, "learning_rate": 3.962686956303667e-06, "loss": 0.5047, "mean_token_accuracy": 0.8443640530109405, "num_tokens": 158162446.0, "step": 131480 }, { "entropy": 1.8989515826106071, "epoch": 0.40760749527844575, "grad_norm": 4.06826639175415, "learning_rate": 3.9625362683259e-06, "loss": 0.4691, "mean_token_accuracy": 0.8507105484604836, "num_tokens": 158174917.0, "step": 131490 }, { "entropy": 1.8795487105846405, "epoch": 0.4076384944034955, "grad_norm": 4.378306865692139, "learning_rate": 3.962385597537333e-06, "loss": 0.4443, "mean_token_accuracy": 0.8498224586248397, "num_tokens": 158187150.0, "step": 131500 }, { "entropy": 1.709404329955578, "epoch": 0.40766949352854515, "grad_norm": 7.759102821350098, "learning_rate": 3.9622349439346985e-06, "loss": 0.348, "mean_token_accuracy": 0.8657671257853508, "num_tokens": 158201223.0, "step": 131510 }, { "entropy": 1.9634048074483872, "epoch": 0.40770049265359487, "grad_norm": 9.048603057861328, "learning_rate": 3.962084307514729e-06, "loss": 0.5274, "mean_token_accuracy": 0.8406730964779854, "num_tokens": 158212564.0, "step": 131520 }, { "entropy": 1.9156591862440109, "epoch": 0.40773149177864454, "grad_norm": 8.055933952331543, "learning_rate": 3.9619336882741595e-06, "loss": 0.4562, "mean_token_accuracy": 0.8517060205340385, "num_tokens": 158224398.0, "step": 131530 }, { "entropy": 1.851639135181904, "epoch": 0.40776249090369426, "grad_norm": 4.349390029907227, "learning_rate": 3.961783086209726e-06, "loss": 0.4029, "mean_token_accuracy": 0.8623872861266136, "num_tokens": 158237555.0, "step": 131540 }, { "entropy": 1.8899920910596848, "epoch": 0.40779349002874393, "grad_norm": 6.555295467376709, "learning_rate": 3.96163250131816e-06, "loss": 0.4161, "mean_token_accuracy": 0.8546891719102859, "num_tokens": 158249575.0, "step": 131550 }, { "entropy": 1.8582070901989938, "epoch": 0.40782448915379366, "grad_norm": 8.720921516418457, "learning_rate": 3.961481933596203e-06, "loss": 0.4457, "mean_token_accuracy": 0.8513376504182816, "num_tokens": 158262561.0, "step": 131560 }, { "entropy": 1.9029300913214684, "epoch": 0.4078554882788433, "grad_norm": 7.594869136810303, "learning_rate": 3.9613313830405895e-06, "loss": 0.4742, "mean_token_accuracy": 0.8554488077759743, "num_tokens": 158274086.0, "step": 131570 }, { "entropy": 1.8785912677645684, "epoch": 0.40788648740389305, "grad_norm": 4.259986400604248, "learning_rate": 3.961180849648059e-06, "loss": 0.4547, "mean_token_accuracy": 0.8466733306646347, "num_tokens": 158286621.0, "step": 131580 }, { "entropy": 1.8832870185375215, "epoch": 0.4079174865289427, "grad_norm": 3.7143633365631104, "learning_rate": 3.961030333415349e-06, "loss": 0.4362, "mean_token_accuracy": 0.8520120859146119, "num_tokens": 158298950.0, "step": 131590 }, { "entropy": 1.8503879860043526, "epoch": 0.40794848565399244, "grad_norm": 6.5647478103637695, "learning_rate": 3.960879834339202e-06, "loss": 0.3874, "mean_token_accuracy": 0.8644073367118835, "num_tokens": 158311634.0, "step": 131600 }, { "entropy": 1.785201308131218, "epoch": 0.4079794847790421, "grad_norm": 8.026589393615723, "learning_rate": 3.960729352416358e-06, "loss": 0.4128, "mean_token_accuracy": 0.849951197206974, "num_tokens": 158325117.0, "step": 131610 }, { "entropy": 1.9552603572607041, "epoch": 0.4080104839040918, "grad_norm": 8.554306983947754, "learning_rate": 3.960578887643557e-06, "loss": 0.4965, "mean_token_accuracy": 0.846280574798584, "num_tokens": 158336274.0, "step": 131620 }, { "entropy": 1.8943400636315346, "epoch": 0.4080414830291415, "grad_norm": 9.436079978942871, "learning_rate": 3.960428440017544e-06, "loss": 0.4796, "mean_token_accuracy": 0.8490863978862763, "num_tokens": 158347481.0, "step": 131630 }, { "entropy": 1.8362084731459618, "epoch": 0.4080724821541912, "grad_norm": 4.264366626739502, "learning_rate": 3.960278009535063e-06, "loss": 0.4281, "mean_token_accuracy": 0.8494585782289505, "num_tokens": 158360076.0, "step": 131640 }, { "entropy": 1.9207329094409942, "epoch": 0.4081034812792409, "grad_norm": 8.190292358398438, "learning_rate": 3.960127596192855e-06, "loss": 0.4646, "mean_token_accuracy": 0.8595984593033791, "num_tokens": 158371437.0, "step": 131650 }, { "entropy": 1.9442640289664268, "epoch": 0.40813448040429057, "grad_norm": 8.884259223937988, "learning_rate": 3.959977199987669e-06, "loss": 0.4553, "mean_token_accuracy": 0.8574367508292198, "num_tokens": 158382706.0, "step": 131660 }, { "entropy": 1.8724360197782517, "epoch": 0.4081654795293403, "grad_norm": 4.065852642059326, "learning_rate": 3.959826820916251e-06, "loss": 0.4309, "mean_token_accuracy": 0.8485496342182159, "num_tokens": 158394833.0, "step": 131670 }, { "entropy": 1.9313586950302124, "epoch": 0.40819647865438996, "grad_norm": 4.444271087646484, "learning_rate": 3.9596764589753435e-06, "loss": 0.4781, "mean_token_accuracy": 0.8469718486070633, "num_tokens": 158406591.0, "step": 131680 }, { "entropy": 1.9100508004426957, "epoch": 0.4082274777794397, "grad_norm": 4.082368850708008, "learning_rate": 3.959526114161699e-06, "loss": 0.4211, "mean_token_accuracy": 0.8586499407887459, "num_tokens": 158418411.0, "step": 131690 }, { "entropy": 1.8683311700820924, "epoch": 0.40825847690448935, "grad_norm": 3.7615411281585693, "learning_rate": 3.959375786472065e-06, "loss": 0.3954, "mean_token_accuracy": 0.8578609451651573, "num_tokens": 158430806.0, "step": 131700 }, { "entropy": 1.902983796596527, "epoch": 0.4082894760295391, "grad_norm": 6.945703983306885, "learning_rate": 3.95922547590319e-06, "loss": 0.4371, "mean_token_accuracy": 0.8584466874599457, "num_tokens": 158442686.0, "step": 131710 }, { "entropy": 1.8431276768445968, "epoch": 0.40832047515458875, "grad_norm": 8.36220932006836, "learning_rate": 3.959075182451826e-06, "loss": 0.4039, "mean_token_accuracy": 0.8638790413737297, "num_tokens": 158455043.0, "step": 131720 }, { "entropy": 1.8704046532511711, "epoch": 0.40835147427963847, "grad_norm": 8.115221977233887, "learning_rate": 3.958924906114722e-06, "loss": 0.4362, "mean_token_accuracy": 0.8541171133518219, "num_tokens": 158467164.0, "step": 131730 }, { "entropy": 1.89015693962574, "epoch": 0.40838247340468814, "grad_norm": 8.641586303710938, "learning_rate": 3.958774646888633e-06, "loss": 0.5154, "mean_token_accuracy": 0.8427566275000572, "num_tokens": 158479113.0, "step": 131740 }, { "entropy": 1.917370368540287, "epoch": 0.40841347252973786, "grad_norm": 8.092529296875, "learning_rate": 3.958624404770311e-06, "loss": 0.4753, "mean_token_accuracy": 0.8460395202040673, "num_tokens": 158490501.0, "step": 131750 }, { "entropy": 1.8727830082178116, "epoch": 0.40844447165478753, "grad_norm": 4.2208404541015625, "learning_rate": 3.95847417975651e-06, "loss": 0.4159, "mean_token_accuracy": 0.854328741133213, "num_tokens": 158502344.0, "step": 131760 }, { "entropy": 1.8874020084738732, "epoch": 0.40847547077983726, "grad_norm": 8.900503158569336, "learning_rate": 3.958323971843983e-06, "loss": 0.4533, "mean_token_accuracy": 0.8399735167622566, "num_tokens": 158514713.0, "step": 131770 }, { "entropy": 1.9528047412633895, "epoch": 0.4085064699048869, "grad_norm": 6.548738956451416, "learning_rate": 3.958173781029487e-06, "loss": 0.476, "mean_token_accuracy": 0.8562960177659988, "num_tokens": 158526164.0, "step": 131780 }, { "entropy": 1.8597876712679864, "epoch": 0.40853746902993665, "grad_norm": 4.265164375305176, "learning_rate": 3.95802360730978e-06, "loss": 0.3839, "mean_token_accuracy": 0.861919678747654, "num_tokens": 158538663.0, "step": 131790 }, { "entropy": 1.874061642587185, "epoch": 0.4085684681549863, "grad_norm": 3.2929224967956543, "learning_rate": 3.957873450681617e-06, "loss": 0.4689, "mean_token_accuracy": 0.8565281003713607, "num_tokens": 158550773.0, "step": 131800 }, { "entropy": 1.943674847483635, "epoch": 0.40859946728003604, "grad_norm": 9.038628578186035, "learning_rate": 3.9577233111417575e-06, "loss": 0.4782, "mean_token_accuracy": 0.8586441352963448, "num_tokens": 158562087.0, "step": 131810 }, { "entropy": 1.88618975430727, "epoch": 0.4086304664050857, "grad_norm": 8.164410591125488, "learning_rate": 3.95757318868696e-06, "loss": 0.5036, "mean_token_accuracy": 0.8438228338956832, "num_tokens": 158573774.0, "step": 131820 }, { "entropy": 1.9306119337677956, "epoch": 0.40866146553013544, "grad_norm": 9.125570297241211, "learning_rate": 3.957423083313984e-06, "loss": 0.4908, "mean_token_accuracy": 0.839359101653099, "num_tokens": 158585857.0, "step": 131830 }, { "entropy": 1.9744738072156907, "epoch": 0.4086924646551851, "grad_norm": 8.555429458618164, "learning_rate": 3.957272995019592e-06, "loss": 0.5472, "mean_token_accuracy": 0.8404243916273118, "num_tokens": 158597937.0, "step": 131840 }, { "entropy": 1.9038583174347878, "epoch": 0.40872346378023483, "grad_norm": 2.9242498874664307, "learning_rate": 3.9571229238005436e-06, "loss": 0.4247, "mean_token_accuracy": 0.8589347749948502, "num_tokens": 158610210.0, "step": 131850 }, { "entropy": 1.9494069427251817, "epoch": 0.4087544629052845, "grad_norm": 3.915905237197876, "learning_rate": 3.956972869653602e-06, "loss": 0.4873, "mean_token_accuracy": 0.8493471398949624, "num_tokens": 158621519.0, "step": 131860 }, { "entropy": 1.8807632595300674, "epoch": 0.40878546203033417, "grad_norm": 7.95693302154541, "learning_rate": 3.956822832575532e-06, "loss": 0.4319, "mean_token_accuracy": 0.8504085868597031, "num_tokens": 158633877.0, "step": 131870 }, { "entropy": 1.971236227452755, "epoch": 0.4088164611553839, "grad_norm": 7.221638202667236, "learning_rate": 3.956672812563096e-06, "loss": 0.5112, "mean_token_accuracy": 0.8400038599967956, "num_tokens": 158645327.0, "step": 131880 }, { "entropy": 1.9444902956485748, "epoch": 0.40884746028043356, "grad_norm": 8.52728271484375, "learning_rate": 3.956522809613061e-06, "loss": 0.4633, "mean_token_accuracy": 0.8471096143126488, "num_tokens": 158657499.0, "step": 131890 }, { "entropy": 1.8855510473251342, "epoch": 0.4088784594054833, "grad_norm": 6.880878448486328, "learning_rate": 3.95637282372219e-06, "loss": 0.4621, "mean_token_accuracy": 0.856865806877613, "num_tokens": 158669407.0, "step": 131900 }, { "entropy": 1.9185251086950301, "epoch": 0.40890945853053295, "grad_norm": 8.515263557434082, "learning_rate": 3.956222854887252e-06, "loss": 0.4217, "mean_token_accuracy": 0.8487884759902954, "num_tokens": 158681430.0, "step": 131910 }, { "entropy": 1.8058866739273072, "epoch": 0.4089404576555827, "grad_norm": 3.930187463760376, "learning_rate": 3.956072903105014e-06, "loss": 0.3995, "mean_token_accuracy": 0.8495476976037025, "num_tokens": 158695298.0, "step": 131920 }, { "entropy": 1.9411882251501082, "epoch": 0.40897145678063235, "grad_norm": 6.896012306213379, "learning_rate": 3.955922968372246e-06, "loss": 0.4603, "mean_token_accuracy": 0.8471732020378113, "num_tokens": 158706166.0, "step": 131930 }, { "entropy": 1.9487490490078927, "epoch": 0.40900245590568207, "grad_norm": 7.6823015213012695, "learning_rate": 3.955773050685715e-06, "loss": 0.4411, "mean_token_accuracy": 0.8591760620474815, "num_tokens": 158717557.0, "step": 131940 }, { "entropy": 1.855629739165306, "epoch": 0.40903345503073174, "grad_norm": 7.680805683135986, "learning_rate": 3.955623150042193e-06, "loss": 0.4827, "mean_token_accuracy": 0.8519311249256134, "num_tokens": 158730328.0, "step": 131950 }, { "entropy": 1.9119280502200127, "epoch": 0.40906445415578147, "grad_norm": 7.97050666809082, "learning_rate": 3.9554732664384495e-06, "loss": 0.4739, "mean_token_accuracy": 0.8517009600996971, "num_tokens": 158742681.0, "step": 131960 }, { "entropy": 1.8887418761849404, "epoch": 0.40909545328083113, "grad_norm": 4.2701802253723145, "learning_rate": 3.955323399871258e-06, "loss": 0.4534, "mean_token_accuracy": 0.8548057928681374, "num_tokens": 158754792.0, "step": 131970 }, { "entropy": 1.955525654554367, "epoch": 0.40912645240588086, "grad_norm": 8.67664623260498, "learning_rate": 3.955173550337391e-06, "loss": 0.5038, "mean_token_accuracy": 0.8389379262924195, "num_tokens": 158766595.0, "step": 131980 }, { "entropy": 1.9655676484107971, "epoch": 0.4091574515309305, "grad_norm": 7.483652591705322, "learning_rate": 3.955023717833621e-06, "loss": 0.4962, "mean_token_accuracy": 0.8467064067721367, "num_tokens": 158777624.0, "step": 131990 }, { "entropy": 1.8954958975315095, "epoch": 0.40918845065598025, "grad_norm": 8.60129165649414, "learning_rate": 3.954873902356724e-06, "loss": 0.4291, "mean_token_accuracy": 0.8588397979736329, "num_tokens": 158789808.0, "step": 132000 }, { "entropy": 1.9237147703766824, "epoch": 0.4092194497810299, "grad_norm": 7.778072357177734, "learning_rate": 3.9547241039034745e-06, "loss": 0.4504, "mean_token_accuracy": 0.8559079930186272, "num_tokens": 158801021.0, "step": 132010 }, { "entropy": 1.9441052988171577, "epoch": 0.40925044890607964, "grad_norm": 8.119583129882812, "learning_rate": 3.954574322470649e-06, "loss": 0.4965, "mean_token_accuracy": 0.8437977254390716, "num_tokens": 158812670.0, "step": 132020 }, { "entropy": 1.9320714622735977, "epoch": 0.4092814480311293, "grad_norm": 7.461652755737305, "learning_rate": 3.954424558055025e-06, "loss": 0.568, "mean_token_accuracy": 0.842699658870697, "num_tokens": 158823864.0, "step": 132030 }, { "entropy": 1.9182468011975289, "epoch": 0.40931244715617904, "grad_norm": 7.672018051147461, "learning_rate": 3.954274810653379e-06, "loss": 0.4464, "mean_token_accuracy": 0.8570101261138916, "num_tokens": 158835728.0, "step": 132040 }, { "entropy": 1.9373611778020858, "epoch": 0.4093434462812287, "grad_norm": 8.900008201599121, "learning_rate": 3.954125080262492e-06, "loss": 0.4879, "mean_token_accuracy": 0.8497121125459671, "num_tokens": 158846813.0, "step": 132050 }, { "entropy": 1.8692992970347404, "epoch": 0.40937444540627843, "grad_norm": 7.517188549041748, "learning_rate": 3.953975366879141e-06, "loss": 0.4331, "mean_token_accuracy": 0.8590711817145348, "num_tokens": 158858240.0, "step": 132060 }, { "entropy": 1.8589076712727546, "epoch": 0.4094054445313281, "grad_norm": 9.378162384033203, "learning_rate": 3.953825670500109e-06, "loss": 0.4061, "mean_token_accuracy": 0.861855249106884, "num_tokens": 158870488.0, "step": 132070 }, { "entropy": 1.9007334470748902, "epoch": 0.4094364436563778, "grad_norm": 8.594630241394043, "learning_rate": 3.953675991122176e-06, "loss": 0.5, "mean_token_accuracy": 0.8480967715382576, "num_tokens": 158882023.0, "step": 132080 }, { "entropy": 1.9315433949232101, "epoch": 0.4094674427814275, "grad_norm": 3.873143196105957, "learning_rate": 3.953526328742123e-06, "loss": 0.4676, "mean_token_accuracy": 0.8507410883903503, "num_tokens": 158893185.0, "step": 132090 }, { "entropy": 1.9150833919644357, "epoch": 0.4094984419064772, "grad_norm": 3.768460273742676, "learning_rate": 3.953376683356738e-06, "loss": 0.4955, "mean_token_accuracy": 0.8455236718058586, "num_tokens": 158904520.0, "step": 132100 }, { "entropy": 1.8527635991573335, "epoch": 0.4095294410315269, "grad_norm": 8.320259094238281, "learning_rate": 3.953227054962798e-06, "loss": 0.3745, "mean_token_accuracy": 0.8663365572690964, "num_tokens": 158917450.0, "step": 132110 }, { "entropy": 1.8949743598699569, "epoch": 0.40956044015657656, "grad_norm": 3.7683377265930176, "learning_rate": 3.953077443557093e-06, "loss": 0.5338, "mean_token_accuracy": 0.8330464109778404, "num_tokens": 158929795.0, "step": 132120 }, { "entropy": 1.9238438218832017, "epoch": 0.4095914392816263, "grad_norm": 8.981724739074707, "learning_rate": 3.952927849136406e-06, "loss": 0.5059, "mean_token_accuracy": 0.8446515381336213, "num_tokens": 158941331.0, "step": 132130 }, { "entropy": 1.9227786138653755, "epoch": 0.40962243840667595, "grad_norm": 7.715068817138672, "learning_rate": 3.952778271697524e-06, "loss": 0.5024, "mean_token_accuracy": 0.8444668635725975, "num_tokens": 158953028.0, "step": 132140 }, { "entropy": 1.9023156434297561, "epoch": 0.4096534375317257, "grad_norm": 8.244248390197754, "learning_rate": 3.952628711237235e-06, "loss": 0.4696, "mean_token_accuracy": 0.850898091495037, "num_tokens": 158964269.0, "step": 132150 }, { "entropy": 1.8805926099419594, "epoch": 0.40968443665677534, "grad_norm": 7.6930999755859375, "learning_rate": 3.952479167752328e-06, "loss": 0.4305, "mean_token_accuracy": 0.863546060025692, "num_tokens": 158976310.0, "step": 132160 }, { "entropy": 1.7959831669926642, "epoch": 0.40971543578182507, "grad_norm": 5.07602596282959, "learning_rate": 3.952329641239589e-06, "loss": 0.4041, "mean_token_accuracy": 0.8648563221096992, "num_tokens": 158989932.0, "step": 132170 }, { "entropy": 1.9297270089387895, "epoch": 0.40974643490687473, "grad_norm": 7.6279096603393555, "learning_rate": 3.9521801316958105e-06, "loss": 0.4611, "mean_token_accuracy": 0.8553159907460213, "num_tokens": 159001128.0, "step": 132180 }, { "entropy": 1.8945249140262603, "epoch": 0.40977743403192446, "grad_norm": 7.788156032562256, "learning_rate": 3.952030639117782e-06, "loss": 0.4566, "mean_token_accuracy": 0.8605994209647179, "num_tokens": 159012619.0, "step": 132190 }, { "entropy": 1.9229301661252975, "epoch": 0.40980843315697413, "grad_norm": 9.636686325073242, "learning_rate": 3.951881163502295e-06, "loss": 0.4814, "mean_token_accuracy": 0.8494541838765144, "num_tokens": 159023540.0, "step": 132200 }, { "entropy": 1.8150003015995027, "epoch": 0.40983943228202385, "grad_norm": 8.98440933227539, "learning_rate": 3.951731704846143e-06, "loss": 0.3751, "mean_token_accuracy": 0.8665079712867737, "num_tokens": 159036264.0, "step": 132210 }, { "entropy": 1.9748639404773711, "epoch": 0.4098704314070735, "grad_norm": 10.900146484375, "learning_rate": 3.951582263146119e-06, "loss": 0.5727, "mean_token_accuracy": 0.8254036799073219, "num_tokens": 159046762.0, "step": 132220 }, { "entropy": 1.8873142629861832, "epoch": 0.40990143053212325, "grad_norm": 9.89717960357666, "learning_rate": 3.951432838399017e-06, "loss": 0.5366, "mean_token_accuracy": 0.8460609570145607, "num_tokens": 159058574.0, "step": 132230 }, { "entropy": 1.8014112293720246, "epoch": 0.4099324296571729, "grad_norm": 8.128996849060059, "learning_rate": 3.95128343060163e-06, "loss": 0.3828, "mean_token_accuracy": 0.8637454330921173, "num_tokens": 159072392.0, "step": 132240 }, { "entropy": 1.8816646337509155, "epoch": 0.40996342878222264, "grad_norm": 2.441240072250366, "learning_rate": 3.9511340397507555e-06, "loss": 0.4778, "mean_token_accuracy": 0.8551323056221009, "num_tokens": 159084913.0, "step": 132250 }, { "entropy": 1.8489706605672835, "epoch": 0.4099944279072723, "grad_norm": 4.065392971038818, "learning_rate": 3.950984665843191e-06, "loss": 0.4157, "mean_token_accuracy": 0.8606104895472526, "num_tokens": 159097664.0, "step": 132260 }, { "entropy": 1.8708397269248962, "epoch": 0.41002542703232203, "grad_norm": 4.442748069763184, "learning_rate": 3.950835308875733e-06, "loss": 0.4184, "mean_token_accuracy": 0.8600472211837769, "num_tokens": 159109240.0, "step": 132270 }, { "entropy": 1.8376126080751418, "epoch": 0.4100564261573717, "grad_norm": 7.40627908706665, "learning_rate": 3.9506859688451805e-06, "loss": 0.4016, "mean_token_accuracy": 0.8633526057004929, "num_tokens": 159121773.0, "step": 132280 }, { "entropy": 1.9250367239117623, "epoch": 0.4100874252824214, "grad_norm": 7.536020755767822, "learning_rate": 3.950536645748332e-06, "loss": 0.4819, "mean_token_accuracy": 0.8445648729801178, "num_tokens": 159133238.0, "step": 132290 }, { "entropy": 1.8859627723693848, "epoch": 0.4101184244074711, "grad_norm": 10.275099754333496, "learning_rate": 3.950387339581987e-06, "loss": 0.4357, "mean_token_accuracy": 0.8571148261427879, "num_tokens": 159144881.0, "step": 132300 }, { "entropy": 1.8043443158268928, "epoch": 0.4101494235325208, "grad_norm": 6.609226703643799, "learning_rate": 3.950238050342948e-06, "loss": 0.3754, "mean_token_accuracy": 0.8644479662179947, "num_tokens": 159157919.0, "step": 132310 }, { "entropy": 1.9193567991256715, "epoch": 0.4101804226575705, "grad_norm": 8.553771018981934, "learning_rate": 3.950088778028016e-06, "loss": 0.5014, "mean_token_accuracy": 0.8534865245223046, "num_tokens": 159169473.0, "step": 132320 }, { "entropy": 1.9148662850260734, "epoch": 0.4102114217826202, "grad_norm": 5.254544258117676, "learning_rate": 3.949939522633992e-06, "loss": 0.49, "mean_token_accuracy": 0.8425288826227189, "num_tokens": 159181088.0, "step": 132330 }, { "entropy": 1.8751920118927956, "epoch": 0.4102424209076699, "grad_norm": 8.47884750366211, "learning_rate": 3.9497902841576826e-06, "loss": 0.4212, "mean_token_accuracy": 0.8611443415284157, "num_tokens": 159192624.0, "step": 132340 }, { "entropy": 1.9114878982305528, "epoch": 0.41027342003271955, "grad_norm": 8.459266662597656, "learning_rate": 3.949641062595889e-06, "loss": 0.4893, "mean_token_accuracy": 0.8466789424419403, "num_tokens": 159203363.0, "step": 132350 }, { "entropy": 1.9088921546936035, "epoch": 0.4103044191577693, "grad_norm": 7.341428756713867, "learning_rate": 3.949491857945419e-06, "loss": 0.4507, "mean_token_accuracy": 0.8548379436135292, "num_tokens": 159215263.0, "step": 132360 }, { "entropy": 1.8097696974873543, "epoch": 0.41033541828281894, "grad_norm": 8.788670539855957, "learning_rate": 3.949342670203077e-06, "loss": 0.4085, "mean_token_accuracy": 0.8639630541205406, "num_tokens": 159227890.0, "step": 132370 }, { "entropy": 1.921763353049755, "epoch": 0.41036641740786867, "grad_norm": 8.037771224975586, "learning_rate": 3.94919349936567e-06, "loss": 0.5278, "mean_token_accuracy": 0.8411315530538559, "num_tokens": 159239371.0, "step": 132380 }, { "entropy": 1.9325927823781968, "epoch": 0.41039741653291834, "grad_norm": 9.67178726196289, "learning_rate": 3.949044345430004e-06, "loss": 0.4994, "mean_token_accuracy": 0.8497194886207581, "num_tokens": 159249796.0, "step": 132390 }, { "entropy": 1.8051844477653503, "epoch": 0.41042841565796806, "grad_norm": 5.3989691734313965, "learning_rate": 3.94889520839289e-06, "loss": 0.4265, "mean_token_accuracy": 0.8578594639897347, "num_tokens": 159262693.0, "step": 132400 }, { "entropy": 1.9161362245678901, "epoch": 0.41045941478301773, "grad_norm": 8.300895690917969, "learning_rate": 3.948746088251138e-06, "loss": 0.4547, "mean_token_accuracy": 0.853403514623642, "num_tokens": 159274712.0, "step": 132410 }, { "entropy": 1.8416450381278993, "epoch": 0.41049041390806745, "grad_norm": 6.615303039550781, "learning_rate": 3.948596985001556e-06, "loss": 0.3615, "mean_token_accuracy": 0.8720286816358567, "num_tokens": 159287211.0, "step": 132420 }, { "entropy": 1.935446311533451, "epoch": 0.4105214130331171, "grad_norm": 8.131443977355957, "learning_rate": 3.948447898640955e-06, "loss": 0.4783, "mean_token_accuracy": 0.8410095199942589, "num_tokens": 159298375.0, "step": 132430 }, { "entropy": 1.8042462676763535, "epoch": 0.41055241215816685, "grad_norm": 6.6756911277771, "learning_rate": 3.948298829166149e-06, "loss": 0.4102, "mean_token_accuracy": 0.8577197238802909, "num_tokens": 159311942.0, "step": 132440 }, { "entropy": 1.8535395577549933, "epoch": 0.4105834112832165, "grad_norm": 8.319269180297852, "learning_rate": 3.948149776573948e-06, "loss": 0.5231, "mean_token_accuracy": 0.841600701212883, "num_tokens": 159325565.0, "step": 132450 }, { "entropy": 1.853706520795822, "epoch": 0.41061441040826624, "grad_norm": 7.631629467010498, "learning_rate": 3.948000740861168e-06, "loss": 0.4186, "mean_token_accuracy": 0.8647748947143554, "num_tokens": 159337661.0, "step": 132460 }, { "entropy": 1.8631751976907254, "epoch": 0.4106454095333159, "grad_norm": 3.5813517570495605, "learning_rate": 3.947851722024622e-06, "loss": 0.4183, "mean_token_accuracy": 0.8632732674479484, "num_tokens": 159349920.0, "step": 132470 }, { "entropy": 1.9363053843379021, "epoch": 0.41067640865836563, "grad_norm": 7.310238361358643, "learning_rate": 3.947702720061125e-06, "loss": 0.4557, "mean_token_accuracy": 0.8465033307671547, "num_tokens": 159361594.0, "step": 132480 }, { "entropy": 1.8684279143810272, "epoch": 0.4107074077834153, "grad_norm": 9.23641300201416, "learning_rate": 3.947553734967494e-06, "loss": 0.4332, "mean_token_accuracy": 0.8531864553689956, "num_tokens": 159373909.0, "step": 132490 }, { "entropy": 1.9086994245648383, "epoch": 0.410738406908465, "grad_norm": 9.247127532958984, "learning_rate": 3.947404766740546e-06, "loss": 0.4944, "mean_token_accuracy": 0.8483163312077522, "num_tokens": 159386422.0, "step": 132500 }, { "entropy": 1.9499953478574752, "epoch": 0.4107694060335147, "grad_norm": 7.84163236618042, "learning_rate": 3.947255815377098e-06, "loss": 0.4753, "mean_token_accuracy": 0.8528471052646637, "num_tokens": 159397202.0, "step": 132510 }, { "entropy": 1.8840179577469827, "epoch": 0.4108004051585644, "grad_norm": 4.282788276672363, "learning_rate": 3.947106880873969e-06, "loss": 0.4659, "mean_token_accuracy": 0.8478341817855835, "num_tokens": 159410046.0, "step": 132520 }, { "entropy": 1.7982640989124774, "epoch": 0.4108314042836141, "grad_norm": 4.207269191741943, "learning_rate": 3.946957963227978e-06, "loss": 0.3782, "mean_token_accuracy": 0.8666106954216957, "num_tokens": 159422809.0, "step": 132530 }, { "entropy": 1.8728843182325363, "epoch": 0.4108624034086638, "grad_norm": 4.20776891708374, "learning_rate": 3.946809062435946e-06, "loss": 0.4235, "mean_token_accuracy": 0.8539015829563141, "num_tokens": 159435471.0, "step": 132540 }, { "entropy": 1.8800769940018653, "epoch": 0.4108934025337135, "grad_norm": 8.50875473022461, "learning_rate": 3.9466601784946935e-06, "loss": 0.4762, "mean_token_accuracy": 0.8496467992663383, "num_tokens": 159448258.0, "step": 132550 }, { "entropy": 1.824411703646183, "epoch": 0.4109244016587632, "grad_norm": 7.829589366912842, "learning_rate": 3.946511311401043e-06, "loss": 0.3826, "mean_token_accuracy": 0.8696093156933784, "num_tokens": 159460857.0, "step": 132560 }, { "entropy": 1.8826001703739166, "epoch": 0.4109554007838129, "grad_norm": 8.163921356201172, "learning_rate": 3.946362461151816e-06, "loss": 0.4364, "mean_token_accuracy": 0.8533704668283463, "num_tokens": 159473481.0, "step": 132570 }, { "entropy": 1.841157278418541, "epoch": 0.4109863999088626, "grad_norm": 8.525252342224121, "learning_rate": 3.946213627743839e-06, "loss": 0.3758, "mean_token_accuracy": 0.8639968425035477, "num_tokens": 159485282.0, "step": 132580 }, { "entropy": 1.903964453935623, "epoch": 0.41101739903391227, "grad_norm": 8.637951850891113, "learning_rate": 3.9460648111739346e-06, "loss": 0.4897, "mean_token_accuracy": 0.8393364146351814, "num_tokens": 159496784.0, "step": 132590 }, { "entropy": 1.8756361320614814, "epoch": 0.41104839815896194, "grad_norm": 9.585840225219727, "learning_rate": 3.945916011438926e-06, "loss": 0.463, "mean_token_accuracy": 0.8499387681484223, "num_tokens": 159508357.0, "step": 132600 }, { "entropy": 1.8971466958522796, "epoch": 0.41107939728401166, "grad_norm": 7.960050106048584, "learning_rate": 3.945767228535644e-06, "loss": 0.4611, "mean_token_accuracy": 0.8593509659171105, "num_tokens": 159519968.0, "step": 132610 }, { "entropy": 1.928882573544979, "epoch": 0.41111039640906133, "grad_norm": 7.798891544342041, "learning_rate": 3.9456184624609115e-06, "loss": 0.4672, "mean_token_accuracy": 0.8493491113185883, "num_tokens": 159531286.0, "step": 132620 }, { "entropy": 1.9064328223466873, "epoch": 0.41114139553411105, "grad_norm": 8.213712692260742, "learning_rate": 3.945469713211559e-06, "loss": 0.4889, "mean_token_accuracy": 0.844736622273922, "num_tokens": 159543308.0, "step": 132630 }, { "entropy": 1.898288056254387, "epoch": 0.4111723946591607, "grad_norm": 7.8646039962768555, "learning_rate": 3.945320980784413e-06, "loss": 0.5054, "mean_token_accuracy": 0.8415489286184311, "num_tokens": 159555394.0, "step": 132640 }, { "entropy": 1.8438436210155487, "epoch": 0.41120339378421045, "grad_norm": 8.93295955657959, "learning_rate": 3.945172265176305e-06, "loss": 0.4203, "mean_token_accuracy": 0.8616420149803161, "num_tokens": 159568585.0, "step": 132650 }, { "entropy": 1.909870770573616, "epoch": 0.4112343929092601, "grad_norm": 3.4213240146636963, "learning_rate": 3.945023566384064e-06, "loss": 0.4484, "mean_token_accuracy": 0.856622938811779, "num_tokens": 159580433.0, "step": 132660 }, { "entropy": 1.9142403885722161, "epoch": 0.41126539203430984, "grad_norm": 4.372912883758545, "learning_rate": 3.944874884404522e-06, "loss": 0.4383, "mean_token_accuracy": 0.8533108174800873, "num_tokens": 159592231.0, "step": 132670 }, { "entropy": 1.9566717997193337, "epoch": 0.4112963911593595, "grad_norm": 7.756010055541992, "learning_rate": 3.944726219234511e-06, "loss": 0.4849, "mean_token_accuracy": 0.8435320198535919, "num_tokens": 159603617.0, "step": 132680 }, { "entropy": 1.7626475676894189, "epoch": 0.41132739028440923, "grad_norm": 9.682427406311035, "learning_rate": 3.944577570870863e-06, "loss": 0.3736, "mean_token_accuracy": 0.8564101874828338, "num_tokens": 159617560.0, "step": 132690 }, { "entropy": 1.9389828056097032, "epoch": 0.4113583894094589, "grad_norm": 7.8840179443359375, "learning_rate": 3.944428939310412e-06, "loss": 0.4442, "mean_token_accuracy": 0.8546505197882652, "num_tokens": 159628901.0, "step": 132700 }, { "entropy": 1.7881031468510629, "epoch": 0.4113893885345086, "grad_norm": 3.9923946857452393, "learning_rate": 3.944280324549993e-06, "loss": 0.3477, "mean_token_accuracy": 0.8698825523257255, "num_tokens": 159642790.0, "step": 132710 }, { "entropy": 1.9210022300481797, "epoch": 0.4114203876595583, "grad_norm": 6.981124401092529, "learning_rate": 3.944131726586441e-06, "loss": 0.4662, "mean_token_accuracy": 0.842510312795639, "num_tokens": 159654565.0, "step": 132720 }, { "entropy": 1.7577592477202415, "epoch": 0.411451386784608, "grad_norm": 7.697649002075195, "learning_rate": 3.943983145416592e-06, "loss": 0.3523, "mean_token_accuracy": 0.8635148763656616, "num_tokens": 159668041.0, "step": 132730 }, { "entropy": 1.9331092268228531, "epoch": 0.4114823859096577, "grad_norm": 7.486753463745117, "learning_rate": 3.943834581037284e-06, "loss": 0.4667, "mean_token_accuracy": 0.8566341996192932, "num_tokens": 159679025.0, "step": 132740 }, { "entropy": 1.8456541523337364, "epoch": 0.4115133850347074, "grad_norm": 8.025050163269043, "learning_rate": 3.9436860334453535e-06, "loss": 0.3865, "mean_token_accuracy": 0.8562669888138771, "num_tokens": 159691580.0, "step": 132750 }, { "entropy": 1.8954646542668343, "epoch": 0.4115443841597571, "grad_norm": 8.532896995544434, "learning_rate": 3.94353750263764e-06, "loss": 0.4245, "mean_token_accuracy": 0.8576274499297142, "num_tokens": 159702748.0, "step": 132760 }, { "entropy": 1.8293071150779725, "epoch": 0.4115753832848068, "grad_norm": 3.7488701343536377, "learning_rate": 3.943388988610982e-06, "loss": 0.3922, "mean_token_accuracy": 0.8620394229888916, "num_tokens": 159715325.0, "step": 132770 }, { "entropy": 1.8163642302155494, "epoch": 0.4116063824098565, "grad_norm": 11.977995872497559, "learning_rate": 3.943240491362222e-06, "loss": 0.4271, "mean_token_accuracy": 0.8522105798125267, "num_tokens": 159728453.0, "step": 132780 }, { "entropy": 1.8184501871466636, "epoch": 0.4116373815349062, "grad_norm": 7.925955772399902, "learning_rate": 3.9430920108882e-06, "loss": 0.464, "mean_token_accuracy": 0.8594678580760956, "num_tokens": 159741671.0, "step": 132790 }, { "entropy": 1.8316495031118394, "epoch": 0.41166838065995587, "grad_norm": 8.928302764892578, "learning_rate": 3.942943547185757e-06, "loss": 0.4176, "mean_token_accuracy": 0.86448425501585, "num_tokens": 159754290.0, "step": 132800 }, { "entropy": 1.8046483889222145, "epoch": 0.4116993797850056, "grad_norm": 3.8450210094451904, "learning_rate": 3.942795100251737e-06, "loss": 0.4365, "mean_token_accuracy": 0.8553688034415246, "num_tokens": 159766765.0, "step": 132810 }, { "entropy": 1.888925837725401, "epoch": 0.41173037891005526, "grad_norm": 9.118193626403809, "learning_rate": 3.942646670082983e-06, "loss": 0.4684, "mean_token_accuracy": 0.8464773207902908, "num_tokens": 159779056.0, "step": 132820 }, { "entropy": 1.900699371099472, "epoch": 0.411761378035105, "grad_norm": 7.870625972747803, "learning_rate": 3.942498256676342e-06, "loss": 0.4509, "mean_token_accuracy": 0.8564991891384125, "num_tokens": 159790237.0, "step": 132830 }, { "entropy": 1.9068083673715592, "epoch": 0.41179237716015465, "grad_norm": 9.596370697021484, "learning_rate": 3.942349860028655e-06, "loss": 0.4538, "mean_token_accuracy": 0.8504379868507386, "num_tokens": 159802324.0, "step": 132840 }, { "entropy": 1.95052922219038, "epoch": 0.4118233762852043, "grad_norm": 7.9511494636535645, "learning_rate": 3.942201480136772e-06, "loss": 0.4751, "mean_token_accuracy": 0.8542814716696739, "num_tokens": 159813411.0, "step": 132850 }, { "entropy": 1.9478548653423786, "epoch": 0.41185437541025405, "grad_norm": 7.628299713134766, "learning_rate": 3.942053116997537e-06, "loss": 0.4533, "mean_token_accuracy": 0.8497453570365906, "num_tokens": 159824950.0, "step": 132860 }, { "entropy": 1.9837032228708267, "epoch": 0.4118853745353037, "grad_norm": 8.014751434326172, "learning_rate": 3.941904770607801e-06, "loss": 0.5192, "mean_token_accuracy": 0.8386216104030609, "num_tokens": 159836448.0, "step": 132870 }, { "entropy": 1.881002813577652, "epoch": 0.41191637366035344, "grad_norm": 7.482250213623047, "learning_rate": 3.94175644096441e-06, "loss": 0.4778, "mean_token_accuracy": 0.8465662628412247, "num_tokens": 159847845.0, "step": 132880 }, { "entropy": 1.8887635886669158, "epoch": 0.4119473727854031, "grad_norm": 3.9271512031555176, "learning_rate": 3.941608128064214e-06, "loss": 0.4312, "mean_token_accuracy": 0.8607404991984368, "num_tokens": 159860022.0, "step": 132890 }, { "entropy": 1.9076422840356826, "epoch": 0.41197837191045283, "grad_norm": 4.137758255004883, "learning_rate": 3.9414598319040645e-06, "loss": 0.4207, "mean_token_accuracy": 0.8553347662091255, "num_tokens": 159872269.0, "step": 132900 }, { "entropy": 1.8967055141925813, "epoch": 0.4120093710355025, "grad_norm": 6.800586700439453, "learning_rate": 3.941311552480813e-06, "loss": 0.4575, "mean_token_accuracy": 0.8462317153811455, "num_tokens": 159884036.0, "step": 132910 }, { "entropy": 1.9517858356237412, "epoch": 0.4120403701605522, "grad_norm": 8.87108039855957, "learning_rate": 3.941163289791309e-06, "loss": 0.4335, "mean_token_accuracy": 0.8600587204098702, "num_tokens": 159896130.0, "step": 132920 }, { "entropy": 1.9215604767203331, "epoch": 0.4120713692856019, "grad_norm": 8.849936485290527, "learning_rate": 3.941015043832408e-06, "loss": 0.4597, "mean_token_accuracy": 0.8451809346675873, "num_tokens": 159908040.0, "step": 132930 }, { "entropy": 1.9565720595419407, "epoch": 0.4121023684106516, "grad_norm": 7.92255973815918, "learning_rate": 3.940866814600961e-06, "loss": 0.4594, "mean_token_accuracy": 0.8439367547631264, "num_tokens": 159919326.0, "step": 132940 }, { "entropy": 1.9593399614095688, "epoch": 0.4121333675357013, "grad_norm": 7.588318824768066, "learning_rate": 3.940718602093826e-06, "loss": 0.4888, "mean_token_accuracy": 0.850217179954052, "num_tokens": 159930913.0, "step": 132950 }, { "entropy": 1.9544413655996322, "epoch": 0.412164366660751, "grad_norm": 8.658806800842285, "learning_rate": 3.9405704063078546e-06, "loss": 0.5432, "mean_token_accuracy": 0.841646321117878, "num_tokens": 159942302.0, "step": 132960 }, { "entropy": 1.9482817143201827, "epoch": 0.4121953657858007, "grad_norm": 9.174203872680664, "learning_rate": 3.940422227239905e-06, "loss": 0.5234, "mean_token_accuracy": 0.8407966732978821, "num_tokens": 159953706.0, "step": 132970 }, { "entropy": 1.8826787516474723, "epoch": 0.4122263649108504, "grad_norm": 10.95289421081543, "learning_rate": 3.9402740648868335e-06, "loss": 0.4539, "mean_token_accuracy": 0.84808798879385, "num_tokens": 159965894.0, "step": 132980 }, { "entropy": 1.8592276245355606, "epoch": 0.4122573640359001, "grad_norm": 9.670906066894531, "learning_rate": 3.940125919245498e-06, "loss": 0.4063, "mean_token_accuracy": 0.8619933322072029, "num_tokens": 159978522.0, "step": 132990 }, { "entropy": 1.9317376971244813, "epoch": 0.4122883631609498, "grad_norm": 7.894320011138916, "learning_rate": 3.939977790312759e-06, "loss": 0.4869, "mean_token_accuracy": 0.8445814654231072, "num_tokens": 159989620.0, "step": 133000 }, { "entropy": 2.002483421564102, "epoch": 0.41231936228599947, "grad_norm": 9.11700439453125, "learning_rate": 3.939829678085473e-06, "loss": 0.5269, "mean_token_accuracy": 0.8350201919674873, "num_tokens": 160001293.0, "step": 133010 }, { "entropy": 1.914002077281475, "epoch": 0.4123503614110492, "grad_norm": 8.952300071716309, "learning_rate": 3.939681582560501e-06, "loss": 0.4696, "mean_token_accuracy": 0.8454202443361283, "num_tokens": 160013020.0, "step": 133020 }, { "entropy": 1.9585654482245445, "epoch": 0.41238136053609886, "grad_norm": 7.388771057128906, "learning_rate": 3.939533503734705e-06, "loss": 0.4946, "mean_token_accuracy": 0.8444287464022636, "num_tokens": 160024299.0, "step": 133030 }, { "entropy": 1.880546286702156, "epoch": 0.4124123596611486, "grad_norm": 3.908165454864502, "learning_rate": 3.939385441604947e-06, "loss": 0.4181, "mean_token_accuracy": 0.859189136326313, "num_tokens": 160036416.0, "step": 133040 }, { "entropy": 1.8767894744873046, "epoch": 0.41244335878619826, "grad_norm": 7.4038472175598145, "learning_rate": 3.939237396168088e-06, "loss": 0.3913, "mean_token_accuracy": 0.8633315816521645, "num_tokens": 160048717.0, "step": 133050 }, { "entropy": 1.9026233911514283, "epoch": 0.412474357911248, "grad_norm": 8.312832832336426, "learning_rate": 3.939089367420993e-06, "loss": 0.4479, "mean_token_accuracy": 0.852534967660904, "num_tokens": 160060661.0, "step": 133060 }, { "entropy": 1.8906838029623032, "epoch": 0.41250535703629765, "grad_norm": 7.901620864868164, "learning_rate": 3.938941355360527e-06, "loss": 0.4, "mean_token_accuracy": 0.8649371221661568, "num_tokens": 160073055.0, "step": 133070 }, { "entropy": 1.9295494481921196, "epoch": 0.4125363561613474, "grad_norm": 7.450974941253662, "learning_rate": 3.938793359983554e-06, "loss": 0.4751, "mean_token_accuracy": 0.8515502735972404, "num_tokens": 160084739.0, "step": 133080 }, { "entropy": 1.8649974435567855, "epoch": 0.41256735528639704, "grad_norm": 3.9373440742492676, "learning_rate": 3.93864538128694e-06, "loss": 0.4441, "mean_token_accuracy": 0.8602189511060715, "num_tokens": 160097401.0, "step": 133090 }, { "entropy": 1.88185980245471, "epoch": 0.4125983544114467, "grad_norm": 8.116096496582031, "learning_rate": 3.938497419267553e-06, "loss": 0.369, "mean_token_accuracy": 0.8672322928905487, "num_tokens": 160109518.0, "step": 133100 }, { "entropy": 1.9526149734854699, "epoch": 0.41262935353649643, "grad_norm": 10.256512641906738, "learning_rate": 3.938349473922259e-06, "loss": 0.4795, "mean_token_accuracy": 0.8465583026409149, "num_tokens": 160120718.0, "step": 133110 }, { "entropy": 1.8862829342484475, "epoch": 0.4126603526615461, "grad_norm": 7.902219295501709, "learning_rate": 3.938201545247929e-06, "loss": 0.4305, "mean_token_accuracy": 0.8557077690958976, "num_tokens": 160133234.0, "step": 133120 }, { "entropy": 1.9045727148652076, "epoch": 0.41269135178659583, "grad_norm": 8.710145950317383, "learning_rate": 3.93805363324143e-06, "loss": 0.4381, "mean_token_accuracy": 0.8461684197187423, "num_tokens": 160145332.0, "step": 133130 }, { "entropy": 1.8291308224201202, "epoch": 0.4127223509116455, "grad_norm": 9.70020866394043, "learning_rate": 3.937905737899633e-06, "loss": 0.4424, "mean_token_accuracy": 0.8510018140077591, "num_tokens": 160158147.0, "step": 133140 }, { "entropy": 1.8897850081324576, "epoch": 0.4127533500366952, "grad_norm": 8.668664932250977, "learning_rate": 3.937757859219409e-06, "loss": 0.4277, "mean_token_accuracy": 0.8535225465893745, "num_tokens": 160169771.0, "step": 133150 }, { "entropy": 1.9326857790350913, "epoch": 0.4127843491617449, "grad_norm": 10.146557807922363, "learning_rate": 3.93760999719763e-06, "loss": 0.5089, "mean_token_accuracy": 0.8501642525196076, "num_tokens": 160180380.0, "step": 133160 }, { "entropy": 1.8924714922904968, "epoch": 0.4128153482867946, "grad_norm": 8.729730606079102, "learning_rate": 3.937462151831168e-06, "loss": 0.4288, "mean_token_accuracy": 0.8599536821246148, "num_tokens": 160192793.0, "step": 133170 }, { "entropy": 1.8484364092350005, "epoch": 0.4128463474118443, "grad_norm": 7.995754718780518, "learning_rate": 3.937314323116897e-06, "loss": 0.3844, "mean_token_accuracy": 0.8631056323647499, "num_tokens": 160204883.0, "step": 133180 }, { "entropy": 1.8777808651328087, "epoch": 0.412877346536894, "grad_norm": 9.567119598388672, "learning_rate": 3.937166511051691e-06, "loss": 0.4602, "mean_token_accuracy": 0.8523745030164719, "num_tokens": 160217041.0, "step": 133190 }, { "entropy": 1.8994059637188911, "epoch": 0.4129083456619437, "grad_norm": 7.305065155029297, "learning_rate": 3.937018715632426e-06, "loss": 0.4384, "mean_token_accuracy": 0.8568759083747863, "num_tokens": 160229204.0, "step": 133200 }, { "entropy": 1.8381987407803535, "epoch": 0.4129393447869934, "grad_norm": 7.50701379776001, "learning_rate": 3.936870936855977e-06, "loss": 0.4089, "mean_token_accuracy": 0.8586773455142975, "num_tokens": 160242271.0, "step": 133210 }, { "entropy": 1.9254462212324142, "epoch": 0.41297034391204307, "grad_norm": 9.528829574584961, "learning_rate": 3.93672317471922e-06, "loss": 0.4837, "mean_token_accuracy": 0.8490356847643852, "num_tokens": 160253561.0, "step": 133220 }, { "entropy": 1.9055677652359009, "epoch": 0.4130013430370928, "grad_norm": 7.700775146484375, "learning_rate": 3.9365754292190345e-06, "loss": 0.4233, "mean_token_accuracy": 0.8636551991105079, "num_tokens": 160264851.0, "step": 133230 }, { "entropy": 1.9295484229922295, "epoch": 0.41303234216214246, "grad_norm": 8.420736312866211, "learning_rate": 3.936427700352297e-06, "loss": 0.438, "mean_token_accuracy": 0.8591367319226265, "num_tokens": 160276017.0, "step": 133240 }, { "entropy": 1.9808582752943038, "epoch": 0.4130633412871922, "grad_norm": 6.451159954071045, "learning_rate": 3.936279988115888e-06, "loss": 0.4928, "mean_token_accuracy": 0.8498171672224999, "num_tokens": 160286922.0, "step": 133250 }, { "entropy": 1.8775882482528687, "epoch": 0.41309434041224186, "grad_norm": 8.08848762512207, "learning_rate": 3.936132292506687e-06, "loss": 0.4848, "mean_token_accuracy": 0.8411854594945908, "num_tokens": 160299889.0, "step": 133260 }, { "entropy": 1.9185838371515274, "epoch": 0.4131253395372916, "grad_norm": 8.169017791748047, "learning_rate": 3.935984613521574e-06, "loss": 0.4792, "mean_token_accuracy": 0.8455637603998184, "num_tokens": 160311587.0, "step": 133270 }, { "entropy": 1.927768488228321, "epoch": 0.41315633866234125, "grad_norm": 8.6692476272583, "learning_rate": 3.9358369511574325e-06, "loss": 0.4657, "mean_token_accuracy": 0.8541040673851967, "num_tokens": 160322859.0, "step": 133280 }, { "entropy": 1.980207970738411, "epoch": 0.413187337787391, "grad_norm": 9.063236236572266, "learning_rate": 3.935689305411144e-06, "loss": 0.5183, "mean_token_accuracy": 0.8438897222280503, "num_tokens": 160333949.0, "step": 133290 }, { "entropy": 1.8348941326141357, "epoch": 0.41321833691244064, "grad_norm": 11.424354553222656, "learning_rate": 3.935541676279592e-06, "loss": 0.4391, "mean_token_accuracy": 0.857143734395504, "num_tokens": 160346373.0, "step": 133300 }, { "entropy": 1.9375634253025056, "epoch": 0.41324933603749037, "grad_norm": 7.8653435707092285, "learning_rate": 3.93539406375966e-06, "loss": 0.4542, "mean_token_accuracy": 0.8492831483483314, "num_tokens": 160357451.0, "step": 133310 }, { "entropy": 1.8438090533018112, "epoch": 0.41328033516254004, "grad_norm": 8.466445922851562, "learning_rate": 3.9352464678482325e-06, "loss": 0.4196, "mean_token_accuracy": 0.8501735106110573, "num_tokens": 160370838.0, "step": 133320 }, { "entropy": 1.9386936947703362, "epoch": 0.41331133428758976, "grad_norm": 8.103391647338867, "learning_rate": 3.935098888542198e-06, "loss": 0.4446, "mean_token_accuracy": 0.8613961502909661, "num_tokens": 160382175.0, "step": 133330 }, { "entropy": 1.8318694144487382, "epoch": 0.41334233341263943, "grad_norm": 8.630756378173828, "learning_rate": 3.934951325838439e-06, "loss": 0.4651, "mean_token_accuracy": 0.8484767213463783, "num_tokens": 160395178.0, "step": 133340 }, { "entropy": 1.9260124281048774, "epoch": 0.4133733325376891, "grad_norm": 7.492244720458984, "learning_rate": 3.934803779733846e-06, "loss": 0.4451, "mean_token_accuracy": 0.8476378932595253, "num_tokens": 160407014.0, "step": 133350 }, { "entropy": 1.8849865958094596, "epoch": 0.4134043316627388, "grad_norm": 3.519080400466919, "learning_rate": 3.934656250225307e-06, "loss": 0.4288, "mean_token_accuracy": 0.8558978796005249, "num_tokens": 160419398.0, "step": 133360 }, { "entropy": 1.8688034132122993, "epoch": 0.4134353307877885, "grad_norm": 6.8715033531188965, "learning_rate": 3.934508737309709e-06, "loss": 0.4437, "mean_token_accuracy": 0.8502852901816368, "num_tokens": 160431982.0, "step": 133370 }, { "entropy": 1.8360135570168494, "epoch": 0.4134663299128382, "grad_norm": 6.848766326904297, "learning_rate": 3.934361240983944e-06, "loss": 0.4321, "mean_token_accuracy": 0.8653775498270988, "num_tokens": 160444385.0, "step": 133380 }, { "entropy": 1.9299522519111634, "epoch": 0.4134973290378879, "grad_norm": 4.2813496589660645, "learning_rate": 3.934213761244901e-06, "loss": 0.5009, "mean_token_accuracy": 0.8370076134800911, "num_tokens": 160456540.0, "step": 133390 }, { "entropy": 1.8286241069436073, "epoch": 0.4135283281629376, "grad_norm": 8.64360237121582, "learning_rate": 3.934066298089472e-06, "loss": 0.4333, "mean_token_accuracy": 0.8586912482976914, "num_tokens": 160469565.0, "step": 133400 }, { "entropy": 1.9575915843248368, "epoch": 0.4135593272879873, "grad_norm": 5.0459370613098145, "learning_rate": 3.93391885151455e-06, "loss": 0.4756, "mean_token_accuracy": 0.8582988694310189, "num_tokens": 160480566.0, "step": 133410 }, { "entropy": 1.9278161972761154, "epoch": 0.413590326413037, "grad_norm": 7.705987930297852, "learning_rate": 3.933771421517027e-06, "loss": 0.4942, "mean_token_accuracy": 0.8494553253054619, "num_tokens": 160492405.0, "step": 133420 }, { "entropy": 1.9024669751524925, "epoch": 0.41362132553808667, "grad_norm": 7.893537998199463, "learning_rate": 3.9336240080937985e-06, "loss": 0.4601, "mean_token_accuracy": 0.8514198035001754, "num_tokens": 160504029.0, "step": 133430 }, { "entropy": 1.830211453139782, "epoch": 0.4136523246631364, "grad_norm": 5.1870222091674805, "learning_rate": 3.933476611241757e-06, "loss": 0.4069, "mean_token_accuracy": 0.85890783816576, "num_tokens": 160516627.0, "step": 133440 }, { "entropy": 1.8257996559143066, "epoch": 0.41368332378818606, "grad_norm": 3.705263376235962, "learning_rate": 3.9333292309578e-06, "loss": 0.4155, "mean_token_accuracy": 0.8598002254962921, "num_tokens": 160530211.0, "step": 133450 }, { "entropy": 1.9126117467880248, "epoch": 0.4137143229132358, "grad_norm": 4.153369903564453, "learning_rate": 3.933181867238822e-06, "loss": 0.4548, "mean_token_accuracy": 0.8488003075122833, "num_tokens": 160541810.0, "step": 133460 }, { "entropy": 1.9465106219053268, "epoch": 0.41374532203828546, "grad_norm": 8.59892463684082, "learning_rate": 3.933034520081723e-06, "loss": 0.46, "mean_token_accuracy": 0.8590655192732811, "num_tokens": 160552736.0, "step": 133470 }, { "entropy": 1.9508576810359954, "epoch": 0.4137763211633352, "grad_norm": 8.499300003051758, "learning_rate": 3.9328871894833975e-06, "loss": 0.4967, "mean_token_accuracy": 0.8416604056954384, "num_tokens": 160564088.0, "step": 133480 }, { "entropy": 1.9252055808901787, "epoch": 0.41380732028838485, "grad_norm": 8.029823303222656, "learning_rate": 3.932739875440747e-06, "loss": 0.4801, "mean_token_accuracy": 0.8455364406108856, "num_tokens": 160575490.0, "step": 133490 }, { "entropy": 1.8071866802871228, "epoch": 0.4138383194134346, "grad_norm": 8.960620880126953, "learning_rate": 3.93259257795067e-06, "loss": 0.4126, "mean_token_accuracy": 0.8656040906906128, "num_tokens": 160588680.0, "step": 133500 }, { "entropy": 1.877330508828163, "epoch": 0.41386931853848424, "grad_norm": 8.935325622558594, "learning_rate": 3.932445297010065e-06, "loss": 0.4528, "mean_token_accuracy": 0.8558456063270569, "num_tokens": 160600946.0, "step": 133510 }, { "entropy": 1.9224560901522636, "epoch": 0.41390031766353397, "grad_norm": 7.983498573303223, "learning_rate": 3.932298032615838e-06, "loss": 0.4876, "mean_token_accuracy": 0.8463954851031303, "num_tokens": 160612788.0, "step": 133520 }, { "entropy": 1.8346376448869706, "epoch": 0.41393131678858364, "grad_norm": 8.071921348571777, "learning_rate": 3.932150784764887e-06, "loss": 0.4211, "mean_token_accuracy": 0.8672112032771111, "num_tokens": 160625526.0, "step": 133530 }, { "entropy": 1.931402738392353, "epoch": 0.41396231591363336, "grad_norm": 3.571261405944824, "learning_rate": 3.932003553454117e-06, "loss": 0.4441, "mean_token_accuracy": 0.8496660724282264, "num_tokens": 160637476.0, "step": 133540 }, { "entropy": 1.9502887561917306, "epoch": 0.41399331503868303, "grad_norm": 7.512312412261963, "learning_rate": 3.93185633868043e-06, "loss": 0.4495, "mean_token_accuracy": 0.8570262104272842, "num_tokens": 160648694.0, "step": 133550 }, { "entropy": 1.9070443853735923, "epoch": 0.41402431416373275, "grad_norm": 8.191246032714844, "learning_rate": 3.931709140440732e-06, "loss": 0.4601, "mean_token_accuracy": 0.8478936731815339, "num_tokens": 160660627.0, "step": 133560 }, { "entropy": 1.9022893160581589, "epoch": 0.4140553132887824, "grad_norm": 7.333972454071045, "learning_rate": 3.931561958731927e-06, "loss": 0.4469, "mean_token_accuracy": 0.8530973836779594, "num_tokens": 160672648.0, "step": 133570 }, { "entropy": 1.8672500133514405, "epoch": 0.41408631241383215, "grad_norm": 7.397851467132568, "learning_rate": 3.931414793550921e-06, "loss": 0.4088, "mean_token_accuracy": 0.8604272216558456, "num_tokens": 160685095.0, "step": 133580 }, { "entropy": 1.911836712062359, "epoch": 0.4141173115388818, "grad_norm": 9.264906883239746, "learning_rate": 3.9312676448946225e-06, "loss": 0.4732, "mean_token_accuracy": 0.8507508277893067, "num_tokens": 160697135.0, "step": 133590 }, { "entropy": 1.892936834692955, "epoch": 0.4141483106639315, "grad_norm": 8.400348663330078, "learning_rate": 3.931120512759939e-06, "loss": 0.4422, "mean_token_accuracy": 0.8533075451850891, "num_tokens": 160708171.0, "step": 133600 }, { "entropy": 1.9460464030504228, "epoch": 0.4141793097889812, "grad_norm": 7.992114067077637, "learning_rate": 3.930973397143777e-06, "loss": 0.4671, "mean_token_accuracy": 0.8554316237568855, "num_tokens": 160719263.0, "step": 133610 }, { "entropy": 1.8571830540895462, "epoch": 0.4142103089140309, "grad_norm": 9.004242897033691, "learning_rate": 3.930826298043048e-06, "loss": 0.3898, "mean_token_accuracy": 0.868439619243145, "num_tokens": 160731482.0, "step": 133620 }, { "entropy": 1.9615523904561996, "epoch": 0.4142413080390806, "grad_norm": 7.82743501663208, "learning_rate": 3.930679215454661e-06, "loss": 0.5313, "mean_token_accuracy": 0.8472507506608963, "num_tokens": 160741961.0, "step": 133630 }, { "entropy": 1.8198735401034356, "epoch": 0.41427230716413027, "grad_norm": 5.923557281494141, "learning_rate": 3.9305321493755265e-06, "loss": 0.4134, "mean_token_accuracy": 0.8585488334298134, "num_tokens": 160755417.0, "step": 133640 }, { "entropy": 1.8547986596822739, "epoch": 0.41430330628918, "grad_norm": 9.227656364440918, "learning_rate": 3.9303850998025586e-06, "loss": 0.4319, "mean_token_accuracy": 0.8587556973099708, "num_tokens": 160767718.0, "step": 133650 }, { "entropy": 1.8604234397411346, "epoch": 0.41433430541422966, "grad_norm": 7.426607131958008, "learning_rate": 3.930238066732667e-06, "loss": 0.473, "mean_token_accuracy": 0.8567753449082375, "num_tokens": 160780130.0, "step": 133660 }, { "entropy": 1.9464622527360915, "epoch": 0.4143653045392794, "grad_norm": 8.832720756530762, "learning_rate": 3.930091050162768e-06, "loss": 0.4752, "mean_token_accuracy": 0.8490612745285034, "num_tokens": 160790549.0, "step": 133670 }, { "entropy": 1.738626065850258, "epoch": 0.41439630366432906, "grad_norm": 8.568602561950684, "learning_rate": 3.9299440500897725e-06, "loss": 0.3268, "mean_token_accuracy": 0.8719424903392792, "num_tokens": 160804288.0, "step": 133680 }, { "entropy": 1.8175345674157142, "epoch": 0.4144273027893788, "grad_norm": 3.655850410461426, "learning_rate": 3.929797066510598e-06, "loss": 0.403, "mean_token_accuracy": 0.8615160465240479, "num_tokens": 160816494.0, "step": 133690 }, { "entropy": 1.9714103803038596, "epoch": 0.41445830191442845, "grad_norm": 3.3211865425109863, "learning_rate": 3.92965009942216e-06, "loss": 0.4984, "mean_token_accuracy": 0.8397148564457894, "num_tokens": 160827569.0, "step": 133700 }, { "entropy": 1.8605849146842957, "epoch": 0.4144893010394782, "grad_norm": 4.2350921630859375, "learning_rate": 3.929503148821374e-06, "loss": 0.461, "mean_token_accuracy": 0.8473623186349869, "num_tokens": 160840031.0, "step": 133710 }, { "entropy": 1.8704538196325302, "epoch": 0.41452030016452784, "grad_norm": 4.739353656768799, "learning_rate": 3.929356214705158e-06, "loss": 0.4535, "mean_token_accuracy": 0.8485003530979156, "num_tokens": 160851772.0, "step": 133720 }, { "entropy": 1.8266652062535287, "epoch": 0.41455129928957757, "grad_norm": 8.387866020202637, "learning_rate": 3.92920929707043e-06, "loss": 0.4312, "mean_token_accuracy": 0.8575938984751701, "num_tokens": 160864401.0, "step": 133730 }, { "entropy": 1.8190097853541374, "epoch": 0.41458229841462724, "grad_norm": 2.620790719985962, "learning_rate": 3.92906239591411e-06, "loss": 0.3936, "mean_token_accuracy": 0.8680200964212418, "num_tokens": 160877175.0, "step": 133740 }, { "entropy": 1.9274157121777535, "epoch": 0.41461329753967696, "grad_norm": 8.17121696472168, "learning_rate": 3.928915511233117e-06, "loss": 0.481, "mean_token_accuracy": 0.8486859872937202, "num_tokens": 160889187.0, "step": 133750 }, { "entropy": 1.921906155347824, "epoch": 0.41464429666472663, "grad_norm": 8.626646041870117, "learning_rate": 3.928768643024372e-06, "loss": 0.5145, "mean_token_accuracy": 0.8409358784556389, "num_tokens": 160899766.0, "step": 133760 }, { "entropy": 1.8961732387542725, "epoch": 0.41467529578977635, "grad_norm": 7.050107955932617, "learning_rate": 3.928621791284796e-06, "loss": 0.4861, "mean_token_accuracy": 0.8544517681002617, "num_tokens": 160910866.0, "step": 133770 }, { "entropy": 1.9284267202019691, "epoch": 0.414706294914826, "grad_norm": 4.148862361907959, "learning_rate": 3.928474956011312e-06, "loss": 0.4777, "mean_token_accuracy": 0.8424947530031204, "num_tokens": 160922592.0, "step": 133780 }, { "entropy": 1.881701409816742, "epoch": 0.41473729403987575, "grad_norm": 6.7715229988098145, "learning_rate": 3.928328137200842e-06, "loss": 0.475, "mean_token_accuracy": 0.8563426598906517, "num_tokens": 160934482.0, "step": 133790 }, { "entropy": 1.7748121902346612, "epoch": 0.4147682931649254, "grad_norm": 9.0849609375, "learning_rate": 3.92818133485031e-06, "loss": 0.3697, "mean_token_accuracy": 0.870860530436039, "num_tokens": 160947645.0, "step": 133800 }, { "entropy": 1.9011081397533416, "epoch": 0.41479929228997514, "grad_norm": 8.376455307006836, "learning_rate": 3.928034548956642e-06, "loss": 0.4385, "mean_token_accuracy": 0.8554087609052659, "num_tokens": 160959336.0, "step": 133810 }, { "entropy": 1.937306745350361, "epoch": 0.4148302914150248, "grad_norm": 9.280498504638672, "learning_rate": 3.927887779516763e-06, "loss": 0.4628, "mean_token_accuracy": 0.8521208077669143, "num_tokens": 160970927.0, "step": 133820 }, { "entropy": 1.8931998923420905, "epoch": 0.4148612905400745, "grad_norm": 4.04230260848999, "learning_rate": 3.927741026527598e-06, "loss": 0.4633, "mean_token_accuracy": 0.8493711724877357, "num_tokens": 160982582.0, "step": 133830 }, { "entropy": 1.862531739473343, "epoch": 0.4148922896651242, "grad_norm": 8.157683372497559, "learning_rate": 3.927594289986076e-06, "loss": 0.4603, "mean_token_accuracy": 0.849953505396843, "num_tokens": 160994545.0, "step": 133840 }, { "entropy": 1.8835390016436577, "epoch": 0.41492328879017387, "grad_norm": 8.619124412536621, "learning_rate": 3.927447569889122e-06, "loss": 0.4962, "mean_token_accuracy": 0.8389230579137802, "num_tokens": 161007106.0, "step": 133850 }, { "entropy": 1.8973518058657646, "epoch": 0.4149542879152236, "grad_norm": 7.793274402618408, "learning_rate": 3.9273008662336685e-06, "loss": 0.5054, "mean_token_accuracy": 0.8410894647240639, "num_tokens": 161018629.0, "step": 133860 }, { "entropy": 1.9573092699050902, "epoch": 0.41498528704027327, "grad_norm": 7.558224678039551, "learning_rate": 3.927154179016643e-06, "loss": 0.5093, "mean_token_accuracy": 0.8450547724962234, "num_tokens": 161029814.0, "step": 133870 }, { "entropy": 1.8571864694356919, "epoch": 0.415016286165323, "grad_norm": 8.728129386901855, "learning_rate": 3.927007508234975e-06, "loss": 0.4406, "mean_token_accuracy": 0.8491732507944107, "num_tokens": 161041934.0, "step": 133880 }, { "entropy": 1.877596764266491, "epoch": 0.41504728529037266, "grad_norm": 4.276613235473633, "learning_rate": 3.9268608538855965e-06, "loss": 0.4656, "mean_token_accuracy": 0.8503337040543556, "num_tokens": 161053951.0, "step": 133890 }, { "entropy": 1.8341388761997224, "epoch": 0.4150782844154224, "grad_norm": 4.777545928955078, "learning_rate": 3.92671421596544e-06, "loss": 0.4701, "mean_token_accuracy": 0.8406883478164673, "num_tokens": 161067201.0, "step": 133900 }, { "entropy": 1.8835119605064392, "epoch": 0.41510928354047205, "grad_norm": 7.586372375488281, "learning_rate": 3.926567594471437e-06, "loss": 0.4865, "mean_token_accuracy": 0.8480971708893776, "num_tokens": 161079821.0, "step": 133910 }, { "entropy": 1.817841324210167, "epoch": 0.4151402826655218, "grad_norm": 8.434521675109863, "learning_rate": 3.926420989400522e-06, "loss": 0.4007, "mean_token_accuracy": 0.8662325829267502, "num_tokens": 161092583.0, "step": 133920 }, { "entropy": 1.9291996493935586, "epoch": 0.41517128179057144, "grad_norm": 8.39786434173584, "learning_rate": 3.926274400749629e-06, "loss": 0.4703, "mean_token_accuracy": 0.8544291496276856, "num_tokens": 161103847.0, "step": 133930 }, { "entropy": 1.8668764278292656, "epoch": 0.41520228091562117, "grad_norm": 3.983625650405884, "learning_rate": 3.926127828515693e-06, "loss": 0.417, "mean_token_accuracy": 0.8636351123452186, "num_tokens": 161115696.0, "step": 133940 }, { "entropy": 1.8500849679112434, "epoch": 0.41523328004067084, "grad_norm": 11.573062896728516, "learning_rate": 3.9259812726956495e-06, "loss": 0.4025, "mean_token_accuracy": 0.8647431343793869, "num_tokens": 161127966.0, "step": 133950 }, { "entropy": 1.8055422961711884, "epoch": 0.41526427916572056, "grad_norm": 2.1831018924713135, "learning_rate": 3.925834733286436e-06, "loss": 0.4989, "mean_token_accuracy": 0.8400851160287857, "num_tokens": 161141745.0, "step": 133960 }, { "entropy": 1.8568698644638062, "epoch": 0.41529527829077023, "grad_norm": 7.404766082763672, "learning_rate": 3.92568821028499e-06, "loss": 0.4149, "mean_token_accuracy": 0.8579725489020348, "num_tokens": 161153901.0, "step": 133970 }, { "entropy": 1.9167630136013032, "epoch": 0.41532627741581996, "grad_norm": 7.15609884262085, "learning_rate": 3.925541703688249e-06, "loss": 0.4677, "mean_token_accuracy": 0.8486376807093621, "num_tokens": 161165538.0, "step": 133980 }, { "entropy": 1.8915724635124207, "epoch": 0.4153572765408696, "grad_norm": 7.963951587677002, "learning_rate": 3.925395213493153e-06, "loss": 0.4811, "mean_token_accuracy": 0.8474412530660629, "num_tokens": 161178272.0, "step": 133990 }, { "entropy": 1.9221931904554368, "epoch": 0.41538827566591935, "grad_norm": 7.879276752471924, "learning_rate": 3.9252487396966406e-06, "loss": 0.4792, "mean_token_accuracy": 0.8496794193983078, "num_tokens": 161189021.0, "step": 134000 }, { "entropy": 1.8921538457274436, "epoch": 0.415419274790969, "grad_norm": 8.086331367492676, "learning_rate": 3.925102282295654e-06, "loss": 0.4773, "mean_token_accuracy": 0.8499061942100525, "num_tokens": 161201131.0, "step": 134010 }, { "entropy": 1.825336940586567, "epoch": 0.41545027391601874, "grad_norm": 3.680633068084717, "learning_rate": 3.924955841287134e-06, "loss": 0.4024, "mean_token_accuracy": 0.8611365631222725, "num_tokens": 161213630.0, "step": 134020 }, { "entropy": 1.8338982105255126, "epoch": 0.4154812730410684, "grad_norm": 9.535050392150879, "learning_rate": 3.924809416668023e-06, "loss": 0.4559, "mean_token_accuracy": 0.8613386616110802, "num_tokens": 161226206.0, "step": 134030 }, { "entropy": 1.8270452991127968, "epoch": 0.41551227216611814, "grad_norm": 4.551232814788818, "learning_rate": 3.924663008435264e-06, "loss": 0.424, "mean_token_accuracy": 0.8666022554039955, "num_tokens": 161238486.0, "step": 134040 }, { "entropy": 1.9449387162923812, "epoch": 0.4155432712911678, "grad_norm": 8.532154083251953, "learning_rate": 3.924516616585802e-06, "loss": 0.4774, "mean_token_accuracy": 0.8555340111255646, "num_tokens": 161249333.0, "step": 134050 }, { "entropy": 1.9312722623348235, "epoch": 0.41557427041621753, "grad_norm": 8.082392692565918, "learning_rate": 3.9243702411165805e-06, "loss": 0.5061, "mean_token_accuracy": 0.8388245224952697, "num_tokens": 161260572.0, "step": 134060 }, { "entropy": 1.7703733541071416, "epoch": 0.4156052695412672, "grad_norm": 8.54536247253418, "learning_rate": 3.924223882024544e-06, "loss": 0.3865, "mean_token_accuracy": 0.8662910461425781, "num_tokens": 161274273.0, "step": 134070 }, { "entropy": 1.8904980316758155, "epoch": 0.41563626866631687, "grad_norm": 9.833793640136719, "learning_rate": 3.9240775393066405e-06, "loss": 0.4482, "mean_token_accuracy": 0.8553065106272697, "num_tokens": 161285970.0, "step": 134080 }, { "entropy": 1.887739597260952, "epoch": 0.4156672677913666, "grad_norm": 6.797054767608643, "learning_rate": 3.923931212959817e-06, "loss": 0.4456, "mean_token_accuracy": 0.8516652911901474, "num_tokens": 161297693.0, "step": 134090 }, { "entropy": 1.9516427367925644, "epoch": 0.41569826691641626, "grad_norm": 7.91145658493042, "learning_rate": 3.923784902981022e-06, "loss": 0.4695, "mean_token_accuracy": 0.855071696639061, "num_tokens": 161308121.0, "step": 134100 }, { "entropy": 1.9453662782907486, "epoch": 0.415729266041466, "grad_norm": 7.931567668914795, "learning_rate": 3.923638609367202e-06, "loss": 0.5049, "mean_token_accuracy": 0.8384775072336197, "num_tokens": 161319934.0, "step": 134110 }, { "entropy": 1.9178491979837418, "epoch": 0.41576026516651565, "grad_norm": 8.443523406982422, "learning_rate": 3.923492332115307e-06, "loss": 0.5374, "mean_token_accuracy": 0.8489273399114609, "num_tokens": 161330638.0, "step": 134120 }, { "entropy": 1.9393232375383378, "epoch": 0.4157912642915654, "grad_norm": 6.289927005767822, "learning_rate": 3.9233460712222895e-06, "loss": 0.5225, "mean_token_accuracy": 0.8360320463776588, "num_tokens": 161342902.0, "step": 134130 }, { "entropy": 1.9341855004429818, "epoch": 0.41582226341661505, "grad_norm": 8.161822319030762, "learning_rate": 3.923199826685099e-06, "loss": 0.4703, "mean_token_accuracy": 0.850852632522583, "num_tokens": 161354270.0, "step": 134140 }, { "entropy": 1.8818897396326064, "epoch": 0.41585326254166477, "grad_norm": 9.941109657287598, "learning_rate": 3.9230535985006865e-06, "loss": 0.4419, "mean_token_accuracy": 0.8523253738880158, "num_tokens": 161366256.0, "step": 134150 }, { "entropy": 1.878204520046711, "epoch": 0.41588426166671444, "grad_norm": 9.152496337890625, "learning_rate": 3.9229073866660056e-06, "loss": 0.4654, "mean_token_accuracy": 0.8530999973416329, "num_tokens": 161378072.0, "step": 134160 }, { "entropy": 1.9943836033344269, "epoch": 0.41591526079176416, "grad_norm": 8.155722618103027, "learning_rate": 3.92276119117801e-06, "loss": 0.5221, "mean_token_accuracy": 0.8379778906702995, "num_tokens": 161389288.0, "step": 134170 }, { "entropy": 1.8447602733969688, "epoch": 0.41594625991681383, "grad_norm": 8.624749183654785, "learning_rate": 3.922615012033654e-06, "loss": 0.4482, "mean_token_accuracy": 0.8538846731185913, "num_tokens": 161401938.0, "step": 134180 }, { "entropy": 1.8731971263885498, "epoch": 0.41597725904186356, "grad_norm": 8.428763389587402, "learning_rate": 3.922468849229893e-06, "loss": 0.4268, "mean_token_accuracy": 0.8539308935403824, "num_tokens": 161413429.0, "step": 134190 }, { "entropy": 1.9091638535261155, "epoch": 0.4160082581669132, "grad_norm": 8.18406867980957, "learning_rate": 3.922322702763682e-06, "loss": 0.4879, "mean_token_accuracy": 0.8525436207652092, "num_tokens": 161424862.0, "step": 134200 }, { "entropy": 1.8178348287940025, "epoch": 0.41603925729196295, "grad_norm": 5.807666778564453, "learning_rate": 3.922176572631976e-06, "loss": 0.3925, "mean_token_accuracy": 0.8637724488973617, "num_tokens": 161437571.0, "step": 134210 }, { "entropy": 1.8581533119082452, "epoch": 0.4160702564170126, "grad_norm": 5.807939052581787, "learning_rate": 3.922030458831736e-06, "loss": 0.4243, "mean_token_accuracy": 0.8546082183718682, "num_tokens": 161450359.0, "step": 134220 }, { "entropy": 1.9059478402137757, "epoch": 0.41610125554206234, "grad_norm": 8.28518009185791, "learning_rate": 3.921884361359918e-06, "loss": 0.4905, "mean_token_accuracy": 0.8530061066150665, "num_tokens": 161461493.0, "step": 134230 }, { "entropy": 1.8699321061372758, "epoch": 0.416132254667112, "grad_norm": 7.9905195236206055, "learning_rate": 3.921738280213482e-06, "loss": 0.4416, "mean_token_accuracy": 0.8566524386405945, "num_tokens": 161473498.0, "step": 134240 }, { "entropy": 1.88876011967659, "epoch": 0.41616325379216174, "grad_norm": 8.742656707763672, "learning_rate": 3.921592215389386e-06, "loss": 0.4385, "mean_token_accuracy": 0.8500191062688828, "num_tokens": 161485605.0, "step": 134250 }, { "entropy": 1.9045383304357528, "epoch": 0.4161942529172114, "grad_norm": 9.43719482421875, "learning_rate": 3.921446166884594e-06, "loss": 0.465, "mean_token_accuracy": 0.8534532591700554, "num_tokens": 161496981.0, "step": 134260 }, { "entropy": 1.9324941888451577, "epoch": 0.41622525204226113, "grad_norm": 7.1453680992126465, "learning_rate": 3.921300134696064e-06, "loss": 0.488, "mean_token_accuracy": 0.8457109674811363, "num_tokens": 161508264.0, "step": 134270 }, { "entropy": 1.8439267560839654, "epoch": 0.4162562511673108, "grad_norm": 3.715588331222534, "learning_rate": 3.921154118820759e-06, "loss": 0.43, "mean_token_accuracy": 0.8561937183141708, "num_tokens": 161521748.0, "step": 134280 }, { "entropy": 1.995403453707695, "epoch": 0.4162872502923605, "grad_norm": 8.698075294494629, "learning_rate": 3.921008119255644e-06, "loss": 0.4773, "mean_token_accuracy": 0.8444655641913414, "num_tokens": 161532336.0, "step": 134290 }, { "entropy": 1.8547970443964004, "epoch": 0.4163182494174102, "grad_norm": 7.284455299377441, "learning_rate": 3.9208621359976816e-06, "loss": 0.4427, "mean_token_accuracy": 0.8494389861822128, "num_tokens": 161545319.0, "step": 134300 }, { "entropy": 1.9544886738061904, "epoch": 0.4163492485424599, "grad_norm": 8.434039115905762, "learning_rate": 3.920716169043834e-06, "loss": 0.5019, "mean_token_accuracy": 0.8425055891275406, "num_tokens": 161556597.0, "step": 134310 }, { "entropy": 1.8419823780655862, "epoch": 0.4163802476675096, "grad_norm": 8.092065811157227, "learning_rate": 3.9205702183910696e-06, "loss": 0.3975, "mean_token_accuracy": 0.8516080200672149, "num_tokens": 161570185.0, "step": 134320 }, { "entropy": 1.8836676687002183, "epoch": 0.41641124679255925, "grad_norm": 7.677006721496582, "learning_rate": 3.920424284036354e-06, "loss": 0.4231, "mean_token_accuracy": 0.8599972873926163, "num_tokens": 161582993.0, "step": 134330 }, { "entropy": 1.910847471654415, "epoch": 0.416442245917609, "grad_norm": 10.631965637207031, "learning_rate": 3.920278365976653e-06, "loss": 0.5067, "mean_token_accuracy": 0.8386124089360237, "num_tokens": 161594430.0, "step": 134340 }, { "entropy": 1.867373377084732, "epoch": 0.41647324504265865, "grad_norm": 4.6483259201049805, "learning_rate": 3.920132464208936e-06, "loss": 0.4387, "mean_token_accuracy": 0.8458052203059196, "num_tokens": 161607226.0, "step": 134350 }, { "entropy": 1.9246284693479538, "epoch": 0.41650424416770837, "grad_norm": 7.288817405700684, "learning_rate": 3.91998657873017e-06, "loss": 0.4895, "mean_token_accuracy": 0.8457208350300789, "num_tokens": 161618272.0, "step": 134360 }, { "entropy": 1.8862150296568871, "epoch": 0.41653524329275804, "grad_norm": 5.131035804748535, "learning_rate": 3.919840709537325e-06, "loss": 0.4388, "mean_token_accuracy": 0.8593840152025223, "num_tokens": 161630204.0, "step": 134370 }, { "entropy": 1.876247802376747, "epoch": 0.41656624241780776, "grad_norm": 10.8263578414917, "learning_rate": 3.919694856627371e-06, "loss": 0.3749, "mean_token_accuracy": 0.869242000579834, "num_tokens": 161641771.0, "step": 134380 }, { "entropy": 1.9201571598649025, "epoch": 0.41659724154285743, "grad_norm": 8.346663475036621, "learning_rate": 3.919549019997278e-06, "loss": 0.4356, "mean_token_accuracy": 0.8538030579686164, "num_tokens": 161653401.0, "step": 134390 }, { "entropy": 1.8522045940160752, "epoch": 0.41662824066790716, "grad_norm": 3.982300281524658, "learning_rate": 3.919403199644019e-06, "loss": 0.4508, "mean_token_accuracy": 0.8470421150326729, "num_tokens": 161665679.0, "step": 134400 }, { "entropy": 1.8318135902285575, "epoch": 0.4166592397929568, "grad_norm": 4.39650297164917, "learning_rate": 3.919257395564566e-06, "loss": 0.4511, "mean_token_accuracy": 0.8538282006978989, "num_tokens": 161678834.0, "step": 134410 }, { "entropy": 1.8755167797207832, "epoch": 0.41669023891800655, "grad_norm": 9.182366371154785, "learning_rate": 3.919111607755892e-06, "loss": 0.4426, "mean_token_accuracy": 0.847786869108677, "num_tokens": 161691040.0, "step": 134420 }, { "entropy": 1.921954096853733, "epoch": 0.4167212380430562, "grad_norm": 8.545275688171387, "learning_rate": 3.918965836214972e-06, "loss": 0.4937, "mean_token_accuracy": 0.8338611707091331, "num_tokens": 161703955.0, "step": 134430 }, { "entropy": 1.8536706238985061, "epoch": 0.41675223716810594, "grad_norm": 7.954626560211182, "learning_rate": 3.918820080938779e-06, "loss": 0.4029, "mean_token_accuracy": 0.8524566516280174, "num_tokens": 161716705.0, "step": 134440 }, { "entropy": 1.907846173644066, "epoch": 0.4167832362931556, "grad_norm": 6.290383338928223, "learning_rate": 3.91867434192429e-06, "loss": 0.48, "mean_token_accuracy": 0.8462353229522706, "num_tokens": 161728755.0, "step": 134450 }, { "entropy": 1.9126424625515939, "epoch": 0.41681423541820534, "grad_norm": 7.351926803588867, "learning_rate": 3.918528619168481e-06, "loss": 0.4377, "mean_token_accuracy": 0.8547870278358459, "num_tokens": 161740140.0, "step": 134460 }, { "entropy": 1.7981932133436203, "epoch": 0.416845234543255, "grad_norm": 10.335866928100586, "learning_rate": 3.9183829126683305e-06, "loss": 0.3812, "mean_token_accuracy": 0.8719327822327614, "num_tokens": 161753546.0, "step": 134470 }, { "entropy": 1.9288191676139832, "epoch": 0.41687623366830473, "grad_norm": 8.642077445983887, "learning_rate": 3.918237222420813e-06, "loss": 0.4738, "mean_token_accuracy": 0.8515138506889344, "num_tokens": 161764200.0, "step": 134480 }, { "entropy": 1.8795452415943146, "epoch": 0.4169072327933544, "grad_norm": 7.725874900817871, "learning_rate": 3.918091548422912e-06, "loss": 0.4932, "mean_token_accuracy": 0.8545924738049507, "num_tokens": 161776635.0, "step": 134490 }, { "entropy": 1.9160576537251472, "epoch": 0.4169382319184041, "grad_norm": 4.713511943817139, "learning_rate": 3.9179458906716036e-06, "loss": 0.4434, "mean_token_accuracy": 0.854782110452652, "num_tokens": 161789062.0, "step": 134500 }, { "entropy": 1.891368383169174, "epoch": 0.4169692310434538, "grad_norm": 7.320746898651123, "learning_rate": 3.917800249163869e-06, "loss": 0.4709, "mean_token_accuracy": 0.8455578476190567, "num_tokens": 161801067.0, "step": 134510 }, { "entropy": 1.9198919370770455, "epoch": 0.4170002301685035, "grad_norm": 4.034561634063721, "learning_rate": 3.917654623896689e-06, "loss": 0.4307, "mean_token_accuracy": 0.8582830354571342, "num_tokens": 161813219.0, "step": 134520 }, { "entropy": 1.8274074256420136, "epoch": 0.4170312292935532, "grad_norm": 3.6742751598358154, "learning_rate": 3.917509014867046e-06, "loss": 0.3795, "mean_token_accuracy": 0.8572375267744065, "num_tokens": 161826737.0, "step": 134530 }, { "entropy": 1.96164468228817, "epoch": 0.4170622284186029, "grad_norm": 7.9983367919921875, "learning_rate": 3.917363422071923e-06, "loss": 0.4904, "mean_token_accuracy": 0.8470546841621399, "num_tokens": 161837950.0, "step": 134540 }, { "entropy": 1.840873746573925, "epoch": 0.4170932275436526, "grad_norm": 8.481100082397461, "learning_rate": 3.917217845508302e-06, "loss": 0.4535, "mean_token_accuracy": 0.8508628994226456, "num_tokens": 161851286.0, "step": 134550 }, { "entropy": 1.8318819746375083, "epoch": 0.4171242266687023, "grad_norm": 6.554941654205322, "learning_rate": 3.917072285173169e-06, "loss": 0.4343, "mean_token_accuracy": 0.8574759662151337, "num_tokens": 161864090.0, "step": 134560 }, { "entropy": 1.8796166345477103, "epoch": 0.41715522579375197, "grad_norm": 7.1165266036987305, "learning_rate": 3.916926741063509e-06, "loss": 0.4252, "mean_token_accuracy": 0.8539829269051552, "num_tokens": 161876656.0, "step": 134570 }, { "entropy": 1.8293213650584221, "epoch": 0.41718622491880164, "grad_norm": 8.498753547668457, "learning_rate": 3.916781213176306e-06, "loss": 0.3757, "mean_token_accuracy": 0.8652872815728188, "num_tokens": 161889042.0, "step": 134580 }, { "entropy": 1.9550884038209915, "epoch": 0.41721722404385136, "grad_norm": 9.21761417388916, "learning_rate": 3.916635701508549e-06, "loss": 0.5468, "mean_token_accuracy": 0.8403035715222359, "num_tokens": 161900992.0, "step": 134590 }, { "entropy": 1.9226959884166717, "epoch": 0.41724822316890103, "grad_norm": 7.0631842613220215, "learning_rate": 3.916490206057224e-06, "loss": 0.4587, "mean_token_accuracy": 0.8605365335941315, "num_tokens": 161912060.0, "step": 134600 }, { "entropy": 1.7682531923055649, "epoch": 0.41727922229395076, "grad_norm": 8.758580207824707, "learning_rate": 3.91634472681932e-06, "loss": 0.4329, "mean_token_accuracy": 0.8549866124987602, "num_tokens": 161926308.0, "step": 134610 }, { "entropy": 1.805106556415558, "epoch": 0.4173102214190004, "grad_norm": 4.0905070304870605, "learning_rate": 3.916199263791824e-06, "loss": 0.3913, "mean_token_accuracy": 0.8517909824848175, "num_tokens": 161939441.0, "step": 134620 }, { "entropy": 1.7959918841719626, "epoch": 0.41734122054405015, "grad_norm": 9.999496459960938, "learning_rate": 3.916053816971728e-06, "loss": 0.3566, "mean_token_accuracy": 0.863648708164692, "num_tokens": 161952525.0, "step": 134630 }, { "entropy": 1.8762843772768973, "epoch": 0.4173722196690998, "grad_norm": 3.626180648803711, "learning_rate": 3.9159083863560204e-06, "loss": 0.4183, "mean_token_accuracy": 0.8647943899035454, "num_tokens": 161964249.0, "step": 134640 }, { "entropy": 1.9006209999322892, "epoch": 0.41740321879414954, "grad_norm": 8.766674041748047, "learning_rate": 3.915762971941694e-06, "loss": 0.4963, "mean_token_accuracy": 0.8434425577521324, "num_tokens": 161975450.0, "step": 134650 }, { "entropy": 1.959898342192173, "epoch": 0.4174342179191992, "grad_norm": 7.614290714263916, "learning_rate": 3.915617573725742e-06, "loss": 0.5172, "mean_token_accuracy": 0.847202044725418, "num_tokens": 161986492.0, "step": 134660 }, { "entropy": 1.9258347421884536, "epoch": 0.41746521704424894, "grad_norm": 7.911265850067139, "learning_rate": 3.9154721917051546e-06, "loss": 0.495, "mean_token_accuracy": 0.8465585187077522, "num_tokens": 161997495.0, "step": 134670 }, { "entropy": 1.8868989422917366, "epoch": 0.4174962161692986, "grad_norm": 3.8985397815704346, "learning_rate": 3.915326825876927e-06, "loss": 0.4437, "mean_token_accuracy": 0.8470399528741837, "num_tokens": 162009972.0, "step": 134680 }, { "entropy": 1.961172890663147, "epoch": 0.41752721529434833, "grad_norm": 9.122645378112793, "learning_rate": 3.915181476238054e-06, "loss": 0.5025, "mean_token_accuracy": 0.8451171442866325, "num_tokens": 162021139.0, "step": 134690 }, { "entropy": 1.867299085855484, "epoch": 0.417558214419398, "grad_norm": 7.1753034591674805, "learning_rate": 3.91503614278553e-06, "loss": 0.4129, "mean_token_accuracy": 0.8628281652927399, "num_tokens": 162033229.0, "step": 134700 }, { "entropy": 1.8105697840452195, "epoch": 0.4175892135444477, "grad_norm": 3.7438528537750244, "learning_rate": 3.91489082551635e-06, "loss": 0.4439, "mean_token_accuracy": 0.8544130697846413, "num_tokens": 162046064.0, "step": 134710 }, { "entropy": 1.916566787660122, "epoch": 0.4176202126694974, "grad_norm": 8.405454635620117, "learning_rate": 3.914745524427513e-06, "loss": 0.478, "mean_token_accuracy": 0.8509135708212853, "num_tokens": 162056998.0, "step": 134720 }, { "entropy": 1.883594536781311, "epoch": 0.4176512117945471, "grad_norm": 9.41535472869873, "learning_rate": 3.914600239516016e-06, "loss": 0.4559, "mean_token_accuracy": 0.8488730236887931, "num_tokens": 162068897.0, "step": 134730 }, { "entropy": 1.8739972546696664, "epoch": 0.4176822109195968, "grad_norm": 4.211245059967041, "learning_rate": 3.9144549707788556e-06, "loss": 0.4419, "mean_token_accuracy": 0.8516407638788224, "num_tokens": 162081030.0, "step": 134740 }, { "entropy": 1.9788037657737731, "epoch": 0.4177132100446465, "grad_norm": 8.905488967895508, "learning_rate": 3.914309718213034e-06, "loss": 0.5444, "mean_token_accuracy": 0.8368604257702827, "num_tokens": 162091716.0, "step": 134750 }, { "entropy": 1.8498676016926765, "epoch": 0.4177442091696962, "grad_norm": 2.3638017177581787, "learning_rate": 3.914164481815549e-06, "loss": 0.4331, "mean_token_accuracy": 0.8562711879611016, "num_tokens": 162103721.0, "step": 134760 }, { "entropy": 1.9256371967494488, "epoch": 0.4177752082947459, "grad_norm": 8.89646053314209, "learning_rate": 3.9140192615833995e-06, "loss": 0.5043, "mean_token_accuracy": 0.8388048008084297, "num_tokens": 162115431.0, "step": 134770 }, { "entropy": 1.8755139395594598, "epoch": 0.41780620741979557, "grad_norm": 7.007622718811035, "learning_rate": 3.91387405751359e-06, "loss": 0.4111, "mean_token_accuracy": 0.8632483765482902, "num_tokens": 162127975.0, "step": 134780 }, { "entropy": 1.865597426891327, "epoch": 0.4178372065448453, "grad_norm": 4.80448579788208, "learning_rate": 3.913728869603122e-06, "loss": 0.4438, "mean_token_accuracy": 0.8586938053369522, "num_tokens": 162139430.0, "step": 134790 }, { "entropy": 1.9837687581777572, "epoch": 0.41786820566989497, "grad_norm": 7.113491058349609, "learning_rate": 3.913583697848999e-06, "loss": 0.5099, "mean_token_accuracy": 0.8430691197514534, "num_tokens": 162150117.0, "step": 134800 }, { "entropy": 1.8740553870797156, "epoch": 0.4178992047949447, "grad_norm": 8.047861099243164, "learning_rate": 3.913438542248223e-06, "loss": 0.4269, "mean_token_accuracy": 0.8576863974332809, "num_tokens": 162161930.0, "step": 134810 }, { "entropy": 1.955976441502571, "epoch": 0.41793020391999436, "grad_norm": 8.8277006149292, "learning_rate": 3.913293402797799e-06, "loss": 0.4547, "mean_token_accuracy": 0.8523634567856788, "num_tokens": 162173265.0, "step": 134820 }, { "entropy": 1.9804795682430267, "epoch": 0.417961203045044, "grad_norm": 7.893826007843018, "learning_rate": 3.913148279494734e-06, "loss": 0.5503, "mean_token_accuracy": 0.8313609048724174, "num_tokens": 162184039.0, "step": 134830 }, { "entropy": 1.8900707513093948, "epoch": 0.41799220217009375, "grad_norm": 7.8856329917907715, "learning_rate": 3.913003172336033e-06, "loss": 0.4777, "mean_token_accuracy": 0.8358192846179009, "num_tokens": 162196357.0, "step": 134840 }, { "entropy": 1.93702944368124, "epoch": 0.4180232012951434, "grad_norm": 7.011838912963867, "learning_rate": 3.912858081318703e-06, "loss": 0.4432, "mean_token_accuracy": 0.854140405356884, "num_tokens": 162207437.0, "step": 134850 }, { "entropy": 1.9064744263887405, "epoch": 0.41805420042019314, "grad_norm": 3.6196608543395996, "learning_rate": 3.912713006439751e-06, "loss": 0.5159, "mean_token_accuracy": 0.8377542212605477, "num_tokens": 162218620.0, "step": 134860 }, { "entropy": 1.8770883545279502, "epoch": 0.4180851995452428, "grad_norm": 8.111139297485352, "learning_rate": 3.912567947696187e-06, "loss": 0.4365, "mean_token_accuracy": 0.858944533765316, "num_tokens": 162230501.0, "step": 134870 }, { "entropy": 1.9460731491446495, "epoch": 0.41811619867029254, "grad_norm": 10.321117401123047, "learning_rate": 3.912422905085019e-06, "loss": 0.5021, "mean_token_accuracy": 0.8436010986566543, "num_tokens": 162241864.0, "step": 134880 }, { "entropy": 1.9751074135303497, "epoch": 0.4181471977953422, "grad_norm": 7.4099273681640625, "learning_rate": 3.912277878603257e-06, "loss": 0.5029, "mean_token_accuracy": 0.8479305684566498, "num_tokens": 162252709.0, "step": 134890 }, { "entropy": 1.8740838050842286, "epoch": 0.41817819692039193, "grad_norm": 3.3563060760498047, "learning_rate": 3.9121328682479126e-06, "loss": 0.4222, "mean_token_accuracy": 0.8611055985093117, "num_tokens": 162264722.0, "step": 134900 }, { "entropy": 1.8472968205809592, "epoch": 0.4182091960454416, "grad_norm": 4.131076812744141, "learning_rate": 3.911987874015997e-06, "loss": 0.4296, "mean_token_accuracy": 0.8558681800961494, "num_tokens": 162277616.0, "step": 134910 }, { "entropy": 1.9546231135725975, "epoch": 0.4182401951704913, "grad_norm": 3.539958953857422, "learning_rate": 3.911842895904522e-06, "loss": 0.5274, "mean_token_accuracy": 0.8383591964840889, "num_tokens": 162288986.0, "step": 134920 }, { "entropy": 1.8494961842894555, "epoch": 0.418271194295541, "grad_norm": 7.521032810211182, "learning_rate": 3.911697933910501e-06, "loss": 0.4387, "mean_token_accuracy": 0.8557368025183678, "num_tokens": 162301754.0, "step": 134930 }, { "entropy": 1.8642336532473565, "epoch": 0.4183021934205907, "grad_norm": 4.32247257232666, "learning_rate": 3.911552988030949e-06, "loss": 0.4259, "mean_token_accuracy": 0.8557508587837219, "num_tokens": 162313972.0, "step": 134940 }, { "entropy": 1.94805389046669, "epoch": 0.4183331925456404, "grad_norm": 9.484874725341797, "learning_rate": 3.911408058262879e-06, "loss": 0.4782, "mean_token_accuracy": 0.8523627325892449, "num_tokens": 162324752.0, "step": 134950 }, { "entropy": 1.8355311632156373, "epoch": 0.4183641916706901, "grad_norm": 16.279993057250977, "learning_rate": 3.911263144603306e-06, "loss": 0.4276, "mean_token_accuracy": 0.8622775912284851, "num_tokens": 162337826.0, "step": 134960 }, { "entropy": 1.8673854805529118, "epoch": 0.4183951907957398, "grad_norm": 7.603078365325928, "learning_rate": 3.911118247049249e-06, "loss": 0.408, "mean_token_accuracy": 0.8558634921908379, "num_tokens": 162350433.0, "step": 134970 }, { "entropy": 1.9007538348436355, "epoch": 0.4184261899207895, "grad_norm": 7.9676337242126465, "learning_rate": 3.9109733655977225e-06, "loss": 0.5029, "mean_token_accuracy": 0.8374831721186637, "num_tokens": 162362674.0, "step": 134980 }, { "entropy": 1.8434744045138358, "epoch": 0.4184571890458392, "grad_norm": 7.775720119476318, "learning_rate": 3.910828500245745e-06, "loss": 0.4214, "mean_token_accuracy": 0.8564092546701432, "num_tokens": 162375250.0, "step": 134990 }, { "entropy": 1.918935863673687, "epoch": 0.4184881881708889, "grad_norm": 7.319066524505615, "learning_rate": 3.910683650990335e-06, "loss": 0.4322, "mean_token_accuracy": 0.861502180993557, "num_tokens": 162386475.0, "step": 135000 }, { "entropy": 1.8640062302350997, "epoch": 0.41851918729593857, "grad_norm": 8.106133460998535, "learning_rate": 3.910538817828512e-06, "loss": 0.4262, "mean_token_accuracy": 0.854218167066574, "num_tokens": 162398652.0, "step": 135010 }, { "entropy": 1.7948240421712398, "epoch": 0.4185501864209883, "grad_norm": 3.9371562004089355, "learning_rate": 3.910394000757297e-06, "loss": 0.4051, "mean_token_accuracy": 0.8555122867226601, "num_tokens": 162411080.0, "step": 135020 }, { "entropy": 1.8870450854301453, "epoch": 0.41858118554603796, "grad_norm": 7.394895076751709, "learning_rate": 3.910249199773708e-06, "loss": 0.4217, "mean_token_accuracy": 0.8639406561851501, "num_tokens": 162422198.0, "step": 135030 }, { "entropy": 1.928053417801857, "epoch": 0.4186121846710877, "grad_norm": 7.574821472167969, "learning_rate": 3.910104414874769e-06, "loss": 0.5109, "mean_token_accuracy": 0.84600929915905, "num_tokens": 162432961.0, "step": 135040 }, { "entropy": 1.9174908056855202, "epoch": 0.41864318379613735, "grad_norm": 8.21877384185791, "learning_rate": 3.909959646057503e-06, "loss": 0.4699, "mean_token_accuracy": 0.8488267034292221, "num_tokens": 162443874.0, "step": 135050 }, { "entropy": 1.8227450162172318, "epoch": 0.4186741829211871, "grad_norm": 9.875720024108887, "learning_rate": 3.90981489331893e-06, "loss": 0.4328, "mean_token_accuracy": 0.8520313039422035, "num_tokens": 162457203.0, "step": 135060 }, { "entropy": 1.8200043380260467, "epoch": 0.41870518204623675, "grad_norm": 10.07118034362793, "learning_rate": 3.909670156656076e-06, "loss": 0.4238, "mean_token_accuracy": 0.8482141956686974, "num_tokens": 162469740.0, "step": 135070 }, { "entropy": 1.9454358339309692, "epoch": 0.4187361811712864, "grad_norm": 9.135685920715332, "learning_rate": 3.909525436065966e-06, "loss": 0.4849, "mean_token_accuracy": 0.8446780741214752, "num_tokens": 162480950.0, "step": 135080 }, { "entropy": 1.802914521098137, "epoch": 0.41876718029633614, "grad_norm": 7.378044605255127, "learning_rate": 3.909380731545625e-06, "loss": 0.3828, "mean_token_accuracy": 0.8730568781495094, "num_tokens": 162493924.0, "step": 135090 }, { "entropy": 1.762102383375168, "epoch": 0.4187981794213858, "grad_norm": 8.075687408447266, "learning_rate": 3.90923604309208e-06, "loss": 0.3709, "mean_token_accuracy": 0.8703906610608101, "num_tokens": 162507010.0, "step": 135100 }, { "entropy": 1.890143983066082, "epoch": 0.41882917854643553, "grad_norm": 8.960722923278809, "learning_rate": 3.9090913707023566e-06, "loss": 0.4922, "mean_token_accuracy": 0.8449656277894974, "num_tokens": 162518570.0, "step": 135110 }, { "entropy": 1.8910117477178574, "epoch": 0.4188601776714852, "grad_norm": 3.260122776031494, "learning_rate": 3.908946714373483e-06, "loss": 0.4742, "mean_token_accuracy": 0.8450547441840172, "num_tokens": 162530768.0, "step": 135120 }, { "entropy": 1.8741901010274886, "epoch": 0.4188911767965349, "grad_norm": 3.713733196258545, "learning_rate": 3.908802074102489e-06, "loss": 0.4205, "mean_token_accuracy": 0.8599935069680213, "num_tokens": 162542560.0, "step": 135130 }, { "entropy": 1.8225472882390021, "epoch": 0.4189221759215846, "grad_norm": 2.8528435230255127, "learning_rate": 3.908657449886402e-06, "loss": 0.4187, "mean_token_accuracy": 0.8527352377772331, "num_tokens": 162555416.0, "step": 135140 }, { "entropy": 1.9285516187548637, "epoch": 0.4189531750466343, "grad_norm": 9.028520584106445, "learning_rate": 3.908512841722253e-06, "loss": 0.4887, "mean_token_accuracy": 0.8442423656582833, "num_tokens": 162566844.0, "step": 135150 }, { "entropy": 1.9082002013921737, "epoch": 0.418984174171684, "grad_norm": 8.118371963500977, "learning_rate": 3.908368249607073e-06, "loss": 0.5018, "mean_token_accuracy": 0.8459517538547516, "num_tokens": 162578692.0, "step": 135160 }, { "entropy": 1.9068205058574677, "epoch": 0.4190151732967337, "grad_norm": 8.231268882751465, "learning_rate": 3.908223673537895e-06, "loss": 0.451, "mean_token_accuracy": 0.8569748342037201, "num_tokens": 162590372.0, "step": 135170 }, { "entropy": 1.7947299778461456, "epoch": 0.4190461724217834, "grad_norm": 7.387755393981934, "learning_rate": 3.908079113511748e-06, "loss": 0.4068, "mean_token_accuracy": 0.8611501380801201, "num_tokens": 162603085.0, "step": 135180 }, { "entropy": 1.830442163348198, "epoch": 0.4190771715468331, "grad_norm": 3.5146117210388184, "learning_rate": 3.907934569525668e-06, "loss": 0.4604, "mean_token_accuracy": 0.8458912119269371, "num_tokens": 162616037.0, "step": 135190 }, { "entropy": 1.8631732419133187, "epoch": 0.4191081706718828, "grad_norm": 8.939651489257812, "learning_rate": 3.907790041576687e-06, "loss": 0.4309, "mean_token_accuracy": 0.8587908774614335, "num_tokens": 162627961.0, "step": 135200 }, { "entropy": 1.891756534576416, "epoch": 0.4191391697969325, "grad_norm": 7.691137313842773, "learning_rate": 3.907645529661842e-06, "loss": 0.4621, "mean_token_accuracy": 0.8453534230589866, "num_tokens": 162640527.0, "step": 135210 }, { "entropy": 1.880522905290127, "epoch": 0.41917016892198217, "grad_norm": 8.83800983428955, "learning_rate": 3.907501033778167e-06, "loss": 0.4631, "mean_token_accuracy": 0.8536536455154419, "num_tokens": 162652160.0, "step": 135220 }, { "entropy": 1.9340814992785453, "epoch": 0.4192011680470319, "grad_norm": 9.312424659729004, "learning_rate": 3.907356553922698e-06, "loss": 0.4707, "mean_token_accuracy": 0.8431622445583343, "num_tokens": 162663724.0, "step": 135230 }, { "entropy": 1.7783160164952279, "epoch": 0.41923216717208156, "grad_norm": 8.399624824523926, "learning_rate": 3.907212090092472e-06, "loss": 0.3552, "mean_token_accuracy": 0.8627290308475495, "num_tokens": 162677173.0, "step": 135240 }, { "entropy": 1.9121920481324195, "epoch": 0.4192631662971313, "grad_norm": 4.327278137207031, "learning_rate": 3.907067642284528e-06, "loss": 0.4775, "mean_token_accuracy": 0.8431808009743691, "num_tokens": 162688069.0, "step": 135250 }, { "entropy": 1.90309626609087, "epoch": 0.41929416542218095, "grad_norm": 7.39229679107666, "learning_rate": 3.906923210495903e-06, "loss": 0.4782, "mean_token_accuracy": 0.8486498475074769, "num_tokens": 162699438.0, "step": 135260 }, { "entropy": 1.7719358682632447, "epoch": 0.4193251645472307, "grad_norm": 2.3753769397735596, "learning_rate": 3.9067787947236376e-06, "loss": 0.3807, "mean_token_accuracy": 0.8691380977630615, "num_tokens": 162711906.0, "step": 135270 }, { "entropy": 1.7657958284020423, "epoch": 0.41935616367228035, "grad_norm": 8.348288536071777, "learning_rate": 3.906634394964771e-06, "loss": 0.399, "mean_token_accuracy": 0.8593086928129197, "num_tokens": 162725490.0, "step": 135280 }, { "entropy": 1.8791421324014663, "epoch": 0.41938716279733007, "grad_norm": 5.080158233642578, "learning_rate": 3.906490011216344e-06, "loss": 0.4721, "mean_token_accuracy": 0.8483076050877572, "num_tokens": 162737225.0, "step": 135290 }, { "entropy": 1.77224540412426, "epoch": 0.41941816192237974, "grad_norm": 9.42541790008545, "learning_rate": 3.9063456434754005e-06, "loss": 0.3504, "mean_token_accuracy": 0.8711794734001159, "num_tokens": 162750989.0, "step": 135300 }, { "entropy": 1.909542678296566, "epoch": 0.41944916104742946, "grad_norm": 8.59212589263916, "learning_rate": 3.906201291738979e-06, "loss": 0.4915, "mean_token_accuracy": 0.8415026575326919, "num_tokens": 162762817.0, "step": 135310 }, { "entropy": 1.8708134442567825, "epoch": 0.41948016017247913, "grad_norm": 7.089857578277588, "learning_rate": 3.906056956004125e-06, "loss": 0.4525, "mean_token_accuracy": 0.8642296716570854, "num_tokens": 162774634.0, "step": 135320 }, { "entropy": 1.8708177879452705, "epoch": 0.4195111592975288, "grad_norm": 7.2748823165893555, "learning_rate": 3.905912636267882e-06, "loss": 0.3918, "mean_token_accuracy": 0.8618823811411858, "num_tokens": 162786925.0, "step": 135330 }, { "entropy": 1.9178523600101471, "epoch": 0.4195421584225785, "grad_norm": 7.701486110687256, "learning_rate": 3.905768332527295e-06, "loss": 0.5139, "mean_token_accuracy": 0.8496042251586914, "num_tokens": 162799600.0, "step": 135340 }, { "entropy": 1.8545253962278365, "epoch": 0.4195731575476282, "grad_norm": 8.929347038269043, "learning_rate": 3.905624044779408e-06, "loss": 0.4689, "mean_token_accuracy": 0.8579249680042267, "num_tokens": 162811881.0, "step": 135350 }, { "entropy": 1.8590249836444854, "epoch": 0.4196041566726779, "grad_norm": 7.88853645324707, "learning_rate": 3.905479773021269e-06, "loss": 0.4949, "mean_token_accuracy": 0.843219818174839, "num_tokens": 162824189.0, "step": 135360 }, { "entropy": 1.8993327513337135, "epoch": 0.4196351557977276, "grad_norm": 7.894437789916992, "learning_rate": 3.905335517249924e-06, "loss": 0.4708, "mean_token_accuracy": 0.8462705582380294, "num_tokens": 162836157.0, "step": 135370 }, { "entropy": 1.924989990890026, "epoch": 0.4196661549227773, "grad_norm": 3.602142333984375, "learning_rate": 3.9051912774624215e-06, "loss": 0.4607, "mean_token_accuracy": 0.8598493561148643, "num_tokens": 162847849.0, "step": 135380 }, { "entropy": 1.8464587591588497, "epoch": 0.419697154047827, "grad_norm": 9.821682929992676, "learning_rate": 3.905047053655809e-06, "loss": 0.3962, "mean_token_accuracy": 0.8622051507234574, "num_tokens": 162860582.0, "step": 135390 }, { "entropy": 1.8456949055194856, "epoch": 0.4197281531728767, "grad_norm": 3.930516242980957, "learning_rate": 3.904902845827135e-06, "loss": 0.401, "mean_token_accuracy": 0.8577687725424766, "num_tokens": 162873376.0, "step": 135400 }, { "entropy": 1.805974441766739, "epoch": 0.4197591522979264, "grad_norm": 7.337070941925049, "learning_rate": 3.904758653973452e-06, "loss": 0.3964, "mean_token_accuracy": 0.8626691713929177, "num_tokens": 162886100.0, "step": 135410 }, { "entropy": 1.9164967209100723, "epoch": 0.4197901514229761, "grad_norm": 9.788459777832031, "learning_rate": 3.904614478091809e-06, "loss": 0.4771, "mean_token_accuracy": 0.8476973131299019, "num_tokens": 162897812.0, "step": 135420 }, { "entropy": 1.9670021116733551, "epoch": 0.41982115054802577, "grad_norm": 9.283044815063477, "learning_rate": 3.904470318179257e-06, "loss": 0.534, "mean_token_accuracy": 0.8294244706630707, "num_tokens": 162909541.0, "step": 135430 }, { "entropy": 1.9219158321619034, "epoch": 0.4198521496730755, "grad_norm": 8.37589168548584, "learning_rate": 3.9043261742328505e-06, "loss": 0.5045, "mean_token_accuracy": 0.8363066896796226, "num_tokens": 162921516.0, "step": 135440 }, { "entropy": 1.953237357735634, "epoch": 0.41988314879812516, "grad_norm": 9.618903160095215, "learning_rate": 3.904182046249641e-06, "loss": 0.534, "mean_token_accuracy": 0.8366630434989929, "num_tokens": 162932956.0, "step": 135450 }, { "entropy": 1.9217358708381653, "epoch": 0.4199141479231749, "grad_norm": 7.9734649658203125, "learning_rate": 3.904037934226683e-06, "loss": 0.4784, "mean_token_accuracy": 0.8531175389885902, "num_tokens": 162944286.0, "step": 135460 }, { "entropy": 1.799736338853836, "epoch": 0.41994514704822455, "grad_norm": 8.599648475646973, "learning_rate": 3.903893838161029e-06, "loss": 0.3677, "mean_token_accuracy": 0.8724782764911652, "num_tokens": 162957587.0, "step": 135470 }, { "entropy": 1.8858862951397897, "epoch": 0.4199761461732743, "grad_norm": 3.428628444671631, "learning_rate": 3.903749758049738e-06, "loss": 0.424, "mean_token_accuracy": 0.8496242195367814, "num_tokens": 162969224.0, "step": 135480 }, { "entropy": 1.8270292654633522, "epoch": 0.42000714529832395, "grad_norm": 2.5544705390930176, "learning_rate": 3.903605693889863e-06, "loss": 0.3985, "mean_token_accuracy": 0.8701824352145195, "num_tokens": 162982342.0, "step": 135490 }, { "entropy": 1.8943888053297997, "epoch": 0.42003814442337367, "grad_norm": 9.184906005859375, "learning_rate": 3.90346164567846e-06, "loss": 0.4773, "mean_token_accuracy": 0.8437836706638336, "num_tokens": 162994089.0, "step": 135500 }, { "entropy": 1.8288681022822857, "epoch": 0.42006914354842334, "grad_norm": 8.831469535827637, "learning_rate": 3.903317613412592e-06, "loss": 0.394, "mean_token_accuracy": 0.867976401746273, "num_tokens": 163006600.0, "step": 135510 }, { "entropy": 1.9004840448498725, "epoch": 0.42010014267347306, "grad_norm": 8.030474662780762, "learning_rate": 3.903173597089313e-06, "loss": 0.4529, "mean_token_accuracy": 0.8580782458186149, "num_tokens": 163017668.0, "step": 135520 }, { "entropy": 1.9328984439373016, "epoch": 0.42013114179852273, "grad_norm": 8.515178680419922, "learning_rate": 3.903029596705682e-06, "loss": 0.4733, "mean_token_accuracy": 0.8430827930569649, "num_tokens": 163029533.0, "step": 135530 }, { "entropy": 1.903455564379692, "epoch": 0.42016214092357246, "grad_norm": 8.266940116882324, "learning_rate": 3.90288561225876e-06, "loss": 0.4381, "mean_token_accuracy": 0.8574344128370285, "num_tokens": 163041861.0, "step": 135540 }, { "entropy": 1.7817657575011254, "epoch": 0.4201931400486221, "grad_norm": 8.015380859375, "learning_rate": 3.902741643745609e-06, "loss": 0.3766, "mean_token_accuracy": 0.8572424992918968, "num_tokens": 163054619.0, "step": 135550 }, { "entropy": 1.8124181941151618, "epoch": 0.4202241391736718, "grad_norm": 5.867700576782227, "learning_rate": 3.902597691163288e-06, "loss": 0.4328, "mean_token_accuracy": 0.8469098672270775, "num_tokens": 163067835.0, "step": 135560 }, { "entropy": 1.9572464644908905, "epoch": 0.4202551382987215, "grad_norm": 8.415164947509766, "learning_rate": 3.902453754508861e-06, "loss": 0.5148, "mean_token_accuracy": 0.8418419510126114, "num_tokens": 163079318.0, "step": 135570 }, { "entropy": 1.9142266780138015, "epoch": 0.4202861374237712, "grad_norm": 7.7311577796936035, "learning_rate": 3.902309833779389e-06, "loss": 0.458, "mean_token_accuracy": 0.8493567198514939, "num_tokens": 163090456.0, "step": 135580 }, { "entropy": 1.9059449434280396, "epoch": 0.4203171365488209, "grad_norm": 7.837739944458008, "learning_rate": 3.902165928971938e-06, "loss": 0.4521, "mean_token_accuracy": 0.857314033806324, "num_tokens": 163101560.0, "step": 135590 }, { "entropy": 1.9108774542808533, "epoch": 0.4203481356738706, "grad_norm": 7.050215244293213, "learning_rate": 3.90202204008357e-06, "loss": 0.4735, "mean_token_accuracy": 0.8525308519601822, "num_tokens": 163113176.0, "step": 135600 }, { "entropy": 1.8393698036670685, "epoch": 0.4203791347989203, "grad_norm": 3.4507510662078857, "learning_rate": 3.901878167111353e-06, "loss": 0.4471, "mean_token_accuracy": 0.8612019449472428, "num_tokens": 163126007.0, "step": 135610 }, { "entropy": 1.9607517927885056, "epoch": 0.42041013392397, "grad_norm": 7.289004325866699, "learning_rate": 3.9017343100523505e-06, "loss": 0.5067, "mean_token_accuracy": 0.8425389587879181, "num_tokens": 163136804.0, "step": 135620 }, { "entropy": 1.8896097630262374, "epoch": 0.4204411330490197, "grad_norm": 8.581795692443848, "learning_rate": 3.90159046890363e-06, "loss": 0.4696, "mean_token_accuracy": 0.8448723956942559, "num_tokens": 163148832.0, "step": 135630 }, { "entropy": 1.922749936580658, "epoch": 0.42047213217406937, "grad_norm": 8.489492416381836, "learning_rate": 3.90144664366226e-06, "loss": 0.4862, "mean_token_accuracy": 0.8492430374026299, "num_tokens": 163160997.0, "step": 135640 }, { "entropy": 1.9693446904420853, "epoch": 0.4205031312991191, "grad_norm": 9.86866283416748, "learning_rate": 3.9013028343253065e-06, "loss": 0.4833, "mean_token_accuracy": 0.8453862652182579, "num_tokens": 163172061.0, "step": 135650 }, { "entropy": 1.9001538813114167, "epoch": 0.42053413042416876, "grad_norm": 7.654898166656494, "learning_rate": 3.901159040889842e-06, "loss": 0.4418, "mean_token_accuracy": 0.8523498460650444, "num_tokens": 163184244.0, "step": 135660 }, { "entropy": 1.945447364449501, "epoch": 0.4205651295492185, "grad_norm": 8.955438613891602, "learning_rate": 3.901015263352933e-06, "loss": 0.5053, "mean_token_accuracy": 0.8504986420273781, "num_tokens": 163195436.0, "step": 135670 }, { "entropy": 1.9072694763541223, "epoch": 0.42059612867426815, "grad_norm": 8.15773868560791, "learning_rate": 3.900871501711651e-06, "loss": 0.4505, "mean_token_accuracy": 0.8516003981232643, "num_tokens": 163206912.0, "step": 135680 }, { "entropy": 1.8923516809940337, "epoch": 0.4206271277993179, "grad_norm": 8.769623756408691, "learning_rate": 3.900727755963067e-06, "loss": 0.4825, "mean_token_accuracy": 0.848148649930954, "num_tokens": 163218689.0, "step": 135690 }, { "entropy": 1.8840610541403293, "epoch": 0.42065812692436755, "grad_norm": 7.660909652709961, "learning_rate": 3.9005840261042535e-06, "loss": 0.4562, "mean_token_accuracy": 0.8552731052041054, "num_tokens": 163231024.0, "step": 135700 }, { "entropy": 1.903901606798172, "epoch": 0.4206891260494173, "grad_norm": 3.740813732147217, "learning_rate": 3.900440312132283e-06, "loss": 0.4964, "mean_token_accuracy": 0.8453383758664131, "num_tokens": 163242740.0, "step": 135710 }, { "entropy": 1.9122828841209412, "epoch": 0.42072012517446694, "grad_norm": 4.723738670349121, "learning_rate": 3.90029661404423e-06, "loss": 0.4393, "mean_token_accuracy": 0.8527891039848328, "num_tokens": 163254707.0, "step": 135720 }, { "entropy": 1.9014658220112324, "epoch": 0.42075112429951667, "grad_norm": 3.560215711593628, "learning_rate": 3.900152931837168e-06, "loss": 0.4368, "mean_token_accuracy": 0.8482464477419853, "num_tokens": 163266816.0, "step": 135730 }, { "entropy": 1.9249530613422394, "epoch": 0.42078212342456633, "grad_norm": 8.004035949707031, "learning_rate": 3.900009265508172e-06, "loss": 0.4692, "mean_token_accuracy": 0.8406677231192589, "num_tokens": 163278761.0, "step": 135740 }, { "entropy": 1.8970685094594955, "epoch": 0.42081312254961606, "grad_norm": 8.880062103271484, "learning_rate": 3.899865615054318e-06, "loss": 0.4325, "mean_token_accuracy": 0.8497609063982964, "num_tokens": 163291090.0, "step": 135750 }, { "entropy": 1.828018756210804, "epoch": 0.4208441216746657, "grad_norm": 7.22142219543457, "learning_rate": 3.8997219804726815e-06, "loss": 0.43, "mean_token_accuracy": 0.8671692669391632, "num_tokens": 163302749.0, "step": 135760 }, { "entropy": 1.8527932271361351, "epoch": 0.42087512079971545, "grad_norm": 7.9586286544799805, "learning_rate": 3.899578361760341e-06, "loss": 0.4645, "mean_token_accuracy": 0.8436411425471306, "num_tokens": 163315230.0, "step": 135770 }, { "entropy": 1.8658724144101142, "epoch": 0.4209061199247651, "grad_norm": 4.109507083892822, "learning_rate": 3.899434758914374e-06, "loss": 0.4303, "mean_token_accuracy": 0.8581762701272965, "num_tokens": 163327266.0, "step": 135780 }, { "entropy": 1.842541829496622, "epoch": 0.42093711904981485, "grad_norm": 9.778244972229004, "learning_rate": 3.89929117193186e-06, "loss": 0.458, "mean_token_accuracy": 0.8542135626077652, "num_tokens": 163339658.0, "step": 135790 }, { "entropy": 1.8465433612465858, "epoch": 0.4209681181748645, "grad_norm": 8.6669282913208, "learning_rate": 3.899147600809877e-06, "loss": 0.4387, "mean_token_accuracy": 0.8487094223499299, "num_tokens": 163352124.0, "step": 135800 }, { "entropy": 1.90444867759943, "epoch": 0.4209991172999142, "grad_norm": 8.977686882019043, "learning_rate": 3.899004045545507e-06, "loss": 0.4835, "mean_token_accuracy": 0.8524438515305519, "num_tokens": 163363934.0, "step": 135810 }, { "entropy": 1.8843096554279328, "epoch": 0.4210301164249639, "grad_norm": 8.393902778625488, "learning_rate": 3.898860506135832e-06, "loss": 0.4474, "mean_token_accuracy": 0.8609971582889557, "num_tokens": 163375865.0, "step": 135820 }, { "entropy": 1.9457574129104613, "epoch": 0.4210611155500136, "grad_norm": 7.407272815704346, "learning_rate": 3.898716982577929e-06, "loss": 0.5116, "mean_token_accuracy": 0.8447777807712555, "num_tokens": 163386634.0, "step": 135830 }, { "entropy": 1.916204272210598, "epoch": 0.4210921146750633, "grad_norm": 7.169795513153076, "learning_rate": 3.898573474868886e-06, "loss": 0.4745, "mean_token_accuracy": 0.8451360926032067, "num_tokens": 163398136.0, "step": 135840 }, { "entropy": 1.887849335372448, "epoch": 0.42112311380011297, "grad_norm": 9.207273483276367, "learning_rate": 3.898429983005783e-06, "loss": 0.4681, "mean_token_accuracy": 0.849080765247345, "num_tokens": 163409693.0, "step": 135850 }, { "entropy": 1.8300122573971749, "epoch": 0.4211541129251627, "grad_norm": 8.275792121887207, "learning_rate": 3.898286506985706e-06, "loss": 0.3938, "mean_token_accuracy": 0.8691477358341217, "num_tokens": 163422481.0, "step": 135860 }, { "entropy": 1.9414837792515756, "epoch": 0.42118511205021236, "grad_norm": 9.043680191040039, "learning_rate": 3.898143046805739e-06, "loss": 0.4655, "mean_token_accuracy": 0.8596002608537674, "num_tokens": 163433827.0, "step": 135870 }, { "entropy": 1.8458356723189353, "epoch": 0.4212161111752621, "grad_norm": 8.126672744750977, "learning_rate": 3.897999602462968e-06, "loss": 0.3803, "mean_token_accuracy": 0.8593410521745681, "num_tokens": 163446168.0, "step": 135880 }, { "entropy": 1.925001895427704, "epoch": 0.42124711030031176, "grad_norm": 8.667831420898438, "learning_rate": 3.897856173954477e-06, "loss": 0.4655, "mean_token_accuracy": 0.8532567232847214, "num_tokens": 163457240.0, "step": 135890 }, { "entropy": 1.8693645611405372, "epoch": 0.4212781094253615, "grad_norm": 6.114764213562012, "learning_rate": 3.897712761277357e-06, "loss": 0.4568, "mean_token_accuracy": 0.8476710587739944, "num_tokens": 163470439.0, "step": 135900 }, { "entropy": 1.928558248281479, "epoch": 0.42130910855041115, "grad_norm": 9.09399127960205, "learning_rate": 3.897569364428692e-06, "loss": 0.4697, "mean_token_accuracy": 0.8500400453805923, "num_tokens": 163481986.0, "step": 135910 }, { "entropy": 1.9676057755947114, "epoch": 0.4213401076754609, "grad_norm": 7.342076301574707, "learning_rate": 3.897425983405573e-06, "loss": 0.494, "mean_token_accuracy": 0.8489060357213021, "num_tokens": 163492836.0, "step": 135920 }, { "entropy": 1.8425328716635705, "epoch": 0.42137110680051054, "grad_norm": 4.709118843078613, "learning_rate": 3.897282618205089e-06, "loss": 0.4112, "mean_token_accuracy": 0.8543885082006455, "num_tokens": 163505935.0, "step": 135930 }, { "entropy": 1.875008788704872, "epoch": 0.42140210592556027, "grad_norm": 2.8072144985198975, "learning_rate": 3.89713926882433e-06, "loss": 0.4848, "mean_token_accuracy": 0.8433866515755654, "num_tokens": 163518513.0, "step": 135940 }, { "entropy": 1.907314045727253, "epoch": 0.42143310505060994, "grad_norm": 7.362760543823242, "learning_rate": 3.896995935260386e-06, "loss": 0.4514, "mean_token_accuracy": 0.8597197920084, "num_tokens": 163531303.0, "step": 135950 }, { "entropy": 1.9355842828750611, "epoch": 0.42146410417565966, "grad_norm": 3.6717031002044678, "learning_rate": 3.896852617510349e-06, "loss": 0.4762, "mean_token_accuracy": 0.8499437302350998, "num_tokens": 163542827.0, "step": 135960 }, { "entropy": 1.9380091413855554, "epoch": 0.42149510330070933, "grad_norm": 7.6753621101379395, "learning_rate": 3.896709315571311e-06, "loss": 0.4788, "mean_token_accuracy": 0.834736093878746, "num_tokens": 163555197.0, "step": 135970 }, { "entropy": 1.8424546226859093, "epoch": 0.42152610242575905, "grad_norm": 3.7391586303710938, "learning_rate": 3.896566029440366e-06, "loss": 0.4114, "mean_token_accuracy": 0.865011602640152, "num_tokens": 163567600.0, "step": 135980 }, { "entropy": 1.830755239725113, "epoch": 0.4215571015508087, "grad_norm": 10.804282188415527, "learning_rate": 3.8964227591146075e-06, "loss": 0.4528, "mean_token_accuracy": 0.8500369563698769, "num_tokens": 163580961.0, "step": 135990 }, { "entropy": 1.8739851251244546, "epoch": 0.42158810067585845, "grad_norm": 8.843622207641602, "learning_rate": 3.89627950459113e-06, "loss": 0.4433, "mean_token_accuracy": 0.853018419444561, "num_tokens": 163593529.0, "step": 136000 }, { "entropy": 1.8963138684630394, "epoch": 0.4216190998009081, "grad_norm": 7.0039191246032715, "learning_rate": 3.8961362658670284e-06, "loss": 0.4491, "mean_token_accuracy": 0.849344827234745, "num_tokens": 163605267.0, "step": 136010 }, { "entropy": 1.8308332130312919, "epoch": 0.42165009892595784, "grad_norm": 9.128500938415527, "learning_rate": 3.895993042939398e-06, "loss": 0.4015, "mean_token_accuracy": 0.8566051840782165, "num_tokens": 163618128.0, "step": 136020 }, { "entropy": 1.9336943060159684, "epoch": 0.4216810980510075, "grad_norm": 8.359414100646973, "learning_rate": 3.895849835805338e-06, "loss": 0.4617, "mean_token_accuracy": 0.8571832865476608, "num_tokens": 163629610.0, "step": 136030 }, { "entropy": 1.81222113519907, "epoch": 0.42171209717605723, "grad_norm": 7.943345546722412, "learning_rate": 3.8957066444619444e-06, "loss": 0.3755, "mean_token_accuracy": 0.86582732796669, "num_tokens": 163642383.0, "step": 136040 }, { "entropy": 1.8568004190921783, "epoch": 0.4217430963011069, "grad_norm": 5.962125778198242, "learning_rate": 3.895563468906315e-06, "loss": 0.4627, "mean_token_accuracy": 0.8517604455351829, "num_tokens": 163654588.0, "step": 136050 }, { "entropy": 1.9352711230516433, "epoch": 0.42177409542615657, "grad_norm": 9.666674613952637, "learning_rate": 3.89542030913555e-06, "loss": 0.4991, "mean_token_accuracy": 0.8422815144062042, "num_tokens": 163665434.0, "step": 136060 }, { "entropy": 1.9193247556686401, "epoch": 0.4218050945512063, "grad_norm": 9.703880310058594, "learning_rate": 3.895277165146748e-06, "loss": 0.517, "mean_token_accuracy": 0.8462235674262046, "num_tokens": 163677012.0, "step": 136070 }, { "entropy": 1.9230276107788087, "epoch": 0.42183609367625596, "grad_norm": 9.869771003723145, "learning_rate": 3.895134036937011e-06, "loss": 0.4913, "mean_token_accuracy": 0.8501724734902382, "num_tokens": 163687569.0, "step": 136080 }, { "entropy": 1.9550448417663575, "epoch": 0.4218670928013057, "grad_norm": 8.047429084777832, "learning_rate": 3.89499092450344e-06, "loss": 0.5219, "mean_token_accuracy": 0.843185929954052, "num_tokens": 163698183.0, "step": 136090 }, { "entropy": 1.8611886352300644, "epoch": 0.42189809192635536, "grad_norm": 8.042689323425293, "learning_rate": 3.894847827843135e-06, "loss": 0.4431, "mean_token_accuracy": 0.8485627174377441, "num_tokens": 163710589.0, "step": 136100 }, { "entropy": 1.8729781568050385, "epoch": 0.4219290910514051, "grad_norm": 8.031086921691895, "learning_rate": 3.894704746953201e-06, "loss": 0.4227, "mean_token_accuracy": 0.8670966506004334, "num_tokens": 163722848.0, "step": 136110 }, { "entropy": 1.9010607331991196, "epoch": 0.42196009017645475, "grad_norm": 8.790847778320312, "learning_rate": 3.894561681830741e-06, "loss": 0.4238, "mean_token_accuracy": 0.85305155813694, "num_tokens": 163734838.0, "step": 136120 }, { "entropy": 1.869620206952095, "epoch": 0.4219910893015045, "grad_norm": 7.369499206542969, "learning_rate": 3.89441863247286e-06, "loss": 0.3984, "mean_token_accuracy": 0.8668390452861786, "num_tokens": 163747691.0, "step": 136130 }, { "entropy": 1.8736939072608947, "epoch": 0.42202208842655414, "grad_norm": 8.191452026367188, "learning_rate": 3.89427559887666e-06, "loss": 0.443, "mean_token_accuracy": 0.8507094740867615, "num_tokens": 163759715.0, "step": 136140 }, { "entropy": 1.891453741490841, "epoch": 0.42205308755160387, "grad_norm": 4.542218208312988, "learning_rate": 3.89413258103925e-06, "loss": 0.4505, "mean_token_accuracy": 0.8499180749058723, "num_tokens": 163772049.0, "step": 136150 }, { "entropy": 1.8985579848289489, "epoch": 0.42208408667665354, "grad_norm": 6.974556922912598, "learning_rate": 3.8939895789577355e-06, "loss": 0.4075, "mean_token_accuracy": 0.866258729994297, "num_tokens": 163783799.0, "step": 136160 }, { "entropy": 1.8413588017225266, "epoch": 0.42211508580170326, "grad_norm": 3.5806539058685303, "learning_rate": 3.893846592629224e-06, "loss": 0.4167, "mean_token_accuracy": 0.856754244863987, "num_tokens": 163796194.0, "step": 136170 }, { "entropy": 1.8396148562431336, "epoch": 0.42214608492675293, "grad_norm": 7.897609710693359, "learning_rate": 3.893703622050822e-06, "loss": 0.4449, "mean_token_accuracy": 0.8517332747578621, "num_tokens": 163808614.0, "step": 136180 }, { "entropy": 1.8847883358597755, "epoch": 0.42217708405180265, "grad_norm": 8.880624771118164, "learning_rate": 3.89356066721964e-06, "loss": 0.4671, "mean_token_accuracy": 0.848950457572937, "num_tokens": 163820626.0, "step": 136190 }, { "entropy": 1.9313714131712914, "epoch": 0.4222080831768523, "grad_norm": 7.545229911804199, "learning_rate": 3.893417728132786e-06, "loss": 0.4886, "mean_token_accuracy": 0.8502359837293625, "num_tokens": 163832281.0, "step": 136200 }, { "entropy": 1.942906777560711, "epoch": 0.42223908230190205, "grad_norm": 8.471320152282715, "learning_rate": 3.8932748047873715e-06, "loss": 0.4698, "mean_token_accuracy": 0.8459069982171059, "num_tokens": 163843591.0, "step": 136210 }, { "entropy": 1.8104972153902055, "epoch": 0.4222700814269517, "grad_norm": 7.06924295425415, "learning_rate": 3.893131897180506e-06, "loss": 0.3943, "mean_token_accuracy": 0.8590885251760483, "num_tokens": 163856366.0, "step": 136220 }, { "entropy": 1.8111203506588935, "epoch": 0.42230108055200144, "grad_norm": 3.74467396736145, "learning_rate": 3.892989005309303e-06, "loss": 0.3944, "mean_token_accuracy": 0.8676251381635666, "num_tokens": 163870058.0, "step": 136230 }, { "entropy": 1.8452250599861144, "epoch": 0.4223320796770511, "grad_norm": 9.069093704223633, "learning_rate": 3.892846129170875e-06, "loss": 0.4211, "mean_token_accuracy": 0.8483367681503295, "num_tokens": 163883160.0, "step": 136240 }, { "entropy": 1.9414851903915404, "epoch": 0.42236307880210083, "grad_norm": 9.163724899291992, "learning_rate": 3.892703268762333e-06, "loss": 0.5084, "mean_token_accuracy": 0.8401763945817947, "num_tokens": 163894584.0, "step": 136250 }, { "entropy": 1.8744617268443107, "epoch": 0.4223940779271505, "grad_norm": 4.958732604980469, "learning_rate": 3.892560424080792e-06, "loss": 0.4281, "mean_token_accuracy": 0.8564569279551506, "num_tokens": 163906968.0, "step": 136260 }, { "entropy": 1.9449343591928483, "epoch": 0.4224250770522002, "grad_norm": 7.974160671234131, "learning_rate": 3.892417595123367e-06, "loss": 0.4889, "mean_token_accuracy": 0.8489760622382164, "num_tokens": 163918188.0, "step": 136270 }, { "entropy": 1.8679928988218308, "epoch": 0.4224560761772499, "grad_norm": 9.126168251037598, "learning_rate": 3.892274781887172e-06, "loss": 0.4807, "mean_token_accuracy": 0.8490209579467773, "num_tokens": 163930662.0, "step": 136280 }, { "entropy": 1.8945604875683784, "epoch": 0.4224870753022996, "grad_norm": 7.6865363121032715, "learning_rate": 3.892131984369326e-06, "loss": 0.4245, "mean_token_accuracy": 0.8592929482460022, "num_tokens": 163942417.0, "step": 136290 }, { "entropy": 1.9043464064598083, "epoch": 0.4225180744273493, "grad_norm": 9.015311241149902, "learning_rate": 3.891989202566944e-06, "loss": 0.5079, "mean_token_accuracy": 0.8410688266158104, "num_tokens": 163955136.0, "step": 136300 }, { "entropy": 1.9317906364798545, "epoch": 0.42254907355239896, "grad_norm": 9.661110877990723, "learning_rate": 3.891846436477144e-06, "loss": 0.4953, "mean_token_accuracy": 0.8499211475253106, "num_tokens": 163966625.0, "step": 136310 }, { "entropy": 1.9347480967640878, "epoch": 0.4225800726774487, "grad_norm": 4.00605583190918, "learning_rate": 3.891703686097043e-06, "loss": 0.5011, "mean_token_accuracy": 0.8386645168066025, "num_tokens": 163978413.0, "step": 136320 }, { "entropy": 1.8491806223988534, "epoch": 0.42261107180249835, "grad_norm": 6.554616451263428, "learning_rate": 3.891560951423763e-06, "loss": 0.4156, "mean_token_accuracy": 0.8566528141498566, "num_tokens": 163990762.0, "step": 136330 }, { "entropy": 1.9310165911912918, "epoch": 0.4226420709275481, "grad_norm": 10.993410110473633, "learning_rate": 3.891418232454421e-06, "loss": 0.5121, "mean_token_accuracy": 0.8411554425954819, "num_tokens": 164003208.0, "step": 136340 }, { "entropy": 1.8688613146543502, "epoch": 0.42267307005259774, "grad_norm": 4.234386920928955, "learning_rate": 3.891275529186138e-06, "loss": 0.4322, "mean_token_accuracy": 0.8530001133680344, "num_tokens": 164015901.0, "step": 136350 }, { "entropy": 1.8955225050449371, "epoch": 0.42270406917764747, "grad_norm": 7.75908899307251, "learning_rate": 3.891132841616038e-06, "loss": 0.3968, "mean_token_accuracy": 0.8615424484014511, "num_tokens": 164027911.0, "step": 136360 }, { "entropy": 1.90107059776783, "epoch": 0.42273506830269714, "grad_norm": 4.065981388092041, "learning_rate": 3.890990169741239e-06, "loss": 0.4431, "mean_token_accuracy": 0.8597174286842346, "num_tokens": 164039942.0, "step": 136370 }, { "entropy": 1.971032439172268, "epoch": 0.42276606742774686, "grad_norm": 8.14873218536377, "learning_rate": 3.890847513558867e-06, "loss": 0.5376, "mean_token_accuracy": 0.8287710756063461, "num_tokens": 164051132.0, "step": 136380 }, { "entropy": 1.933982428908348, "epoch": 0.42279706655279653, "grad_norm": 9.093382835388184, "learning_rate": 3.890704873066045e-06, "loss": 0.4483, "mean_token_accuracy": 0.8538150131702423, "num_tokens": 164063375.0, "step": 136390 }, { "entropy": 1.9692260310053826, "epoch": 0.42282806567784625, "grad_norm": 7.917427062988281, "learning_rate": 3.890562248259897e-06, "loss": 0.4822, "mean_token_accuracy": 0.842354828119278, "num_tokens": 164074799.0, "step": 136400 }, { "entropy": 1.949564391374588, "epoch": 0.4228590648028959, "grad_norm": 8.609591484069824, "learning_rate": 3.890419639137547e-06, "loss": 0.5208, "mean_token_accuracy": 0.8415541231632233, "num_tokens": 164086636.0, "step": 136410 }, { "entropy": 1.8990302249789237, "epoch": 0.42289006392794565, "grad_norm": 8.789817810058594, "learning_rate": 3.890277045696122e-06, "loss": 0.4632, "mean_token_accuracy": 0.8372734472155571, "num_tokens": 164099437.0, "step": 136420 }, { "entropy": 1.9440916180610657, "epoch": 0.4229210630529953, "grad_norm": 8.302007675170898, "learning_rate": 3.890134467932746e-06, "loss": 0.4679, "mean_token_accuracy": 0.8474587753415108, "num_tokens": 164111134.0, "step": 136430 }, { "entropy": 1.9455933913588523, "epoch": 0.42295206217804504, "grad_norm": 9.274413108825684, "learning_rate": 3.889991905844551e-06, "loss": 0.4528, "mean_token_accuracy": 0.84911008477211, "num_tokens": 164122818.0, "step": 136440 }, { "entropy": 1.8401254639029503, "epoch": 0.4229830613030947, "grad_norm": 7.885955810546875, "learning_rate": 3.88984935942866e-06, "loss": 0.474, "mean_token_accuracy": 0.8547663778066635, "num_tokens": 164135328.0, "step": 136450 }, { "entropy": 1.8858144536614418, "epoch": 0.42301406042814443, "grad_norm": 7.3441338539123535, "learning_rate": 3.8897068286822046e-06, "loss": 0.4749, "mean_token_accuracy": 0.8430177375674248, "num_tokens": 164147713.0, "step": 136460 }, { "entropy": 1.9624179631471634, "epoch": 0.4230450595531941, "grad_norm": 7.776828289031982, "learning_rate": 3.8895643136023146e-06, "loss": 0.4735, "mean_token_accuracy": 0.8557393714785576, "num_tokens": 164159157.0, "step": 136470 }, { "entropy": 1.9207000091671944, "epoch": 0.4230760586782438, "grad_norm": 8.962823867797852, "learning_rate": 3.889421814186118e-06, "loss": 0.5688, "mean_token_accuracy": 0.8365709602832794, "num_tokens": 164170834.0, "step": 136480 }, { "entropy": 1.9185936331748963, "epoch": 0.4231070578032935, "grad_norm": 6.344710826873779, "learning_rate": 3.889279330430746e-06, "loss": 0.4697, "mean_token_accuracy": 0.8482104346156121, "num_tokens": 164183232.0, "step": 136490 }, { "entropy": 1.908946332335472, "epoch": 0.4231380569283432, "grad_norm": 7.712813854217529, "learning_rate": 3.889136862333333e-06, "loss": 0.4495, "mean_token_accuracy": 0.8569162204861641, "num_tokens": 164195812.0, "step": 136500 }, { "entropy": 1.9538255900144577, "epoch": 0.4231690560533929, "grad_norm": 8.661401748657227, "learning_rate": 3.888994409891007e-06, "loss": 0.5378, "mean_token_accuracy": 0.8376554921269417, "num_tokens": 164207031.0, "step": 136510 }, { "entropy": 1.9065537318587302, "epoch": 0.4232000551784426, "grad_norm": 3.5878145694732666, "learning_rate": 3.8888519731009065e-06, "loss": 0.4526, "mean_token_accuracy": 0.8541804388165474, "num_tokens": 164219113.0, "step": 136520 }, { "entropy": 1.9570256814360618, "epoch": 0.4232310543034923, "grad_norm": 11.070653915405273, "learning_rate": 3.8887095519601594e-06, "loss": 0.4652, "mean_token_accuracy": 0.8492454007267952, "num_tokens": 164230241.0, "step": 136530 }, { "entropy": 1.9088216736912726, "epoch": 0.423262053428542, "grad_norm": 5.85700798034668, "learning_rate": 3.888567146465905e-06, "loss": 0.4576, "mean_token_accuracy": 0.8500758215785027, "num_tokens": 164242548.0, "step": 136540 }, { "entropy": 1.9074143424630166, "epoch": 0.4232930525535917, "grad_norm": 7.5828938484191895, "learning_rate": 3.888424756615277e-06, "loss": 0.4828, "mean_token_accuracy": 0.8519628167152404, "num_tokens": 164254254.0, "step": 136550 }, { "entropy": 1.958798161149025, "epoch": 0.42332405167864134, "grad_norm": 7.560995101928711, "learning_rate": 3.888282382405411e-06, "loss": 0.4955, "mean_token_accuracy": 0.8531105071306229, "num_tokens": 164265550.0, "step": 136560 }, { "entropy": 1.9580877602100373, "epoch": 0.42335505080369107, "grad_norm": 7.767574787139893, "learning_rate": 3.888140023833444e-06, "loss": 0.4987, "mean_token_accuracy": 0.8387601390480995, "num_tokens": 164277156.0, "step": 136570 }, { "entropy": 1.9372137054800986, "epoch": 0.42338604992874074, "grad_norm": 7.815626621246338, "learning_rate": 3.887997680896513e-06, "loss": 0.4388, "mean_token_accuracy": 0.8588192850351334, "num_tokens": 164288771.0, "step": 136580 }, { "entropy": 1.8666774332523346, "epoch": 0.42341704905379046, "grad_norm": 11.0514554977417, "learning_rate": 3.887855353591757e-06, "loss": 0.4583, "mean_token_accuracy": 0.8574911624193191, "num_tokens": 164302139.0, "step": 136590 }, { "entropy": 1.8457847326993941, "epoch": 0.42344804817884013, "grad_norm": 3.481797695159912, "learning_rate": 3.887713041916315e-06, "loss": 0.3907, "mean_token_accuracy": 0.8640109539031983, "num_tokens": 164315100.0, "step": 136600 }, { "entropy": 1.8275635659694671, "epoch": 0.42347904730388986, "grad_norm": 7.928697109222412, "learning_rate": 3.887570745867327e-06, "loss": 0.4406, "mean_token_accuracy": 0.8522553309798241, "num_tokens": 164328806.0, "step": 136610 }, { "entropy": 1.945831647515297, "epoch": 0.4235100464289395, "grad_norm": 8.89621353149414, "learning_rate": 3.887428465441934e-06, "loss": 0.4692, "mean_token_accuracy": 0.8539063900709152, "num_tokens": 164340003.0, "step": 136620 }, { "entropy": 1.9486369535326957, "epoch": 0.42354104555398925, "grad_norm": 9.575989723205566, "learning_rate": 3.8872862006372745e-06, "loss": 0.4362, "mean_token_accuracy": 0.8562672764062882, "num_tokens": 164350643.0, "step": 136630 }, { "entropy": 1.8524845391511917, "epoch": 0.4235720446790389, "grad_norm": 6.633891582489014, "learning_rate": 3.887143951450493e-06, "loss": 0.4592, "mean_token_accuracy": 0.8535214066505432, "num_tokens": 164363583.0, "step": 136640 }, { "entropy": 1.833214531838894, "epoch": 0.42360304380408864, "grad_norm": 7.453033924102783, "learning_rate": 3.887001717878731e-06, "loss": 0.3917, "mean_token_accuracy": 0.8589364722371101, "num_tokens": 164377000.0, "step": 136650 }, { "entropy": 1.7080215141177177, "epoch": 0.4236340429291383, "grad_norm": 7.440715312957764, "learning_rate": 3.886859499919133e-06, "loss": 0.356, "mean_token_accuracy": 0.8675283506512642, "num_tokens": 164390708.0, "step": 136660 }, { "entropy": 1.8073251724243165, "epoch": 0.42366504205418803, "grad_norm": 4.673215389251709, "learning_rate": 3.886717297568841e-06, "loss": 0.4071, "mean_token_accuracy": 0.8596639856696129, "num_tokens": 164404186.0, "step": 136670 }, { "entropy": 1.7819759905338288, "epoch": 0.4236960411792377, "grad_norm": 4.5533552169799805, "learning_rate": 3.8865751108250015e-06, "loss": 0.3629, "mean_token_accuracy": 0.8706469982862473, "num_tokens": 164418745.0, "step": 136680 }, { "entropy": 1.931021724641323, "epoch": 0.42372704030428743, "grad_norm": 7.136261463165283, "learning_rate": 3.88643293968476e-06, "loss": 0.4592, "mean_token_accuracy": 0.851832240819931, "num_tokens": 164431143.0, "step": 136690 }, { "entropy": 1.9522256717085837, "epoch": 0.4237580394293371, "grad_norm": 6.650088787078857, "learning_rate": 3.886290784145263e-06, "loss": 0.5078, "mean_token_accuracy": 0.8387707456946373, "num_tokens": 164442769.0, "step": 136700 }, { "entropy": 1.9686541602015495, "epoch": 0.4237890385543868, "grad_norm": 4.377057075500488, "learning_rate": 3.886148644203657e-06, "loss": 0.4832, "mean_token_accuracy": 0.8511147424578667, "num_tokens": 164454299.0, "step": 136710 }, { "entropy": 1.907273495197296, "epoch": 0.4238200376794365, "grad_norm": 9.434427261352539, "learning_rate": 3.886006519857088e-06, "loss": 0.4182, "mean_token_accuracy": 0.8593376144766808, "num_tokens": 164466567.0, "step": 136720 }, { "entropy": 1.8965537667274475, "epoch": 0.4238510368044862, "grad_norm": 7.356637477874756, "learning_rate": 3.885864411102708e-06, "loss": 0.4372, "mean_token_accuracy": 0.8546531453728676, "num_tokens": 164478726.0, "step": 136730 }, { "entropy": 1.8856528677046298, "epoch": 0.4238820359295359, "grad_norm": 9.142583847045898, "learning_rate": 3.885722317937665e-06, "loss": 0.4395, "mean_token_accuracy": 0.8572150781750679, "num_tokens": 164491306.0, "step": 136740 }, { "entropy": 1.8360870122909545, "epoch": 0.4239130350545856, "grad_norm": 9.105968475341797, "learning_rate": 3.885580240359107e-06, "loss": 0.4475, "mean_token_accuracy": 0.851330541074276, "num_tokens": 164503683.0, "step": 136750 }, { "entropy": 1.926066693663597, "epoch": 0.4239440341796353, "grad_norm": 6.350192546844482, "learning_rate": 3.885438178364187e-06, "loss": 0.4194, "mean_token_accuracy": 0.8608698025345802, "num_tokens": 164515517.0, "step": 136760 }, { "entropy": 1.9915336534380912, "epoch": 0.423975033304685, "grad_norm": 7.947946548461914, "learning_rate": 3.885296131950055e-06, "loss": 0.4984, "mean_token_accuracy": 0.8465639933943748, "num_tokens": 164527079.0, "step": 136770 }, { "entropy": 1.9105818301439286, "epoch": 0.42400603242973467, "grad_norm": 9.564684867858887, "learning_rate": 3.885154101113865e-06, "loss": 0.4449, "mean_token_accuracy": 0.8596938267350197, "num_tokens": 164538814.0, "step": 136780 }, { "entropy": 1.9386409044265747, "epoch": 0.4240370315547844, "grad_norm": 10.250873565673828, "learning_rate": 3.885012085852768e-06, "loss": 0.4499, "mean_token_accuracy": 0.853357446193695, "num_tokens": 164550954.0, "step": 136790 }, { "entropy": 1.9958611935377122, "epoch": 0.42406803067983406, "grad_norm": 8.585638999938965, "learning_rate": 3.884870086163918e-06, "loss": 0.4789, "mean_token_accuracy": 0.852650736272335, "num_tokens": 164561994.0, "step": 136800 }, { "entropy": 1.8948075383901597, "epoch": 0.42409902980488373, "grad_norm": 8.89138412475586, "learning_rate": 3.88472810204447e-06, "loss": 0.4955, "mean_token_accuracy": 0.8488554358482361, "num_tokens": 164573952.0, "step": 136810 }, { "entropy": 1.9540094196796418, "epoch": 0.42413002892993346, "grad_norm": 7.017063140869141, "learning_rate": 3.884586133491578e-06, "loss": 0.4833, "mean_token_accuracy": 0.8478411644697189, "num_tokens": 164585607.0, "step": 136820 }, { "entropy": 1.8501681357622146, "epoch": 0.4241610280549831, "grad_norm": 8.608563423156738, "learning_rate": 3.884444180502399e-06, "loss": 0.4388, "mean_token_accuracy": 0.8567686438560486, "num_tokens": 164598434.0, "step": 136830 }, { "entropy": 1.9179615780711174, "epoch": 0.42419202718003285, "grad_norm": 7.781816482543945, "learning_rate": 3.884302243074088e-06, "loss": 0.5059, "mean_token_accuracy": 0.8461335703730584, "num_tokens": 164609861.0, "step": 136840 }, { "entropy": 1.9359031677246095, "epoch": 0.4242230263050825, "grad_norm": 9.121841430664062, "learning_rate": 3.884160321203804e-06, "loss": 0.4301, "mean_token_accuracy": 0.8523103460669518, "num_tokens": 164622151.0, "step": 136850 }, { "entropy": 1.923108348250389, "epoch": 0.42425402543013224, "grad_norm": 3.747391700744629, "learning_rate": 3.884018414888704e-06, "loss": 0.4437, "mean_token_accuracy": 0.856217896938324, "num_tokens": 164633849.0, "step": 136860 }, { "entropy": 2.0000748217105864, "epoch": 0.4242850245551819, "grad_norm": 9.04723834991455, "learning_rate": 3.883876524125946e-06, "loss": 0.5137, "mean_token_accuracy": 0.8461816415190697, "num_tokens": 164645091.0, "step": 136870 }, { "entropy": 1.9932536423206328, "epoch": 0.42431602368023164, "grad_norm": 9.16733169555664, "learning_rate": 3.883734648912692e-06, "loss": 0.5414, "mean_token_accuracy": 0.8346955180168152, "num_tokens": 164655606.0, "step": 136880 }, { "entropy": 1.936814832687378, "epoch": 0.4243470228052813, "grad_norm": 7.927580833435059, "learning_rate": 3.883592789246098e-06, "loss": 0.4945, "mean_token_accuracy": 0.848675799369812, "num_tokens": 164667161.0, "step": 136890 }, { "entropy": 1.9285539746284486, "epoch": 0.42437802193033103, "grad_norm": 3.780477523803711, "learning_rate": 3.883450945123329e-06, "loss": 0.4747, "mean_token_accuracy": 0.8493298560380935, "num_tokens": 164678581.0, "step": 136900 }, { "entropy": 1.8320277720689773, "epoch": 0.4244090210553807, "grad_norm": 9.713822364807129, "learning_rate": 3.883309116541545e-06, "loss": 0.4115, "mean_token_accuracy": 0.8653510987758637, "num_tokens": 164690881.0, "step": 136910 }, { "entropy": 1.9016039952635766, "epoch": 0.4244400201804304, "grad_norm": 8.08753776550293, "learning_rate": 3.883167303497907e-06, "loss": 0.4351, "mean_token_accuracy": 0.8574401244521141, "num_tokens": 164703031.0, "step": 136920 }, { "entropy": 1.8986100777983665, "epoch": 0.4244710193054801, "grad_norm": 3.745185375213623, "learning_rate": 3.88302550598958e-06, "loss": 0.4313, "mean_token_accuracy": 0.8573842152953148, "num_tokens": 164715311.0, "step": 136930 }, { "entropy": 1.9534535899758338, "epoch": 0.4245020184305298, "grad_norm": 10.045483589172363, "learning_rate": 3.882883724013727e-06, "loss": 0.4726, "mean_token_accuracy": 0.8531460225582123, "num_tokens": 164727139.0, "step": 136940 }, { "entropy": 1.931221941113472, "epoch": 0.4245330175555795, "grad_norm": 9.615496635437012, "learning_rate": 3.882741957567513e-06, "loss": 0.4526, "mean_token_accuracy": 0.8572171211242676, "num_tokens": 164738964.0, "step": 136950 }, { "entropy": 1.9113097980618476, "epoch": 0.4245640166806292, "grad_norm": 4.518940448760986, "learning_rate": 3.882600206648101e-06, "loss": 0.4727, "mean_token_accuracy": 0.8442388892173767, "num_tokens": 164750751.0, "step": 136960 }, { "entropy": 1.884780551493168, "epoch": 0.4245950158056789, "grad_norm": 10.810199737548828, "learning_rate": 3.88245847125266e-06, "loss": 0.4323, "mean_token_accuracy": 0.8605505630373955, "num_tokens": 164762720.0, "step": 136970 }, { "entropy": 1.8848048835992812, "epoch": 0.4246260149307286, "grad_norm": 7.640275478363037, "learning_rate": 3.882316751378355e-06, "loss": 0.4696, "mean_token_accuracy": 0.8543965950608253, "num_tokens": 164774673.0, "step": 136980 }, { "entropy": 1.9594868689775466, "epoch": 0.42465701405577827, "grad_norm": 8.8289213180542, "learning_rate": 3.882175047022354e-06, "loss": 0.4698, "mean_token_accuracy": 0.8498617529869079, "num_tokens": 164785684.0, "step": 136990 }, { "entropy": 1.9494032636284828, "epoch": 0.424688013180828, "grad_norm": 9.075950622558594, "learning_rate": 3.8820333581818245e-06, "loss": 0.487, "mean_token_accuracy": 0.8439081847667694, "num_tokens": 164797327.0, "step": 137000 }, { "entropy": 1.8996917128562927, "epoch": 0.42471901230587766, "grad_norm": 9.116379737854004, "learning_rate": 3.881891684853936e-06, "loss": 0.509, "mean_token_accuracy": 0.8461435765028, "num_tokens": 164809654.0, "step": 137010 }, { "entropy": 1.927696566283703, "epoch": 0.4247500114309274, "grad_norm": 8.284031867980957, "learning_rate": 3.881750027035857e-06, "loss": 0.4344, "mean_token_accuracy": 0.8543187946081161, "num_tokens": 164820690.0, "step": 137020 }, { "entropy": 1.773264393210411, "epoch": 0.42478101055597706, "grad_norm": 7.941432476043701, "learning_rate": 3.88160838472476e-06, "loss": 0.389, "mean_token_accuracy": 0.8707416027784347, "num_tokens": 164834760.0, "step": 137030 }, { "entropy": 1.798407319188118, "epoch": 0.4248120096810267, "grad_norm": 3.9334545135498047, "learning_rate": 3.881466757917814e-06, "loss": 0.3334, "mean_token_accuracy": 0.8698904514312744, "num_tokens": 164848105.0, "step": 137040 }, { "entropy": 1.8838206291198731, "epoch": 0.42484300880607645, "grad_norm": 7.942371845245361, "learning_rate": 3.8813251466121905e-06, "loss": 0.4114, "mean_token_accuracy": 0.8569322526454926, "num_tokens": 164860645.0, "step": 137050 }, { "entropy": 1.9465039163827895, "epoch": 0.4248740079311261, "grad_norm": 9.43044662475586, "learning_rate": 3.881183550805064e-06, "loss": 0.4727, "mean_token_accuracy": 0.8527957573533058, "num_tokens": 164871767.0, "step": 137060 }, { "entropy": 1.948851892352104, "epoch": 0.42490500705617584, "grad_norm": 7.4881110191345215, "learning_rate": 3.881041970493606e-06, "loss": 0.465, "mean_token_accuracy": 0.8495731383562088, "num_tokens": 164883389.0, "step": 137070 }, { "entropy": 1.7817671418190002, "epoch": 0.4249360061812255, "grad_norm": 3.924825429916382, "learning_rate": 3.88090040567499e-06, "loss": 0.348, "mean_token_accuracy": 0.8674560770392418, "num_tokens": 164896540.0, "step": 137080 }, { "entropy": 1.9194593280553818, "epoch": 0.42496700530627524, "grad_norm": 7.824845790863037, "learning_rate": 3.8807588563463924e-06, "loss": 0.5412, "mean_token_accuracy": 0.8526005268096923, "num_tokens": 164908834.0, "step": 137090 }, { "entropy": 1.9632477343082428, "epoch": 0.4249980044313249, "grad_norm": 7.450333595275879, "learning_rate": 3.880617322504987e-06, "loss": 0.5201, "mean_token_accuracy": 0.8478045627474785, "num_tokens": 164919727.0, "step": 137100 }, { "entropy": 1.9021240234375, "epoch": 0.42502900355637463, "grad_norm": 8.733455657958984, "learning_rate": 3.8804758041479515e-06, "loss": 0.4241, "mean_token_accuracy": 0.853900471329689, "num_tokens": 164931228.0, "step": 137110 }, { "entropy": 1.9155962690711021, "epoch": 0.4250600026814243, "grad_norm": 8.310453414916992, "learning_rate": 3.880334301272461e-06, "loss": 0.5104, "mean_token_accuracy": 0.8396357014775276, "num_tokens": 164943673.0, "step": 137120 }, { "entropy": 1.8234332576394081, "epoch": 0.425091001806474, "grad_norm": 8.392141342163086, "learning_rate": 3.880192813875693e-06, "loss": 0.4604, "mean_token_accuracy": 0.8463705331087112, "num_tokens": 164957038.0, "step": 137130 }, { "entropy": 1.8326028779149055, "epoch": 0.4251220009315237, "grad_norm": 4.345094203948975, "learning_rate": 3.880051341954828e-06, "loss": 0.3922, "mean_token_accuracy": 0.870023638010025, "num_tokens": 164970423.0, "step": 137140 }, { "entropy": 1.881044802069664, "epoch": 0.4251530000565734, "grad_norm": 8.87633991241455, "learning_rate": 3.879909885507042e-06, "loss": 0.4442, "mean_token_accuracy": 0.8461710780858993, "num_tokens": 164982236.0, "step": 137150 }, { "entropy": 1.8549317836761474, "epoch": 0.4251839991816231, "grad_norm": 9.388912200927734, "learning_rate": 3.879768444529517e-06, "loss": 0.4236, "mean_token_accuracy": 0.8555863380432129, "num_tokens": 164995126.0, "step": 137160 }, { "entropy": 1.8838744007050992, "epoch": 0.4252149983066728, "grad_norm": 7.678390979766846, "learning_rate": 3.879627019019431e-06, "loss": 0.4511, "mean_token_accuracy": 0.8485071495175361, "num_tokens": 165007465.0, "step": 137170 }, { "entropy": 1.9211634308099748, "epoch": 0.4252459974317225, "grad_norm": 8.58484172821045, "learning_rate": 3.879485608973968e-06, "loss": 0.4207, "mean_token_accuracy": 0.8675820276141166, "num_tokens": 165018031.0, "step": 137180 }, { "entropy": 1.7676224395632745, "epoch": 0.4252769965567722, "grad_norm": 8.151896476745605, "learning_rate": 3.879344214390308e-06, "loss": 0.416, "mean_token_accuracy": 0.8637456342577934, "num_tokens": 165032409.0, "step": 137190 }, { "entropy": 1.883620023727417, "epoch": 0.42530799568182187, "grad_norm": 7.578486442565918, "learning_rate": 3.879202835265633e-06, "loss": 0.4462, "mean_token_accuracy": 0.8562824308872223, "num_tokens": 165044847.0, "step": 137200 }, { "entropy": 1.8436495438218117, "epoch": 0.4253389948068716, "grad_norm": 8.562501907348633, "learning_rate": 3.879061471597127e-06, "loss": 0.4225, "mean_token_accuracy": 0.8575531423091889, "num_tokens": 165058103.0, "step": 137210 }, { "entropy": 1.8444740131497384, "epoch": 0.42536999393192126, "grad_norm": 7.836605072021484, "learning_rate": 3.878920123381976e-06, "loss": 0.4366, "mean_token_accuracy": 0.8577763974666596, "num_tokens": 165070717.0, "step": 137220 }, { "entropy": 1.747816914319992, "epoch": 0.425400993056971, "grad_norm": 8.393543243408203, "learning_rate": 3.878778790617362e-06, "loss": 0.3329, "mean_token_accuracy": 0.8659502550959587, "num_tokens": 165085002.0, "step": 137230 }, { "entropy": 1.9642114371061326, "epoch": 0.42543199218202066, "grad_norm": 8.948190689086914, "learning_rate": 3.87863747330047e-06, "loss": 0.5287, "mean_token_accuracy": 0.8416478574275971, "num_tokens": 165095906.0, "step": 137240 }, { "entropy": 1.8498766794800758, "epoch": 0.4254629913070704, "grad_norm": 4.120171070098877, "learning_rate": 3.8784961714284885e-06, "loss": 0.4044, "mean_token_accuracy": 0.8605809271335602, "num_tokens": 165108454.0, "step": 137250 }, { "entropy": 1.9248084783554078, "epoch": 0.42549399043212005, "grad_norm": 3.3840432167053223, "learning_rate": 3.878354884998603e-06, "loss": 0.4371, "mean_token_accuracy": 0.8541036680340767, "num_tokens": 165120547.0, "step": 137260 }, { "entropy": 1.8695127993822098, "epoch": 0.4255249895571698, "grad_norm": 7.736443519592285, "learning_rate": 3.878213614008001e-06, "loss": 0.4748, "mean_token_accuracy": 0.8537966668605804, "num_tokens": 165132994.0, "step": 137270 }, { "entropy": 1.889861761033535, "epoch": 0.42555598868221944, "grad_norm": 9.705860137939453, "learning_rate": 3.878072358453872e-06, "loss": 0.4257, "mean_token_accuracy": 0.8553769558668136, "num_tokens": 165144856.0, "step": 137280 }, { "entropy": 1.8385322004556657, "epoch": 0.4255869878072691, "grad_norm": 4.066993236541748, "learning_rate": 3.877931118333403e-06, "loss": 0.3621, "mean_token_accuracy": 0.8684134423732758, "num_tokens": 165157887.0, "step": 137290 }, { "entropy": 1.9168552801012992, "epoch": 0.42561798693231884, "grad_norm": 7.822579383850098, "learning_rate": 3.877789893643785e-06, "loss": 0.4402, "mean_token_accuracy": 0.8533562257885933, "num_tokens": 165170433.0, "step": 137300 }, { "entropy": 1.8463567778468133, "epoch": 0.4256489860573685, "grad_norm": 7.959437847137451, "learning_rate": 3.877648684382209e-06, "loss": 0.362, "mean_token_accuracy": 0.8728025883436203, "num_tokens": 165183224.0, "step": 137310 }, { "entropy": 1.896780176460743, "epoch": 0.42567998518241823, "grad_norm": 3.448885679244995, "learning_rate": 3.877507490545866e-06, "loss": 0.4425, "mean_token_accuracy": 0.8589633658528328, "num_tokens": 165194990.0, "step": 137320 }, { "entropy": 1.9273283243179322, "epoch": 0.4257109843074679, "grad_norm": 8.690948486328125, "learning_rate": 3.877366312131946e-06, "loss": 0.478, "mean_token_accuracy": 0.8396705284714698, "num_tokens": 165207611.0, "step": 137330 }, { "entropy": 1.9326757207512855, "epoch": 0.4257419834325176, "grad_norm": 9.278722763061523, "learning_rate": 3.877225149137642e-06, "loss": 0.4887, "mean_token_accuracy": 0.8490404456853866, "num_tokens": 165219648.0, "step": 137340 }, { "entropy": 1.7946316838264464, "epoch": 0.4257729825575673, "grad_norm": 8.48944091796875, "learning_rate": 3.877084001560149e-06, "loss": 0.3903, "mean_token_accuracy": 0.8692017003893853, "num_tokens": 165232876.0, "step": 137350 }, { "entropy": 1.9322822287678718, "epoch": 0.425803981682617, "grad_norm": 7.394003391265869, "learning_rate": 3.87694286939666e-06, "loss": 0.4537, "mean_token_accuracy": 0.85526312738657, "num_tokens": 165244530.0, "step": 137360 }, { "entropy": 1.8217709109187126, "epoch": 0.4258349808076667, "grad_norm": 8.290609359741211, "learning_rate": 3.876801752644371e-06, "loss": 0.394, "mean_token_accuracy": 0.8580706313252449, "num_tokens": 165257526.0, "step": 137370 }, { "entropy": 1.8738437041640281, "epoch": 0.4258659799327164, "grad_norm": 8.288384437561035, "learning_rate": 3.876660651300476e-06, "loss": 0.475, "mean_token_accuracy": 0.8446152061223984, "num_tokens": 165269243.0, "step": 137380 }, { "entropy": 1.8645242124795913, "epoch": 0.4258969790577661, "grad_norm": 7.856457710266113, "learning_rate": 3.876519565362171e-06, "loss": 0.4103, "mean_token_accuracy": 0.8648913115262985, "num_tokens": 165281328.0, "step": 137390 }, { "entropy": 1.8050186708569527, "epoch": 0.4259279781828158, "grad_norm": 6.527402400970459, "learning_rate": 3.876378494826653e-06, "loss": 0.3733, "mean_token_accuracy": 0.8636491522192955, "num_tokens": 165294695.0, "step": 137400 }, { "entropy": 1.88536017537117, "epoch": 0.42595897730786547, "grad_norm": 2.9641103744506836, "learning_rate": 3.8762374396911215e-06, "loss": 0.4317, "mean_token_accuracy": 0.8582195937633514, "num_tokens": 165307479.0, "step": 137410 }, { "entropy": 1.846780201792717, "epoch": 0.4259899764329152, "grad_norm": 5.190714359283447, "learning_rate": 3.876096399952772e-06, "loss": 0.3712, "mean_token_accuracy": 0.8630301207304001, "num_tokens": 165320970.0, "step": 137420 }, { "entropy": 1.9380200251936912, "epoch": 0.42602097555796486, "grad_norm": 7.597511291503906, "learning_rate": 3.875955375608804e-06, "loss": 0.5248, "mean_token_accuracy": 0.8328088164329529, "num_tokens": 165333314.0, "step": 137430 }, { "entropy": 1.9373356685042382, "epoch": 0.4260519746830146, "grad_norm": 7.61322021484375, "learning_rate": 3.875814366656419e-06, "loss": 0.4599, "mean_token_accuracy": 0.8467313498258591, "num_tokens": 165345441.0, "step": 137440 }, { "entropy": 1.849155631661415, "epoch": 0.42608297380806426, "grad_norm": 8.47956371307373, "learning_rate": 3.875673373092818e-06, "loss": 0.4481, "mean_token_accuracy": 0.8476941645145416, "num_tokens": 165358430.0, "step": 137450 }, { "entropy": 1.9534918010234832, "epoch": 0.426113972933114, "grad_norm": 7.988271236419678, "learning_rate": 3.875532394915199e-06, "loss": 0.4779, "mean_token_accuracy": 0.8520032778382302, "num_tokens": 165369269.0, "step": 137460 }, { "entropy": 1.9056456133723259, "epoch": 0.42614497205816365, "grad_norm": 9.272027969360352, "learning_rate": 3.875391432120765e-06, "loss": 0.4568, "mean_token_accuracy": 0.8481805965304374, "num_tokens": 165380985.0, "step": 137470 }, { "entropy": 1.939954537153244, "epoch": 0.4261759711832134, "grad_norm": 9.503613471984863, "learning_rate": 3.87525048470672e-06, "loss": 0.5082, "mean_token_accuracy": 0.8481825277209282, "num_tokens": 165392220.0, "step": 137480 }, { "entropy": 1.9774063229560852, "epoch": 0.42620697030826304, "grad_norm": 8.539331436157227, "learning_rate": 3.875109552670266e-06, "loss": 0.4858, "mean_token_accuracy": 0.8497935310006142, "num_tokens": 165403622.0, "step": 137490 }, { "entropy": 1.9746006906032563, "epoch": 0.42623796943331277, "grad_norm": 7.517263412475586, "learning_rate": 3.874968636008607e-06, "loss": 0.4568, "mean_token_accuracy": 0.8602147474884987, "num_tokens": 165414792.0, "step": 137500 }, { "entropy": 1.9843586146831513, "epoch": 0.42626896855836244, "grad_norm": 9.263333320617676, "learning_rate": 3.874827734718949e-06, "loss": 0.5149, "mean_token_accuracy": 0.8430792987346649, "num_tokens": 165425288.0, "step": 137510 }, { "entropy": 1.8610298484563828, "epoch": 0.42629996768341216, "grad_norm": 8.659163475036621, "learning_rate": 3.8746868487984955e-06, "loss": 0.4769, "mean_token_accuracy": 0.8577761441469193, "num_tokens": 165437939.0, "step": 137520 }, { "entropy": 1.9919404417276383, "epoch": 0.42633096680846183, "grad_norm": 7.622340202331543, "learning_rate": 3.874545978244454e-06, "loss": 0.4922, "mean_token_accuracy": 0.8462273955345154, "num_tokens": 165449331.0, "step": 137530 }, { "entropy": 1.883239607512951, "epoch": 0.4263619659335115, "grad_norm": 8.540725708007812, "learning_rate": 3.87440512305403e-06, "loss": 0.4444, "mean_token_accuracy": 0.8450183764100074, "num_tokens": 165462052.0, "step": 137540 }, { "entropy": 1.761160995066166, "epoch": 0.4263929650585612, "grad_norm": 3.7234740257263184, "learning_rate": 3.874264283224433e-06, "loss": 0.341, "mean_token_accuracy": 0.877360138297081, "num_tokens": 165476174.0, "step": 137550 }, { "entropy": 1.8949896410107612, "epoch": 0.4264239641836109, "grad_norm": 4.375694274902344, "learning_rate": 3.874123458752871e-06, "loss": 0.4789, "mean_token_accuracy": 0.8546592324972153, "num_tokens": 165488036.0, "step": 137560 }, { "entropy": 1.970250654220581, "epoch": 0.4264549633086606, "grad_norm": 7.3062357902526855, "learning_rate": 3.873982649636551e-06, "loss": 0.4985, "mean_token_accuracy": 0.8475343957543373, "num_tokens": 165498774.0, "step": 137570 }, { "entropy": 1.931829509139061, "epoch": 0.4264859624337103, "grad_norm": 8.701257705688477, "learning_rate": 3.8738418558726845e-06, "loss": 0.473, "mean_token_accuracy": 0.8460123270750046, "num_tokens": 165510856.0, "step": 137580 }, { "entropy": 1.9778954088687897, "epoch": 0.42651696155876, "grad_norm": 10.640881538391113, "learning_rate": 3.873701077458481e-06, "loss": 0.5176, "mean_token_accuracy": 0.8416032001376152, "num_tokens": 165521684.0, "step": 137590 }, { "entropy": 1.871251691877842, "epoch": 0.4265479606838097, "grad_norm": 7.633983612060547, "learning_rate": 3.8735603143911525e-06, "loss": 0.418, "mean_token_accuracy": 0.8568643853068352, "num_tokens": 165533887.0, "step": 137600 }, { "entropy": 1.9317373171448708, "epoch": 0.4265789598088594, "grad_norm": 7.2197465896606445, "learning_rate": 3.873419566667911e-06, "loss": 0.4767, "mean_token_accuracy": 0.845657354593277, "num_tokens": 165545740.0, "step": 137610 }, { "entropy": 1.8812014043331147, "epoch": 0.4266099589339091, "grad_norm": 8.381542205810547, "learning_rate": 3.873278834285967e-06, "loss": 0.4234, "mean_token_accuracy": 0.8589608117938041, "num_tokens": 165557441.0, "step": 137620 }, { "entropy": 1.9113554194569589, "epoch": 0.4266409580589588, "grad_norm": 8.038044929504395, "learning_rate": 3.8731381172425355e-06, "loss": 0.4489, "mean_token_accuracy": 0.8493774205446243, "num_tokens": 165568993.0, "step": 137630 }, { "entropy": 1.891987682878971, "epoch": 0.42667195718400847, "grad_norm": 8.052876472473145, "learning_rate": 3.87299741553483e-06, "loss": 0.4313, "mean_token_accuracy": 0.8627904951572418, "num_tokens": 165580905.0, "step": 137640 }, { "entropy": 1.9532914757728577, "epoch": 0.4267029563090582, "grad_norm": 8.188140869140625, "learning_rate": 3.872856729160065e-06, "loss": 0.4911, "mean_token_accuracy": 0.8446842655539513, "num_tokens": 165591744.0, "step": 137650 }, { "entropy": 1.9147072404623031, "epoch": 0.42673395543410786, "grad_norm": 7.08876895904541, "learning_rate": 3.872716058115456e-06, "loss": 0.4474, "mean_token_accuracy": 0.8530980065464974, "num_tokens": 165603429.0, "step": 137660 }, { "entropy": 1.950710503757, "epoch": 0.4267649545591576, "grad_norm": 9.763349533081055, "learning_rate": 3.87257540239822e-06, "loss": 0.4953, "mean_token_accuracy": 0.8458059638738632, "num_tokens": 165614886.0, "step": 137670 }, { "entropy": 1.8922182023525238, "epoch": 0.42679595368420725, "grad_norm": 7.75042724609375, "learning_rate": 3.872434762005572e-06, "loss": 0.4605, "mean_token_accuracy": 0.8595366209745408, "num_tokens": 165626421.0, "step": 137680 }, { "entropy": 1.9189585834741592, "epoch": 0.426826952809257, "grad_norm": 5.677242755889893, "learning_rate": 3.872294136934731e-06, "loss": 0.4916, "mean_token_accuracy": 0.8460530653595925, "num_tokens": 165638484.0, "step": 137690 }, { "entropy": 1.8581395775079728, "epoch": 0.42685795193430665, "grad_norm": 8.89156723022461, "learning_rate": 3.872153527182914e-06, "loss": 0.4251, "mean_token_accuracy": 0.851922070980072, "num_tokens": 165652365.0, "step": 137700 }, { "entropy": 1.9807763159275056, "epoch": 0.42688895105935637, "grad_norm": 9.592456817626953, "learning_rate": 3.87201293274734e-06, "loss": 0.5234, "mean_token_accuracy": 0.8444359451532364, "num_tokens": 165663039.0, "step": 137710 }, { "entropy": 1.9604480162262916, "epoch": 0.42691995018440604, "grad_norm": 6.907299518585205, "learning_rate": 3.87187235362523e-06, "loss": 0.5137, "mean_token_accuracy": 0.8401239201426506, "num_tokens": 165674987.0, "step": 137720 }, { "entropy": 1.8310967803001403, "epoch": 0.42695094930945576, "grad_norm": 7.537128925323486, "learning_rate": 3.871731789813803e-06, "loss": 0.3615, "mean_token_accuracy": 0.8688787788152694, "num_tokens": 165688212.0, "step": 137730 }, { "entropy": 1.8531201511621476, "epoch": 0.42698194843450543, "grad_norm": 3.874274253845215, "learning_rate": 3.87159124131028e-06, "loss": 0.3895, "mean_token_accuracy": 0.8694923147559166, "num_tokens": 165700648.0, "step": 137740 }, { "entropy": 1.8400910899043084, "epoch": 0.42701294755955516, "grad_norm": 8.50154972076416, "learning_rate": 3.871450708111883e-06, "loss": 0.4143, "mean_token_accuracy": 0.8610082730650902, "num_tokens": 165713968.0, "step": 137750 }, { "entropy": 1.9427133277058601, "epoch": 0.4270439466846048, "grad_norm": 9.070932388305664, "learning_rate": 3.8713101902158354e-06, "loss": 0.457, "mean_token_accuracy": 0.8499548882246017, "num_tokens": 165725879.0, "step": 137760 }, { "entropy": 1.913887146115303, "epoch": 0.42707494580965455, "grad_norm": 7.201402187347412, "learning_rate": 3.871169687619359e-06, "loss": 0.4384, "mean_token_accuracy": 0.8556391328573227, "num_tokens": 165737902.0, "step": 137770 }, { "entropy": 1.8545256823301315, "epoch": 0.4271059449347042, "grad_norm": 7.537273406982422, "learning_rate": 3.871029200319679e-06, "loss": 0.4282, "mean_token_accuracy": 0.8587723672389984, "num_tokens": 165750054.0, "step": 137780 }, { "entropy": 1.938425388932228, "epoch": 0.4271369440597539, "grad_norm": 8.340550422668457, "learning_rate": 3.870888728314018e-06, "loss": 0.4813, "mean_token_accuracy": 0.8487496972084045, "num_tokens": 165760805.0, "step": 137790 }, { "entropy": 1.7333957627415657, "epoch": 0.4271679431848036, "grad_norm": 8.770150184631348, "learning_rate": 3.870748271599602e-06, "loss": 0.3814, "mean_token_accuracy": 0.8617977604269982, "num_tokens": 165774883.0, "step": 137800 }, { "entropy": 1.924593235552311, "epoch": 0.4271989423098533, "grad_norm": 4.3831400871276855, "learning_rate": 3.870607830173659e-06, "loss": 0.4805, "mean_token_accuracy": 0.8520621985197068, "num_tokens": 165786764.0, "step": 137810 }, { "entropy": 1.856243497133255, "epoch": 0.427229941434903, "grad_norm": 8.254294395446777, "learning_rate": 3.8704674040334124e-06, "loss": 0.4257, "mean_token_accuracy": 0.8571957781910896, "num_tokens": 165799501.0, "step": 137820 }, { "entropy": 1.9511059790849685, "epoch": 0.4272609405599527, "grad_norm": 7.887767314910889, "learning_rate": 3.8703269931760905e-06, "loss": 0.4449, "mean_token_accuracy": 0.8668577119708061, "num_tokens": 165810071.0, "step": 137830 }, { "entropy": 1.854556131362915, "epoch": 0.4272919396850024, "grad_norm": 7.153961658477783, "learning_rate": 3.870186597598924e-06, "loss": 0.4058, "mean_token_accuracy": 0.8615784257650375, "num_tokens": 165822533.0, "step": 137840 }, { "entropy": 1.8858709022402764, "epoch": 0.42732293881005207, "grad_norm": 3.6662893295288086, "learning_rate": 3.870046217299139e-06, "loss": 0.4176, "mean_token_accuracy": 0.8626757502555847, "num_tokens": 165834055.0, "step": 137850 }, { "entropy": 1.977617959678173, "epoch": 0.4273539379351018, "grad_norm": 10.866979598999023, "learning_rate": 3.869905852273965e-06, "loss": 0.5671, "mean_token_accuracy": 0.8238250657916069, "num_tokens": 165846081.0, "step": 137860 }, { "entropy": 1.808312650024891, "epoch": 0.42738493706015146, "grad_norm": 3.0479211807250977, "learning_rate": 3.869765502520633e-06, "loss": 0.4259, "mean_token_accuracy": 0.8619560331106186, "num_tokens": 165858762.0, "step": 137870 }, { "entropy": 1.966909298300743, "epoch": 0.4274159361852012, "grad_norm": 8.616495132446289, "learning_rate": 3.869625168036374e-06, "loss": 0.5079, "mean_token_accuracy": 0.8432893455028534, "num_tokens": 165869711.0, "step": 137880 }, { "entropy": 1.8776897192001343, "epoch": 0.42744693531025085, "grad_norm": 5.014652252197266, "learning_rate": 3.8694848488184185e-06, "loss": 0.4583, "mean_token_accuracy": 0.8473319381475448, "num_tokens": 165882325.0, "step": 137890 }, { "entropy": 1.9359253600239754, "epoch": 0.4274779344353006, "grad_norm": 9.926865577697754, "learning_rate": 3.869344544864e-06, "loss": 0.4655, "mean_token_accuracy": 0.8564401522278786, "num_tokens": 165893416.0, "step": 137900 }, { "entropy": 1.9416421443223952, "epoch": 0.42750893356035025, "grad_norm": 4.446148872375488, "learning_rate": 3.869204256170351e-06, "loss": 0.5114, "mean_token_accuracy": 0.8427053913474083, "num_tokens": 165904568.0, "step": 137910 }, { "entropy": 1.8732975110411645, "epoch": 0.42753993268539997, "grad_norm": 7.7177019119262695, "learning_rate": 3.869063982734706e-06, "loss": 0.4447, "mean_token_accuracy": 0.860724626481533, "num_tokens": 165916369.0, "step": 137920 }, { "entropy": 1.8843568712472916, "epoch": 0.42757093181044964, "grad_norm": 8.525490760803223, "learning_rate": 3.868923724554298e-06, "loss": 0.4442, "mean_token_accuracy": 0.8500266253948212, "num_tokens": 165928150.0, "step": 137930 }, { "entropy": 1.9407880812883378, "epoch": 0.42760193093549936, "grad_norm": 8.2413969039917, "learning_rate": 3.868783481626363e-06, "loss": 0.4822, "mean_token_accuracy": 0.850470757484436, "num_tokens": 165938815.0, "step": 137940 }, { "entropy": 1.9635769337415696, "epoch": 0.42763293006054903, "grad_norm": 8.470434188842773, "learning_rate": 3.868643253948136e-06, "loss": 0.4737, "mean_token_accuracy": 0.853185373544693, "num_tokens": 165949479.0, "step": 137950 }, { "entropy": 1.778677401691675, "epoch": 0.42766392918559876, "grad_norm": 9.556220054626465, "learning_rate": 3.868503041516854e-06, "loss": 0.3995, "mean_token_accuracy": 0.8522502809762955, "num_tokens": 165962742.0, "step": 137960 }, { "entropy": 1.9046275839209557, "epoch": 0.4276949283106484, "grad_norm": 8.775141716003418, "learning_rate": 3.8683628443297554e-06, "loss": 0.5006, "mean_token_accuracy": 0.8469727069139481, "num_tokens": 165974044.0, "step": 137970 }, { "entropy": 1.9093778878450394, "epoch": 0.42772592743569815, "grad_norm": 4.199886798858643, "learning_rate": 3.868222662384076e-06, "loss": 0.4646, "mean_token_accuracy": 0.8507871389389038, "num_tokens": 165985295.0, "step": 137980 }, { "entropy": 1.858989103138447, "epoch": 0.4277569265607478, "grad_norm": 8.536810874938965, "learning_rate": 3.868082495677056e-06, "loss": 0.4509, "mean_token_accuracy": 0.8496241062879563, "num_tokens": 165998258.0, "step": 137990 }, { "entropy": 1.8789437100291253, "epoch": 0.42778792568579754, "grad_norm": 8.451990127563477, "learning_rate": 3.867942344205934e-06, "loss": 0.4667, "mean_token_accuracy": 0.8546844542026519, "num_tokens": 166009620.0, "step": 138000 }, { "entropy": 1.8833610072731972, "epoch": 0.4278189248108472, "grad_norm": 7.35070276260376, "learning_rate": 3.867802207967949e-06, "loss": 0.4241, "mean_token_accuracy": 0.8602321013808251, "num_tokens": 166021208.0, "step": 138010 }, { "entropy": 1.8281296089291572, "epoch": 0.42784992393589694, "grad_norm": 7.972667694091797, "learning_rate": 3.867662086960344e-06, "loss": 0.4273, "mean_token_accuracy": 0.8570877268910408, "num_tokens": 166033804.0, "step": 138020 }, { "entropy": 1.7907851234078407, "epoch": 0.4278809230609466, "grad_norm": 8.16515827178955, "learning_rate": 3.8675219811803586e-06, "loss": 0.4153, "mean_token_accuracy": 0.8523140147328376, "num_tokens": 166047092.0, "step": 138030 }, { "entropy": 1.7590288013219832, "epoch": 0.4279119221859963, "grad_norm": 3.805769443511963, "learning_rate": 3.8673818906252354e-06, "loss": 0.349, "mean_token_accuracy": 0.8651872128248215, "num_tokens": 166060694.0, "step": 138040 }, { "entropy": 1.8908022806048392, "epoch": 0.427942921311046, "grad_norm": 3.611936092376709, "learning_rate": 3.867241815292218e-06, "loss": 0.4588, "mean_token_accuracy": 0.851640222966671, "num_tokens": 166072485.0, "step": 138050 }, { "entropy": 1.9297591403126717, "epoch": 0.42797392043609567, "grad_norm": 7.440158843994141, "learning_rate": 3.867101755178548e-06, "loss": 0.4881, "mean_token_accuracy": 0.8443147346377373, "num_tokens": 166083693.0, "step": 138060 }, { "entropy": 1.9145255535840988, "epoch": 0.4280049195611454, "grad_norm": 7.726010799407959, "learning_rate": 3.866961710281471e-06, "loss": 0.4287, "mean_token_accuracy": 0.8665438875555992, "num_tokens": 166095128.0, "step": 138070 }, { "entropy": 1.8834168136119842, "epoch": 0.42803591868619506, "grad_norm": 8.636183738708496, "learning_rate": 3.866821680598232e-06, "loss": 0.4411, "mean_token_accuracy": 0.8588782355189324, "num_tokens": 166107020.0, "step": 138080 }, { "entropy": 1.890410417318344, "epoch": 0.4280669178112448, "grad_norm": 9.407487869262695, "learning_rate": 3.8666816661260766e-06, "loss": 0.4226, "mean_token_accuracy": 0.8558942392468453, "num_tokens": 166118533.0, "step": 138090 }, { "entropy": 1.895900882035494, "epoch": 0.42809791693629445, "grad_norm": 9.499615669250488, "learning_rate": 3.86654166686225e-06, "loss": 0.4494, "mean_token_accuracy": 0.8502351224422455, "num_tokens": 166130282.0, "step": 138100 }, { "entropy": 1.899457646906376, "epoch": 0.4281289160613442, "grad_norm": 8.837594032287598, "learning_rate": 3.866401682804001e-06, "loss": 0.4296, "mean_token_accuracy": 0.8482611507177353, "num_tokens": 166142276.0, "step": 138110 }, { "entropy": 1.9217953234910965, "epoch": 0.42815991518639385, "grad_norm": 8.12231731414795, "learning_rate": 3.866261713948576e-06, "loss": 0.5005, "mean_token_accuracy": 0.847544614970684, "num_tokens": 166154311.0, "step": 138120 }, { "entropy": 1.975175791978836, "epoch": 0.42819091431144357, "grad_norm": 7.083240509033203, "learning_rate": 3.866121760293223e-06, "loss": 0.5289, "mean_token_accuracy": 0.8364548414945603, "num_tokens": 166165093.0, "step": 138130 }, { "entropy": 1.9826736554503441, "epoch": 0.42822191343649324, "grad_norm": 9.184717178344727, "learning_rate": 3.865981821835192e-06, "loss": 0.4599, "mean_token_accuracy": 0.8572745114564896, "num_tokens": 166176186.0, "step": 138140 }, { "entropy": 1.8134810969233512, "epoch": 0.42825291256154296, "grad_norm": 9.861612319946289, "learning_rate": 3.8658418985717325e-06, "loss": 0.4013, "mean_token_accuracy": 0.8668722435832024, "num_tokens": 166188879.0, "step": 138150 }, { "entropy": 1.884695628285408, "epoch": 0.42828391168659263, "grad_norm": 7.422934532165527, "learning_rate": 3.865701990500095e-06, "loss": 0.4236, "mean_token_accuracy": 0.8617542907595634, "num_tokens": 166201719.0, "step": 138160 }, { "entropy": 1.9591299802064897, "epoch": 0.42831491081164236, "grad_norm": 7.2439165115356445, "learning_rate": 3.865562097617531e-06, "loss": 0.4762, "mean_token_accuracy": 0.8428146690130234, "num_tokens": 166213159.0, "step": 138170 }, { "entropy": 1.9952568769454957, "epoch": 0.428345909936692, "grad_norm": 8.97078800201416, "learning_rate": 3.865422219921291e-06, "loss": 0.51, "mean_token_accuracy": 0.8497517094016075, "num_tokens": 166224006.0, "step": 138180 }, { "entropy": 1.924196371436119, "epoch": 0.42837690906174175, "grad_norm": 7.9839186668396, "learning_rate": 3.865282357408629e-06, "loss": 0.4873, "mean_token_accuracy": 0.84932641685009, "num_tokens": 166235138.0, "step": 138190 }, { "entropy": 1.8705265656113625, "epoch": 0.4284079081867914, "grad_norm": 8.166479110717773, "learning_rate": 3.8651425100767995e-06, "loss": 0.4345, "mean_token_accuracy": 0.8603829473257065, "num_tokens": 166246952.0, "step": 138200 }, { "entropy": 1.8786292031407357, "epoch": 0.42843890731184114, "grad_norm": 3.4489612579345703, "learning_rate": 3.865002677923053e-06, "loss": 0.3917, "mean_token_accuracy": 0.8639238491654396, "num_tokens": 166259522.0, "step": 138210 }, { "entropy": 1.924619671702385, "epoch": 0.4284699064368908, "grad_norm": 7.831303596496582, "learning_rate": 3.864862860944646e-06, "loss": 0.4674, "mean_token_accuracy": 0.8498168155550957, "num_tokens": 166270568.0, "step": 138220 }, { "entropy": 1.8376672357320785, "epoch": 0.42850090556194054, "grad_norm": 8.016936302185059, "learning_rate": 3.864723059138835e-06, "loss": 0.4077, "mean_token_accuracy": 0.8555368795990944, "num_tokens": 166284006.0, "step": 138230 }, { "entropy": 1.8461035266518593, "epoch": 0.4285319046869902, "grad_norm": 7.379615783691406, "learning_rate": 3.864583272502873e-06, "loss": 0.3994, "mean_token_accuracy": 0.8619070097804069, "num_tokens": 166296817.0, "step": 138240 }, { "entropy": 1.8854956120252608, "epoch": 0.42856290381203993, "grad_norm": 3.884890079498291, "learning_rate": 3.86444350103402e-06, "loss": 0.4355, "mean_token_accuracy": 0.8470316350460052, "num_tokens": 166309348.0, "step": 138250 }, { "entropy": 1.7654275074601173, "epoch": 0.4285939029370896, "grad_norm": 9.349808692932129, "learning_rate": 3.864303744729532e-06, "loss": 0.3955, "mean_token_accuracy": 0.8671312019228935, "num_tokens": 166322832.0, "step": 138260 }, { "entropy": 1.8022211775183679, "epoch": 0.4286249020621393, "grad_norm": 9.219120979309082, "learning_rate": 3.864164003586666e-06, "loss": 0.3904, "mean_token_accuracy": 0.8593027204275131, "num_tokens": 166335905.0, "step": 138270 }, { "entropy": 1.8638395503163339, "epoch": 0.428655901187189, "grad_norm": 7.9525628089904785, "learning_rate": 3.864024277602683e-06, "loss": 0.4317, "mean_token_accuracy": 0.8568304657936097, "num_tokens": 166348632.0, "step": 138280 }, { "entropy": 1.954586958885193, "epoch": 0.42868690031223866, "grad_norm": 4.062179088592529, "learning_rate": 3.863884566774842e-06, "loss": 0.4813, "mean_token_accuracy": 0.8409469664096832, "num_tokens": 166360316.0, "step": 138290 }, { "entropy": 1.9421510837972165, "epoch": 0.4287178994372884, "grad_norm": 2.5580577850341797, "learning_rate": 3.863744871100402e-06, "loss": 0.4814, "mean_token_accuracy": 0.8459225296974182, "num_tokens": 166372255.0, "step": 138300 }, { "entropy": 1.9440967753529548, "epoch": 0.42874889856233805, "grad_norm": 8.030793190002441, "learning_rate": 3.863605190576625e-06, "loss": 0.4716, "mean_token_accuracy": 0.8478009730577469, "num_tokens": 166383633.0, "step": 138310 }, { "entropy": 1.9596870571374894, "epoch": 0.4287798976873878, "grad_norm": 8.272663116455078, "learning_rate": 3.863465525200771e-06, "loss": 0.5012, "mean_token_accuracy": 0.855411772429943, "num_tokens": 166393996.0, "step": 138320 }, { "entropy": 1.9576293110847474, "epoch": 0.42881089681243745, "grad_norm": 8.87380313873291, "learning_rate": 3.863325874970105e-06, "loss": 0.4675, "mean_token_accuracy": 0.8526660829782486, "num_tokens": 166405246.0, "step": 138330 }, { "entropy": 1.857802005112171, "epoch": 0.42884189593748717, "grad_norm": 7.251513957977295, "learning_rate": 3.863186239881888e-06, "loss": 0.4319, "mean_token_accuracy": 0.8620542094111443, "num_tokens": 166416790.0, "step": 138340 }, { "entropy": 1.888051538169384, "epoch": 0.42887289506253684, "grad_norm": 5.157383441925049, "learning_rate": 3.863046619933384e-06, "loss": 0.4401, "mean_token_accuracy": 0.8537735804915428, "num_tokens": 166429648.0, "step": 138350 }, { "entropy": 1.8693744093179703, "epoch": 0.42890389418758657, "grad_norm": 3.8320672512054443, "learning_rate": 3.862907015121856e-06, "loss": 0.4023, "mean_token_accuracy": 0.8648984044790268, "num_tokens": 166441105.0, "step": 138360 }, { "entropy": 1.993468850851059, "epoch": 0.42893489331263623, "grad_norm": 7.150916576385498, "learning_rate": 3.8627674254445724e-06, "loss": 0.5345, "mean_token_accuracy": 0.8378593668341636, "num_tokens": 166452361.0, "step": 138370 }, { "entropy": 1.9181072056293487, "epoch": 0.42896589243768596, "grad_norm": 7.089375972747803, "learning_rate": 3.862627850898797e-06, "loss": 0.5196, "mean_token_accuracy": 0.8403525799512863, "num_tokens": 166463855.0, "step": 138380 }, { "entropy": 1.967288474738598, "epoch": 0.4289968915627356, "grad_norm": 7.978488445281982, "learning_rate": 3.8624882914817964e-06, "loss": 0.5225, "mean_token_accuracy": 0.838663375377655, "num_tokens": 166475665.0, "step": 138390 }, { "entropy": 1.8494392395019532, "epoch": 0.42902789068778535, "grad_norm": 8.566108703613281, "learning_rate": 3.8623487471908375e-06, "loss": 0.4489, "mean_token_accuracy": 0.844138278067112, "num_tokens": 166488310.0, "step": 138400 }, { "entropy": 1.8647007137537002, "epoch": 0.429058889812835, "grad_norm": 8.269538879394531, "learning_rate": 3.862209218023188e-06, "loss": 0.4146, "mean_token_accuracy": 0.8606134533882142, "num_tokens": 166500577.0, "step": 138410 }, { "entropy": 1.8854542568325996, "epoch": 0.42908988893788474, "grad_norm": 8.041147232055664, "learning_rate": 3.862069703976116e-06, "loss": 0.4745, "mean_token_accuracy": 0.8451815471053123, "num_tokens": 166512317.0, "step": 138420 }, { "entropy": 1.885179491341114, "epoch": 0.4291208880629344, "grad_norm": 7.563552379608154, "learning_rate": 3.861930205046893e-06, "loss": 0.425, "mean_token_accuracy": 0.8753857642412186, "num_tokens": 166524119.0, "step": 138430 }, { "entropy": 1.8435839340090752, "epoch": 0.42915188718798414, "grad_norm": 3.8470942974090576, "learning_rate": 3.861790721232786e-06, "loss": 0.3848, "mean_token_accuracy": 0.865519005060196, "num_tokens": 166536053.0, "step": 138440 }, { "entropy": 1.943047122657299, "epoch": 0.4291828863130338, "grad_norm": 7.736998558044434, "learning_rate": 3.861651252531068e-06, "loss": 0.4898, "mean_token_accuracy": 0.8467626482248306, "num_tokens": 166547902.0, "step": 138450 }, { "entropy": 1.9400530129671096, "epoch": 0.42921388543808353, "grad_norm": 7.892675399780273, "learning_rate": 3.861511798939009e-06, "loss": 0.4613, "mean_token_accuracy": 0.8521466955542565, "num_tokens": 166559133.0, "step": 138460 }, { "entropy": 1.8419177174568175, "epoch": 0.4292448845631332, "grad_norm": 8.258288383483887, "learning_rate": 3.8613723604538814e-06, "loss": 0.4093, "mean_token_accuracy": 0.8595252349972725, "num_tokens": 166571712.0, "step": 138470 }, { "entropy": 1.918247513473034, "epoch": 0.4292758836881829, "grad_norm": 8.431722640991211, "learning_rate": 3.8612329370729566e-06, "loss": 0.4883, "mean_token_accuracy": 0.8501754224300384, "num_tokens": 166583492.0, "step": 138480 }, { "entropy": 1.922251921892166, "epoch": 0.4293068828132326, "grad_norm": 7.291534423828125, "learning_rate": 3.861093528793509e-06, "loss": 0.4675, "mean_token_accuracy": 0.8490281432867051, "num_tokens": 166595760.0, "step": 138490 }, { "entropy": 1.9234865739941598, "epoch": 0.4293378819382823, "grad_norm": 8.550054550170898, "learning_rate": 3.860954135612814e-06, "loss": 0.4715, "mean_token_accuracy": 0.8617073327302933, "num_tokens": 166607418.0, "step": 138500 }, { "entropy": 1.9898644000291825, "epoch": 0.429368881063332, "grad_norm": 7.687295913696289, "learning_rate": 3.860814757528143e-06, "loss": 0.5028, "mean_token_accuracy": 0.8465102508664131, "num_tokens": 166618579.0, "step": 138510 }, { "entropy": 1.9373308807611465, "epoch": 0.42939988018838166, "grad_norm": 8.360345840454102, "learning_rate": 3.8606753945367746e-06, "loss": 0.4598, "mean_token_accuracy": 0.849955253303051, "num_tokens": 166629584.0, "step": 138520 }, { "entropy": 1.8768581017851829, "epoch": 0.4294308793134314, "grad_norm": 8.298843383789062, "learning_rate": 3.860536046635983e-06, "loss": 0.4401, "mean_token_accuracy": 0.8533497661352157, "num_tokens": 166641699.0, "step": 138530 }, { "entropy": 1.8640162199735641, "epoch": 0.42946187843848105, "grad_norm": 8.439657211303711, "learning_rate": 3.8603967138230456e-06, "loss": 0.4794, "mean_token_accuracy": 0.8546614408493042, "num_tokens": 166654557.0, "step": 138540 }, { "entropy": 1.7630941659212112, "epoch": 0.4294928775635308, "grad_norm": 4.3905744552612305, "learning_rate": 3.8602573960952405e-06, "loss": 0.3375, "mean_token_accuracy": 0.8672227919101715, "num_tokens": 166669057.0, "step": 138550 }, { "entropy": 1.883773235976696, "epoch": 0.42952387668858044, "grad_norm": 8.244091987609863, "learning_rate": 3.860118093449845e-06, "loss": 0.4368, "mean_token_accuracy": 0.8554739341139793, "num_tokens": 166680914.0, "step": 138560 }, { "entropy": 1.9457030475139618, "epoch": 0.42955487581363017, "grad_norm": 9.337789535522461, "learning_rate": 3.8599788058841385e-06, "loss": 0.4938, "mean_token_accuracy": 0.8439801335334778, "num_tokens": 166692421.0, "step": 138570 }, { "entropy": 1.8535506889224052, "epoch": 0.42958587493867983, "grad_norm": 10.059792518615723, "learning_rate": 3.859839533395399e-06, "loss": 0.3947, "mean_token_accuracy": 0.8560854658484459, "num_tokens": 166704599.0, "step": 138580 }, { "entropy": 1.9137897729873656, "epoch": 0.42961687406372956, "grad_norm": 8.756423950195312, "learning_rate": 3.859700275980909e-06, "loss": 0.4349, "mean_token_accuracy": 0.8534789383411407, "num_tokens": 166716671.0, "step": 138590 }, { "entropy": 1.8182189181447028, "epoch": 0.42964787318877923, "grad_norm": 4.194288730621338, "learning_rate": 3.859561033637948e-06, "loss": 0.4178, "mean_token_accuracy": 0.8575180351734162, "num_tokens": 166729546.0, "step": 138600 }, { "entropy": 1.9256513655185699, "epoch": 0.42967887231382895, "grad_norm": 8.066496849060059, "learning_rate": 3.859421806363798e-06, "loss": 0.4543, "mean_token_accuracy": 0.8577398300170899, "num_tokens": 166741327.0, "step": 138610 }, { "entropy": 1.9436537489295005, "epoch": 0.4297098714388786, "grad_norm": 8.44263744354248, "learning_rate": 3.859282594155741e-06, "loss": 0.4676, "mean_token_accuracy": 0.8500909224152565, "num_tokens": 166752956.0, "step": 138620 }, { "entropy": 1.8246969357132912, "epoch": 0.42974087056392835, "grad_norm": 7.600648880004883, "learning_rate": 3.859143397011061e-06, "loss": 0.3817, "mean_token_accuracy": 0.8686912745237351, "num_tokens": 166766150.0, "step": 138630 }, { "entropy": 1.8569831773638725, "epoch": 0.429771869688978, "grad_norm": 3.839282989501953, "learning_rate": 3.85900421492704e-06, "loss": 0.4299, "mean_token_accuracy": 0.8597505047917366, "num_tokens": 166779296.0, "step": 138640 }, { "entropy": 1.9191448912024498, "epoch": 0.42980286881402774, "grad_norm": 8.564552307128906, "learning_rate": 3.858865047900964e-06, "loss": 0.5094, "mean_token_accuracy": 0.8487473547458648, "num_tokens": 166790635.0, "step": 138650 }, { "entropy": 1.9488032266497612, "epoch": 0.4298338679390774, "grad_norm": 7.2074360847473145, "learning_rate": 3.858725895930116e-06, "loss": 0.4532, "mean_token_accuracy": 0.8469724044203758, "num_tokens": 166802382.0, "step": 138660 }, { "entropy": 1.9374908640980721, "epoch": 0.42986486706412713, "grad_norm": 8.300436973571777, "learning_rate": 3.8585867590117845e-06, "loss": 0.445, "mean_token_accuracy": 0.8558620125055313, "num_tokens": 166813964.0, "step": 138670 }, { "entropy": 1.9663563668727875, "epoch": 0.4298958661891768, "grad_norm": 9.31306266784668, "learning_rate": 3.858447637143254e-06, "loss": 0.4717, "mean_token_accuracy": 0.8509336993098259, "num_tokens": 166825330.0, "step": 138680 }, { "entropy": 1.944647277891636, "epoch": 0.4299268653142265, "grad_norm": 9.174337387084961, "learning_rate": 3.858308530321811e-06, "loss": 0.4679, "mean_token_accuracy": 0.8518739119172096, "num_tokens": 166836729.0, "step": 138690 }, { "entropy": 1.9004219979047776, "epoch": 0.4299578644392762, "grad_norm": 9.097691535949707, "learning_rate": 3.858169438544745e-06, "loss": 0.4376, "mean_token_accuracy": 0.8654677256941795, "num_tokens": 166848221.0, "step": 138700 }, { "entropy": 1.905510926246643, "epoch": 0.4299888635643259, "grad_norm": 7.829078674316406, "learning_rate": 3.858030361809342e-06, "loss": 0.4285, "mean_token_accuracy": 0.8583436101675034, "num_tokens": 166860834.0, "step": 138710 }, { "entropy": 2.0032723784446715, "epoch": 0.4300198626893756, "grad_norm": 9.5745267868042, "learning_rate": 3.857891300112894e-06, "loss": 0.5005, "mean_token_accuracy": 0.845015361905098, "num_tokens": 166871860.0, "step": 138720 }, { "entropy": 1.9505044639110565, "epoch": 0.4300508618144253, "grad_norm": 8.139888763427734, "learning_rate": 3.85775225345269e-06, "loss": 0.4537, "mean_token_accuracy": 0.8441401451826096, "num_tokens": 166883775.0, "step": 138730 }, { "entropy": 1.996666456758976, "epoch": 0.430081860939475, "grad_norm": 8.10516357421875, "learning_rate": 3.857613221826021e-06, "loss": 0.4895, "mean_token_accuracy": 0.8445516094565392, "num_tokens": 166894824.0, "step": 138740 }, { "entropy": 1.9581088334321977, "epoch": 0.4301128600645247, "grad_norm": 8.987552642822266, "learning_rate": 3.8574742052301755e-06, "loss": 0.4833, "mean_token_accuracy": 0.8530200630426407, "num_tokens": 166906284.0, "step": 138750 }, { "entropy": 1.8697850778698921, "epoch": 0.4301438591895744, "grad_norm": 7.841629981994629, "learning_rate": 3.857335203662448e-06, "loss": 0.4098, "mean_token_accuracy": 0.8616936087608338, "num_tokens": 166919062.0, "step": 138760 }, { "entropy": 1.8914014741778373, "epoch": 0.43017485831462404, "grad_norm": 3.872026205062866, "learning_rate": 3.85719621712013e-06, "loss": 0.4809, "mean_token_accuracy": 0.8455238342285156, "num_tokens": 166930561.0, "step": 138770 }, { "entropy": 1.8692807108163834, "epoch": 0.43020585743967377, "grad_norm": 2.8080806732177734, "learning_rate": 3.857057245600515e-06, "loss": 0.3985, "mean_token_accuracy": 0.8588801845908165, "num_tokens": 166944394.0, "step": 138780 }, { "entropy": 1.8834291696548462, "epoch": 0.43023685656472344, "grad_norm": 8.107061386108398, "learning_rate": 3.856918289100897e-06, "loss": 0.4835, "mean_token_accuracy": 0.8372646197676659, "num_tokens": 166957123.0, "step": 138790 }, { "entropy": 1.9186320677399635, "epoch": 0.43026785568977316, "grad_norm": 6.207789421081543, "learning_rate": 3.856779347618569e-06, "loss": 0.4659, "mean_token_accuracy": 0.8523793607950211, "num_tokens": 166969149.0, "step": 138800 }, { "entropy": 1.9312793225049973, "epoch": 0.43029885481482283, "grad_norm": 9.400903701782227, "learning_rate": 3.85664042115083e-06, "loss": 0.4098, "mean_token_accuracy": 0.8667648211121559, "num_tokens": 166981732.0, "step": 138810 }, { "entropy": 1.7980728089809417, "epoch": 0.43032985393987255, "grad_norm": 5.016563415527344, "learning_rate": 3.8565015096949724e-06, "loss": 0.3996, "mean_token_accuracy": 0.8642199486494064, "num_tokens": 166995822.0, "step": 138820 }, { "entropy": 1.8924931958317757, "epoch": 0.4303608530649222, "grad_norm": 8.750584602355957, "learning_rate": 3.856362613248295e-06, "loss": 0.4858, "mean_token_accuracy": 0.8461324706673622, "num_tokens": 167008749.0, "step": 138830 }, { "entropy": 1.840745933353901, "epoch": 0.43039185218997195, "grad_norm": 10.046074867248535, "learning_rate": 3.856223731808094e-06, "loss": 0.4145, "mean_token_accuracy": 0.8469096630811691, "num_tokens": 167022329.0, "step": 138840 }, { "entropy": 1.8810099840164185, "epoch": 0.4304228513150216, "grad_norm": 8.733558654785156, "learning_rate": 3.856084865371667e-06, "loss": 0.3754, "mean_token_accuracy": 0.8611682400107383, "num_tokens": 167035334.0, "step": 138850 }, { "entropy": 1.8504212602972985, "epoch": 0.43045385044007134, "grad_norm": 2.9879043102264404, "learning_rate": 3.855946013936313e-06, "loss": 0.4424, "mean_token_accuracy": 0.8579250067472458, "num_tokens": 167048594.0, "step": 138860 }, { "entropy": 1.9671182319521905, "epoch": 0.430484849565121, "grad_norm": 8.825550079345703, "learning_rate": 3.855807177499334e-06, "loss": 0.4828, "mean_token_accuracy": 0.8505840554833413, "num_tokens": 167060152.0, "step": 138870 }, { "entropy": 1.8691889211535453, "epoch": 0.43051584869017073, "grad_norm": 7.807781219482422, "learning_rate": 3.855668356058026e-06, "loss": 0.4117, "mean_token_accuracy": 0.8700853690505028, "num_tokens": 167072813.0, "step": 138880 }, { "entropy": 1.89640132188797, "epoch": 0.4305468478152204, "grad_norm": 7.267297267913818, "learning_rate": 3.855529549609692e-06, "loss": 0.4418, "mean_token_accuracy": 0.8511320620775222, "num_tokens": 167084361.0, "step": 138890 }, { "entropy": 1.8999717831611633, "epoch": 0.4305778469402701, "grad_norm": 8.101717948913574, "learning_rate": 3.855390758151633e-06, "loss": 0.4316, "mean_token_accuracy": 0.8580278515815735, "num_tokens": 167096540.0, "step": 138900 }, { "entropy": 1.9178320989012718, "epoch": 0.4306088460653198, "grad_norm": 6.982521057128906, "learning_rate": 3.855251981681151e-06, "loss": 0.4057, "mean_token_accuracy": 0.8600656524300575, "num_tokens": 167107872.0, "step": 138910 }, { "entropy": 1.9452101722359658, "epoch": 0.4306398451903695, "grad_norm": 10.741822242736816, "learning_rate": 3.855113220195549e-06, "loss": 0.4779, "mean_token_accuracy": 0.8533175542950631, "num_tokens": 167119118.0, "step": 138920 }, { "entropy": 1.893312358856201, "epoch": 0.4306708443154192, "grad_norm": 8.916707992553711, "learning_rate": 3.8549744736921305e-06, "loss": 0.4095, "mean_token_accuracy": 0.8634658083319664, "num_tokens": 167130595.0, "step": 138930 }, { "entropy": 1.8818313643336295, "epoch": 0.4307018434404689, "grad_norm": 9.812067985534668, "learning_rate": 3.854835742168199e-06, "loss": 0.4265, "mean_token_accuracy": 0.8561764359474182, "num_tokens": 167143490.0, "step": 138940 }, { "entropy": 1.8934076741337775, "epoch": 0.4307328425655186, "grad_norm": 6.764780521392822, "learning_rate": 3.85469702562106e-06, "loss": 0.4771, "mean_token_accuracy": 0.8511760547757149, "num_tokens": 167156082.0, "step": 138950 }, { "entropy": 1.983777368068695, "epoch": 0.4307638416905683, "grad_norm": 7.675331115722656, "learning_rate": 3.854558324048018e-06, "loss": 0.5062, "mean_token_accuracy": 0.8494358032941818, "num_tokens": 167166878.0, "step": 138960 }, { "entropy": 1.8882510006427764, "epoch": 0.430794840815618, "grad_norm": 3.8300962448120117, "learning_rate": 3.85441963744638e-06, "loss": 0.4472, "mean_token_accuracy": 0.8561283707618713, "num_tokens": 167178890.0, "step": 138970 }, { "entropy": 1.9575580522418021, "epoch": 0.4308258399406677, "grad_norm": 10.387866020202637, "learning_rate": 3.854280965813453e-06, "loss": 0.4783, "mean_token_accuracy": 0.843521349132061, "num_tokens": 167190173.0, "step": 138980 }, { "entropy": 1.9261010527610778, "epoch": 0.43085683906571737, "grad_norm": 7.206926345825195, "learning_rate": 3.8541423091465445e-06, "loss": 0.461, "mean_token_accuracy": 0.856898583471775, "num_tokens": 167201201.0, "step": 138990 }, { "entropy": 1.871525762975216, "epoch": 0.4308878381907671, "grad_norm": 7.605156898498535, "learning_rate": 3.854003667442962e-06, "loss": 0.4548, "mean_token_accuracy": 0.8476424932479858, "num_tokens": 167213579.0, "step": 139000 }, { "entropy": 1.9758980587124824, "epoch": 0.43091883731581676, "grad_norm": 8.815040588378906, "learning_rate": 3.853865040700016e-06, "loss": 0.4918, "mean_token_accuracy": 0.8458743005990982, "num_tokens": 167224666.0, "step": 139010 }, { "entropy": 1.883438839018345, "epoch": 0.43094983644086643, "grad_norm": 7.926342010498047, "learning_rate": 3.853726428915014e-06, "loss": 0.4409, "mean_token_accuracy": 0.8550115719437599, "num_tokens": 167237308.0, "step": 139020 }, { "entropy": 1.984131459891796, "epoch": 0.43098083556591615, "grad_norm": 7.568789958953857, "learning_rate": 3.853587832085266e-06, "loss": 0.5039, "mean_token_accuracy": 0.8429438158869743, "num_tokens": 167248949.0, "step": 139030 }, { "entropy": 1.8531333744525909, "epoch": 0.4310118346909658, "grad_norm": 4.8112359046936035, "learning_rate": 3.8534492502080855e-06, "loss": 0.4109, "mean_token_accuracy": 0.8603510394692421, "num_tokens": 167261627.0, "step": 139040 }, { "entropy": 1.923319575190544, "epoch": 0.43104283381601555, "grad_norm": 7.872719764709473, "learning_rate": 3.853310683280783e-06, "loss": 0.4643, "mean_token_accuracy": 0.8496645122766495, "num_tokens": 167272866.0, "step": 139050 }, { "entropy": 1.9188224956393243, "epoch": 0.4310738329410652, "grad_norm": 8.1365327835083, "learning_rate": 3.853172131300669e-06, "loss": 0.4393, "mean_token_accuracy": 0.8515357092022896, "num_tokens": 167284947.0, "step": 139060 }, { "entropy": 1.8909332856535912, "epoch": 0.43110483206611494, "grad_norm": 9.189377784729004, "learning_rate": 3.8530335942650585e-06, "loss": 0.4156, "mean_token_accuracy": 0.853959108889103, "num_tokens": 167297259.0, "step": 139070 }, { "entropy": 1.8885620832443237, "epoch": 0.4311358311911646, "grad_norm": 9.13309097290039, "learning_rate": 3.852895072171265e-06, "loss": 0.4055, "mean_token_accuracy": 0.8578508168458938, "num_tokens": 167308954.0, "step": 139080 }, { "entropy": 1.8412837103009223, "epoch": 0.43116683031621433, "grad_norm": 4.5541090965271, "learning_rate": 3.8527565650166015e-06, "loss": 0.4433, "mean_token_accuracy": 0.8527804985642433, "num_tokens": 167321410.0, "step": 139090 }, { "entropy": 1.9028522804379464, "epoch": 0.431197829441264, "grad_norm": 9.414676666259766, "learning_rate": 3.852618072798383e-06, "loss": 0.4245, "mean_token_accuracy": 0.8578800424933434, "num_tokens": 167333571.0, "step": 139100 }, { "entropy": 1.8518059805035592, "epoch": 0.4312288285663137, "grad_norm": 3.911571741104126, "learning_rate": 3.852479595513928e-06, "loss": 0.4215, "mean_token_accuracy": 0.8502576932311058, "num_tokens": 167346630.0, "step": 139110 }, { "entropy": 1.9005075827240945, "epoch": 0.4312598276913634, "grad_norm": 7.447115898132324, "learning_rate": 3.852341133160549e-06, "loss": 0.458, "mean_token_accuracy": 0.8436521142721176, "num_tokens": 167359023.0, "step": 139120 }, { "entropy": 1.955150267481804, "epoch": 0.4312908268164131, "grad_norm": 8.531542778015137, "learning_rate": 3.8522026857355656e-06, "loss": 0.4963, "mean_token_accuracy": 0.8467582687735558, "num_tokens": 167370225.0, "step": 139130 }, { "entropy": 1.912473227083683, "epoch": 0.4313218259414628, "grad_norm": 7.494203567504883, "learning_rate": 3.852064253236293e-06, "loss": 0.4264, "mean_token_accuracy": 0.8552014127373695, "num_tokens": 167382204.0, "step": 139140 }, { "entropy": 1.929689645767212, "epoch": 0.4313528250665125, "grad_norm": 8.015472412109375, "learning_rate": 3.851925835660052e-06, "loss": 0.4245, "mean_token_accuracy": 0.8602582827210427, "num_tokens": 167393606.0, "step": 139150 }, { "entropy": 1.93514022231102, "epoch": 0.4313838241915622, "grad_norm": 6.807281017303467, "learning_rate": 3.851787433004161e-06, "loss": 0.456, "mean_token_accuracy": 0.855001138150692, "num_tokens": 167404837.0, "step": 139160 }, { "entropy": 1.8539175868034363, "epoch": 0.4314148233166119, "grad_norm": 10.743212699890137, "learning_rate": 3.851649045265939e-06, "loss": 0.4302, "mean_token_accuracy": 0.8622266680002213, "num_tokens": 167416637.0, "step": 139170 }, { "entropy": 1.9064085155725479, "epoch": 0.4314458224416616, "grad_norm": 8.57366943359375, "learning_rate": 3.851510672442708e-06, "loss": 0.4703, "mean_token_accuracy": 0.8500051259994507, "num_tokens": 167428580.0, "step": 139180 }, { "entropy": 1.8125137344002724, "epoch": 0.4314768215667113, "grad_norm": 8.313339233398438, "learning_rate": 3.851372314531787e-06, "loss": 0.392, "mean_token_accuracy": 0.8584845051169395, "num_tokens": 167441286.0, "step": 139190 }, { "entropy": 1.9030530989170074, "epoch": 0.43150782069176097, "grad_norm": 4.577101707458496, "learning_rate": 3.851233971530498e-06, "loss": 0.4585, "mean_token_accuracy": 0.8593366608023644, "num_tokens": 167453514.0, "step": 139200 }, { "entropy": 1.9610344976186753, "epoch": 0.4315388198168107, "grad_norm": 8.109823226928711, "learning_rate": 3.851095643436164e-06, "loss": 0.5133, "mean_token_accuracy": 0.8386234149336815, "num_tokens": 167464843.0, "step": 139210 }, { "entropy": 1.9184462070465087, "epoch": 0.43156981894186036, "grad_norm": 4.524685859680176, "learning_rate": 3.850957330246109e-06, "loss": 0.4429, "mean_token_accuracy": 0.8548565566539764, "num_tokens": 167476607.0, "step": 139220 }, { "entropy": 1.8219572558999062, "epoch": 0.4316008180669101, "grad_norm": 7.470212936401367, "learning_rate": 3.850819031957655e-06, "loss": 0.4031, "mean_token_accuracy": 0.8599913343787193, "num_tokens": 167489375.0, "step": 139230 }, { "entropy": 1.878198716044426, "epoch": 0.43163181719195975, "grad_norm": 7.494040489196777, "learning_rate": 3.850680748568127e-06, "loss": 0.4019, "mean_token_accuracy": 0.8682243391871453, "num_tokens": 167501096.0, "step": 139240 }, { "entropy": 1.882466796040535, "epoch": 0.4316628163170095, "grad_norm": 3.8041422367095947, "learning_rate": 3.85054248007485e-06, "loss": 0.4767, "mean_token_accuracy": 0.8378754198551178, "num_tokens": 167513496.0, "step": 139250 }, { "entropy": 1.9623596340417861, "epoch": 0.43169381544205915, "grad_norm": 9.18985366821289, "learning_rate": 3.850404226475151e-06, "loss": 0.5032, "mean_token_accuracy": 0.8428544372320175, "num_tokens": 167523826.0, "step": 139260 }, { "entropy": 1.8917414352297783, "epoch": 0.4317248145671088, "grad_norm": 9.102961540222168, "learning_rate": 3.850265987766354e-06, "loss": 0.437, "mean_token_accuracy": 0.8560737207531929, "num_tokens": 167535560.0, "step": 139270 }, { "entropy": 1.9341025441884994, "epoch": 0.43175581369215854, "grad_norm": 9.072357177734375, "learning_rate": 3.850127763945788e-06, "loss": 0.4758, "mean_token_accuracy": 0.8465018764138221, "num_tokens": 167546927.0, "step": 139280 }, { "entropy": 1.7552316695451737, "epoch": 0.4317868128172082, "grad_norm": 6.688268661499023, "learning_rate": 3.849989555010781e-06, "loss": 0.363, "mean_token_accuracy": 0.8682321265339852, "num_tokens": 167560862.0, "step": 139290 }, { "entropy": 1.9611527785658835, "epoch": 0.43181781194225793, "grad_norm": 3.5315842628479004, "learning_rate": 3.84985136095866e-06, "loss": 0.4377, "mean_token_accuracy": 0.8544865190982819, "num_tokens": 167572247.0, "step": 139300 }, { "entropy": 1.9224155962467193, "epoch": 0.4318488110673076, "grad_norm": 8.508109092712402, "learning_rate": 3.849713181786755e-06, "loss": 0.4655, "mean_token_accuracy": 0.8514292508363723, "num_tokens": 167583263.0, "step": 139310 }, { "entropy": 1.910231950879097, "epoch": 0.4318798101923573, "grad_norm": 9.595215797424316, "learning_rate": 3.849575017492395e-06, "loss": 0.4727, "mean_token_accuracy": 0.8502141878008842, "num_tokens": 167595159.0, "step": 139320 }, { "entropy": 1.9836928397417068, "epoch": 0.431910809317407, "grad_norm": 8.529363632202148, "learning_rate": 3.849436868072912e-06, "loss": 0.5398, "mean_token_accuracy": 0.8343594878911972, "num_tokens": 167606457.0, "step": 139330 }, { "entropy": 1.8075464725494386, "epoch": 0.4319418084424567, "grad_norm": 7.387181282043457, "learning_rate": 3.849298733525636e-06, "loss": 0.389, "mean_token_accuracy": 0.8693550243973732, "num_tokens": 167619065.0, "step": 139340 }, { "entropy": 1.8776528060436248, "epoch": 0.4319728075675064, "grad_norm": 7.418018341064453, "learning_rate": 3.849160613847898e-06, "loss": 0.4092, "mean_token_accuracy": 0.8570902019739151, "num_tokens": 167631388.0, "step": 139350 }, { "entropy": 1.8456250533461571, "epoch": 0.4320038066925561, "grad_norm": 6.901108741760254, "learning_rate": 3.849022509037032e-06, "loss": 0.4719, "mean_token_accuracy": 0.8538346603512764, "num_tokens": 167643724.0, "step": 139360 }, { "entropy": 1.8757693514227867, "epoch": 0.4320348058176058, "grad_norm": 7.9772210121154785, "learning_rate": 3.848884419090371e-06, "loss": 0.4258, "mean_token_accuracy": 0.8559473112225533, "num_tokens": 167657019.0, "step": 139370 }, { "entropy": 1.8769934855401516, "epoch": 0.4320658049426555, "grad_norm": 7.6656413078308105, "learning_rate": 3.848746344005249e-06, "loss": 0.3837, "mean_token_accuracy": 0.8610425606369972, "num_tokens": 167669569.0, "step": 139380 }, { "entropy": 1.8971701174974442, "epoch": 0.4320968040677052, "grad_norm": 8.26727294921875, "learning_rate": 3.848608283778998e-06, "loss": 0.4906, "mean_token_accuracy": 0.8484355866909027, "num_tokens": 167681405.0, "step": 139390 }, { "entropy": 1.9876887768507003, "epoch": 0.4321278031927549, "grad_norm": 7.842116832733154, "learning_rate": 3.848470238408956e-06, "loss": 0.5748, "mean_token_accuracy": 0.8318600550293922, "num_tokens": 167692558.0, "step": 139400 }, { "entropy": 1.8590367823839187, "epoch": 0.43215880231780457, "grad_norm": 8.572957038879395, "learning_rate": 3.848332207892458e-06, "loss": 0.4538, "mean_token_accuracy": 0.8467065960168838, "num_tokens": 167705077.0, "step": 139410 }, { "entropy": 1.9348689168691635, "epoch": 0.4321898014428543, "grad_norm": 7.435234069824219, "learning_rate": 3.84819419222684e-06, "loss": 0.4499, "mean_token_accuracy": 0.8602284207940102, "num_tokens": 167716206.0, "step": 139420 }, { "entropy": 1.9003080978989602, "epoch": 0.43222080056790396, "grad_norm": 3.873739242553711, "learning_rate": 3.848056191409439e-06, "loss": 0.4818, "mean_token_accuracy": 0.8429651230573654, "num_tokens": 167728103.0, "step": 139430 }, { "entropy": 1.9454911321401596, "epoch": 0.4322517996929537, "grad_norm": 7.4276018142700195, "learning_rate": 3.847918205437593e-06, "loss": 0.5043, "mean_token_accuracy": 0.844630342721939, "num_tokens": 167738632.0, "step": 139440 }, { "entropy": 1.8453918382525445, "epoch": 0.43228279881800336, "grad_norm": 8.798940658569336, "learning_rate": 3.847780234308641e-06, "loss": 0.4311, "mean_token_accuracy": 0.8564523667097091, "num_tokens": 167750964.0, "step": 139450 }, { "entropy": 1.8049393191933631, "epoch": 0.4323137979430531, "grad_norm": 9.362977027893066, "learning_rate": 3.847642278019923e-06, "loss": 0.4083, "mean_token_accuracy": 0.8607902884483337, "num_tokens": 167763833.0, "step": 139460 }, { "entropy": 1.8966955333948134, "epoch": 0.43234479706810275, "grad_norm": 7.66609001159668, "learning_rate": 3.8475043365687755e-06, "loss": 0.4731, "mean_token_accuracy": 0.8480794101953506, "num_tokens": 167776065.0, "step": 139470 }, { "entropy": 1.8854218780994416, "epoch": 0.4323757961931525, "grad_norm": 8.28154468536377, "learning_rate": 3.847366409952543e-06, "loss": 0.4477, "mean_token_accuracy": 0.8489232257008552, "num_tokens": 167788059.0, "step": 139480 }, { "entropy": 1.8077181428670883, "epoch": 0.43240679531820214, "grad_norm": 3.7770919799804688, "learning_rate": 3.847228498168565e-06, "loss": 0.3587, "mean_token_accuracy": 0.8708108901977539, "num_tokens": 167801348.0, "step": 139490 }, { "entropy": 1.8897404298186302, "epoch": 0.43243779444325187, "grad_norm": 7.708387851715088, "learning_rate": 3.8470906012141825e-06, "loss": 0.4417, "mean_token_accuracy": 0.8538424223661423, "num_tokens": 167814120.0, "step": 139500 }, { "entropy": 1.834130634367466, "epoch": 0.43246879356830153, "grad_norm": 6.732454299926758, "learning_rate": 3.846952719086739e-06, "loss": 0.3968, "mean_token_accuracy": 0.8624187022447586, "num_tokens": 167826074.0, "step": 139510 }, { "entropy": 1.8972522467374802, "epoch": 0.4324997926933512, "grad_norm": 9.73922061920166, "learning_rate": 3.8468148517835766e-06, "loss": 0.4742, "mean_token_accuracy": 0.8530626803636551, "num_tokens": 167836968.0, "step": 139520 }, { "entropy": 1.8679583430290223, "epoch": 0.43253079181840093, "grad_norm": 6.562689304351807, "learning_rate": 3.84667699930204e-06, "loss": 0.485, "mean_token_accuracy": 0.8574771344661712, "num_tokens": 167849043.0, "step": 139530 }, { "entropy": 1.8196332067251206, "epoch": 0.4325617909434506, "grad_norm": 8.335590362548828, "learning_rate": 3.846539161639474e-06, "loss": 0.3958, "mean_token_accuracy": 0.873447559773922, "num_tokens": 167861647.0, "step": 139540 }, { "entropy": 1.8070330306887628, "epoch": 0.4325927900685003, "grad_norm": 8.824051856994629, "learning_rate": 3.846401338793222e-06, "loss": 0.3973, "mean_token_accuracy": 0.8618940994143486, "num_tokens": 167874455.0, "step": 139550 }, { "entropy": 1.9073014467954637, "epoch": 0.43262378919355, "grad_norm": 8.493125915527344, "learning_rate": 3.846263530760633e-06, "loss": 0.4563, "mean_token_accuracy": 0.8542511522769928, "num_tokens": 167885644.0, "step": 139560 }, { "entropy": 1.8156562566757202, "epoch": 0.4326547883185997, "grad_norm": 9.763708114624023, "learning_rate": 3.84612573753905e-06, "loss": 0.4308, "mean_token_accuracy": 0.8623699083924293, "num_tokens": 167898195.0, "step": 139570 }, { "entropy": 1.9067368656396866, "epoch": 0.4326857874436494, "grad_norm": 8.543749809265137, "learning_rate": 3.845987959125823e-06, "loss": 0.4759, "mean_token_accuracy": 0.855238850414753, "num_tokens": 167908792.0, "step": 139580 }, { "entropy": 1.8284807071089744, "epoch": 0.4327167865686991, "grad_norm": 7.208462238311768, "learning_rate": 3.8458501955182975e-06, "loss": 0.4143, "mean_token_accuracy": 0.8515859737992286, "num_tokens": 167921383.0, "step": 139590 }, { "entropy": 1.872288428246975, "epoch": 0.4327477856937488, "grad_norm": 7.469978332519531, "learning_rate": 3.845712446713823e-06, "loss": 0.4476, "mean_token_accuracy": 0.8519970834255218, "num_tokens": 167932974.0, "step": 139600 }, { "entropy": 1.918510441482067, "epoch": 0.4327787848187985, "grad_norm": 10.325370788574219, "learning_rate": 3.845574712709749e-06, "loss": 0.4922, "mean_token_accuracy": 0.8455549374222755, "num_tokens": 167945139.0, "step": 139610 }, { "entropy": 1.7871095269918442, "epoch": 0.43280978394384817, "grad_norm": 8.072989463806152, "learning_rate": 3.845436993503426e-06, "loss": 0.3753, "mean_token_accuracy": 0.8632336914539337, "num_tokens": 167959774.0, "step": 139620 }, { "entropy": 1.6974161878228187, "epoch": 0.4328407830688979, "grad_norm": 1.9402376413345337, "learning_rate": 3.845299289092203e-06, "loss": 0.3739, "mean_token_accuracy": 0.8687549382448196, "num_tokens": 167974761.0, "step": 139630 }, { "entropy": 1.8783841490745545, "epoch": 0.43287178219394756, "grad_norm": 8.304696083068848, "learning_rate": 3.845161599473431e-06, "loss": 0.458, "mean_token_accuracy": 0.8487045183777809, "num_tokens": 167987541.0, "step": 139640 }, { "entropy": 1.8873986780643464, "epoch": 0.4329027813189973, "grad_norm": 7.96927547454834, "learning_rate": 3.845023924644462e-06, "loss": 0.4082, "mean_token_accuracy": 0.8622677832841873, "num_tokens": 167999815.0, "step": 139650 }, { "entropy": 1.8961790382862092, "epoch": 0.43293378044404696, "grad_norm": 9.11035442352295, "learning_rate": 3.84488626460265e-06, "loss": 0.4601, "mean_token_accuracy": 0.8553287327289582, "num_tokens": 168011278.0, "step": 139660 }, { "entropy": 1.8724280893802643, "epoch": 0.4329647795690967, "grad_norm": 7.960391521453857, "learning_rate": 3.8447486193453464e-06, "loss": 0.4489, "mean_token_accuracy": 0.8539118707180023, "num_tokens": 168023250.0, "step": 139670 }, { "entropy": 1.9397452518343925, "epoch": 0.43299577869414635, "grad_norm": 6.704464912414551, "learning_rate": 3.8446109888699054e-06, "loss": 0.4952, "mean_token_accuracy": 0.844795499742031, "num_tokens": 168034708.0, "step": 139680 }, { "entropy": 1.9342885583639144, "epoch": 0.4330267778191961, "grad_norm": 8.719551086425781, "learning_rate": 3.8444733731736825e-06, "loss": 0.4688, "mean_token_accuracy": 0.8504753604531288, "num_tokens": 168046323.0, "step": 139690 }, { "entropy": 1.9050405994057655, "epoch": 0.43305777694424574, "grad_norm": 7.164178371429443, "learning_rate": 3.8443357722540315e-06, "loss": 0.3677, "mean_token_accuracy": 0.87413921803236, "num_tokens": 168058199.0, "step": 139700 }, { "entropy": 1.9514371365308762, "epoch": 0.43308877606929547, "grad_norm": 8.19334602355957, "learning_rate": 3.844198186108308e-06, "loss": 0.5086, "mean_token_accuracy": 0.8353783071041108, "num_tokens": 168069154.0, "step": 139710 }, { "entropy": 1.8015297777950763, "epoch": 0.43311977519434514, "grad_norm": 8.755090713500977, "learning_rate": 3.84406061473387e-06, "loss": 0.3908, "mean_token_accuracy": 0.8574639052152634, "num_tokens": 168082496.0, "step": 139720 }, { "entropy": 1.8433510437607765, "epoch": 0.43315077431939486, "grad_norm": 7.901381492614746, "learning_rate": 3.843923058128073e-06, "loss": 0.4282, "mean_token_accuracy": 0.8601280748844147, "num_tokens": 168094337.0, "step": 139730 }, { "entropy": 1.909453672170639, "epoch": 0.43318177344444453, "grad_norm": 4.069192409515381, "learning_rate": 3.8437855162882766e-06, "loss": 0.4692, "mean_token_accuracy": 0.8465544447302819, "num_tokens": 168106420.0, "step": 139740 }, { "entropy": 1.9447955161333084, "epoch": 0.43321277256949425, "grad_norm": 9.183345794677734, "learning_rate": 3.843647989211837e-06, "loss": 0.5139, "mean_token_accuracy": 0.8410720884799957, "num_tokens": 168117779.0, "step": 139750 }, { "entropy": 1.9353636160492897, "epoch": 0.4332437716945439, "grad_norm": 6.792473793029785, "learning_rate": 3.843510476896116e-06, "loss": 0.4398, "mean_token_accuracy": 0.8577348798513412, "num_tokens": 168129340.0, "step": 139760 }, { "entropy": 1.8856726735830307, "epoch": 0.4332747708195936, "grad_norm": 8.366604804992676, "learning_rate": 3.843372979338471e-06, "loss": 0.4784, "mean_token_accuracy": 0.8540061011910438, "num_tokens": 168141683.0, "step": 139770 }, { "entropy": 1.96417216360569, "epoch": 0.4333057699446433, "grad_norm": 8.365377426147461, "learning_rate": 3.843235496536262e-06, "loss": 0.513, "mean_token_accuracy": 0.8412682712078094, "num_tokens": 168152620.0, "step": 139780 }, { "entropy": 1.9343325823545456, "epoch": 0.433336769069693, "grad_norm": 6.398437023162842, "learning_rate": 3.843098028486852e-06, "loss": 0.434, "mean_token_accuracy": 0.8587636038661003, "num_tokens": 168164184.0, "step": 139790 }, { "entropy": 1.8950095444917678, "epoch": 0.4333677681947427, "grad_norm": 8.576680183410645, "learning_rate": 3.842960575187603e-06, "loss": 0.4445, "mean_token_accuracy": 0.8526839464902878, "num_tokens": 168175843.0, "step": 139800 }, { "entropy": 1.8716197207570076, "epoch": 0.4333987673197924, "grad_norm": 7.882635593414307, "learning_rate": 3.842823136635875e-06, "loss": 0.4234, "mean_token_accuracy": 0.8643146887421608, "num_tokens": 168187498.0, "step": 139810 }, { "entropy": 1.8388502165675162, "epoch": 0.4334297664448421, "grad_norm": 7.46567964553833, "learning_rate": 3.842685712829033e-06, "loss": 0.4209, "mean_token_accuracy": 0.8461194902658462, "num_tokens": 168200282.0, "step": 139820 }, { "entropy": 1.8559609532356263, "epoch": 0.43346076556989177, "grad_norm": 5.5117387771606445, "learning_rate": 3.84254830376444e-06, "loss": 0.4908, "mean_token_accuracy": 0.8383535325527192, "num_tokens": 168213312.0, "step": 139830 }, { "entropy": 1.8750181749463082, "epoch": 0.4334917646949415, "grad_norm": 2.167787790298462, "learning_rate": 3.8424109094394605e-06, "loss": 0.4544, "mean_token_accuracy": 0.8538003221154213, "num_tokens": 168226085.0, "step": 139840 }, { "entropy": 1.887896014750004, "epoch": 0.43352276381999116, "grad_norm": 3.5215682983398438, "learning_rate": 3.842273529851461e-06, "loss": 0.4376, "mean_token_accuracy": 0.8547932624816894, "num_tokens": 168237911.0, "step": 139850 }, { "entropy": 1.847142294049263, "epoch": 0.4335537629450409, "grad_norm": 7.590489864349365, "learning_rate": 3.842136164997804e-06, "loss": 0.3865, "mean_token_accuracy": 0.8609548598527909, "num_tokens": 168250763.0, "step": 139860 }, { "entropy": 1.7902900233864785, "epoch": 0.43358476207009056, "grad_norm": 3.1853187084198, "learning_rate": 3.841998814875858e-06, "loss": 0.4003, "mean_token_accuracy": 0.8613974347710609, "num_tokens": 168263670.0, "step": 139870 }, { "entropy": 1.8125322625041007, "epoch": 0.4336157611951403, "grad_norm": 8.757499694824219, "learning_rate": 3.84186147948299e-06, "loss": 0.3847, "mean_token_accuracy": 0.8673945814371109, "num_tokens": 168277183.0, "step": 139880 }, { "entropy": 1.9186308607459068, "epoch": 0.43364676032018995, "grad_norm": 8.361207962036133, "learning_rate": 3.8417241588165675e-06, "loss": 0.4835, "mean_token_accuracy": 0.8502631932497025, "num_tokens": 168288847.0, "step": 139890 }, { "entropy": 1.7792581617832184, "epoch": 0.4336777594452397, "grad_norm": 3.6714017391204834, "learning_rate": 3.841586852873958e-06, "loss": 0.4079, "mean_token_accuracy": 0.8592787533998489, "num_tokens": 168301911.0, "step": 139900 }, { "entropy": 1.9583450973033905, "epoch": 0.43370875857028934, "grad_norm": 7.83007287979126, "learning_rate": 3.841449561652531e-06, "loss": 0.4741, "mean_token_accuracy": 0.8499109253287316, "num_tokens": 168312703.0, "step": 139910 }, { "entropy": 1.83574049025774, "epoch": 0.43373975769533907, "grad_norm": 8.328694343566895, "learning_rate": 3.841312285149657e-06, "loss": 0.4001, "mean_token_accuracy": 0.8661501377820968, "num_tokens": 168325343.0, "step": 139920 }, { "entropy": 1.9416332960128784, "epoch": 0.43377075682038874, "grad_norm": 7.570043563842773, "learning_rate": 3.841175023362706e-06, "loss": 0.4759, "mean_token_accuracy": 0.8532981321215629, "num_tokens": 168337277.0, "step": 139930 }, { "entropy": 1.9153632640838623, "epoch": 0.43380175594543846, "grad_norm": 6.462372303009033, "learning_rate": 3.841037776289048e-06, "loss": 0.4737, "mean_token_accuracy": 0.8486651003360748, "num_tokens": 168348776.0, "step": 139940 }, { "entropy": 1.928571656346321, "epoch": 0.43383275507048813, "grad_norm": 8.484127044677734, "learning_rate": 3.840900543926055e-06, "loss": 0.4827, "mean_token_accuracy": 0.8461811229586601, "num_tokens": 168360653.0, "step": 139950 }, { "entropy": 1.9500855028629303, "epoch": 0.43386375419553785, "grad_norm": 7.157684803009033, "learning_rate": 3.8407633262710995e-06, "loss": 0.4805, "mean_token_accuracy": 0.8550040423870087, "num_tokens": 168371201.0, "step": 139960 }, { "entropy": 1.9280799493193626, "epoch": 0.4338947533205875, "grad_norm": 6.877387046813965, "learning_rate": 3.840626123321555e-06, "loss": 0.4664, "mean_token_accuracy": 0.8493270486593246, "num_tokens": 168382043.0, "step": 139970 }, { "entropy": 1.8996494933962822, "epoch": 0.43392575244563725, "grad_norm": 7.173794746398926, "learning_rate": 3.840488935074794e-06, "loss": 0.4746, "mean_token_accuracy": 0.8530132621526718, "num_tokens": 168394684.0, "step": 139980 }, { "entropy": 2.0039018139243128, "epoch": 0.4339567515706869, "grad_norm": 7.884592056274414, "learning_rate": 3.840351761528191e-06, "loss": 0.5109, "mean_token_accuracy": 0.8355337530374527, "num_tokens": 168406211.0, "step": 139990 }, { "entropy": 1.8956574127078056, "epoch": 0.43398775069573664, "grad_norm": 5.687414646148682, "learning_rate": 3.840214602679122e-06, "loss": 0.4171, "mean_token_accuracy": 0.858708880841732, "num_tokens": 168418288.0, "step": 140000 }, { "entropy": 1.913138222694397, "epoch": 0.4340187498207863, "grad_norm": 3.5383858680725098, "learning_rate": 3.840077458524961e-06, "loss": 0.4717, "mean_token_accuracy": 0.8377836272120476, "num_tokens": 168430136.0, "step": 140010 }, { "entropy": 1.957027330994606, "epoch": 0.434049748945836, "grad_norm": 7.953402996063232, "learning_rate": 3.839940329063085e-06, "loss": 0.4643, "mean_token_accuracy": 0.8612416058778762, "num_tokens": 168441409.0, "step": 140020 }, { "entropy": 1.8934217691421509, "epoch": 0.4340807480708857, "grad_norm": 3.684905529022217, "learning_rate": 3.839803214290871e-06, "loss": 0.4254, "mean_token_accuracy": 0.8587592497467995, "num_tokens": 168453595.0, "step": 140030 }, { "entropy": 1.9658619672060014, "epoch": 0.43411174719593537, "grad_norm": 7.9534525871276855, "learning_rate": 3.839666114205696e-06, "loss": 0.4652, "mean_token_accuracy": 0.8584305748343468, "num_tokens": 168464254.0, "step": 140040 }, { "entropy": 1.8513989642262458, "epoch": 0.4341427463209851, "grad_norm": 7.037330150604248, "learning_rate": 3.839529028804939e-06, "loss": 0.3944, "mean_token_accuracy": 0.8667973428964615, "num_tokens": 168476419.0, "step": 140050 }, { "entropy": 1.921988594532013, "epoch": 0.43417374544603476, "grad_norm": 8.005762100219727, "learning_rate": 3.839391958085978e-06, "loss": 0.4736, "mean_token_accuracy": 0.8441802576184273, "num_tokens": 168488005.0, "step": 140060 }, { "entropy": 1.874787649512291, "epoch": 0.4342047445710845, "grad_norm": 9.830438613891602, "learning_rate": 3.839254902046193e-06, "loss": 0.416, "mean_token_accuracy": 0.8617915287613869, "num_tokens": 168500064.0, "step": 140070 }, { "entropy": 1.9190911263227464, "epoch": 0.43423574369613416, "grad_norm": 9.500381469726562, "learning_rate": 3.8391178606829646e-06, "loss": 0.4676, "mean_token_accuracy": 0.8493470534682274, "num_tokens": 168511882.0, "step": 140080 }, { "entropy": 1.8970647498965263, "epoch": 0.4342667428211839, "grad_norm": 6.222733974456787, "learning_rate": 3.838980833993672e-06, "loss": 0.4962, "mean_token_accuracy": 0.845898899435997, "num_tokens": 168525185.0, "step": 140090 }, { "entropy": 1.9249570727348329, "epoch": 0.43429774194623355, "grad_norm": 8.286949157714844, "learning_rate": 3.838843821975697e-06, "loss": 0.4652, "mean_token_accuracy": 0.8432594612240791, "num_tokens": 168536800.0, "step": 140100 }, { "entropy": 1.7830603495240211, "epoch": 0.4343287410712833, "grad_norm": 3.775357723236084, "learning_rate": 3.8387068246264244e-06, "loss": 0.3601, "mean_token_accuracy": 0.8711403638124466, "num_tokens": 168550628.0, "step": 140110 }, { "entropy": 1.913707821071148, "epoch": 0.43435974019633294, "grad_norm": 7.623897075653076, "learning_rate": 3.838569841943234e-06, "loss": 0.4571, "mean_token_accuracy": 0.8579373374581337, "num_tokens": 168562584.0, "step": 140120 }, { "entropy": 1.959540669620037, "epoch": 0.43439073932138267, "grad_norm": 8.677681922912598, "learning_rate": 3.8384328739235095e-06, "loss": 0.4934, "mean_token_accuracy": 0.8414706364274025, "num_tokens": 168574108.0, "step": 140130 }, { "entropy": 1.8708548903465272, "epoch": 0.43442173844643234, "grad_norm": 7.195833683013916, "learning_rate": 3.838295920564638e-06, "loss": 0.4264, "mean_token_accuracy": 0.8590035811066628, "num_tokens": 168586835.0, "step": 140140 }, { "entropy": 1.9178714960813523, "epoch": 0.43445273757148206, "grad_norm": 8.062506675720215, "learning_rate": 3.838158981863999e-06, "loss": 0.4367, "mean_token_accuracy": 0.8575378447771073, "num_tokens": 168599034.0, "step": 140150 }, { "entropy": 1.9800096511840821, "epoch": 0.43448373669653173, "grad_norm": 7.617713451385498, "learning_rate": 3.838022057818983e-06, "loss": 0.4857, "mean_token_accuracy": 0.8560611560940743, "num_tokens": 168610043.0, "step": 140160 }, { "entropy": 1.8751267299056054, "epoch": 0.43451473582158145, "grad_norm": 9.476043701171875, "learning_rate": 3.837885148426972e-06, "loss": 0.4785, "mean_token_accuracy": 0.8532286942005157, "num_tokens": 168622646.0, "step": 140170 }, { "entropy": 1.9622350841760636, "epoch": 0.4345457349466311, "grad_norm": 8.158764839172363, "learning_rate": 3.837748253685356e-06, "loss": 0.4643, "mean_token_accuracy": 0.852800378203392, "num_tokens": 168633945.0, "step": 140180 }, { "entropy": 1.9728653207421303, "epoch": 0.43457673407168085, "grad_norm": 8.091950416564941, "learning_rate": 3.8376113735915195e-06, "loss": 0.4947, "mean_token_accuracy": 0.8437511593103408, "num_tokens": 168645091.0, "step": 140190 }, { "entropy": 1.8134228363633156, "epoch": 0.4346077331967305, "grad_norm": 3.810476541519165, "learning_rate": 3.8374745081428525e-06, "loss": 0.3402, "mean_token_accuracy": 0.8733655110001564, "num_tokens": 168658792.0, "step": 140200 }, { "entropy": 1.9151874303817749, "epoch": 0.43463873232178024, "grad_norm": 7.474352836608887, "learning_rate": 3.837337657336742e-06, "loss": 0.4914, "mean_token_accuracy": 0.8463295727968216, "num_tokens": 168670568.0, "step": 140210 }, { "entropy": 1.9047392055392265, "epoch": 0.4346697314468299, "grad_norm": 8.334789276123047, "learning_rate": 3.8372008211705795e-06, "loss": 0.4684, "mean_token_accuracy": 0.8502994626760483, "num_tokens": 168682642.0, "step": 140220 }, { "entropy": 1.9391117468476295, "epoch": 0.43470073057187963, "grad_norm": 9.910785675048828, "learning_rate": 3.837063999641753e-06, "loss": 0.4548, "mean_token_accuracy": 0.8517929673194885, "num_tokens": 168694378.0, "step": 140230 }, { "entropy": 1.9310035303235054, "epoch": 0.4347317296969293, "grad_norm": 8.19104290008545, "learning_rate": 3.836927192747655e-06, "loss": 0.44, "mean_token_accuracy": 0.8617075026035309, "num_tokens": 168705541.0, "step": 140240 }, { "entropy": 1.8467669546604157, "epoch": 0.43476272882197897, "grad_norm": 9.081491470336914, "learning_rate": 3.8367904004856745e-06, "loss": 0.4153, "mean_token_accuracy": 0.8642741695046425, "num_tokens": 168718141.0, "step": 140250 }, { "entropy": 1.841063352674246, "epoch": 0.4347937279470287, "grad_norm": 7.375804424285889, "learning_rate": 3.836653622853204e-06, "loss": 0.3789, "mean_token_accuracy": 0.8677456378936768, "num_tokens": 168730815.0, "step": 140260 }, { "entropy": 1.8605894804000855, "epoch": 0.43482472707207837, "grad_norm": 6.377718448638916, "learning_rate": 3.836516859847637e-06, "loss": 0.4352, "mean_token_accuracy": 0.858807185292244, "num_tokens": 168742824.0, "step": 140270 }, { "entropy": 1.824122267216444, "epoch": 0.4348557261971281, "grad_norm": 2.3955891132354736, "learning_rate": 3.836380111466366e-06, "loss": 0.3629, "mean_token_accuracy": 0.8775916382670402, "num_tokens": 168755819.0, "step": 140280 }, { "entropy": 1.9054122567176819, "epoch": 0.43488672532217776, "grad_norm": 10.010201454162598, "learning_rate": 3.836243377706786e-06, "loss": 0.4649, "mean_token_accuracy": 0.8460003793239593, "num_tokens": 168767525.0, "step": 140290 }, { "entropy": 1.9120841890573501, "epoch": 0.4349177244472275, "grad_norm": 8.370523452758789, "learning_rate": 3.83610665856629e-06, "loss": 0.4713, "mean_token_accuracy": 0.8461458712816239, "num_tokens": 168779091.0, "step": 140300 }, { "entropy": 1.9416150897741318, "epoch": 0.43494872357227715, "grad_norm": 8.070106506347656, "learning_rate": 3.835969954042273e-06, "loss": 0.4725, "mean_token_accuracy": 0.8444668143987656, "num_tokens": 168790525.0, "step": 140310 }, { "entropy": 1.873001678287983, "epoch": 0.4349797226973269, "grad_norm": 6.8141584396362305, "learning_rate": 3.835833264132131e-06, "loss": 0.3869, "mean_token_accuracy": 0.8707495495676995, "num_tokens": 168801983.0, "step": 140320 }, { "entropy": 1.9271577775478363, "epoch": 0.43501072182237654, "grad_norm": 6.567296504974365, "learning_rate": 3.835696588833263e-06, "loss": 0.4914, "mean_token_accuracy": 0.8497977092862129, "num_tokens": 168814329.0, "step": 140330 }, { "entropy": 1.913456965982914, "epoch": 0.43504172094742627, "grad_norm": 7.978017330169678, "learning_rate": 3.8355599281430635e-06, "loss": 0.4569, "mean_token_accuracy": 0.8430664092302322, "num_tokens": 168826198.0, "step": 140340 }, { "entropy": 1.882510894536972, "epoch": 0.43507272007247594, "grad_norm": 9.187688827514648, "learning_rate": 3.83542328205893e-06, "loss": 0.4491, "mean_token_accuracy": 0.8393991827964783, "num_tokens": 168838743.0, "step": 140350 }, { "entropy": 1.8549660280346871, "epoch": 0.43510371919752566, "grad_norm": 7.461533069610596, "learning_rate": 3.835286650578262e-06, "loss": 0.4404, "mean_token_accuracy": 0.8586645647883415, "num_tokens": 168851245.0, "step": 140360 }, { "entropy": 1.8594406992197037, "epoch": 0.43513471832257533, "grad_norm": 3.6771328449249268, "learning_rate": 3.835150033698458e-06, "loss": 0.3623, "mean_token_accuracy": 0.8606583446264267, "num_tokens": 168864222.0, "step": 140370 }, { "entropy": 1.9253443703055382, "epoch": 0.43516571744762506, "grad_norm": 7.7639241218566895, "learning_rate": 3.835013431416919e-06, "loss": 0.4706, "mean_token_accuracy": 0.8500281646847725, "num_tokens": 168876003.0, "step": 140380 }, { "entropy": 1.9680373221635818, "epoch": 0.4351967165726747, "grad_norm": 6.00819206237793, "learning_rate": 3.834876843731043e-06, "loss": 0.4793, "mean_token_accuracy": 0.848180590569973, "num_tokens": 168886871.0, "step": 140390 }, { "entropy": 1.9534240260720253, "epoch": 0.43522771569772445, "grad_norm": 7.778793811798096, "learning_rate": 3.834740270638232e-06, "loss": 0.476, "mean_token_accuracy": 0.8536478534340859, "num_tokens": 168898090.0, "step": 140400 }, { "entropy": 1.9209111735224724, "epoch": 0.4352587148227741, "grad_norm": 6.560374736785889, "learning_rate": 3.834603712135889e-06, "loss": 0.4202, "mean_token_accuracy": 0.8524667829275131, "num_tokens": 168909996.0, "step": 140410 }, { "entropy": 1.915994156897068, "epoch": 0.43528971394782384, "grad_norm": 4.213437080383301, "learning_rate": 3.834467168221415e-06, "loss": 0.4744, "mean_token_accuracy": 0.8442320972681046, "num_tokens": 168921960.0, "step": 140420 }, { "entropy": 1.9615544810891152, "epoch": 0.4353207130728735, "grad_norm": 8.39879035949707, "learning_rate": 3.834330638892213e-06, "loss": 0.4498, "mean_token_accuracy": 0.8525176584720612, "num_tokens": 168933811.0, "step": 140430 }, { "entropy": 1.8864429205656053, "epoch": 0.43535171219792324, "grad_norm": 7.037664413452148, "learning_rate": 3.834194124145686e-06, "loss": 0.4542, "mean_token_accuracy": 0.8612302899360657, "num_tokens": 168946667.0, "step": 140440 }, { "entropy": 2.0172328650951385, "epoch": 0.4353827113229729, "grad_norm": 4.6347975730896, "learning_rate": 3.834057623979241e-06, "loss": 0.5135, "mean_token_accuracy": 0.842670176923275, "num_tokens": 168958082.0, "step": 140450 }, { "entropy": 2.007206231355667, "epoch": 0.43541371044802263, "grad_norm": 8.676667213439941, "learning_rate": 3.833921138390279e-06, "loss": 0.506, "mean_token_accuracy": 0.8318960726261139, "num_tokens": 168969385.0, "step": 140460 }, { "entropy": 1.9538621738553048, "epoch": 0.4354447095730723, "grad_norm": 7.149422645568848, "learning_rate": 3.833784667376208e-06, "loss": 0.4332, "mean_token_accuracy": 0.8514281839132309, "num_tokens": 168981528.0, "step": 140470 }, { "entropy": 1.9269013822078704, "epoch": 0.435475708698122, "grad_norm": 7.5539350509643555, "learning_rate": 3.833648210934433e-06, "loss": 0.457, "mean_token_accuracy": 0.8525858089327812, "num_tokens": 168994111.0, "step": 140480 }, { "entropy": 2.015850293636322, "epoch": 0.4355067078231717, "grad_norm": 7.674806118011475, "learning_rate": 3.8335117690623616e-06, "loss": 0.5322, "mean_token_accuracy": 0.8376188382506371, "num_tokens": 169004802.0, "step": 140490 }, { "entropy": 1.868292573094368, "epoch": 0.43553770694822136, "grad_norm": 3.716049909591675, "learning_rate": 3.8333753417574014e-06, "loss": 0.4029, "mean_token_accuracy": 0.8568149447441101, "num_tokens": 169017161.0, "step": 140500 }, { "entropy": 1.999607202410698, "epoch": 0.4355687060732711, "grad_norm": 7.225944519042969, "learning_rate": 3.833238929016959e-06, "loss": 0.4686, "mean_token_accuracy": 0.8523830324411392, "num_tokens": 169027960.0, "step": 140510 }, { "entropy": 1.9995219513773919, "epoch": 0.43559970519832075, "grad_norm": 10.309820175170898, "learning_rate": 3.833102530838446e-06, "loss": 0.5616, "mean_token_accuracy": 0.8337021216750145, "num_tokens": 169039707.0, "step": 140520 }, { "entropy": 1.8859604820609093, "epoch": 0.4356307043233705, "grad_norm": 5.491618633270264, "learning_rate": 3.8329661472192686e-06, "loss": 0.4348, "mean_token_accuracy": 0.8571705892682076, "num_tokens": 169052242.0, "step": 140530 }, { "entropy": 1.8706535249948502, "epoch": 0.43566170344842015, "grad_norm": 4.212784767150879, "learning_rate": 3.832829778156839e-06, "loss": 0.4187, "mean_token_accuracy": 0.8539310187101364, "num_tokens": 169065057.0, "step": 140540 }, { "entropy": 1.9854949221014977, "epoch": 0.43569270257346987, "grad_norm": 7.0491461753845215, "learning_rate": 3.8326934236485665e-06, "loss": 0.4578, "mean_token_accuracy": 0.855704678595066, "num_tokens": 169076702.0, "step": 140550 }, { "entropy": 1.8700939059257506, "epoch": 0.43572370169851954, "grad_norm": 7.940951824188232, "learning_rate": 3.832557083691864e-06, "loss": 0.4005, "mean_token_accuracy": 0.8501318633556366, "num_tokens": 169089351.0, "step": 140560 }, { "entropy": 1.975642454624176, "epoch": 0.43575470082356926, "grad_norm": 8.858991622924805, "learning_rate": 3.832420758284142e-06, "loss": 0.5046, "mean_token_accuracy": 0.8454146534204483, "num_tokens": 169100724.0, "step": 140570 }, { "entropy": 1.8851625487208366, "epoch": 0.43578569994861893, "grad_norm": 7.867511749267578, "learning_rate": 3.832284447422814e-06, "loss": 0.4544, "mean_token_accuracy": 0.8477107077836991, "num_tokens": 169112999.0, "step": 140580 }, { "entropy": 1.8516397640109061, "epoch": 0.43581669907366866, "grad_norm": 3.843761682510376, "learning_rate": 3.832148151105293e-06, "loss": 0.4388, "mean_token_accuracy": 0.8512039586901665, "num_tokens": 169126495.0, "step": 140590 }, { "entropy": 2.005415938794613, "epoch": 0.4358476981987183, "grad_norm": 7.267820835113525, "learning_rate": 3.832011869328994e-06, "loss": 0.5148, "mean_token_accuracy": 0.8331288233399391, "num_tokens": 169137715.0, "step": 140600 }, { "entropy": 1.9655996575951575, "epoch": 0.43587869732376805, "grad_norm": 7.318187236785889, "learning_rate": 3.831875602091329e-06, "loss": 0.4687, "mean_token_accuracy": 0.8459896892309189, "num_tokens": 169149332.0, "step": 140610 }, { "entropy": 1.9661901488900184, "epoch": 0.4359096964488177, "grad_norm": 7.033077716827393, "learning_rate": 3.831739349389714e-06, "loss": 0.4418, "mean_token_accuracy": 0.8531352400779724, "num_tokens": 169160520.0, "step": 140620 }, { "entropy": 1.859518001973629, "epoch": 0.43594069557386744, "grad_norm": 8.226398468017578, "learning_rate": 3.8316031112215676e-06, "loss": 0.371, "mean_token_accuracy": 0.866145646572113, "num_tokens": 169173970.0, "step": 140630 }, { "entropy": 1.967048665881157, "epoch": 0.4359716946989171, "grad_norm": 7.1692962646484375, "learning_rate": 3.831466887584302e-06, "loss": 0.4907, "mean_token_accuracy": 0.8470461040735244, "num_tokens": 169184890.0, "step": 140640 }, { "entropy": 1.901542803645134, "epoch": 0.43600269382396684, "grad_norm": 7.0217156410217285, "learning_rate": 3.831330678475338e-06, "loss": 0.4176, "mean_token_accuracy": 0.8561606049537659, "num_tokens": 169197071.0, "step": 140650 }, { "entropy": 1.7790475860238075, "epoch": 0.4360336929490165, "grad_norm": 10.4070405960083, "learning_rate": 3.831194483892091e-06, "loss": 0.3602, "mean_token_accuracy": 0.8745766609907151, "num_tokens": 169210648.0, "step": 140660 }, { "entropy": 1.8767062574625015, "epoch": 0.43606469207406623, "grad_norm": 6.854859352111816, "learning_rate": 3.831058303831981e-06, "loss": 0.4237, "mean_token_accuracy": 0.8566436514258384, "num_tokens": 169222860.0, "step": 140670 }, { "entropy": 1.956781129539013, "epoch": 0.4360956911991159, "grad_norm": 8.431173324584961, "learning_rate": 3.830922138292426e-06, "loss": 0.4847, "mean_token_accuracy": 0.8472963184118271, "num_tokens": 169234554.0, "step": 140680 }, { "entropy": 1.9454703062772751, "epoch": 0.4361266903241656, "grad_norm": 9.45147705078125, "learning_rate": 3.8307859872708455e-06, "loss": 0.5006, "mean_token_accuracy": 0.848207201063633, "num_tokens": 169245918.0, "step": 140690 }, { "entropy": 1.9917195424437524, "epoch": 0.4361576894492153, "grad_norm": 8.706761360168457, "learning_rate": 3.830649850764661e-06, "loss": 0.4878, "mean_token_accuracy": 0.8496665880084038, "num_tokens": 169257011.0, "step": 140700 }, { "entropy": 1.9021431505680084, "epoch": 0.436188688574265, "grad_norm": 8.302827835083008, "learning_rate": 3.830513728771293e-06, "loss": 0.4093, "mean_token_accuracy": 0.8685775399208069, "num_tokens": 169268523.0, "step": 140710 }, { "entropy": 1.8747971653938293, "epoch": 0.4362196876993147, "grad_norm": 10.040637969970703, "learning_rate": 3.830377621288163e-06, "loss": 0.4226, "mean_token_accuracy": 0.8577211305499077, "num_tokens": 169281126.0, "step": 140720 }, { "entropy": 2.011331743001938, "epoch": 0.4362506868243644, "grad_norm": 9.337910652160645, "learning_rate": 3.8302415283126924e-06, "loss": 0.5219, "mean_token_accuracy": 0.8415524423122406, "num_tokens": 169291751.0, "step": 140730 }, { "entropy": 2.007808841764927, "epoch": 0.4362816859494141, "grad_norm": 7.370612144470215, "learning_rate": 3.830105449842306e-06, "loss": 0.5172, "mean_token_accuracy": 0.83756934851408, "num_tokens": 169303260.0, "step": 140740 }, { "entropy": 1.9281100079417228, "epoch": 0.43631268507446375, "grad_norm": 7.993682861328125, "learning_rate": 3.829969385874425e-06, "loss": 0.504, "mean_token_accuracy": 0.8419538453221321, "num_tokens": 169315175.0, "step": 140750 }, { "entropy": 1.8865912228822708, "epoch": 0.43634368419951347, "grad_norm": 5.286984443664551, "learning_rate": 3.829833336406477e-06, "loss": 0.4069, "mean_token_accuracy": 0.8586981207132339, "num_tokens": 169328237.0, "step": 140760 }, { "entropy": 1.9767191842198373, "epoch": 0.43637468332456314, "grad_norm": 7.798305034637451, "learning_rate": 3.8296973014358825e-06, "loss": 0.5088, "mean_token_accuracy": 0.8429146945476532, "num_tokens": 169339917.0, "step": 140770 }, { "entropy": 1.928345987200737, "epoch": 0.43640568244961286, "grad_norm": 3.7796151638031006, "learning_rate": 3.829561280960071e-06, "loss": 0.4115, "mean_token_accuracy": 0.8631057515740395, "num_tokens": 169351691.0, "step": 140780 }, { "entropy": 1.9100732266902924, "epoch": 0.43643668157466253, "grad_norm": 8.390362739562988, "learning_rate": 3.829425274976465e-06, "loss": 0.4309, "mean_token_accuracy": 0.8546838641166687, "num_tokens": 169364207.0, "step": 140790 }, { "entropy": 1.8982257694005966, "epoch": 0.43646768069971226, "grad_norm": 7.438647747039795, "learning_rate": 3.829289283482494e-06, "loss": 0.4037, "mean_token_accuracy": 0.8580241903662682, "num_tokens": 169376638.0, "step": 140800 }, { "entropy": 1.951183719933033, "epoch": 0.4364986798247619, "grad_norm": 3.837440252304077, "learning_rate": 3.829153306475584e-06, "loss": 0.4269, "mean_token_accuracy": 0.8538202360272408, "num_tokens": 169388468.0, "step": 140810 }, { "entropy": 1.9201230362057686, "epoch": 0.43652967894981165, "grad_norm": 8.309687614440918, "learning_rate": 3.829017343953164e-06, "loss": 0.4843, "mean_token_accuracy": 0.8367157101631164, "num_tokens": 169401878.0, "step": 140820 }, { "entropy": 1.9850788608193397, "epoch": 0.4365606780748613, "grad_norm": 8.711220741271973, "learning_rate": 3.828881395912661e-06, "loss": 0.4953, "mean_token_accuracy": 0.8478342741727829, "num_tokens": 169413486.0, "step": 140830 }, { "entropy": 1.9063683211803437, "epoch": 0.43659167719991104, "grad_norm": 10.316767692565918, "learning_rate": 3.828745462351506e-06, "loss": 0.4484, "mean_token_accuracy": 0.8501601129770279, "num_tokens": 169426451.0, "step": 140840 }, { "entropy": 2.031948208808899, "epoch": 0.4366226763249607, "grad_norm": 8.402220726013184, "learning_rate": 3.828609543267129e-06, "loss": 0.5099, "mean_token_accuracy": 0.8418742671608925, "num_tokens": 169437295.0, "step": 140850 }, { "entropy": 1.7779492631554603, "epoch": 0.43665367545001044, "grad_norm": 8.303640365600586, "learning_rate": 3.828473638656959e-06, "loss": 0.3924, "mean_token_accuracy": 0.8546249598264695, "num_tokens": 169451864.0, "step": 140860 }, { "entropy": 1.8658212095499038, "epoch": 0.4366846745750601, "grad_norm": 8.145844459533691, "learning_rate": 3.828337748518429e-06, "loss": 0.4204, "mean_token_accuracy": 0.8555705919861794, "num_tokens": 169465061.0, "step": 140870 }, { "entropy": 1.9951273322105407, "epoch": 0.43671567370010983, "grad_norm": 7.6853461265563965, "learning_rate": 3.8282018728489685e-06, "loss": 0.4929, "mean_token_accuracy": 0.8458278581500054, "num_tokens": 169476203.0, "step": 140880 }, { "entropy": 1.907281294465065, "epoch": 0.4367466728251595, "grad_norm": 4.954738616943359, "learning_rate": 3.828066011646013e-06, "loss": 0.4118, "mean_token_accuracy": 0.84654021859169, "num_tokens": 169488896.0, "step": 140890 }, { "entropy": 2.035962425172329, "epoch": 0.4367776719502092, "grad_norm": 9.046393394470215, "learning_rate": 3.8279301649069935e-06, "loss": 0.5015, "mean_token_accuracy": 0.8471018671989441, "num_tokens": 169500116.0, "step": 140900 }, { "entropy": 1.9954482674598695, "epoch": 0.4368086710752589, "grad_norm": 8.976217269897461, "learning_rate": 3.827794332629344e-06, "loss": 0.483, "mean_token_accuracy": 0.8554398879408837, "num_tokens": 169510982.0, "step": 140910 }, { "entropy": 2.049514207243919, "epoch": 0.4368396702003086, "grad_norm": 9.787312507629395, "learning_rate": 3.8276585148105e-06, "loss": 0.5093, "mean_token_accuracy": 0.8407163426280022, "num_tokens": 169521572.0, "step": 140920 }, { "entropy": 1.923102656006813, "epoch": 0.4368706693253583, "grad_norm": 8.028412818908691, "learning_rate": 3.827522711447895e-06, "loss": 0.4557, "mean_token_accuracy": 0.8506998717784882, "num_tokens": 169533699.0, "step": 140930 }, { "entropy": 2.0024877056479453, "epoch": 0.436901668450408, "grad_norm": 4.008644104003906, "learning_rate": 3.827386922538967e-06, "loss": 0.4489, "mean_token_accuracy": 0.8583690106868744, "num_tokens": 169544829.0, "step": 140940 }, { "entropy": 1.84952729716897, "epoch": 0.4369326675754577, "grad_norm": 9.895330429077148, "learning_rate": 3.827251148081149e-06, "loss": 0.3781, "mean_token_accuracy": 0.8501643449068069, "num_tokens": 169558584.0, "step": 140950 }, { "entropy": 1.9892782807350158, "epoch": 0.4369636667005074, "grad_norm": 7.615428447723389, "learning_rate": 3.827115388071881e-06, "loss": 0.5019, "mean_token_accuracy": 0.844859579205513, "num_tokens": 169569716.0, "step": 140960 }, { "entropy": 2.0157076716423035, "epoch": 0.43699466582555707, "grad_norm": 8.59665298461914, "learning_rate": 3.8269796425086e-06, "loss": 0.5129, "mean_token_accuracy": 0.8417162656784057, "num_tokens": 169580522.0, "step": 140970 }, { "entropy": 1.925472079217434, "epoch": 0.4370256649506068, "grad_norm": 3.7771337032318115, "learning_rate": 3.826843911388742e-06, "loss": 0.4267, "mean_token_accuracy": 0.8646034061908722, "num_tokens": 169592175.0, "step": 140980 }, { "entropy": 1.8540797725319862, "epoch": 0.43705666407565646, "grad_norm": 7.4315056800842285, "learning_rate": 3.826708194709748e-06, "loss": 0.3951, "mean_token_accuracy": 0.8597232013940811, "num_tokens": 169604782.0, "step": 140990 }, { "entropy": 1.9015352100133895, "epoch": 0.43708766320070613, "grad_norm": 4.213851451873779, "learning_rate": 3.826572492469057e-06, "loss": 0.4461, "mean_token_accuracy": 0.8474423259496688, "num_tokens": 169617522.0, "step": 141000 }, { "entropy": 2.016425573825836, "epoch": 0.43711866232575586, "grad_norm": 8.818294525146484, "learning_rate": 3.826436804664109e-06, "loss": 0.5032, "mean_token_accuracy": 0.8433761283755302, "num_tokens": 169628341.0, "step": 141010 }, { "entropy": 1.9073638312518597, "epoch": 0.4371496614508055, "grad_norm": 7.4958014488220215, "learning_rate": 3.826301131292346e-06, "loss": 0.424, "mean_token_accuracy": 0.8657842293381691, "num_tokens": 169641206.0, "step": 141020 }, { "entropy": 1.8773157000541687, "epoch": 0.43718066057585525, "grad_norm": 9.210996627807617, "learning_rate": 3.826165472351208e-06, "loss": 0.4286, "mean_token_accuracy": 0.8486377000808716, "num_tokens": 169652953.0, "step": 141030 }, { "entropy": 1.8629372149705887, "epoch": 0.4372116597009049, "grad_norm": 8.852280616760254, "learning_rate": 3.826029827838136e-06, "loss": 0.436, "mean_token_accuracy": 0.8569909855723381, "num_tokens": 169665732.0, "step": 141040 }, { "entropy": 1.8929132372140884, "epoch": 0.43724265882595464, "grad_norm": 7.813331127166748, "learning_rate": 3.825894197750575e-06, "loss": 0.4269, "mean_token_accuracy": 0.8591722935438156, "num_tokens": 169677613.0, "step": 141050 }, { "entropy": 1.9381250068545341, "epoch": 0.4372736579510043, "grad_norm": 8.72669792175293, "learning_rate": 3.825758582085967e-06, "loss": 0.4255, "mean_token_accuracy": 0.8587992191314697, "num_tokens": 169689244.0, "step": 141060 }, { "entropy": 1.7425117641687393, "epoch": 0.43730465707605404, "grad_norm": 8.140493392944336, "learning_rate": 3.825622980841757e-06, "loss": 0.3859, "mean_token_accuracy": 0.8660785302519798, "num_tokens": 169703991.0, "step": 141070 }, { "entropy": 1.8813914477825164, "epoch": 0.4373356562011037, "grad_norm": 9.09639835357666, "learning_rate": 3.825487394015388e-06, "loss": 0.4414, "mean_token_accuracy": 0.8516086161136627, "num_tokens": 169716782.0, "step": 141080 }, { "entropy": 1.8726382121443748, "epoch": 0.43736665532615343, "grad_norm": 4.414510250091553, "learning_rate": 3.825351821604306e-06, "loss": 0.455, "mean_token_accuracy": 0.8511142015457154, "num_tokens": 169729151.0, "step": 141090 }, { "entropy": 1.905012820661068, "epoch": 0.4373976544512031, "grad_norm": 8.519266128540039, "learning_rate": 3.825216263605957e-06, "loss": 0.477, "mean_token_accuracy": 0.8418341457843781, "num_tokens": 169741404.0, "step": 141100 }, { "entropy": 1.9148242220282554, "epoch": 0.4374286535762528, "grad_norm": 8.985989570617676, "learning_rate": 3.825080720017787e-06, "loss": 0.4941, "mean_token_accuracy": 0.842382799088955, "num_tokens": 169753529.0, "step": 141110 }, { "entropy": 1.8772297248244285, "epoch": 0.4374596527013025, "grad_norm": 3.817615509033203, "learning_rate": 3.824945190837244e-06, "loss": 0.4215, "mean_token_accuracy": 0.8559065282344818, "num_tokens": 169765965.0, "step": 141120 }, { "entropy": 1.9383199408650398, "epoch": 0.4374906518263522, "grad_norm": 3.4553632736206055, "learning_rate": 3.824809676061776e-06, "loss": 0.468, "mean_token_accuracy": 0.8500086024403573, "num_tokens": 169777683.0, "step": 141130 }, { "entropy": 1.9334457024931908, "epoch": 0.4375216509514019, "grad_norm": 4.0566182136535645, "learning_rate": 3.82467417568883e-06, "loss": 0.4545, "mean_token_accuracy": 0.8465969949960709, "num_tokens": 169789469.0, "step": 141140 }, { "entropy": 1.9533229991793633, "epoch": 0.4375526500764516, "grad_norm": 7.992478847503662, "learning_rate": 3.824538689715855e-06, "loss": 0.4447, "mean_token_accuracy": 0.8505326002836228, "num_tokens": 169800995.0, "step": 141150 }, { "entropy": 1.9198670163750648, "epoch": 0.4375836492015013, "grad_norm": 4.070978164672852, "learning_rate": 3.8244032181403015e-06, "loss": 0.5319, "mean_token_accuracy": 0.8470968812704086, "num_tokens": 169813330.0, "step": 141160 }, { "entropy": 1.8759153246879579, "epoch": 0.437614648326551, "grad_norm": 10.018072128295898, "learning_rate": 3.8242677609596205e-06, "loss": 0.4197, "mean_token_accuracy": 0.8579384982585907, "num_tokens": 169826267.0, "step": 141170 }, { "entropy": 1.839848317205906, "epoch": 0.4376456474516007, "grad_norm": 8.282307624816895, "learning_rate": 3.824132318171262e-06, "loss": 0.4227, "mean_token_accuracy": 0.8613649949431419, "num_tokens": 169840238.0, "step": 141180 }, { "entropy": 1.9160371258854867, "epoch": 0.4376766465766504, "grad_norm": 8.569047927856445, "learning_rate": 3.8239968897726755e-06, "loss": 0.4761, "mean_token_accuracy": 0.8511138498783112, "num_tokens": 169852089.0, "step": 141190 }, { "entropy": 1.9733612596988679, "epoch": 0.43770764570170007, "grad_norm": 7.161515712738037, "learning_rate": 3.823861475761317e-06, "loss": 0.4506, "mean_token_accuracy": 0.8522224485874176, "num_tokens": 169863089.0, "step": 141200 }, { "entropy": 1.9556746795773505, "epoch": 0.4377386448267498, "grad_norm": 8.775317192077637, "learning_rate": 3.823726076134636e-06, "loss": 0.4748, "mean_token_accuracy": 0.8549538642168045, "num_tokens": 169875115.0, "step": 141210 }, { "entropy": 1.9240303069353104, "epoch": 0.43776964395179946, "grad_norm": 8.23504638671875, "learning_rate": 3.823590690890089e-06, "loss": 0.469, "mean_token_accuracy": 0.853808656334877, "num_tokens": 169887042.0, "step": 141220 }, { "entropy": 1.8046733349561692, "epoch": 0.4378006430768492, "grad_norm": 3.5888330936431885, "learning_rate": 3.823455320025128e-06, "loss": 0.361, "mean_token_accuracy": 0.8697149589657783, "num_tokens": 169900449.0, "step": 141230 }, { "entropy": 1.8609862461686135, "epoch": 0.43783164220189885, "grad_norm": 3.756772041320801, "learning_rate": 3.823319963537208e-06, "loss": 0.3973, "mean_token_accuracy": 0.8570352002978325, "num_tokens": 169912988.0, "step": 141240 }, { "entropy": 1.8794613614678384, "epoch": 0.4378626413269485, "grad_norm": 3.9449973106384277, "learning_rate": 3.823184621423784e-06, "loss": 0.4047, "mean_token_accuracy": 0.8526434049010276, "num_tokens": 169926018.0, "step": 141250 }, { "entropy": 1.7661522284150124, "epoch": 0.43789364045199825, "grad_norm": 9.034821510314941, "learning_rate": 3.8230492936823135e-06, "loss": 0.355, "mean_token_accuracy": 0.8720221489667892, "num_tokens": 169939851.0, "step": 141260 }, { "entropy": 1.978477604687214, "epoch": 0.4379246395770479, "grad_norm": 7.994080543518066, "learning_rate": 3.822913980310252e-06, "loss": 0.4984, "mean_token_accuracy": 0.8487569943070412, "num_tokens": 169951672.0, "step": 141270 }, { "entropy": 1.9159216687083245, "epoch": 0.43795563870209764, "grad_norm": 7.902944564819336, "learning_rate": 3.822778681305056e-06, "loss": 0.4246, "mean_token_accuracy": 0.8606354176998139, "num_tokens": 169963211.0, "step": 141280 }, { "entropy": 1.8818385779857636, "epoch": 0.4379866378271473, "grad_norm": 4.299989700317383, "learning_rate": 3.822643396664184e-06, "loss": 0.5038, "mean_token_accuracy": 0.8490740895271301, "num_tokens": 169975835.0, "step": 141290 }, { "entropy": 1.9505405515432357, "epoch": 0.43801763695219703, "grad_norm": 7.912982940673828, "learning_rate": 3.822508126385095e-06, "loss": 0.4642, "mean_token_accuracy": 0.8519005611538887, "num_tokens": 169987046.0, "step": 141300 }, { "entropy": 1.8943722754716874, "epoch": 0.4380486360772467, "grad_norm": 9.575815200805664, "learning_rate": 3.822372870465247e-06, "loss": 0.424, "mean_token_accuracy": 0.8552343189716339, "num_tokens": 169999313.0, "step": 141310 }, { "entropy": 2.0350208193063737, "epoch": 0.4380796352022964, "grad_norm": 9.12436580657959, "learning_rate": 3.822237628902101e-06, "loss": 0.5847, "mean_token_accuracy": 0.8212827384471894, "num_tokens": 170010308.0, "step": 141320 }, { "entropy": 1.9424827635288238, "epoch": 0.4381106343273461, "grad_norm": 7.913768291473389, "learning_rate": 3.822102401693118e-06, "loss": 0.4541, "mean_token_accuracy": 0.8558102324604988, "num_tokens": 170021453.0, "step": 141330 }, { "entropy": 1.9400149762630463, "epoch": 0.4381416334523958, "grad_norm": 7.435611724853516, "learning_rate": 3.821967188835756e-06, "loss": 0.4669, "mean_token_accuracy": 0.8472892969846726, "num_tokens": 170033154.0, "step": 141340 }, { "entropy": 1.9589600950479507, "epoch": 0.4381726325774455, "grad_norm": 4.710630416870117, "learning_rate": 3.82183199032748e-06, "loss": 0.4518, "mean_token_accuracy": 0.8444600135087967, "num_tokens": 170044448.0, "step": 141350 }, { "entropy": 2.002592481672764, "epoch": 0.4382036317024952, "grad_norm": 3.6701736450195312, "learning_rate": 3.821696806165749e-06, "loss": 0.4958, "mean_token_accuracy": 0.8414113700389863, "num_tokens": 170055393.0, "step": 141360 }, { "entropy": 1.9110810875892639, "epoch": 0.4382346308275449, "grad_norm": 8.964020729064941, "learning_rate": 3.821561636348028e-06, "loss": 0.4121, "mean_token_accuracy": 0.8573357686400414, "num_tokens": 170067640.0, "step": 141370 }, { "entropy": 1.9764687582850455, "epoch": 0.4382656299525946, "grad_norm": 8.130205154418945, "learning_rate": 3.8214264808717814e-06, "loss": 0.5127, "mean_token_accuracy": 0.8418732464313508, "num_tokens": 170079057.0, "step": 141380 }, { "entropy": 1.8818054497241974, "epoch": 0.4382966290776443, "grad_norm": 4.381757736206055, "learning_rate": 3.82129133973447e-06, "loss": 0.4403, "mean_token_accuracy": 0.8557385444641114, "num_tokens": 170091217.0, "step": 141390 }, { "entropy": 1.999481311440468, "epoch": 0.438327628202694, "grad_norm": 8.559086799621582, "learning_rate": 3.821156212933562e-06, "loss": 0.5061, "mean_token_accuracy": 0.8520580142736435, "num_tokens": 170102107.0, "step": 141400 }, { "entropy": 1.877190275490284, "epoch": 0.43835862732774367, "grad_norm": 7.804668426513672, "learning_rate": 3.8210211004665206e-06, "loss": 0.4605, "mean_token_accuracy": 0.8549025520682335, "num_tokens": 170114007.0, "step": 141410 }, { "entropy": 1.7784633502364158, "epoch": 0.4383896264527934, "grad_norm": 7.331371784210205, "learning_rate": 3.820886002330814e-06, "loss": 0.3455, "mean_token_accuracy": 0.8712356150150299, "num_tokens": 170127600.0, "step": 141420 }, { "entropy": 1.948923571407795, "epoch": 0.43842062557784306, "grad_norm": 9.89287281036377, "learning_rate": 3.820750918523906e-06, "loss": 0.4901, "mean_token_accuracy": 0.8478708654642105, "num_tokens": 170138776.0, "step": 141430 }, { "entropy": 1.9679486066102982, "epoch": 0.4384516247028928, "grad_norm": 9.170439720153809, "learning_rate": 3.820615849043266e-06, "loss": 0.4568, "mean_token_accuracy": 0.8551394686102867, "num_tokens": 170150407.0, "step": 141440 }, { "entropy": 1.9057482168078423, "epoch": 0.43848262382794245, "grad_norm": 9.319578170776367, "learning_rate": 3.820480793886361e-06, "loss": 0.4794, "mean_token_accuracy": 0.8556445702910424, "num_tokens": 170162841.0, "step": 141450 }, { "entropy": 1.872725434601307, "epoch": 0.4385136229529922, "grad_norm": 8.340863227844238, "learning_rate": 3.820345753050659e-06, "loss": 0.4323, "mean_token_accuracy": 0.8682135134935379, "num_tokens": 170175371.0, "step": 141460 }, { "entropy": 1.8802336245775222, "epoch": 0.43854462207804185, "grad_norm": 8.725083351135254, "learning_rate": 3.82021072653363e-06, "loss": 0.4786, "mean_token_accuracy": 0.8557636126875877, "num_tokens": 170187618.0, "step": 141470 }, { "entropy": 1.8927005499601364, "epoch": 0.43857562120309157, "grad_norm": 3.3288369178771973, "learning_rate": 3.820075714332744e-06, "loss": 0.457, "mean_token_accuracy": 0.8532668456435204, "num_tokens": 170199475.0, "step": 141480 }, { "entropy": 1.996702989935875, "epoch": 0.43860662032814124, "grad_norm": 7.832830429077148, "learning_rate": 3.819940716445472e-06, "loss": 0.5113, "mean_token_accuracy": 0.8426687330007553, "num_tokens": 170210531.0, "step": 141490 }, { "entropy": 1.9969799607992171, "epoch": 0.4386376194531909, "grad_norm": 8.912454605102539, "learning_rate": 3.819805732869283e-06, "loss": 0.5241, "mean_token_accuracy": 0.8406167760491371, "num_tokens": 170221403.0, "step": 141500 }, { "entropy": 1.9903255164623261, "epoch": 0.43866861857824063, "grad_norm": 8.586922645568848, "learning_rate": 3.81967076360165e-06, "loss": 0.5062, "mean_token_accuracy": 0.8494381055235862, "num_tokens": 170232697.0, "step": 141510 }, { "entropy": 1.893722152709961, "epoch": 0.4386996177032903, "grad_norm": 7.579135894775391, "learning_rate": 3.819535808640045e-06, "loss": 0.4301, "mean_token_accuracy": 0.8595759615302085, "num_tokens": 170244794.0, "step": 141520 }, { "entropy": 1.9859010726213455, "epoch": 0.43873061682834, "grad_norm": 8.621550559997559, "learning_rate": 3.819400867981941e-06, "loss": 0.5332, "mean_token_accuracy": 0.8447252556681633, "num_tokens": 170255961.0, "step": 141530 }, { "entropy": 1.945717729628086, "epoch": 0.4387616159533897, "grad_norm": 8.728808403015137, "learning_rate": 3.819265941624811e-06, "loss": 0.4971, "mean_token_accuracy": 0.8427330657839776, "num_tokens": 170268226.0, "step": 141540 }, { "entropy": 2.0041126042604445, "epoch": 0.4387926150784394, "grad_norm": 10.106131553649902, "learning_rate": 3.819131029566131e-06, "loss": 0.5313, "mean_token_accuracy": 0.8371750578284264, "num_tokens": 170278720.0, "step": 141550 }, { "entropy": 1.9148709684610368, "epoch": 0.4388236142034891, "grad_norm": 9.913314819335938, "learning_rate": 3.818996131803373e-06, "loss": 0.4727, "mean_token_accuracy": 0.840086530148983, "num_tokens": 170291028.0, "step": 141560 }, { "entropy": 1.9021180346608162, "epoch": 0.4388546133285388, "grad_norm": 7.955324649810791, "learning_rate": 3.818861248334014e-06, "loss": 0.4134, "mean_token_accuracy": 0.8594782516360283, "num_tokens": 170303129.0, "step": 141570 }, { "entropy": 1.8553086191415786, "epoch": 0.4388856124535885, "grad_norm": 4.323651313781738, "learning_rate": 3.81872637915553e-06, "loss": 0.4325, "mean_token_accuracy": 0.854140630364418, "num_tokens": 170316049.0, "step": 141580 }, { "entropy": 1.9587162017822266, "epoch": 0.4389166115786382, "grad_norm": 8.201522827148438, "learning_rate": 3.818591524265398e-06, "loss": 0.4599, "mean_token_accuracy": 0.8508410602807999, "num_tokens": 170327674.0, "step": 141590 }, { "entropy": 1.986707004904747, "epoch": 0.4389476107036879, "grad_norm": 7.647431373596191, "learning_rate": 3.8184566836610944e-06, "loss": 0.4764, "mean_token_accuracy": 0.8480009466409684, "num_tokens": 170339108.0, "step": 141600 }, { "entropy": 1.948198501765728, "epoch": 0.4389786098287376, "grad_norm": 7.071403503417969, "learning_rate": 3.818321857340097e-06, "loss": 0.509, "mean_token_accuracy": 0.8380307227373123, "num_tokens": 170350766.0, "step": 141610 }, { "entropy": 1.9600090399384498, "epoch": 0.43900960895378727, "grad_norm": 9.295860290527344, "learning_rate": 3.818187045299886e-06, "loss": 0.4916, "mean_token_accuracy": 0.8459015518426896, "num_tokens": 170362080.0, "step": 141620 }, { "entropy": 1.9097954377532005, "epoch": 0.439040608078837, "grad_norm": 4.016225337982178, "learning_rate": 3.818052247537938e-06, "loss": 0.4367, "mean_token_accuracy": 0.8569407507777214, "num_tokens": 170374022.0, "step": 141630 }, { "entropy": 1.8951156988739968, "epoch": 0.43907160720388666, "grad_norm": 7.259555339813232, "learning_rate": 3.817917464051734e-06, "loss": 0.4419, "mean_token_accuracy": 0.8556249186396598, "num_tokens": 170386951.0, "step": 141640 }, { "entropy": 1.9594153746962548, "epoch": 0.4391026063289364, "grad_norm": 7.583054065704346, "learning_rate": 3.817782694838756e-06, "loss": 0.4526, "mean_token_accuracy": 0.856129738688469, "num_tokens": 170398778.0, "step": 141650 }, { "entropy": 1.8969098508358002, "epoch": 0.43913360545398605, "grad_norm": 9.536150932312012, "learning_rate": 3.817647939896483e-06, "loss": 0.4578, "mean_token_accuracy": 0.8510769203305244, "num_tokens": 170411665.0, "step": 141660 }, { "entropy": 1.9632080033421517, "epoch": 0.4391646045790358, "grad_norm": 7.881152153015137, "learning_rate": 3.8175131992223965e-06, "loss": 0.4795, "mean_token_accuracy": 0.8432640329003334, "num_tokens": 170423508.0, "step": 141670 }, { "entropy": 1.8790015771985054, "epoch": 0.43919560370408545, "grad_norm": 9.358724594116211, "learning_rate": 3.81737847281398e-06, "loss": 0.4405, "mean_token_accuracy": 0.8637870714068413, "num_tokens": 170435638.0, "step": 141680 }, { "entropy": 1.8561220914125443, "epoch": 0.43922660282913517, "grad_norm": 8.335770606994629, "learning_rate": 3.8172437606687156e-06, "loss": 0.473, "mean_token_accuracy": 0.8470362722873688, "num_tokens": 170449929.0, "step": 141690 }, { "entropy": 1.9746687322854997, "epoch": 0.43925760195418484, "grad_norm": 7.2637248039245605, "learning_rate": 3.817109062784087e-06, "loss": 0.5127, "mean_token_accuracy": 0.842483501136303, "num_tokens": 170461051.0, "step": 141700 }, { "entropy": 1.9206561237573623, "epoch": 0.43928860107923456, "grad_norm": 9.621170997619629, "learning_rate": 3.816974379157578e-06, "loss": 0.4337, "mean_token_accuracy": 0.8608662933111191, "num_tokens": 170472540.0, "step": 141710 }, { "entropy": 1.9312569439411162, "epoch": 0.43931960020428423, "grad_norm": 9.245631217956543, "learning_rate": 3.816839709786675e-06, "loss": 0.4809, "mean_token_accuracy": 0.8509629473090172, "num_tokens": 170484584.0, "step": 141720 }, { "entropy": 1.9452128455042839, "epoch": 0.4393505993293339, "grad_norm": 8.906986236572266, "learning_rate": 3.81670505466886e-06, "loss": 0.4482, "mean_token_accuracy": 0.8519314289093017, "num_tokens": 170496707.0, "step": 141730 }, { "entropy": 1.957577820122242, "epoch": 0.4393815984543836, "grad_norm": 7.901788234710693, "learning_rate": 3.816570413801622e-06, "loss": 0.4578, "mean_token_accuracy": 0.8558757767081261, "num_tokens": 170507715.0, "step": 141740 }, { "entropy": 1.9821037575602531, "epoch": 0.4394125975794333, "grad_norm": 7.910948276519775, "learning_rate": 3.8164357871824466e-06, "loss": 0.4422, "mean_token_accuracy": 0.8592849254608155, "num_tokens": 170518782.0, "step": 141750 }, { "entropy": 1.9237025648355484, "epoch": 0.439443596704483, "grad_norm": 7.395272731781006, "learning_rate": 3.816301174808821e-06, "loss": 0.4709, "mean_token_accuracy": 0.8577270656824112, "num_tokens": 170531656.0, "step": 141760 }, { "entropy": 1.9360353201627731, "epoch": 0.4394745958295327, "grad_norm": 4.2066426277160645, "learning_rate": 3.8161665766782335e-06, "loss": 0.4523, "mean_token_accuracy": 0.8494734823703766, "num_tokens": 170543484.0, "step": 141770 }, { "entropy": 1.969169418513775, "epoch": 0.4395055949545824, "grad_norm": 4.006796360015869, "learning_rate": 3.816031992788171e-06, "loss": 0.4693, "mean_token_accuracy": 0.8522426009178161, "num_tokens": 170555226.0, "step": 141780 }, { "entropy": 1.7786556363105774, "epoch": 0.4395365940796321, "grad_norm": 8.278726577758789, "learning_rate": 3.815897423136125e-06, "loss": 0.3426, "mean_token_accuracy": 0.8771900832653046, "num_tokens": 170568645.0, "step": 141790 }, { "entropy": 1.9889621213078499, "epoch": 0.4395675932046818, "grad_norm": 8.236480712890625, "learning_rate": 3.815762867719584e-06, "loss": 0.4572, "mean_token_accuracy": 0.8558431312441825, "num_tokens": 170580292.0, "step": 141800 }, { "entropy": 1.8943669281899929, "epoch": 0.4395985923297315, "grad_norm": 8.767983436584473, "learning_rate": 3.815628326536037e-06, "loss": 0.462, "mean_token_accuracy": 0.8443626776337624, "num_tokens": 170593285.0, "step": 141810 }, { "entropy": 1.9128671914339066, "epoch": 0.4396295914547812, "grad_norm": 3.5375518798828125, "learning_rate": 3.815493799582977e-06, "loss": 0.4133, "mean_token_accuracy": 0.8669633626937866, "num_tokens": 170605706.0, "step": 141820 }, { "entropy": 1.985709111392498, "epoch": 0.43966059057983087, "grad_norm": 7.396274566650391, "learning_rate": 3.815359286857895e-06, "loss": 0.4849, "mean_token_accuracy": 0.8488274723291397, "num_tokens": 170617185.0, "step": 141830 }, { "entropy": 1.9252109676599503, "epoch": 0.4396915897048806, "grad_norm": 5.221693515777588, "learning_rate": 3.815224788358284e-06, "loss": 0.4519, "mean_token_accuracy": 0.8559987634420395, "num_tokens": 170629091.0, "step": 141840 }, { "entropy": 1.8718436300754546, "epoch": 0.43972258882993026, "grad_norm": 4.875570297241211, "learning_rate": 3.815090304081635e-06, "loss": 0.4358, "mean_token_accuracy": 0.8549114823341369, "num_tokens": 170642049.0, "step": 141850 }, { "entropy": 1.8797766268253326, "epoch": 0.43975358795498, "grad_norm": 7.365478515625, "learning_rate": 3.814955834025442e-06, "loss": 0.4081, "mean_token_accuracy": 0.8585930600762367, "num_tokens": 170654668.0, "step": 141860 }, { "entropy": 1.9316943183541297, "epoch": 0.43978458708002965, "grad_norm": 4.897908687591553, "learning_rate": 3.814821378187199e-06, "loss": 0.4531, "mean_token_accuracy": 0.8544142752885818, "num_tokens": 170666437.0, "step": 141870 }, { "entropy": 1.799608789384365, "epoch": 0.4398155862050794, "grad_norm": 4.87161922454834, "learning_rate": 3.814686936564401e-06, "loss": 0.4168, "mean_token_accuracy": 0.8723533883690834, "num_tokens": 170680355.0, "step": 141880 }, { "entropy": 1.9116061985492707, "epoch": 0.43984658533012905, "grad_norm": 7.900229454040527, "learning_rate": 3.814552509154544e-06, "loss": 0.4451, "mean_token_accuracy": 0.8542070344090462, "num_tokens": 170691960.0, "step": 141890 }, { "entropy": 1.9224073961377144, "epoch": 0.43987758445517877, "grad_norm": 7.187701225280762, "learning_rate": 3.8144180959551223e-06, "loss": 0.4004, "mean_token_accuracy": 0.8590036764740944, "num_tokens": 170703505.0, "step": 141900 }, { "entropy": 1.8964632406830788, "epoch": 0.43990858358022844, "grad_norm": 7.443815231323242, "learning_rate": 3.8142836969636336e-06, "loss": 0.4465, "mean_token_accuracy": 0.8544583544135094, "num_tokens": 170715715.0, "step": 141910 }, { "entropy": 1.8948586672544478, "epoch": 0.43993958270527816, "grad_norm": 8.835453987121582, "learning_rate": 3.8141493121775747e-06, "loss": 0.4289, "mean_token_accuracy": 0.8569506093859672, "num_tokens": 170728395.0, "step": 141920 }, { "entropy": 1.925899577140808, "epoch": 0.43997058183032783, "grad_norm": 7.583258628845215, "learning_rate": 3.814014941594443e-06, "loss": 0.4607, "mean_token_accuracy": 0.850526462495327, "num_tokens": 170740412.0, "step": 141930 }, { "entropy": 1.9305981874465943, "epoch": 0.44000158095537756, "grad_norm": 3.2110064029693604, "learning_rate": 3.8138805852117377e-06, "loss": 0.4387, "mean_token_accuracy": 0.8527848124504089, "num_tokens": 170752268.0, "step": 141940 }, { "entropy": 1.8623799979686737, "epoch": 0.4400325800804272, "grad_norm": 7.726592540740967, "learning_rate": 3.8137462430269565e-06, "loss": 0.4103, "mean_token_accuracy": 0.8616868555545807, "num_tokens": 170764922.0, "step": 141950 }, { "entropy": 1.9484892964363099, "epoch": 0.44006357920547695, "grad_norm": 8.359121322631836, "learning_rate": 3.8136119150376006e-06, "loss": 0.4418, "mean_token_accuracy": 0.8625785291194916, "num_tokens": 170776660.0, "step": 141960 }, { "entropy": 1.898753324151039, "epoch": 0.4400945783305266, "grad_norm": 3.5382606983184814, "learning_rate": 3.8134776012411693e-06, "loss": 0.4573, "mean_token_accuracy": 0.8586586147546769, "num_tokens": 170789396.0, "step": 141970 }, { "entropy": 1.8218749940395356, "epoch": 0.4401255774555763, "grad_norm": 7.84831428527832, "learning_rate": 3.8133433016351636e-06, "loss": 0.3302, "mean_token_accuracy": 0.8637544587254524, "num_tokens": 170802890.0, "step": 141980 }, { "entropy": 1.9783765748143196, "epoch": 0.440156576580626, "grad_norm": 7.985630512237549, "learning_rate": 3.813209016217084e-06, "loss": 0.4766, "mean_token_accuracy": 0.8482358425855636, "num_tokens": 170814592.0, "step": 141990 }, { "entropy": 1.8626375451683999, "epoch": 0.4401875757056757, "grad_norm": 4.535806179046631, "learning_rate": 3.8130747449844342e-06, "loss": 0.3822, "mean_token_accuracy": 0.8576488435268402, "num_tokens": 170828197.0, "step": 142000 }, { "entropy": 1.9288464725017547, "epoch": 0.4402185748307254, "grad_norm": 4.06549596786499, "learning_rate": 3.812940487934716e-06, "loss": 0.4301, "mean_token_accuracy": 0.8626020058989525, "num_tokens": 170840003.0, "step": 142010 }, { "entropy": 1.9120705425739288, "epoch": 0.4402495739557751, "grad_norm": 7.126809597015381, "learning_rate": 3.8128062450654323e-06, "loss": 0.4804, "mean_token_accuracy": 0.8442542180418968, "num_tokens": 170852921.0, "step": 142020 }, { "entropy": 1.9289035245776176, "epoch": 0.4402805730808248, "grad_norm": 7.8247246742248535, "learning_rate": 3.8126720163740884e-06, "loss": 0.4411, "mean_token_accuracy": 0.846360231935978, "num_tokens": 170866003.0, "step": 142030 }, { "entropy": 1.9721984177827836, "epoch": 0.44031157220587447, "grad_norm": 8.725114822387695, "learning_rate": 3.8125378018581864e-06, "loss": 0.4929, "mean_token_accuracy": 0.85105609446764, "num_tokens": 170877690.0, "step": 142040 }, { "entropy": 1.8866790056228637, "epoch": 0.4403425713309242, "grad_norm": 9.154146194458008, "learning_rate": 3.8124036015152344e-06, "loss": 0.4527, "mean_token_accuracy": 0.8527222231030465, "num_tokens": 170890766.0, "step": 142050 }, { "entropy": 1.9038648203015327, "epoch": 0.44037357045597386, "grad_norm": 9.711874008178711, "learning_rate": 3.8122694153427362e-06, "loss": 0.4257, "mean_token_accuracy": 0.8540343955159188, "num_tokens": 170902831.0, "step": 142060 }, { "entropy": 1.946737252175808, "epoch": 0.4404045695810236, "grad_norm": 6.8612380027771, "learning_rate": 3.8121352433381986e-06, "loss": 0.457, "mean_token_accuracy": 0.8524143502116204, "num_tokens": 170914196.0, "step": 142070 }, { "entropy": 1.9659889578819274, "epoch": 0.44043556870607325, "grad_norm": 7.955692768096924, "learning_rate": 3.812001085499129e-06, "loss": 0.556, "mean_token_accuracy": 0.8456687927246094, "num_tokens": 170925736.0, "step": 142080 }, { "entropy": 1.918195514380932, "epoch": 0.440466567831123, "grad_norm": 4.864497184753418, "learning_rate": 3.811866941823035e-06, "loss": 0.4486, "mean_token_accuracy": 0.8531820371747016, "num_tokens": 170937320.0, "step": 142090 }, { "entropy": 1.924280734360218, "epoch": 0.44049756695617265, "grad_norm": 8.45577335357666, "learning_rate": 3.8117328123074237e-06, "loss": 0.4733, "mean_token_accuracy": 0.8453326195478439, "num_tokens": 170949120.0, "step": 142100 }, { "entropy": 1.8904883772134782, "epoch": 0.4405285660812224, "grad_norm": 3.6915700435638428, "learning_rate": 3.8115986969498047e-06, "loss": 0.4394, "mean_token_accuracy": 0.8524854764342308, "num_tokens": 170960950.0, "step": 142110 }, { "entropy": 1.787447765469551, "epoch": 0.44055956520627204, "grad_norm": 2.5302090644836426, "learning_rate": 3.811464595747688e-06, "loss": 0.376, "mean_token_accuracy": 0.8622151434421539, "num_tokens": 170974946.0, "step": 142120 }, { "entropy": 1.7317695260047912, "epoch": 0.44059056433132177, "grad_norm": 4.011857509613037, "learning_rate": 3.811330508698583e-06, "loss": 0.3431, "mean_token_accuracy": 0.8648561984300613, "num_tokens": 170989419.0, "step": 142130 }, { "entropy": 1.9078683421015739, "epoch": 0.44062156345637143, "grad_norm": 4.096642971038818, "learning_rate": 3.8111964358000005e-06, "loss": 0.4389, "mean_token_accuracy": 0.8534543007612229, "num_tokens": 171001053.0, "step": 142140 }, { "entropy": 1.938607893884182, "epoch": 0.44065256258142116, "grad_norm": 8.272358894348145, "learning_rate": 3.8110623770494515e-06, "loss": 0.4844, "mean_token_accuracy": 0.8414226740598678, "num_tokens": 171012985.0, "step": 142150 }, { "entropy": 1.9575382590293884, "epoch": 0.4406835617064708, "grad_norm": 8.397919654846191, "learning_rate": 3.8109283324444484e-06, "loss": 0.4773, "mean_token_accuracy": 0.8492600947618485, "num_tokens": 171023779.0, "step": 142160 }, { "entropy": 1.935109880566597, "epoch": 0.44071456083152055, "grad_norm": 6.095836162567139, "learning_rate": 3.8107943019825027e-06, "loss": 0.4592, "mean_token_accuracy": 0.8479932472109795, "num_tokens": 171035289.0, "step": 142170 }, { "entropy": 1.952261172235012, "epoch": 0.4407455599565702, "grad_norm": 7.735157489776611, "learning_rate": 3.8106602856611296e-06, "loss": 0.579, "mean_token_accuracy": 0.8471168518066406, "num_tokens": 171047459.0, "step": 142180 }, { "entropy": 1.929404976963997, "epoch": 0.44077655908161995, "grad_norm": 3.8500773906707764, "learning_rate": 3.8105262834778404e-06, "loss": 0.4765, "mean_token_accuracy": 0.8393190920352935, "num_tokens": 171059462.0, "step": 142190 }, { "entropy": 1.8857171356678009, "epoch": 0.4408075582066696, "grad_norm": 8.527490615844727, "learning_rate": 3.8103922954301514e-06, "loss": 0.4077, "mean_token_accuracy": 0.8652898028492928, "num_tokens": 171071937.0, "step": 142200 }, { "entropy": 1.9904403433203697, "epoch": 0.44083855733171934, "grad_norm": 9.216706275939941, "learning_rate": 3.810258321515576e-06, "loss": 0.5001, "mean_token_accuracy": 0.8476386234164238, "num_tokens": 171083344.0, "step": 142210 }, { "entropy": 1.8266815394163132, "epoch": 0.440869556456769, "grad_norm": 9.037412643432617, "learning_rate": 3.81012436173163e-06, "loss": 0.4039, "mean_token_accuracy": 0.855592280626297, "num_tokens": 171097604.0, "step": 142220 }, { "entropy": 1.963519898056984, "epoch": 0.4409005555818187, "grad_norm": 7.543654441833496, "learning_rate": 3.809990416075832e-06, "loss": 0.4642, "mean_token_accuracy": 0.8579617440700531, "num_tokens": 171108370.0, "step": 142230 }, { "entropy": 1.9259283766150475, "epoch": 0.4409315547068684, "grad_norm": 7.956765174865723, "learning_rate": 3.809856484545695e-06, "loss": 0.4481, "mean_token_accuracy": 0.8434132248163223, "num_tokens": 171120766.0, "step": 142240 }, { "entropy": 1.9885295629501343, "epoch": 0.44096255383191807, "grad_norm": 8.011740684509277, "learning_rate": 3.8097225671387384e-06, "loss": 0.4909, "mean_token_accuracy": 0.8473825648427009, "num_tokens": 171131762.0, "step": 142250 }, { "entropy": 1.993890830874443, "epoch": 0.4409935529569678, "grad_norm": 8.500561714172363, "learning_rate": 3.80958866385248e-06, "loss": 0.4702, "mean_token_accuracy": 0.8485628053545952, "num_tokens": 171142751.0, "step": 142260 }, { "entropy": 1.937794804573059, "epoch": 0.44102455208201746, "grad_norm": 4.054864883422852, "learning_rate": 3.8094547746844392e-06, "loss": 0.4594, "mean_token_accuracy": 0.8569764137268067, "num_tokens": 171155116.0, "step": 142270 }, { "entropy": 1.9017036750912666, "epoch": 0.4410555512070672, "grad_norm": 7.951387882232666, "learning_rate": 3.809320899632134e-06, "loss": 0.4321, "mean_token_accuracy": 0.8537882640957832, "num_tokens": 171167623.0, "step": 142280 }, { "entropy": 1.9774706676602363, "epoch": 0.44108655033211686, "grad_norm": 9.138680458068848, "learning_rate": 3.809187038693085e-06, "loss": 0.5366, "mean_token_accuracy": 0.8372975870966911, "num_tokens": 171179230.0, "step": 142290 }, { "entropy": 1.9862707808613778, "epoch": 0.4411175494571666, "grad_norm": 7.265309810638428, "learning_rate": 3.809053191864811e-06, "loss": 0.4819, "mean_token_accuracy": 0.8446347713470459, "num_tokens": 171190091.0, "step": 142300 }, { "entropy": 1.9375395745038986, "epoch": 0.44114854858221625, "grad_norm": 8.636645317077637, "learning_rate": 3.808919359144836e-06, "loss": 0.4197, "mean_token_accuracy": 0.8591432020068168, "num_tokens": 171201834.0, "step": 142310 }, { "entropy": 1.905143465101719, "epoch": 0.441179547707266, "grad_norm": 4.498809814453125, "learning_rate": 3.8087855405306796e-06, "loss": 0.4133, "mean_token_accuracy": 0.8690479651093483, "num_tokens": 171213960.0, "step": 142320 }, { "entropy": 1.9409716859459878, "epoch": 0.44121054683231564, "grad_norm": 8.317008972167969, "learning_rate": 3.8086517360198645e-06, "loss": 0.4613, "mean_token_accuracy": 0.8425351768732071, "num_tokens": 171225664.0, "step": 142330 }, { "entropy": 1.9071755453944206, "epoch": 0.44124154595736537, "grad_norm": 7.693809986114502, "learning_rate": 3.8085179456099135e-06, "loss": 0.4228, "mean_token_accuracy": 0.8560735136270523, "num_tokens": 171238142.0, "step": 142340 }, { "entropy": 1.8364718183875084, "epoch": 0.44127254508241504, "grad_norm": 5.97056245803833, "learning_rate": 3.80838416929835e-06, "loss": 0.395, "mean_token_accuracy": 0.8731970608234405, "num_tokens": 171251102.0, "step": 142350 }, { "entropy": 1.8636333227157593, "epoch": 0.44130354420746476, "grad_norm": 4.0348920822143555, "learning_rate": 3.8082504070826986e-06, "loss": 0.4076, "mean_token_accuracy": 0.863330303132534, "num_tokens": 171263656.0, "step": 142360 }, { "entropy": 1.8688743382692337, "epoch": 0.44133454333251443, "grad_norm": 8.717557907104492, "learning_rate": 3.8081166589604846e-06, "loss": 0.3986, "mean_token_accuracy": 0.8605389028787613, "num_tokens": 171276463.0, "step": 142370 }, { "entropy": 1.851096446812153, "epoch": 0.44136554245756415, "grad_norm": 7.275476455688477, "learning_rate": 3.8079829249292312e-06, "loss": 0.4234, "mean_token_accuracy": 0.8582789734005928, "num_tokens": 171289605.0, "step": 142380 }, { "entropy": 1.966257530450821, "epoch": 0.4413965415826138, "grad_norm": 9.199238777160645, "learning_rate": 3.8078492049864664e-06, "loss": 0.4801, "mean_token_accuracy": 0.8538633704185485, "num_tokens": 171300168.0, "step": 142390 }, { "entropy": 1.7878931298851968, "epoch": 0.44142754070766355, "grad_norm": 3.798326253890991, "learning_rate": 3.8077154991297155e-06, "loss": 0.3512, "mean_token_accuracy": 0.863753379881382, "num_tokens": 171314188.0, "step": 142400 }, { "entropy": 1.9457097873091698, "epoch": 0.4414585398327132, "grad_norm": 7.9168853759765625, "learning_rate": 3.8075818073565064e-06, "loss": 0.4897, "mean_token_accuracy": 0.8465751528739929, "num_tokens": 171326103.0, "step": 142410 }, { "entropy": 1.9705891758203506, "epoch": 0.44148953895776294, "grad_norm": 9.16375732421875, "learning_rate": 3.8074481296643662e-06, "loss": 0.4359, "mean_token_accuracy": 0.860154564678669, "num_tokens": 171337413.0, "step": 142420 }, { "entropy": 1.942754976451397, "epoch": 0.4415205380828126, "grad_norm": 8.25278091430664, "learning_rate": 3.807314466050824e-06, "loss": 0.4173, "mean_token_accuracy": 0.8554434284567833, "num_tokens": 171349655.0, "step": 142430 }, { "entropy": 1.9804495930671693, "epoch": 0.44155153720786233, "grad_norm": 7.4566144943237305, "learning_rate": 3.8071808165134087e-06, "loss": 0.5058, "mean_token_accuracy": 0.848778210580349, "num_tokens": 171360943.0, "step": 142440 }, { "entropy": 1.8187919482588768, "epoch": 0.441582536332912, "grad_norm": 7.4357686042785645, "learning_rate": 3.807047181049649e-06, "loss": 0.3858, "mean_token_accuracy": 0.8577719181776047, "num_tokens": 171374928.0, "step": 142450 }, { "entropy": 1.9486502900719642, "epoch": 0.4416135354579617, "grad_norm": 8.6198148727417, "learning_rate": 3.8069135596570757e-06, "loss": 0.4717, "mean_token_accuracy": 0.8480415970087052, "num_tokens": 171387021.0, "step": 142460 }, { "entropy": 1.7527381375432014, "epoch": 0.4416445345830114, "grad_norm": 7.5862135887146, "learning_rate": 3.806779952333219e-06, "loss": 0.3672, "mean_token_accuracy": 0.8725428983569146, "num_tokens": 171401592.0, "step": 142470 }, { "entropy": 1.9407009825110435, "epoch": 0.44167553370806106, "grad_norm": 7.653254985809326, "learning_rate": 3.806646359075612e-06, "loss": 0.4221, "mean_token_accuracy": 0.8616171032190323, "num_tokens": 171413307.0, "step": 142480 }, { "entropy": 1.8926226265728474, "epoch": 0.4417065328331108, "grad_norm": 7.93331241607666, "learning_rate": 3.806512779881785e-06, "loss": 0.4041, "mean_token_accuracy": 0.8604073897004128, "num_tokens": 171427027.0, "step": 142490 }, { "entropy": 1.9086723767220974, "epoch": 0.44173753195816046, "grad_norm": 8.239239692687988, "learning_rate": 3.806379214749271e-06, "loss": 0.5015, "mean_token_accuracy": 0.8534449085593223, "num_tokens": 171440127.0, "step": 142500 }, { "entropy": 1.888797688484192, "epoch": 0.4417685310832102, "grad_norm": 8.310606956481934, "learning_rate": 3.8062456636756035e-06, "loss": 0.3774, "mean_token_accuracy": 0.871643802523613, "num_tokens": 171452073.0, "step": 142510 }, { "entropy": 1.8726611629128456, "epoch": 0.44179953020825985, "grad_norm": 8.1859769821167, "learning_rate": 3.8061121266583157e-06, "loss": 0.4828, "mean_token_accuracy": 0.8519933164119721, "num_tokens": 171465272.0, "step": 142520 }, { "entropy": 1.9565669789910316, "epoch": 0.4418305293333096, "grad_norm": 7.003886699676514, "learning_rate": 3.805978603694943e-06, "loss": 0.4643, "mean_token_accuracy": 0.8577442169189453, "num_tokens": 171476604.0, "step": 142530 }, { "entropy": 1.969398957490921, "epoch": 0.44186152845835924, "grad_norm": 7.602597713470459, "learning_rate": 3.80584509478302e-06, "loss": 0.4394, "mean_token_accuracy": 0.8602959275245666, "num_tokens": 171487864.0, "step": 142540 }, { "entropy": 1.9313815072178842, "epoch": 0.44189252758340897, "grad_norm": 7.627027988433838, "learning_rate": 3.805711599920083e-06, "loss": 0.4469, "mean_token_accuracy": 0.8534037470817566, "num_tokens": 171500018.0, "step": 142550 }, { "entropy": 1.9397203966975212, "epoch": 0.44192352670845864, "grad_norm": 8.17835521697998, "learning_rate": 3.805578119103666e-06, "loss": 0.4608, "mean_token_accuracy": 0.8493491873145104, "num_tokens": 171511356.0, "step": 142560 }, { "entropy": 1.8530723094940185, "epoch": 0.44195452583350836, "grad_norm": 7.996511459350586, "learning_rate": 3.8054446523313083e-06, "loss": 0.4775, "mean_token_accuracy": 0.8453282520174981, "num_tokens": 171524204.0, "step": 142570 }, { "entropy": 1.839161404967308, "epoch": 0.44198552495855803, "grad_norm": 4.661229133605957, "learning_rate": 3.8053111996005454e-06, "loss": 0.3742, "mean_token_accuracy": 0.859870445728302, "num_tokens": 171537124.0, "step": 142580 }, { "entropy": 1.8566135600209237, "epoch": 0.44201652408360775, "grad_norm": 5.89265775680542, "learning_rate": 3.8051777609089174e-06, "loss": 0.4537, "mean_token_accuracy": 0.8573786094784737, "num_tokens": 171550107.0, "step": 142590 }, { "entropy": 1.8885900154709816, "epoch": 0.4420475232086574, "grad_norm": 7.972777366638184, "learning_rate": 3.805044336253962e-06, "loss": 0.4445, "mean_token_accuracy": 0.8563668861985206, "num_tokens": 171562634.0, "step": 142600 }, { "entropy": 1.9973913908004761, "epoch": 0.44207852233370715, "grad_norm": 7.949423789978027, "learning_rate": 3.8049109256332184e-06, "loss": 0.5469, "mean_token_accuracy": 0.8333679780364036, "num_tokens": 171574739.0, "step": 142610 }, { "entropy": 1.9654783993959426, "epoch": 0.4421095214587568, "grad_norm": 8.60360050201416, "learning_rate": 3.804777529044226e-06, "loss": 0.4183, "mean_token_accuracy": 0.8631428495049477, "num_tokens": 171585950.0, "step": 142620 }, { "entropy": 1.9139345183968544, "epoch": 0.44214052058380654, "grad_norm": 8.448734283447266, "learning_rate": 3.804644146484526e-06, "loss": 0.4199, "mean_token_accuracy": 0.850536921620369, "num_tokens": 171597859.0, "step": 142630 }, { "entropy": 1.8951680943369866, "epoch": 0.4421715197088562, "grad_norm": 7.874819278717041, "learning_rate": 3.8045107779516586e-06, "loss": 0.4726, "mean_token_accuracy": 0.8422896340489388, "num_tokens": 171610294.0, "step": 142640 }, { "entropy": 1.9419591963291167, "epoch": 0.44220251883390593, "grad_norm": 3.6570918560028076, "learning_rate": 3.8043774234431667e-06, "loss": 0.4749, "mean_token_accuracy": 0.8482140868902206, "num_tokens": 171621850.0, "step": 142650 }, { "entropy": 1.8761625073850154, "epoch": 0.4422335179589556, "grad_norm": 8.101895332336426, "learning_rate": 3.804244082956592e-06, "loss": 0.4294, "mean_token_accuracy": 0.8565763145685196, "num_tokens": 171634316.0, "step": 142660 }, { "entropy": 1.9388285502791405, "epoch": 0.4422645170840053, "grad_norm": 3.6656668186187744, "learning_rate": 3.8041107564894777e-06, "loss": 0.4821, "mean_token_accuracy": 0.8403656214475632, "num_tokens": 171646140.0, "step": 142670 }, { "entropy": 1.9169174507260323, "epoch": 0.442295516209055, "grad_norm": 7.208846092224121, "learning_rate": 3.803977444039366e-06, "loss": 0.4243, "mean_token_accuracy": 0.8567602500319481, "num_tokens": 171657641.0, "step": 142680 }, { "entropy": 1.7894317671656608, "epoch": 0.4423265153341047, "grad_norm": 4.454036712646484, "learning_rate": 3.803844145603802e-06, "loss": 0.367, "mean_token_accuracy": 0.8561735481023789, "num_tokens": 171671211.0, "step": 142690 }, { "entropy": 1.924549500644207, "epoch": 0.4423575144591544, "grad_norm": 4.878185272216797, "learning_rate": 3.803710861180331e-06, "loss": 0.4273, "mean_token_accuracy": 0.8529748469591141, "num_tokens": 171683021.0, "step": 142700 }, { "entropy": 1.926971609890461, "epoch": 0.4423885135842041, "grad_norm": 8.438478469848633, "learning_rate": 3.803577590766498e-06, "loss": 0.4745, "mean_token_accuracy": 0.8476522505283356, "num_tokens": 171694922.0, "step": 142710 }, { "entropy": 1.869713193178177, "epoch": 0.4424195127092538, "grad_norm": 8.490039825439453, "learning_rate": 3.8034443343598484e-06, "loss": 0.4056, "mean_token_accuracy": 0.8578268960118294, "num_tokens": 171707523.0, "step": 142720 }, { "entropy": 1.89549450725317, "epoch": 0.44245051183430345, "grad_norm": 3.9622802734375, "learning_rate": 3.8033110919579275e-06, "loss": 0.4157, "mean_token_accuracy": 0.8647204831242561, "num_tokens": 171719274.0, "step": 142730 }, { "entropy": 1.9119060546159745, "epoch": 0.4424815109593532, "grad_norm": 10.171487808227539, "learning_rate": 3.8031778635582846e-06, "loss": 0.4752, "mean_token_accuracy": 0.8421938866376877, "num_tokens": 171731484.0, "step": 142740 }, { "entropy": 1.9421584740281106, "epoch": 0.44251251008440284, "grad_norm": 7.809762001037598, "learning_rate": 3.803044649158467e-06, "loss": 0.4406, "mean_token_accuracy": 0.8524343490600585, "num_tokens": 171742555.0, "step": 142750 }, { "entropy": 1.8323628097772597, "epoch": 0.44254350920945257, "grad_norm": 3.0407345294952393, "learning_rate": 3.802911448756022e-06, "loss": 0.3788, "mean_token_accuracy": 0.8646909952163696, "num_tokens": 171756753.0, "step": 142760 }, { "entropy": 1.949809417128563, "epoch": 0.44257450833450224, "grad_norm": 8.330705642700195, "learning_rate": 3.8027782623484994e-06, "loss": 0.4843, "mean_token_accuracy": 0.8445712998509407, "num_tokens": 171768227.0, "step": 142770 }, { "entropy": 1.8988068416714667, "epoch": 0.44260550745955196, "grad_norm": 7.558119297027588, "learning_rate": 3.802645089933448e-06, "loss": 0.4338, "mean_token_accuracy": 0.8599656626582146, "num_tokens": 171779928.0, "step": 142780 }, { "entropy": 1.8191672816872597, "epoch": 0.44263650658460163, "grad_norm": 8.800172805786133, "learning_rate": 3.8025119315084186e-06, "loss": 0.408, "mean_token_accuracy": 0.8557739660143853, "num_tokens": 171793531.0, "step": 142790 }, { "entropy": 1.912230722606182, "epoch": 0.44266750570965135, "grad_norm": 8.029808044433594, "learning_rate": 3.802378787070961e-06, "loss": 0.4513, "mean_token_accuracy": 0.8631566151976585, "num_tokens": 171804841.0, "step": 142800 }, { "entropy": 1.9423863098025322, "epoch": 0.442698504834701, "grad_norm": 7.330270290374756, "learning_rate": 3.8022456566186274e-06, "loss": 0.4391, "mean_token_accuracy": 0.8568160951137542, "num_tokens": 171816045.0, "step": 142810 }, { "entropy": 1.909077961742878, "epoch": 0.44272950395975075, "grad_norm": 8.023503303527832, "learning_rate": 3.8021125401489695e-06, "loss": 0.4576, "mean_token_accuracy": 0.8499825984239578, "num_tokens": 171827873.0, "step": 142820 }, { "entropy": 1.914843738079071, "epoch": 0.4427605030848004, "grad_norm": 7.886246681213379, "learning_rate": 3.8019794376595393e-06, "loss": 0.4809, "mean_token_accuracy": 0.854096032679081, "num_tokens": 171839212.0, "step": 142830 }, { "entropy": 1.747120851278305, "epoch": 0.44279150220985014, "grad_norm": 4.407447338104248, "learning_rate": 3.8018463491478903e-06, "loss": 0.3271, "mean_token_accuracy": 0.8662238627672195, "num_tokens": 171853351.0, "step": 142840 }, { "entropy": 1.9104099452495575, "epoch": 0.4428225013348998, "grad_norm": 7.366025924682617, "learning_rate": 3.801713274611577e-06, "loss": 0.4701, "mean_token_accuracy": 0.8494726955890656, "num_tokens": 171865251.0, "step": 142850 }, { "entropy": 1.9125598162412643, "epoch": 0.44285350045994953, "grad_norm": 8.276859283447266, "learning_rate": 3.8015802140481523e-06, "loss": 0.4449, "mean_token_accuracy": 0.8597826391458512, "num_tokens": 171877204.0, "step": 142860 }, { "entropy": 1.9104377076029777, "epoch": 0.4428844995849992, "grad_norm": 3.7604010105133057, "learning_rate": 3.801447167455171e-06, "loss": 0.4483, "mean_token_accuracy": 0.8584300577640533, "num_tokens": 171889490.0, "step": 142870 }, { "entropy": 1.7467719167470932, "epoch": 0.4429154987100489, "grad_norm": 2.625401258468628, "learning_rate": 3.8013141348301903e-06, "loss": 0.361, "mean_token_accuracy": 0.8759482517838478, "num_tokens": 171904139.0, "step": 142880 }, { "entropy": 1.8149724557995797, "epoch": 0.4429464978350986, "grad_norm": 7.951505184173584, "learning_rate": 3.801181116170765e-06, "loss": 0.381, "mean_token_accuracy": 0.8639876395463943, "num_tokens": 171918069.0, "step": 142890 }, { "entropy": 1.9812359169125557, "epoch": 0.4429774969601483, "grad_norm": 6.9604926109313965, "learning_rate": 3.801048111474452e-06, "loss": 0.4941, "mean_token_accuracy": 0.8483646467328072, "num_tokens": 171929606.0, "step": 142900 }, { "entropy": 1.9642615109682082, "epoch": 0.443008496085198, "grad_norm": 7.556371688842773, "learning_rate": 3.800915120738809e-06, "loss": 0.4788, "mean_token_accuracy": 0.8475595816969872, "num_tokens": 171941190.0, "step": 142910 }, { "entropy": 1.8470994770526885, "epoch": 0.4430394952102477, "grad_norm": 7.835377216339111, "learning_rate": 3.8007821439613928e-06, "loss": 0.3891, "mean_token_accuracy": 0.8662957057356835, "num_tokens": 171954065.0, "step": 142920 }, { "entropy": 1.8816244021058082, "epoch": 0.4430704943352974, "grad_norm": 3.198092222213745, "learning_rate": 3.8006491811397643e-06, "loss": 0.4295, "mean_token_accuracy": 0.8502931743860245, "num_tokens": 171966243.0, "step": 142930 }, { "entropy": 1.8951477900147438, "epoch": 0.4431014934603471, "grad_norm": 6.3186750411987305, "learning_rate": 3.80051623227148e-06, "loss": 0.4193, "mean_token_accuracy": 0.8593979939818382, "num_tokens": 171979052.0, "step": 142940 }, { "entropy": 1.939696778357029, "epoch": 0.4431324925853968, "grad_norm": 9.299203872680664, "learning_rate": 3.8003832973541e-06, "loss": 0.4681, "mean_token_accuracy": 0.8494555294513703, "num_tokens": 171990440.0, "step": 142950 }, { "entropy": 1.8888702988624573, "epoch": 0.4431634917104465, "grad_norm": 3.8916590213775635, "learning_rate": 3.800250376385186e-06, "loss": 0.4636, "mean_token_accuracy": 0.8480198219418525, "num_tokens": 172002952.0, "step": 142960 }, { "entropy": 1.9692298248410225, "epoch": 0.44319449083549617, "grad_norm": 7.702281475067139, "learning_rate": 3.8001174693622976e-06, "loss": 0.4755, "mean_token_accuracy": 0.8490083158016205, "num_tokens": 172013634.0, "step": 142970 }, { "entropy": 1.796003994345665, "epoch": 0.44322548996054584, "grad_norm": 2.3298850059509277, "learning_rate": 3.7999845762829975e-06, "loss": 0.3878, "mean_token_accuracy": 0.8645777225494384, "num_tokens": 172027808.0, "step": 142980 }, { "entropy": 1.8192441150546075, "epoch": 0.44325648908559556, "grad_norm": 8.177521705627441, "learning_rate": 3.7998516971448463e-06, "loss": 0.4098, "mean_token_accuracy": 0.8649537086486816, "num_tokens": 172041098.0, "step": 142990 }, { "entropy": 1.7670658022165298, "epoch": 0.44328748821064523, "grad_norm": 3.8939199447631836, "learning_rate": 3.7997188319454082e-06, "loss": 0.333, "mean_token_accuracy": 0.8773150816559792, "num_tokens": 172054398.0, "step": 143000 }, { "entropy": 1.839756967127323, "epoch": 0.44331848733569496, "grad_norm": 8.327499389648438, "learning_rate": 3.7995859806822448e-06, "loss": 0.4289, "mean_token_accuracy": 0.8523253813385964, "num_tokens": 172067290.0, "step": 143010 }, { "entropy": 1.9034264922142028, "epoch": 0.4433494864607446, "grad_norm": 7.8952531814575195, "learning_rate": 3.7994531433529215e-06, "loss": 0.4566, "mean_token_accuracy": 0.852616871893406, "num_tokens": 172079439.0, "step": 143020 }, { "entropy": 1.9509797424077988, "epoch": 0.44338048558579435, "grad_norm": 8.539716720581055, "learning_rate": 3.7993203199550016e-06, "loss": 0.4845, "mean_token_accuracy": 0.8532881319522858, "num_tokens": 172090400.0, "step": 143030 }, { "entropy": 1.8199230879545212, "epoch": 0.443411484710844, "grad_norm": 7.303118705749512, "learning_rate": 3.7991875104860506e-06, "loss": 0.4115, "mean_token_accuracy": 0.8639900296926498, "num_tokens": 172103631.0, "step": 143040 }, { "entropy": 1.9047269806265832, "epoch": 0.44344248383589374, "grad_norm": 3.573137044906616, "learning_rate": 3.799054714943634e-06, "loss": 0.4069, "mean_token_accuracy": 0.8529732450842857, "num_tokens": 172115897.0, "step": 143050 }, { "entropy": 1.927889946103096, "epoch": 0.4434734829609434, "grad_norm": 6.2143635749816895, "learning_rate": 3.798921933325319e-06, "loss": 0.5202, "mean_token_accuracy": 0.845208078622818, "num_tokens": 172127650.0, "step": 143060 }, { "entropy": 1.8747559115290642, "epoch": 0.44350448208599313, "grad_norm": 8.309805870056152, "learning_rate": 3.798789165628671e-06, "loss": 0.4779, "mean_token_accuracy": 0.849570831656456, "num_tokens": 172139526.0, "step": 143070 }, { "entropy": 1.8134203180670738, "epoch": 0.4435354812110428, "grad_norm": 10.744686126708984, "learning_rate": 3.7986564118512593e-06, "loss": 0.3987, "mean_token_accuracy": 0.8602747023105621, "num_tokens": 172152170.0, "step": 143080 }, { "entropy": 1.920543058216572, "epoch": 0.44356648033609253, "grad_norm": 8.824508666992188, "learning_rate": 3.79852367199065e-06, "loss": 0.4664, "mean_token_accuracy": 0.8488981783390045, "num_tokens": 172163758.0, "step": 143090 }, { "entropy": 1.95814551115036, "epoch": 0.4435974794611422, "grad_norm": 5.424408912658691, "learning_rate": 3.798390946044413e-06, "loss": 0.4833, "mean_token_accuracy": 0.8507681146264077, "num_tokens": 172175098.0, "step": 143100 }, { "entropy": 1.9256742283701898, "epoch": 0.4436284785861919, "grad_norm": 8.446990966796875, "learning_rate": 3.7982582340101166e-06, "loss": 0.4942, "mean_token_accuracy": 0.8471813514828682, "num_tokens": 172186173.0, "step": 143110 }, { "entropy": 1.914665700495243, "epoch": 0.4436594777112416, "grad_norm": 4.053114414215088, "learning_rate": 3.7981255358853308e-06, "loss": 0.4461, "mean_token_accuracy": 0.8550711467862129, "num_tokens": 172198247.0, "step": 143120 }, { "entropy": 1.9240732803940772, "epoch": 0.4436904768362913, "grad_norm": 6.92840576171875, "learning_rate": 3.7979928516676266e-06, "loss": 0.4483, "mean_token_accuracy": 0.8500184774398803, "num_tokens": 172210111.0, "step": 143130 }, { "entropy": 1.9857992932200432, "epoch": 0.443721475961341, "grad_norm": 9.123396873474121, "learning_rate": 3.7978601813545756e-06, "loss": 0.5182, "mean_token_accuracy": 0.8403029024600983, "num_tokens": 172221785.0, "step": 143140 }, { "entropy": 1.88204335719347, "epoch": 0.4437524750863907, "grad_norm": 4.223636627197266, "learning_rate": 3.797727524943748e-06, "loss": 0.4046, "mean_token_accuracy": 0.8584297001361847, "num_tokens": 172234848.0, "step": 143150 }, { "entropy": 1.8985393255949021, "epoch": 0.4437834742114404, "grad_norm": 10.910396575927734, "learning_rate": 3.797594882432716e-06, "loss": 0.4021, "mean_token_accuracy": 0.856754156947136, "num_tokens": 172247385.0, "step": 143160 }, { "entropy": 1.9117623910307884, "epoch": 0.4438144733364901, "grad_norm": 7.727200508117676, "learning_rate": 3.7974622538190527e-06, "loss": 0.4536, "mean_token_accuracy": 0.845762537419796, "num_tokens": 172260013.0, "step": 143170 }, { "entropy": 1.9631491884589196, "epoch": 0.44384547246153977, "grad_norm": 8.514778137207031, "learning_rate": 3.7973296391003324e-06, "loss": 0.4897, "mean_token_accuracy": 0.8480650544166565, "num_tokens": 172271771.0, "step": 143180 }, { "entropy": 1.8932307675480842, "epoch": 0.4438764715865895, "grad_norm": 8.17857551574707, "learning_rate": 3.797197038274128e-06, "loss": 0.427, "mean_token_accuracy": 0.8522116959095001, "num_tokens": 172284879.0, "step": 143190 }, { "entropy": 2.053502270579338, "epoch": 0.44390747071163916, "grad_norm": 8.686943054199219, "learning_rate": 3.7970644513380144e-06, "loss": 0.5244, "mean_token_accuracy": 0.8443288967013359, "num_tokens": 172295803.0, "step": 143200 }, { "entropy": 2.0258347660303118, "epoch": 0.4439384698366889, "grad_norm": 8.006467819213867, "learning_rate": 3.7969318782895674e-06, "loss": 0.536, "mean_token_accuracy": 0.8328389957547188, "num_tokens": 172306930.0, "step": 143210 }, { "entropy": 1.9896105587482453, "epoch": 0.44396946896173856, "grad_norm": 3.218825340270996, "learning_rate": 3.7967993191263625e-06, "loss": 0.4878, "mean_token_accuracy": 0.8439623191952705, "num_tokens": 172318217.0, "step": 143220 }, { "entropy": 1.9521566152572631, "epoch": 0.4440004680867882, "grad_norm": 7.5543389320373535, "learning_rate": 3.7966667738459746e-06, "loss": 0.4407, "mean_token_accuracy": 0.8609140679240227, "num_tokens": 172330536.0, "step": 143230 }, { "entropy": 1.9294056311249732, "epoch": 0.44403146721183795, "grad_norm": 9.052746772766113, "learning_rate": 3.7965342424459822e-06, "loss": 0.4496, "mean_token_accuracy": 0.8596820458769798, "num_tokens": 172341999.0, "step": 143240 }, { "entropy": 1.869072140753269, "epoch": 0.4440624663368876, "grad_norm": 7.041997909545898, "learning_rate": 3.7964017249239623e-06, "loss": 0.4067, "mean_token_accuracy": 0.8588750317692757, "num_tokens": 172355292.0, "step": 143250 }, { "entropy": 1.9508719503879548, "epoch": 0.44409346546193734, "grad_norm": 4.241085529327393, "learning_rate": 3.796269221277493e-06, "loss": 0.4702, "mean_token_accuracy": 0.8481484889984131, "num_tokens": 172367036.0, "step": 143260 }, { "entropy": 1.9314745679497718, "epoch": 0.444124464586987, "grad_norm": 4.061893939971924, "learning_rate": 3.7961367315041535e-06, "loss": 0.425, "mean_token_accuracy": 0.8549939930438996, "num_tokens": 172379192.0, "step": 143270 }, { "entropy": 1.9073988541960716, "epoch": 0.44415546371203674, "grad_norm": 7.23845100402832, "learning_rate": 3.7960042556015226e-06, "loss": 0.4879, "mean_token_accuracy": 0.8475894257426262, "num_tokens": 172391173.0, "step": 143280 }, { "entropy": 1.9345348447561264, "epoch": 0.4441864628370864, "grad_norm": 8.21992015838623, "learning_rate": 3.79587179356718e-06, "loss": 0.4772, "mean_token_accuracy": 0.8488015040755272, "num_tokens": 172403416.0, "step": 143290 }, { "entropy": 1.9360866978764535, "epoch": 0.44421746196213613, "grad_norm": 6.462368011474609, "learning_rate": 3.7957393453987075e-06, "loss": 0.4593, "mean_token_accuracy": 0.8466382816433906, "num_tokens": 172415684.0, "step": 143300 }, { "entropy": 1.9160262137651443, "epoch": 0.4442484610871858, "grad_norm": 6.9671454429626465, "learning_rate": 3.795606911093684e-06, "loss": 0.4454, "mean_token_accuracy": 0.8536192387342453, "num_tokens": 172427475.0, "step": 143310 }, { "entropy": 1.9292744249105453, "epoch": 0.4442794602122355, "grad_norm": 3.4620540142059326, "learning_rate": 3.7954744906496932e-06, "loss": 0.4252, "mean_token_accuracy": 0.8594009026885032, "num_tokens": 172439631.0, "step": 143320 }, { "entropy": 1.9033573985099792, "epoch": 0.4443104593372852, "grad_norm": 8.030845642089844, "learning_rate": 3.795342084064316e-06, "loss": 0.4125, "mean_token_accuracy": 0.86111601293087, "num_tokens": 172451878.0, "step": 143330 }, { "entropy": 1.851330216228962, "epoch": 0.4443414584623349, "grad_norm": 8.43782901763916, "learning_rate": 3.795209691335136e-06, "loss": 0.3983, "mean_token_accuracy": 0.8640111222863197, "num_tokens": 172465066.0, "step": 143340 }, { "entropy": 1.991729763150215, "epoch": 0.4443724575873846, "grad_norm": 7.826473236083984, "learning_rate": 3.7950773124597375e-06, "loss": 0.488, "mean_token_accuracy": 0.8467871636152268, "num_tokens": 172476252.0, "step": 143350 }, { "entropy": 1.9685329020023346, "epoch": 0.4444034567124343, "grad_norm": 8.118627548217773, "learning_rate": 3.794944947435702e-06, "loss": 0.4874, "mean_token_accuracy": 0.8508369252085686, "num_tokens": 172487279.0, "step": 143360 }, { "entropy": 2.045986759662628, "epoch": 0.444434455837484, "grad_norm": 8.419778823852539, "learning_rate": 3.794812596260616e-06, "loss": 0.4839, "mean_token_accuracy": 0.8484291791915893, "num_tokens": 172498181.0, "step": 143370 }, { "entropy": 1.9107683286070825, "epoch": 0.4444654549625337, "grad_norm": 8.121953964233398, "learning_rate": 3.794680258932064e-06, "loss": 0.4352, "mean_token_accuracy": 0.8511162519454956, "num_tokens": 172510612.0, "step": 143380 }, { "entropy": 1.9096698313951492, "epoch": 0.44449645408758337, "grad_norm": 9.873578071594238, "learning_rate": 3.7945479354476334e-06, "loss": 0.3986, "mean_token_accuracy": 0.867861407995224, "num_tokens": 172522684.0, "step": 143390 }, { "entropy": 1.9415227890014648, "epoch": 0.4445274532126331, "grad_norm": 9.934027671813965, "learning_rate": 3.794415625804908e-06, "loss": 0.477, "mean_token_accuracy": 0.8525919511914253, "num_tokens": 172533993.0, "step": 143400 }, { "entropy": 1.9405535489320755, "epoch": 0.44455845233768276, "grad_norm": 7.328090667724609, "learning_rate": 3.794283330001477e-06, "loss": 0.4226, "mean_token_accuracy": 0.8587682485580445, "num_tokens": 172545914.0, "step": 143410 }, { "entropy": 1.984012272953987, "epoch": 0.4445894514627325, "grad_norm": 8.643956184387207, "learning_rate": 3.7941510480349265e-06, "loss": 0.4748, "mean_token_accuracy": 0.8469142064452171, "num_tokens": 172556997.0, "step": 143420 }, { "entropy": 1.9300741687417031, "epoch": 0.44462045058778216, "grad_norm": 4.657692909240723, "learning_rate": 3.794018779902845e-06, "loss": 0.4216, "mean_token_accuracy": 0.860441392660141, "num_tokens": 172568929.0, "step": 143430 }, { "entropy": 1.8880173966288567, "epoch": 0.4446514497128319, "grad_norm": 8.421919822692871, "learning_rate": 3.793886525602822e-06, "loss": 0.395, "mean_token_accuracy": 0.8604597702622414, "num_tokens": 172580791.0, "step": 143440 }, { "entropy": 1.933186987042427, "epoch": 0.44468244883788155, "grad_norm": 3.5404715538024902, "learning_rate": 3.793754285132446e-06, "loss": 0.4859, "mean_token_accuracy": 0.8485592529177666, "num_tokens": 172593075.0, "step": 143450 }, { "entropy": 1.950406338274479, "epoch": 0.4447134479629312, "grad_norm": 8.130172729492188, "learning_rate": 3.7936220584893074e-06, "loss": 0.4427, "mean_token_accuracy": 0.8546596810221672, "num_tokens": 172604452.0, "step": 143460 }, { "entropy": 1.9698896750807762, "epoch": 0.44474444708798094, "grad_norm": 8.126727104187012, "learning_rate": 3.7934898456709963e-06, "loss": 0.4745, "mean_token_accuracy": 0.8500417619943619, "num_tokens": 172616103.0, "step": 143470 }, { "entropy": 1.8846001833677293, "epoch": 0.4447754462130306, "grad_norm": 7.103678226470947, "learning_rate": 3.793357646675104e-06, "loss": 0.4099, "mean_token_accuracy": 0.8594605028629303, "num_tokens": 172627939.0, "step": 143480 }, { "entropy": 1.9624834671616553, "epoch": 0.44480644533808034, "grad_norm": 7.368971824645996, "learning_rate": 3.7932254614992225e-06, "loss": 0.4464, "mean_token_accuracy": 0.853089140355587, "num_tokens": 172639602.0, "step": 143490 }, { "entropy": 2.0040226548910143, "epoch": 0.44483744446313, "grad_norm": 8.010761260986328, "learning_rate": 3.7930932901409443e-06, "loss": 0.5038, "mean_token_accuracy": 0.8468669816851616, "num_tokens": 172650585.0, "step": 143500 }, { "entropy": 1.9335386529564857, "epoch": 0.44486844358817973, "grad_norm": 8.675997734069824, "learning_rate": 3.7929611325978612e-06, "loss": 0.4349, "mean_token_accuracy": 0.8535260155797004, "num_tokens": 172662166.0, "step": 143510 }, { "entropy": 1.975813153386116, "epoch": 0.4448994427132294, "grad_norm": 9.182839393615723, "learning_rate": 3.7928289888675668e-06, "loss": 0.4709, "mean_token_accuracy": 0.855238126218319, "num_tokens": 172673577.0, "step": 143520 }, { "entropy": 1.982006560266018, "epoch": 0.4449304418382791, "grad_norm": 8.914770126342773, "learning_rate": 3.7926968589476558e-06, "loss": 0.4923, "mean_token_accuracy": 0.8514761239290237, "num_tokens": 172685618.0, "step": 143530 }, { "entropy": 2.004455064237118, "epoch": 0.4449614409633288, "grad_norm": 8.590579986572266, "learning_rate": 3.7925647428357226e-06, "loss": 0.4645, "mean_token_accuracy": 0.8540130868554116, "num_tokens": 172697021.0, "step": 143540 }, { "entropy": 2.02477196007967, "epoch": 0.4449924400883785, "grad_norm": 8.24437427520752, "learning_rate": 3.7924326405293627e-06, "loss": 0.4715, "mean_token_accuracy": 0.8468568831682205, "num_tokens": 172708199.0, "step": 143550 }, { "entropy": 1.9295995250344276, "epoch": 0.4450234392134282, "grad_norm": 4.574173927307129, "learning_rate": 3.7923005520261708e-06, "loss": 0.4307, "mean_token_accuracy": 0.8547417059540748, "num_tokens": 172720296.0, "step": 143560 }, { "entropy": 1.7906879603862762, "epoch": 0.4450544383384779, "grad_norm": 8.181215286254883, "learning_rate": 3.792168477323745e-06, "loss": 0.3569, "mean_token_accuracy": 0.8623343601822853, "num_tokens": 172734098.0, "step": 143570 }, { "entropy": 1.7868395507335664, "epoch": 0.4450854374635276, "grad_norm": 4.160261631011963, "learning_rate": 3.7920364164196805e-06, "loss": 0.3373, "mean_token_accuracy": 0.8650955274701119, "num_tokens": 172748197.0, "step": 143580 }, { "entropy": 1.906904463469982, "epoch": 0.4451164365885773, "grad_norm": 7.085610389709473, "learning_rate": 3.7919043693115757e-06, "loss": 0.425, "mean_token_accuracy": 0.857354860007763, "num_tokens": 172760940.0, "step": 143590 }, { "entropy": 1.8628742694854736, "epoch": 0.44514743571362697, "grad_norm": 7.324219226837158, "learning_rate": 3.7917723359970293e-06, "loss": 0.3814, "mean_token_accuracy": 0.8632989749312401, "num_tokens": 172773595.0, "step": 143600 }, { "entropy": 1.9510155633091926, "epoch": 0.4451784348386767, "grad_norm": 7.612024784088135, "learning_rate": 3.791640316473639e-06, "loss": 0.4609, "mean_token_accuracy": 0.8439678400754929, "num_tokens": 172784596.0, "step": 143610 }, { "entropy": 1.9409842744469643, "epoch": 0.44520943396372636, "grad_norm": 7.95268440246582, "learning_rate": 3.791508310739005e-06, "loss": 0.4449, "mean_token_accuracy": 0.8481158643960953, "num_tokens": 172796663.0, "step": 143620 }, { "entropy": 1.8901426151394844, "epoch": 0.4452404330887761, "grad_norm": 7.622109889984131, "learning_rate": 3.7913763187907265e-06, "loss": 0.4143, "mean_token_accuracy": 0.8628189340233803, "num_tokens": 172809208.0, "step": 143630 }, { "entropy": 1.9054123714566231, "epoch": 0.44527143221382576, "grad_norm": 8.614324569702148, "learning_rate": 3.791244340626404e-06, "loss": 0.4353, "mean_token_accuracy": 0.8581098973751068, "num_tokens": 172821386.0, "step": 143640 }, { "entropy": 1.8443352833390236, "epoch": 0.4453024313388755, "grad_norm": 4.017133712768555, "learning_rate": 3.7911123762436393e-06, "loss": 0.4023, "mean_token_accuracy": 0.8598392441868782, "num_tokens": 172834753.0, "step": 143650 }, { "entropy": 1.9353301167488097, "epoch": 0.44533343046392515, "grad_norm": 4.68195104598999, "learning_rate": 3.7909804256400327e-06, "loss": 0.444, "mean_token_accuracy": 0.8590727686882019, "num_tokens": 172846687.0, "step": 143660 }, { "entropy": 2.013177511096001, "epoch": 0.4453644295889749, "grad_norm": 8.711843490600586, "learning_rate": 3.7908484888131875e-06, "loss": 0.4639, "mean_token_accuracy": 0.8534877151250839, "num_tokens": 172857233.0, "step": 143670 }, { "entropy": 1.9266994565725326, "epoch": 0.44539542871402454, "grad_norm": 8.333199501037598, "learning_rate": 3.7907165657607066e-06, "loss": 0.4196, "mean_token_accuracy": 0.8665392875671387, "num_tokens": 172869433.0, "step": 143680 }, { "entropy": 1.9714792117476463, "epoch": 0.44542642783907427, "grad_norm": 8.7294340133667, "learning_rate": 3.7905846564801923e-06, "loss": 0.4595, "mean_token_accuracy": 0.8482206985354424, "num_tokens": 172880658.0, "step": 143690 }, { "entropy": 1.8752370357513428, "epoch": 0.44545742696412394, "grad_norm": 8.233158111572266, "learning_rate": 3.7904527609692506e-06, "loss": 0.3987, "mean_token_accuracy": 0.8617415338754654, "num_tokens": 172892973.0, "step": 143700 }, { "entropy": 2.0074197858572007, "epoch": 0.4454884260891736, "grad_norm": 7.99542760848999, "learning_rate": 3.7903208792254844e-06, "loss": 0.4904, "mean_token_accuracy": 0.8521378964185715, "num_tokens": 172904130.0, "step": 143710 }, { "entropy": 1.99031350761652, "epoch": 0.44551942521422333, "grad_norm": 8.73692798614502, "learning_rate": 3.7901890112464982e-06, "loss": 0.4322, "mean_token_accuracy": 0.8600781843066215, "num_tokens": 172915284.0, "step": 143720 }, { "entropy": 1.9789996802806855, "epoch": 0.445550424339273, "grad_norm": 9.458690643310547, "learning_rate": 3.7900571570299e-06, "loss": 0.4956, "mean_token_accuracy": 0.8530536726117134, "num_tokens": 172926970.0, "step": 143730 }, { "entropy": 1.834419848024845, "epoch": 0.4455814234643227, "grad_norm": 4.028777599334717, "learning_rate": 3.789925316573294e-06, "loss": 0.4093, "mean_token_accuracy": 0.8613479465246201, "num_tokens": 172939899.0, "step": 143740 }, { "entropy": 1.945051771402359, "epoch": 0.4456124225893724, "grad_norm": 4.0096306800842285, "learning_rate": 3.7897934898742885e-06, "loss": 0.4787, "mean_token_accuracy": 0.8497995778918266, "num_tokens": 172951805.0, "step": 143750 }, { "entropy": 1.8734006211161613, "epoch": 0.4456434217144221, "grad_norm": 3.48857045173645, "learning_rate": 3.7896616769304905e-06, "loss": 0.3735, "mean_token_accuracy": 0.8659715816378594, "num_tokens": 172964673.0, "step": 143760 }, { "entropy": 1.9327120184898376, "epoch": 0.4456744208394718, "grad_norm": 8.14692497253418, "learning_rate": 3.7895298777395078e-06, "loss": 0.46, "mean_token_accuracy": 0.8533953115344047, "num_tokens": 172975682.0, "step": 143770 }, { "entropy": 1.9580781385302544, "epoch": 0.4457054199645215, "grad_norm": 8.688072204589844, "learning_rate": 3.7893980922989496e-06, "loss": 0.4946, "mean_token_accuracy": 0.839683572947979, "num_tokens": 172987936.0, "step": 143780 }, { "entropy": 1.908944171667099, "epoch": 0.4457364190895712, "grad_norm": 3.971160888671875, "learning_rate": 3.7892663206064244e-06, "loss": 0.4127, "mean_token_accuracy": 0.8549481183290482, "num_tokens": 173000930.0, "step": 143790 }, { "entropy": 2.0028712913393973, "epoch": 0.4457674182146209, "grad_norm": 8.951005935668945, "learning_rate": 3.789134562659543e-06, "loss": 0.4679, "mean_token_accuracy": 0.845504654943943, "num_tokens": 173012916.0, "step": 143800 }, { "entropy": 1.9837856590747833, "epoch": 0.44579841733967057, "grad_norm": 7.293043613433838, "learning_rate": 3.7890028184559154e-06, "loss": 0.4261, "mean_token_accuracy": 0.8591191872954369, "num_tokens": 173024193.0, "step": 143810 }, { "entropy": 1.9127214059233666, "epoch": 0.4458294164647203, "grad_norm": 3.8465218544006348, "learning_rate": 3.7888710879931517e-06, "loss": 0.4417, "mean_token_accuracy": 0.8534463986754417, "num_tokens": 173035733.0, "step": 143820 }, { "entropy": 1.949730084836483, "epoch": 0.44586041558976997, "grad_norm": 10.93399429321289, "learning_rate": 3.788739371268865e-06, "loss": 0.4748, "mean_token_accuracy": 0.8493082284927368, "num_tokens": 173047215.0, "step": 143830 }, { "entropy": 2.0025800704956054, "epoch": 0.4458914147148197, "grad_norm": 8.645318984985352, "learning_rate": 3.788607668280666e-06, "loss": 0.5141, "mean_token_accuracy": 0.8457403063774109, "num_tokens": 173058282.0, "step": 143840 }, { "entropy": 1.910311257839203, "epoch": 0.44592241383986936, "grad_norm": 3.202796220779419, "learning_rate": 3.7884759790261683e-06, "loss": 0.4434, "mean_token_accuracy": 0.8490109801292419, "num_tokens": 173070834.0, "step": 143850 }, { "entropy": 1.9238232627511025, "epoch": 0.4459534129649191, "grad_norm": 8.433602333068848, "learning_rate": 3.7883443035029844e-06, "loss": 0.4331, "mean_token_accuracy": 0.8548846036195755, "num_tokens": 173083400.0, "step": 143860 }, { "entropy": 1.961086443066597, "epoch": 0.44598441208996875, "grad_norm": 3.7970497608184814, "learning_rate": 3.7882126417087294e-06, "loss": 0.4884, "mean_token_accuracy": 0.8421978339552879, "num_tokens": 173094716.0, "step": 143870 }, { "entropy": 1.9548719599843025, "epoch": 0.4460154112150185, "grad_norm": 8.201971054077148, "learning_rate": 3.788080993641017e-06, "loss": 0.5078, "mean_token_accuracy": 0.8327627673745155, "num_tokens": 173107004.0, "step": 143880 }, { "entropy": 1.8781091898679734, "epoch": 0.44604641034006814, "grad_norm": 3.5687875747680664, "learning_rate": 3.7879493592974612e-06, "loss": 0.4453, "mean_token_accuracy": 0.8458937138319016, "num_tokens": 173119649.0, "step": 143890 }, { "entropy": 1.9691094398498534, "epoch": 0.44607740946511787, "grad_norm": 10.030198097229004, "learning_rate": 3.7878177386756796e-06, "loss": 0.516, "mean_token_accuracy": 0.8466410353779793, "num_tokens": 173131103.0, "step": 143900 }, { "entropy": 2.0002141326665877, "epoch": 0.44610840859016754, "grad_norm": 8.030385971069336, "learning_rate": 3.7876861317732877e-06, "loss": 0.5004, "mean_token_accuracy": 0.8476643145084382, "num_tokens": 173142013.0, "step": 143910 }, { "entropy": 1.9814749881625175, "epoch": 0.44613940771521726, "grad_norm": 9.209390640258789, "learning_rate": 3.787554538587902e-06, "loss": 0.4712, "mean_token_accuracy": 0.8513798832893371, "num_tokens": 173153145.0, "step": 143920 }, { "entropy": 1.953411616384983, "epoch": 0.44617040684026693, "grad_norm": 10.11378288269043, "learning_rate": 3.7874229591171395e-06, "loss": 0.467, "mean_token_accuracy": 0.845984798669815, "num_tokens": 173165304.0, "step": 143930 }, { "entropy": 1.8993015423417092, "epoch": 0.44620140596531666, "grad_norm": 7.429069995880127, "learning_rate": 3.787291393358619e-06, "loss": 0.4043, "mean_token_accuracy": 0.8644404307007789, "num_tokens": 173177675.0, "step": 143940 }, { "entropy": 1.8486544996500016, "epoch": 0.4462324050903663, "grad_norm": 8.14201545715332, "learning_rate": 3.7871598413099593e-06, "loss": 0.379, "mean_token_accuracy": 0.8645230799913406, "num_tokens": 173191122.0, "step": 143950 }, { "entropy": 2.023305447399616, "epoch": 0.446263404215416, "grad_norm": 10.556729316711426, "learning_rate": 3.7870283029687777e-06, "loss": 0.4801, "mean_token_accuracy": 0.847667895257473, "num_tokens": 173202142.0, "step": 143960 }, { "entropy": 1.910950092971325, "epoch": 0.4462944033404657, "grad_norm": 7.93691349029541, "learning_rate": 3.7868967783326956e-06, "loss": 0.4291, "mean_token_accuracy": 0.856875829398632, "num_tokens": 173214829.0, "step": 143970 }, { "entropy": 1.9166125014424324, "epoch": 0.4463254024655154, "grad_norm": 9.42168140411377, "learning_rate": 3.7867652673993318e-06, "loss": 0.4674, "mean_token_accuracy": 0.8535972073674202, "num_tokens": 173226844.0, "step": 143980 }, { "entropy": 1.8531441867351532, "epoch": 0.4463564015905651, "grad_norm": 4.800891399383545, "learning_rate": 3.7866337701663088e-06, "loss": 0.4391, "mean_token_accuracy": 0.8521735593676567, "num_tokens": 173239753.0, "step": 143990 }, { "entropy": 1.881640262901783, "epoch": 0.4463874007156148, "grad_norm": 9.612931251525879, "learning_rate": 3.7865022866312468e-06, "loss": 0.4001, "mean_token_accuracy": 0.8631109833717346, "num_tokens": 173252177.0, "step": 144000 }, { "entropy": 1.8542085379362105, "epoch": 0.4464183998406645, "grad_norm": 5.085047245025635, "learning_rate": 3.7863708167917686e-06, "loss": 0.3643, "mean_token_accuracy": 0.8695595130324364, "num_tokens": 173264694.0, "step": 144010 }, { "entropy": 2.0253135934472084, "epoch": 0.4464493989657142, "grad_norm": 9.341069221496582, "learning_rate": 3.7862393606454958e-06, "loss": 0.4986, "mean_token_accuracy": 0.8394965454936028, "num_tokens": 173275727.0, "step": 144020 }, { "entropy": 1.8591209024190902, "epoch": 0.4464803980907639, "grad_norm": 8.249305725097656, "learning_rate": 3.786107918190052e-06, "loss": 0.4193, "mean_token_accuracy": 0.8542803794145584, "num_tokens": 173288795.0, "step": 144030 }, { "entropy": 1.9186673119664193, "epoch": 0.44651139721581357, "grad_norm": 9.003576278686523, "learning_rate": 3.785976489423061e-06, "loss": 0.4394, "mean_token_accuracy": 0.8566552370786666, "num_tokens": 173300153.0, "step": 144040 }, { "entropy": 1.9807184368371964, "epoch": 0.4465423963408633, "grad_norm": 8.3126859664917, "learning_rate": 3.785845074342148e-06, "loss": 0.4589, "mean_token_accuracy": 0.8607570424675941, "num_tokens": 173311456.0, "step": 144050 }, { "entropy": 1.9973660945892333, "epoch": 0.44657339546591296, "grad_norm": 8.058760643005371, "learning_rate": 3.7857136729449356e-06, "loss": 0.4827, "mean_token_accuracy": 0.8486983701586723, "num_tokens": 173322684.0, "step": 144060 }, { "entropy": 1.9315310716629028, "epoch": 0.4466043945909627, "grad_norm": 9.327740669250488, "learning_rate": 3.7855822852290523e-06, "loss": 0.4784, "mean_token_accuracy": 0.8483046740293503, "num_tokens": 173334375.0, "step": 144070 }, { "entropy": 1.854925161600113, "epoch": 0.44663539371601235, "grad_norm": 8.95000171661377, "learning_rate": 3.785450911192121e-06, "loss": 0.4349, "mean_token_accuracy": 0.8594622611999512, "num_tokens": 173347587.0, "step": 144080 }, { "entropy": 1.9255126759409904, "epoch": 0.4466663928410621, "grad_norm": 9.044776916503906, "learning_rate": 3.7853195508317707e-06, "loss": 0.5041, "mean_token_accuracy": 0.8483678221702575, "num_tokens": 173359548.0, "step": 144090 }, { "entropy": 1.9388061597943307, "epoch": 0.44669739196611175, "grad_norm": 8.007100105285645, "learning_rate": 3.785188204145627e-06, "loss": 0.4604, "mean_token_accuracy": 0.8521351292729378, "num_tokens": 173371042.0, "step": 144100 }, { "entropy": 1.9809336930513382, "epoch": 0.44672839109116147, "grad_norm": 8.529678344726562, "learning_rate": 3.7850568711313192e-06, "loss": 0.4665, "mean_token_accuracy": 0.8555900052189827, "num_tokens": 173382290.0, "step": 144110 }, { "entropy": 1.9656935811042786, "epoch": 0.44675939021621114, "grad_norm": 4.3543219566345215, "learning_rate": 3.7849255517864743e-06, "loss": 0.4852, "mean_token_accuracy": 0.8531507521867752, "num_tokens": 173393550.0, "step": 144120 }, { "entropy": 1.8287263602018355, "epoch": 0.44679038934126086, "grad_norm": 3.763613700866699, "learning_rate": 3.784794246108721e-06, "loss": 0.3894, "mean_token_accuracy": 0.859270915389061, "num_tokens": 173406666.0, "step": 144130 }, { "entropy": 1.93349888920784, "epoch": 0.44682138846631053, "grad_norm": 8.177099227905273, "learning_rate": 3.784662954095691e-06, "loss": 0.4772, "mean_token_accuracy": 0.8471749663352967, "num_tokens": 173418993.0, "step": 144140 }, { "entropy": 1.9891784444451333, "epoch": 0.44685238759136026, "grad_norm": 9.054378509521484, "learning_rate": 3.7845316757450114e-06, "loss": 0.4624, "mean_token_accuracy": 0.8486263379454613, "num_tokens": 173430436.0, "step": 144150 }, { "entropy": 1.9303235545754434, "epoch": 0.4468833867164099, "grad_norm": 8.891922950744629, "learning_rate": 3.7844004110543148e-06, "loss": 0.4609, "mean_token_accuracy": 0.8510625973343849, "num_tokens": 173442415.0, "step": 144160 }, { "entropy": 1.94441519677639, "epoch": 0.44691438584145965, "grad_norm": 8.909342765808105, "learning_rate": 3.7842691600212316e-06, "loss": 0.4587, "mean_token_accuracy": 0.8438147768378258, "num_tokens": 173454807.0, "step": 144170 }, { "entropy": 2.0386078268289567, "epoch": 0.4469453849665093, "grad_norm": 10.605982780456543, "learning_rate": 3.784137922643394e-06, "loss": 0.5531, "mean_token_accuracy": 0.8398728102445603, "num_tokens": 173465671.0, "step": 144180 }, { "entropy": 1.9403854593634606, "epoch": 0.44697638409155904, "grad_norm": 7.102710723876953, "learning_rate": 3.7840066989184345e-06, "loss": 0.4316, "mean_token_accuracy": 0.8576330199837685, "num_tokens": 173478051.0, "step": 144190 }, { "entropy": 1.9223571211099624, "epoch": 0.4470073832166087, "grad_norm": 7.868150234222412, "learning_rate": 3.7838754888439857e-06, "loss": 0.4203, "mean_token_accuracy": 0.8626696810126304, "num_tokens": 173489382.0, "step": 144200 }, { "entropy": 1.9799730449914932, "epoch": 0.4470383823416584, "grad_norm": 6.588014602661133, "learning_rate": 3.7837442924176804e-06, "loss": 0.5044, "mean_token_accuracy": 0.8521138474345207, "num_tokens": 173501335.0, "step": 144210 }, { "entropy": 1.9104075238108635, "epoch": 0.4470693814667081, "grad_norm": 8.533629417419434, "learning_rate": 3.783613109637155e-06, "loss": 0.4194, "mean_token_accuracy": 0.8599554657936096, "num_tokens": 173513423.0, "step": 144220 }, { "entropy": 1.884183020889759, "epoch": 0.4471003805917578, "grad_norm": 2.680689573287964, "learning_rate": 3.7834819405000404e-06, "loss": 0.4075, "mean_token_accuracy": 0.8564264923334122, "num_tokens": 173525703.0, "step": 144230 }, { "entropy": 1.998404061794281, "epoch": 0.4471313797168075, "grad_norm": 7.732211112976074, "learning_rate": 3.7833507850039757e-06, "loss": 0.4948, "mean_token_accuracy": 0.8460897743701935, "num_tokens": 173536387.0, "step": 144240 }, { "entropy": 1.888758347928524, "epoch": 0.44716237884185717, "grad_norm": 9.650225639343262, "learning_rate": 3.783219643146595e-06, "loss": 0.4148, "mean_token_accuracy": 0.8614796027541161, "num_tokens": 173548843.0, "step": 144250 }, { "entropy": 1.9674888044595717, "epoch": 0.4471933779669069, "grad_norm": 7.974256992340088, "learning_rate": 3.7830885149255346e-06, "loss": 0.4328, "mean_token_accuracy": 0.8619727566838264, "num_tokens": 173560036.0, "step": 144260 }, { "entropy": 1.854830276221037, "epoch": 0.44722437709195656, "grad_norm": 4.305238246917725, "learning_rate": 3.7829574003384317e-06, "loss": 0.4224, "mean_token_accuracy": 0.8533999145030975, "num_tokens": 173572981.0, "step": 144270 }, { "entropy": 1.8603426963090897, "epoch": 0.4472553762170063, "grad_norm": 8.774975776672363, "learning_rate": 3.7828262993829235e-06, "loss": 0.4179, "mean_token_accuracy": 0.8556037560105324, "num_tokens": 173586083.0, "step": 144280 }, { "entropy": 1.8292605131864548, "epoch": 0.44728637534205595, "grad_norm": 3.801150321960449, "learning_rate": 3.7826952120566485e-06, "loss": 0.3767, "mean_token_accuracy": 0.8639505878090858, "num_tokens": 173599424.0, "step": 144290 }, { "entropy": 1.8792633160948753, "epoch": 0.4473173744671057, "grad_norm": 4.3725409507751465, "learning_rate": 3.7825641383572448e-06, "loss": 0.4181, "mean_token_accuracy": 0.8518676936626435, "num_tokens": 173611821.0, "step": 144300 }, { "entropy": 1.9342765644192697, "epoch": 0.44734837359215535, "grad_norm": 9.642680168151855, "learning_rate": 3.7824330782823524e-06, "loss": 0.4445, "mean_token_accuracy": 0.8533917725086212, "num_tokens": 173623485.0, "step": 144310 }, { "entropy": 2.012094184756279, "epoch": 0.44737937271720507, "grad_norm": 7.291985988616943, "learning_rate": 3.7823020318296104e-06, "loss": 0.5282, "mean_token_accuracy": 0.8319395646452904, "num_tokens": 173634217.0, "step": 144320 }, { "entropy": 1.9318621635437012, "epoch": 0.44741037184225474, "grad_norm": 6.990114212036133, "learning_rate": 3.7821709989966605e-06, "loss": 0.4082, "mean_token_accuracy": 0.8600582510232926, "num_tokens": 173646108.0, "step": 144330 }, { "entropy": 1.9527502954006195, "epoch": 0.44744137096730446, "grad_norm": 7.637629508972168, "learning_rate": 3.782039979781142e-06, "loss": 0.4795, "mean_token_accuracy": 0.8539629653096199, "num_tokens": 173657100.0, "step": 144340 }, { "entropy": 1.969818153977394, "epoch": 0.44747237009235413, "grad_norm": 8.846341133117676, "learning_rate": 3.7819089741806974e-06, "loss": 0.5181, "mean_token_accuracy": 0.8485133573412895, "num_tokens": 173668376.0, "step": 144350 }, { "entropy": 1.903004801273346, "epoch": 0.44750336921740386, "grad_norm": 6.75057315826416, "learning_rate": 3.781777982192969e-06, "loss": 0.5438, "mean_token_accuracy": 0.8346272155642509, "num_tokens": 173681042.0, "step": 144360 }, { "entropy": 1.8782558515667915, "epoch": 0.4475343683424535, "grad_norm": 6.844717025756836, "learning_rate": 3.781647003815598e-06, "loss": 0.4359, "mean_token_accuracy": 0.8552762180566787, "num_tokens": 173693000.0, "step": 144370 }, { "entropy": 1.9393670424818992, "epoch": 0.44756536746750325, "grad_norm": 9.035807609558105, "learning_rate": 3.7815160390462298e-06, "loss": 0.4454, "mean_token_accuracy": 0.8559998840093612, "num_tokens": 173704983.0, "step": 144380 }, { "entropy": 1.9297925919294356, "epoch": 0.4475963665925529, "grad_norm": 6.4936137199401855, "learning_rate": 3.7813850878825064e-06, "loss": 0.4642, "mean_token_accuracy": 0.8614618703722954, "num_tokens": 173716355.0, "step": 144390 }, { "entropy": 1.8133862063288688, "epoch": 0.44762736571760264, "grad_norm": 4.11335563659668, "learning_rate": 3.7812541503220733e-06, "loss": 0.3879, "mean_token_accuracy": 0.8672507256269455, "num_tokens": 173730318.0, "step": 144400 }, { "entropy": 1.956637793779373, "epoch": 0.4476583648426523, "grad_norm": 6.994054794311523, "learning_rate": 3.7811232263625753e-06, "loss": 0.4681, "mean_token_accuracy": 0.8508543521165848, "num_tokens": 173741321.0, "step": 144410 }, { "entropy": 1.92156320810318, "epoch": 0.44768936396770204, "grad_norm": 7.512271404266357, "learning_rate": 3.780992316001657e-06, "loss": 0.5093, "mean_token_accuracy": 0.845100137591362, "num_tokens": 173753512.0, "step": 144420 }, { "entropy": 1.8369450643658638, "epoch": 0.4477203630927517, "grad_norm": 8.064924240112305, "learning_rate": 3.7808614192369664e-06, "loss": 0.402, "mean_token_accuracy": 0.8641338348388672, "num_tokens": 173767543.0, "step": 144430 }, { "entropy": 1.8546362400054932, "epoch": 0.44775136221780143, "grad_norm": 9.19071102142334, "learning_rate": 3.7807305360661485e-06, "loss": 0.3772, "mean_token_accuracy": 0.8610058188438415, "num_tokens": 173780631.0, "step": 144440 }, { "entropy": 1.9356670066714288, "epoch": 0.4477823613428511, "grad_norm": 11.3148832321167, "learning_rate": 3.780599666486851e-06, "loss": 0.4627, "mean_token_accuracy": 0.852401140332222, "num_tokens": 173792247.0, "step": 144450 }, { "entropy": 1.8100459277629852, "epoch": 0.44781336046790077, "grad_norm": 3.687837839126587, "learning_rate": 3.780468810496722e-06, "loss": 0.3704, "mean_token_accuracy": 0.8491495341062546, "num_tokens": 173806105.0, "step": 144460 }, { "entropy": 1.9653054475784302, "epoch": 0.4478443595929505, "grad_norm": 8.540299415588379, "learning_rate": 3.7803379680934092e-06, "loss": 0.4749, "mean_token_accuracy": 0.8488686487078667, "num_tokens": 173818024.0, "step": 144470 }, { "entropy": 1.9154264122247695, "epoch": 0.44787535871800016, "grad_norm": 8.398823738098145, "learning_rate": 3.7802071392745628e-06, "loss": 0.4439, "mean_token_accuracy": 0.8501715928316116, "num_tokens": 173830403.0, "step": 144480 }, { "entropy": 1.9817367240786552, "epoch": 0.4479063578430499, "grad_norm": 8.067160606384277, "learning_rate": 3.7800763240378307e-06, "loss": 0.4427, "mean_token_accuracy": 0.8624659746885299, "num_tokens": 173841447.0, "step": 144490 }, { "entropy": 1.826142853498459, "epoch": 0.44793735696809955, "grad_norm": 8.450629234313965, "learning_rate": 3.7799455223808647e-06, "loss": 0.3764, "mean_token_accuracy": 0.8624396830797195, "num_tokens": 173854816.0, "step": 144500 }, { "entropy": 1.9071953654289246, "epoch": 0.4479683560931493, "grad_norm": 8.542773246765137, "learning_rate": 3.7798147343013134e-06, "loss": 0.4273, "mean_token_accuracy": 0.8587081953883171, "num_tokens": 173866813.0, "step": 144510 }, { "entropy": 1.9302286952733994, "epoch": 0.44799935521819895, "grad_norm": 8.922464370727539, "learning_rate": 3.7796839597968305e-06, "loss": 0.4607, "mean_token_accuracy": 0.8490835353732109, "num_tokens": 173878864.0, "step": 144520 }, { "entropy": 1.841838812828064, "epoch": 0.44803035434324867, "grad_norm": 10.363226890563965, "learning_rate": 3.7795531988650663e-06, "loss": 0.4245, "mean_token_accuracy": 0.8650688961148262, "num_tokens": 173892098.0, "step": 144530 }, { "entropy": 1.9519219264388084, "epoch": 0.44806135346829834, "grad_norm": 8.940476417541504, "learning_rate": 3.7794224515036733e-06, "loss": 0.4974, "mean_token_accuracy": 0.8444177433848381, "num_tokens": 173904195.0, "step": 144540 }, { "entropy": 1.998878961801529, "epoch": 0.44809235259334806, "grad_norm": 8.424077033996582, "learning_rate": 3.7792917177103043e-06, "loss": 0.5687, "mean_token_accuracy": 0.8384242698550224, "num_tokens": 173915400.0, "step": 144550 }, { "entropy": 1.9215024933218956, "epoch": 0.44812335171839773, "grad_norm": 8.416211128234863, "learning_rate": 3.7791609974826136e-06, "loss": 0.4661, "mean_token_accuracy": 0.8600714936852455, "num_tokens": 173927090.0, "step": 144560 }, { "entropy": 1.891715730726719, "epoch": 0.44815435084344746, "grad_norm": 9.589639663696289, "learning_rate": 3.7790302908182543e-06, "loss": 0.4641, "mean_token_accuracy": 0.8505370214581489, "num_tokens": 173938920.0, "step": 144570 }, { "entropy": 1.8214015498757363, "epoch": 0.4481853499684971, "grad_norm": 8.072266578674316, "learning_rate": 3.7788995977148823e-06, "loss": 0.348, "mean_token_accuracy": 0.8764893263578415, "num_tokens": 173951952.0, "step": 144580 }, { "entropy": 1.9523023292422295, "epoch": 0.44821634909354685, "grad_norm": 8.023730278015137, "learning_rate": 3.7787689181701514e-06, "loss": 0.4498, "mean_token_accuracy": 0.8551726579666138, "num_tokens": 173963990.0, "step": 144590 }, { "entropy": 1.975418707728386, "epoch": 0.4482473482185965, "grad_norm": 8.128300666809082, "learning_rate": 3.7786382521817178e-06, "loss": 0.4957, "mean_token_accuracy": 0.8471471086144448, "num_tokens": 173974720.0, "step": 144600 }, { "entropy": 1.935040408372879, "epoch": 0.44827834734364624, "grad_norm": 8.031977653503418, "learning_rate": 3.7785075997472385e-06, "loss": 0.4166, "mean_token_accuracy": 0.8642361879348754, "num_tokens": 173986483.0, "step": 144610 }, { "entropy": 1.9660716861486436, "epoch": 0.4483093464686959, "grad_norm": 7.869020938873291, "learning_rate": 3.7783769608643696e-06, "loss": 0.4754, "mean_token_accuracy": 0.8499971255660057, "num_tokens": 173998365.0, "step": 144620 }, { "entropy": 1.998292076587677, "epoch": 0.44834034559374564, "grad_norm": 8.130206108093262, "learning_rate": 3.778246335530769e-06, "loss": 0.5028, "mean_token_accuracy": 0.8486690282821655, "num_tokens": 174009884.0, "step": 144630 }, { "entropy": 1.890485832095146, "epoch": 0.4483713447187953, "grad_norm": 7.7400946617126465, "learning_rate": 3.778115723744095e-06, "loss": 0.4081, "mean_token_accuracy": 0.8541572690010071, "num_tokens": 174022402.0, "step": 144640 }, { "entropy": 1.897654327750206, "epoch": 0.44840234384384503, "grad_norm": 4.250067710876465, "learning_rate": 3.7779851255020057e-06, "loss": 0.4291, "mean_token_accuracy": 0.8561486795544624, "num_tokens": 174034391.0, "step": 144650 }, { "entropy": 1.917440117895603, "epoch": 0.4484333429688947, "grad_norm": 4.559934139251709, "learning_rate": 3.7778545408021607e-06, "loss": 0.4682, "mean_token_accuracy": 0.8511192619800567, "num_tokens": 174046427.0, "step": 144660 }, { "entropy": 1.9716055259108543, "epoch": 0.4484643420939444, "grad_norm": 7.858667850494385, "learning_rate": 3.77772396964222e-06, "loss": 0.4878, "mean_token_accuracy": 0.8432918503880501, "num_tokens": 174057999.0, "step": 144670 }, { "entropy": 1.928077156841755, "epoch": 0.4484953412189941, "grad_norm": 9.245266914367676, "learning_rate": 3.777593412019842e-06, "loss": 0.4646, "mean_token_accuracy": 0.8530062898993492, "num_tokens": 174069219.0, "step": 144680 }, { "entropy": 2.0191250085830688, "epoch": 0.4485263403440438, "grad_norm": 8.808843612670898, "learning_rate": 3.7774628679326895e-06, "loss": 0.5707, "mean_token_accuracy": 0.8358566120266915, "num_tokens": 174079632.0, "step": 144690 }, { "entropy": 1.9445554435253143, "epoch": 0.4485573394690935, "grad_norm": 10.58156681060791, "learning_rate": 3.7773323373784244e-06, "loss": 0.4865, "mean_token_accuracy": 0.8430225223302841, "num_tokens": 174091815.0, "step": 144700 }, { "entropy": 1.9787197232246398, "epoch": 0.44858833859414315, "grad_norm": 8.029949188232422, "learning_rate": 3.777201820354707e-06, "loss": 0.4623, "mean_token_accuracy": 0.8574790298938751, "num_tokens": 174102183.0, "step": 144710 }, { "entropy": 1.8836344301700592, "epoch": 0.4486193377191929, "grad_norm": 4.910955429077148, "learning_rate": 3.777071316859201e-06, "loss": 0.4363, "mean_token_accuracy": 0.8546113297343254, "num_tokens": 174114589.0, "step": 144720 }, { "entropy": 1.8803539738059043, "epoch": 0.44865033684424255, "grad_norm": 7.87885046005249, "learning_rate": 3.776940826889569e-06, "loss": 0.4525, "mean_token_accuracy": 0.8489265456795693, "num_tokens": 174126765.0, "step": 144730 }, { "entropy": 1.9403909876942635, "epoch": 0.44868133596929227, "grad_norm": 9.444135665893555, "learning_rate": 3.776810350443475e-06, "loss": 0.489, "mean_token_accuracy": 0.8454124689102173, "num_tokens": 174138010.0, "step": 144740 }, { "entropy": 1.869876691699028, "epoch": 0.44871233509434194, "grad_norm": 10.500418663024902, "learning_rate": 3.776679887518583e-06, "loss": 0.4856, "mean_token_accuracy": 0.8416117459535599, "num_tokens": 174151284.0, "step": 144750 }, { "entropy": 1.8063198134303093, "epoch": 0.44874333421939167, "grad_norm": 7.03587532043457, "learning_rate": 3.776549438112559e-06, "loss": 0.3784, "mean_token_accuracy": 0.8664273172616959, "num_tokens": 174164715.0, "step": 144760 }, { "entropy": 1.9691897720098495, "epoch": 0.44877433334444133, "grad_norm": 8.313720703125, "learning_rate": 3.7764190022230658e-06, "loss": 0.4791, "mean_token_accuracy": 0.8534282594919205, "num_tokens": 174175915.0, "step": 144770 }, { "entropy": 1.8201162710785865, "epoch": 0.44880533246949106, "grad_norm": 3.464089870452881, "learning_rate": 3.7762885798477715e-06, "loss": 0.3616, "mean_token_accuracy": 0.8693333268165588, "num_tokens": 174188460.0, "step": 144780 }, { "entropy": 1.968066155910492, "epoch": 0.4488363315945407, "grad_norm": 9.63811206817627, "learning_rate": 3.776158170984343e-06, "loss": 0.4864, "mean_token_accuracy": 0.8475179970264435, "num_tokens": 174199357.0, "step": 144790 }, { "entropy": 1.8865119606256484, "epoch": 0.44886733071959045, "grad_norm": 8.431649208068848, "learning_rate": 3.7760277756304458e-06, "loss": 0.4675, "mean_token_accuracy": 0.8538785234093667, "num_tokens": 174211184.0, "step": 144800 }, { "entropy": 1.9904786467552185, "epoch": 0.4488983298446401, "grad_norm": 8.829815864562988, "learning_rate": 3.775897393783749e-06, "loss": 0.4995, "mean_token_accuracy": 0.849235288798809, "num_tokens": 174222072.0, "step": 144810 }, { "entropy": 1.8386689230799675, "epoch": 0.44892932896968984, "grad_norm": 8.532641410827637, "learning_rate": 3.775767025441919e-06, "loss": 0.412, "mean_token_accuracy": 0.8608880117535591, "num_tokens": 174235310.0, "step": 144820 }, { "entropy": 1.9428915694355964, "epoch": 0.4489603280947395, "grad_norm": 8.705090522766113, "learning_rate": 3.7756366706026264e-06, "loss": 0.5264, "mean_token_accuracy": 0.8452722027897834, "num_tokens": 174247069.0, "step": 144830 }, { "entropy": 1.9664770871400834, "epoch": 0.44899132721978924, "grad_norm": 9.778718948364258, "learning_rate": 3.7755063292635396e-06, "loss": 0.5638, "mean_token_accuracy": 0.8389669686555863, "num_tokens": 174257829.0, "step": 144840 }, { "entropy": 1.9240783050656318, "epoch": 0.4490223263448389, "grad_norm": 7.943138122558594, "learning_rate": 3.775376001422329e-06, "loss": 0.504, "mean_token_accuracy": 0.8448940262198448, "num_tokens": 174269741.0, "step": 144850 }, { "entropy": 1.9432480692863465, "epoch": 0.44905332546988863, "grad_norm": 3.412449598312378, "learning_rate": 3.7752456870766646e-06, "loss": 0.4416, "mean_token_accuracy": 0.8651240512728691, "num_tokens": 174280859.0, "step": 144860 }, { "entropy": 1.8958119705319405, "epoch": 0.4490843245949383, "grad_norm": 4.008243083953857, "learning_rate": 3.775115386224218e-06, "loss": 0.4582, "mean_token_accuracy": 0.8487641841173172, "num_tokens": 174293162.0, "step": 144870 }, { "entropy": 1.9293140321969986, "epoch": 0.449115323719988, "grad_norm": 8.646997451782227, "learning_rate": 3.7749850988626597e-06, "loss": 0.4743, "mean_token_accuracy": 0.8494552314281464, "num_tokens": 174304631.0, "step": 144880 }, { "entropy": 1.8826806560158729, "epoch": 0.4491463228450377, "grad_norm": 7.910927772521973, "learning_rate": 3.7748548249896632e-06, "loss": 0.3843, "mean_token_accuracy": 0.8651323825120926, "num_tokens": 174316973.0, "step": 144890 }, { "entropy": 1.9384344890713692, "epoch": 0.4491773219700874, "grad_norm": 8.303495407104492, "learning_rate": 3.7747245646029e-06, "loss": 0.4715, "mean_token_accuracy": 0.8473970159888268, "num_tokens": 174328666.0, "step": 144900 }, { "entropy": 1.9702891185879707, "epoch": 0.4492083210951371, "grad_norm": 7.635629177093506, "learning_rate": 3.7745943177000442e-06, "loss": 0.4746, "mean_token_accuracy": 0.84871247112751, "num_tokens": 174340268.0, "step": 144910 }, { "entropy": 1.9801568925380706, "epoch": 0.4492393202201868, "grad_norm": 7.393730640411377, "learning_rate": 3.7744640842787706e-06, "loss": 0.4899, "mean_token_accuracy": 0.8527787283062935, "num_tokens": 174350919.0, "step": 144920 }, { "entropy": 1.8521282449364662, "epoch": 0.4492703193452365, "grad_norm": 7.78226900100708, "learning_rate": 3.774333864336751e-06, "loss": 0.3754, "mean_token_accuracy": 0.8685169115662574, "num_tokens": 174363562.0, "step": 144930 }, { "entropy": 1.8877981454133987, "epoch": 0.44930131847028615, "grad_norm": 7.9879302978515625, "learning_rate": 3.774203657871663e-06, "loss": 0.4636, "mean_token_accuracy": 0.8551206976175308, "num_tokens": 174375369.0, "step": 144940 }, { "entropy": 1.8575106739997864, "epoch": 0.4493323175953359, "grad_norm": 7.526092529296875, "learning_rate": 3.7740734648811805e-06, "loss": 0.42, "mean_token_accuracy": 0.8627775356173515, "num_tokens": 174387464.0, "step": 144950 }, { "entropy": 1.873704120516777, "epoch": 0.44936331672038554, "grad_norm": 8.647795677185059, "learning_rate": 3.7739432853629796e-06, "loss": 0.4119, "mean_token_accuracy": 0.8583318084478379, "num_tokens": 174399542.0, "step": 144960 }, { "entropy": 1.9029436275362968, "epoch": 0.44939431584543527, "grad_norm": 7.637712001800537, "learning_rate": 3.773813119314738e-06, "loss": 0.454, "mean_token_accuracy": 0.8516765266656876, "num_tokens": 174410772.0, "step": 144970 }, { "entropy": 1.8933783307671548, "epoch": 0.44942531497048493, "grad_norm": 4.113735675811768, "learning_rate": 3.7736829667341326e-06, "loss": 0.5039, "mean_token_accuracy": 0.843044999241829, "num_tokens": 174423995.0, "step": 144980 }, { "entropy": 1.9160526722669602, "epoch": 0.44945631409553466, "grad_norm": 9.520600318908691, "learning_rate": 3.7735528276188404e-06, "loss": 0.4766, "mean_token_accuracy": 0.8556877493858337, "num_tokens": 174435604.0, "step": 144990 }, { "entropy": 1.9604547709226607, "epoch": 0.44948731322058433, "grad_norm": 8.132418632507324, "learning_rate": 3.773422701966541e-06, "loss": 0.4974, "mean_token_accuracy": 0.8487904667854309, "num_tokens": 174446584.0, "step": 145000 }, { "entropy": 1.8689698219299316, "epoch": 0.44951831234563405, "grad_norm": 8.19633960723877, "learning_rate": 3.773292589774912e-06, "loss": 0.4134, "mean_token_accuracy": 0.8521596968173981, "num_tokens": 174458869.0, "step": 145010 }, { "entropy": 1.9306217849254608, "epoch": 0.4495493114706837, "grad_norm": 6.501941204071045, "learning_rate": 3.773162491041633e-06, "loss": 0.4743, "mean_token_accuracy": 0.8544813707470894, "num_tokens": 174470488.0, "step": 145020 }, { "entropy": 1.9626944810152054, "epoch": 0.44958031059573345, "grad_norm": 6.954222202301025, "learning_rate": 3.7730324057643857e-06, "loss": 0.4843, "mean_token_accuracy": 0.8428614303469658, "num_tokens": 174480946.0, "step": 145030 }, { "entropy": 1.9757573395967483, "epoch": 0.4496113097207831, "grad_norm": 7.623712539672852, "learning_rate": 3.7729023339408476e-06, "loss": 0.5138, "mean_token_accuracy": 0.8365162685513496, "num_tokens": 174492039.0, "step": 145040 }, { "entropy": 1.8950822830200196, "epoch": 0.44964230884583284, "grad_norm": 4.679478645324707, "learning_rate": 3.7727722755687034e-06, "loss": 0.4306, "mean_token_accuracy": 0.84790218770504, "num_tokens": 174504318.0, "step": 145050 }, { "entropy": 1.8425507709383964, "epoch": 0.4496733079708825, "grad_norm": 3.7281038761138916, "learning_rate": 3.7726422306456324e-06, "loss": 0.421, "mean_token_accuracy": 0.8493483513593674, "num_tokens": 174516826.0, "step": 145060 }, { "entropy": 1.8533750414848327, "epoch": 0.44970430709593223, "grad_norm": 3.924395799636841, "learning_rate": 3.7725121991693183e-06, "loss": 0.4451, "mean_token_accuracy": 0.8450315102934838, "num_tokens": 174530110.0, "step": 145070 }, { "entropy": 1.9302792519330978, "epoch": 0.4497353062209819, "grad_norm": 12.66701889038086, "learning_rate": 3.772382181137442e-06, "loss": 0.4751, "mean_token_accuracy": 0.846200980246067, "num_tokens": 174542159.0, "step": 145080 }, { "entropy": 1.9027525156736373, "epoch": 0.4497663053460316, "grad_norm": 7.88311243057251, "learning_rate": 3.7722521765476877e-06, "loss": 0.4395, "mean_token_accuracy": 0.8578101888298988, "num_tokens": 174552779.0, "step": 145090 }, { "entropy": 1.8144456431269647, "epoch": 0.4497973044710813, "grad_norm": 7.007596015930176, "learning_rate": 3.77212218539774e-06, "loss": 0.4167, "mean_token_accuracy": 0.8665218591690064, "num_tokens": 174566254.0, "step": 145100 }, { "entropy": 1.9618958830833435, "epoch": 0.449828303596131, "grad_norm": 7.736656665802002, "learning_rate": 3.771992207685284e-06, "loss": 0.4956, "mean_token_accuracy": 0.836806908249855, "num_tokens": 174578147.0, "step": 145110 }, { "entropy": 1.8933872073888778, "epoch": 0.4498593027211807, "grad_norm": 7.00977087020874, "learning_rate": 3.771862243408003e-06, "loss": 0.3912, "mean_token_accuracy": 0.8770283341407776, "num_tokens": 174589833.0, "step": 145120 }, { "entropy": 1.9726043611764907, "epoch": 0.4498903018462304, "grad_norm": 8.317329406738281, "learning_rate": 3.7717322925635836e-06, "loss": 0.4768, "mean_token_accuracy": 0.8538321673870086, "num_tokens": 174600627.0, "step": 145130 }, { "entropy": 1.8826549246907234, "epoch": 0.4499213009712801, "grad_norm": 4.149418354034424, "learning_rate": 3.7716023551497116e-06, "loss": 0.3934, "mean_token_accuracy": 0.8519825637340546, "num_tokens": 174613125.0, "step": 145140 }, { "entropy": 2.011317655444145, "epoch": 0.4499523000963298, "grad_norm": 7.87576961517334, "learning_rate": 3.7714724311640744e-06, "loss": 0.4786, "mean_token_accuracy": 0.8533690482378006, "num_tokens": 174623674.0, "step": 145150 }, { "entropy": 1.9144355922937393, "epoch": 0.4499832992213795, "grad_norm": 7.214294910430908, "learning_rate": 3.771342520604358e-06, "loss": 0.4861, "mean_token_accuracy": 0.851697339117527, "num_tokens": 174635535.0, "step": 145160 }, { "entropy": 1.9184280708432198, "epoch": 0.4500142983464292, "grad_norm": 4.311148166656494, "learning_rate": 3.771212623468252e-06, "loss": 0.4744, "mean_token_accuracy": 0.8484656348824501, "num_tokens": 174647262.0, "step": 145170 }, { "entropy": 1.9903872221708299, "epoch": 0.45004529747147887, "grad_norm": 8.605646133422852, "learning_rate": 3.771082739753443e-06, "loss": 0.4741, "mean_token_accuracy": 0.8503182783722878, "num_tokens": 174658244.0, "step": 145180 }, { "entropy": 1.9099340721964837, "epoch": 0.45007629659652854, "grad_norm": 9.464656829833984, "learning_rate": 3.7709528694576214e-06, "loss": 0.4633, "mean_token_accuracy": 0.8493123814463616, "num_tokens": 174670230.0, "step": 145190 }, { "entropy": 1.962883660197258, "epoch": 0.45010729572157826, "grad_norm": 8.372581481933594, "learning_rate": 3.7708230125784757e-06, "loss": 0.4834, "mean_token_accuracy": 0.8490772858262062, "num_tokens": 174681923.0, "step": 145200 }, { "entropy": 1.8974821627140046, "epoch": 0.45013829484662793, "grad_norm": 3.493914842605591, "learning_rate": 3.7706931691136962e-06, "loss": 0.4276, "mean_token_accuracy": 0.861706106364727, "num_tokens": 174693772.0, "step": 145210 }, { "entropy": 1.9498993948101997, "epoch": 0.45016929397167765, "grad_norm": 7.030703067779541, "learning_rate": 3.7705633390609737e-06, "loss": 0.4377, "mean_token_accuracy": 0.8613954395055771, "num_tokens": 174705327.0, "step": 145220 }, { "entropy": 1.92320823520422, "epoch": 0.4502002930967273, "grad_norm": 11.262289047241211, "learning_rate": 3.770433522418e-06, "loss": 0.4846, "mean_token_accuracy": 0.8487863168120384, "num_tokens": 174717121.0, "step": 145230 }, { "entropy": 1.8713699102401733, "epoch": 0.45023129222177705, "grad_norm": 9.125288963317871, "learning_rate": 3.770303719182465e-06, "loss": 0.4408, "mean_token_accuracy": 0.8564379021525383, "num_tokens": 174729447.0, "step": 145240 }, { "entropy": 1.916731895506382, "epoch": 0.4502622913468267, "grad_norm": 8.092860221862793, "learning_rate": 3.7701739293520634e-06, "loss": 0.446, "mean_token_accuracy": 0.8585136890411377, "num_tokens": 174741652.0, "step": 145250 }, { "entropy": 1.8323576033115387, "epoch": 0.45029329047187644, "grad_norm": 5.573619365692139, "learning_rate": 3.7700441529244865e-06, "loss": 0.4288, "mean_token_accuracy": 0.858129957318306, "num_tokens": 174754464.0, "step": 145260 }, { "entropy": 1.8868038043379785, "epoch": 0.4503242895969261, "grad_norm": 7.117466926574707, "learning_rate": 3.7699143898974273e-06, "loss": 0.4213, "mean_token_accuracy": 0.8598243370652199, "num_tokens": 174767280.0, "step": 145270 }, { "entropy": 1.9327538147568704, "epoch": 0.45035528872197583, "grad_norm": 7.857034206390381, "learning_rate": 3.769784640268581e-06, "loss": 0.5033, "mean_token_accuracy": 0.8374062940478325, "num_tokens": 174779680.0, "step": 145280 }, { "entropy": 1.8759731560945512, "epoch": 0.4503862878470255, "grad_norm": 9.070082664489746, "learning_rate": 3.769654904035642e-06, "loss": 0.4414, "mean_token_accuracy": 0.8461429521441459, "num_tokens": 174792617.0, "step": 145290 }, { "entropy": 1.9318383768200875, "epoch": 0.4504172869720752, "grad_norm": 8.199836730957031, "learning_rate": 3.7695251811963052e-06, "loss": 0.4324, "mean_token_accuracy": 0.8561635687947273, "num_tokens": 174804624.0, "step": 145300 }, { "entropy": 1.9782269150018692, "epoch": 0.4504482860971249, "grad_norm": 3.9492764472961426, "learning_rate": 3.7693954717482656e-06, "loss": 0.4845, "mean_token_accuracy": 0.847953063249588, "num_tokens": 174816707.0, "step": 145310 }, { "entropy": 1.8973633468151092, "epoch": 0.4504792852221746, "grad_norm": 8.541032791137695, "learning_rate": 3.76926577568922e-06, "loss": 0.407, "mean_token_accuracy": 0.8600034147500992, "num_tokens": 174829220.0, "step": 145320 }, { "entropy": 1.868040455877781, "epoch": 0.4505102843472243, "grad_norm": 8.232982635498047, "learning_rate": 3.769136093016865e-06, "loss": 0.4017, "mean_token_accuracy": 0.860238878428936, "num_tokens": 174841040.0, "step": 145330 }, { "entropy": 1.8973990380764008, "epoch": 0.450541283472274, "grad_norm": 7.8517165184021, "learning_rate": 3.769006423728897e-06, "loss": 0.4159, "mean_token_accuracy": 0.8573246464133263, "num_tokens": 174852510.0, "step": 145340 }, { "entropy": 1.88846056163311, "epoch": 0.4505722825973237, "grad_norm": 7.867887496948242, "learning_rate": 3.7688767678230155e-06, "loss": 0.4817, "mean_token_accuracy": 0.8506530284881592, "num_tokens": 174864689.0, "step": 145350 }, { "entropy": 1.9104204386472703, "epoch": 0.4506032817223734, "grad_norm": 9.259328842163086, "learning_rate": 3.768747125296918e-06, "loss": 0.463, "mean_token_accuracy": 0.8554364711046218, "num_tokens": 174876233.0, "step": 145360 }, { "entropy": 1.9462157368659974, "epoch": 0.4506342808474231, "grad_norm": 8.897783279418945, "learning_rate": 3.7686174961483033e-06, "loss": 0.4928, "mean_token_accuracy": 0.8470346301794052, "num_tokens": 174886851.0, "step": 145370 }, { "entropy": 1.9113350063562393, "epoch": 0.4506652799724728, "grad_norm": 9.343399047851562, "learning_rate": 3.768487880374872e-06, "loss": 0.4542, "mean_token_accuracy": 0.8516996219754219, "num_tokens": 174898934.0, "step": 145380 }, { "entropy": 1.9105153515934945, "epoch": 0.45069627909752247, "grad_norm": 8.577310562133789, "learning_rate": 3.768358277974323e-06, "loss": 0.4668, "mean_token_accuracy": 0.8571393460035324, "num_tokens": 174910924.0, "step": 145390 }, { "entropy": 1.7790193900465965, "epoch": 0.4507272782225722, "grad_norm": 3.691340446472168, "learning_rate": 3.7682286889443563e-06, "loss": 0.3487, "mean_token_accuracy": 0.8683654963970184, "num_tokens": 174924748.0, "step": 145400 }, { "entropy": 1.853318177163601, "epoch": 0.45075827734762186, "grad_norm": 3.717491388320923, "learning_rate": 3.768099113282675e-06, "loss": 0.419, "mean_token_accuracy": 0.8503245025873184, "num_tokens": 174937623.0, "step": 145410 }, { "entropy": 1.8263938404619693, "epoch": 0.4507892764726716, "grad_norm": 3.7551968097686768, "learning_rate": 3.7679695509869798e-06, "loss": 0.4215, "mean_token_accuracy": 0.8548484072089195, "num_tokens": 174951248.0, "step": 145420 }, { "entropy": 1.9366182684898376, "epoch": 0.45082027559772125, "grad_norm": 7.332760810852051, "learning_rate": 3.7678400020549727e-06, "loss": 0.5017, "mean_token_accuracy": 0.8432886257767678, "num_tokens": 174962653.0, "step": 145430 }, { "entropy": 1.9753572344779968, "epoch": 0.4508512747227709, "grad_norm": 8.431512832641602, "learning_rate": 3.767710466484357e-06, "loss": 0.5259, "mean_token_accuracy": 0.839150819182396, "num_tokens": 174973717.0, "step": 145440 }, { "entropy": 1.9610534459352493, "epoch": 0.45088227384782065, "grad_norm": 6.784049034118652, "learning_rate": 3.767580944272836e-06, "loss": 0.5492, "mean_token_accuracy": 0.845959635078907, "num_tokens": 174985280.0, "step": 145450 }, { "entropy": 1.8077823013067245, "epoch": 0.4509132729728703, "grad_norm": 9.173667907714844, "learning_rate": 3.767451435418114e-06, "loss": 0.3778, "mean_token_accuracy": 0.8598848000168801, "num_tokens": 174999259.0, "step": 145460 }, { "entropy": 1.8944556072354317, "epoch": 0.45094427209792004, "grad_norm": 8.268648147583008, "learning_rate": 3.767321939917894e-06, "loss": 0.4506, "mean_token_accuracy": 0.861379300057888, "num_tokens": 175010366.0, "step": 145470 }, { "entropy": 1.8945633813738822, "epoch": 0.4509752712229697, "grad_norm": 3.7836310863494873, "learning_rate": 3.7671924577698832e-06, "loss": 0.4324, "mean_token_accuracy": 0.8532457053661346, "num_tokens": 175022036.0, "step": 145480 }, { "entropy": 1.9272257044911385, "epoch": 0.45100627034801943, "grad_norm": 9.02260971069336, "learning_rate": 3.767062988971786e-06, "loss": 0.4369, "mean_token_accuracy": 0.8532265886664391, "num_tokens": 175034153.0, "step": 145490 }, { "entropy": 1.9854293823242188, "epoch": 0.4510372694730691, "grad_norm": 7.62827730178833, "learning_rate": 3.766933533521308e-06, "loss": 0.4918, "mean_token_accuracy": 0.8465104550123215, "num_tokens": 175045152.0, "step": 145500 }, { "entropy": 1.888518001139164, "epoch": 0.4510682685981188, "grad_norm": 7.6335883140563965, "learning_rate": 3.766804091416157e-06, "loss": 0.4595, "mean_token_accuracy": 0.8494478777050972, "num_tokens": 175057796.0, "step": 145510 }, { "entropy": 1.916823922097683, "epoch": 0.4510992677231685, "grad_norm": 9.0162935256958, "learning_rate": 3.76667466265404e-06, "loss": 0.4461, "mean_token_accuracy": 0.8522469475865364, "num_tokens": 175069430.0, "step": 145520 }, { "entropy": 1.8178383886814118, "epoch": 0.4511302668482182, "grad_norm": 8.224920272827148, "learning_rate": 3.766545247232664e-06, "loss": 0.4528, "mean_token_accuracy": 0.8601442322134971, "num_tokens": 175082985.0, "step": 145530 }, { "entropy": 1.898731729388237, "epoch": 0.4511612659732679, "grad_norm": 3.8143157958984375, "learning_rate": 3.7664158451497383e-06, "loss": 0.4486, "mean_token_accuracy": 0.8475686386227608, "num_tokens": 175095045.0, "step": 145540 }, { "entropy": 1.9468153685331344, "epoch": 0.4511922650983176, "grad_norm": 6.286230564117432, "learning_rate": 3.766286456402971e-06, "loss": 0.4355, "mean_token_accuracy": 0.8552075073122978, "num_tokens": 175106619.0, "step": 145550 }, { "entropy": 1.8661633163690567, "epoch": 0.4512232642233673, "grad_norm": 3.8550450801849365, "learning_rate": 3.766157080990073e-06, "loss": 0.4514, "mean_token_accuracy": 0.8487115263938904, "num_tokens": 175119296.0, "step": 145560 }, { "entropy": 1.928366206586361, "epoch": 0.451254263348417, "grad_norm": 7.825737953186035, "learning_rate": 3.7660277189087524e-06, "loss": 0.4441, "mean_token_accuracy": 0.8565060004591942, "num_tokens": 175130424.0, "step": 145570 }, { "entropy": 1.9019853085279466, "epoch": 0.4512852624734667, "grad_norm": 4.290144920349121, "learning_rate": 3.7658983701567215e-06, "loss": 0.4669, "mean_token_accuracy": 0.8472353518009186, "num_tokens": 175143118.0, "step": 145580 }, { "entropy": 1.811234064400196, "epoch": 0.4513162615985164, "grad_norm": 8.767404556274414, "learning_rate": 3.7657690347316896e-06, "loss": 0.4132, "mean_token_accuracy": 0.859492601454258, "num_tokens": 175155763.0, "step": 145590 }, { "entropy": 1.9169629096984864, "epoch": 0.45134726072356607, "grad_norm": 8.782833099365234, "learning_rate": 3.7656397126313704e-06, "loss": 0.4829, "mean_token_accuracy": 0.8517961174249649, "num_tokens": 175167606.0, "step": 145600 }, { "entropy": 1.9656015813350678, "epoch": 0.4513782598486158, "grad_norm": 7.532365322113037, "learning_rate": 3.765510403853474e-06, "loss": 0.5174, "mean_token_accuracy": 0.8420636489987373, "num_tokens": 175178779.0, "step": 145610 }, { "entropy": 1.89768455773592, "epoch": 0.45140925897366546, "grad_norm": 7.2777419090271, "learning_rate": 3.765381108395715e-06, "loss": 0.3982, "mean_token_accuracy": 0.8631177753210068, "num_tokens": 175190689.0, "step": 145620 }, { "entropy": 1.8941746070981025, "epoch": 0.4514402580987152, "grad_norm": 7.515166282653809, "learning_rate": 3.7652518262558054e-06, "loss": 0.4465, "mean_token_accuracy": 0.8515808820724488, "num_tokens": 175202846.0, "step": 145630 }, { "entropy": 1.8148242741823197, "epoch": 0.45147125722376485, "grad_norm": 3.4633376598358154, "learning_rate": 3.7651225574314597e-06, "loss": 0.3591, "mean_token_accuracy": 0.8688716858625412, "num_tokens": 175215871.0, "step": 145640 }, { "entropy": 1.897299911081791, "epoch": 0.4515022563488146, "grad_norm": 8.746367454528809, "learning_rate": 3.764993301920392e-06, "loss": 0.4639, "mean_token_accuracy": 0.8508375376462937, "num_tokens": 175227683.0, "step": 145650 }, { "entropy": 1.9884761601686478, "epoch": 0.45153325547386425, "grad_norm": 7.514569282531738, "learning_rate": 3.764864059720317e-06, "loss": 0.4895, "mean_token_accuracy": 0.8484263613820076, "num_tokens": 175238305.0, "step": 145660 }, { "entropy": 1.8774521455168725, "epoch": 0.45156425459891397, "grad_norm": 4.357245922088623, "learning_rate": 3.7647348308289522e-06, "loss": 0.4094, "mean_token_accuracy": 0.8594071462750434, "num_tokens": 175250175.0, "step": 145670 }, { "entropy": 1.9650772213935852, "epoch": 0.45159525372396364, "grad_norm": 9.48290729522705, "learning_rate": 3.7646056152440104e-06, "loss": 0.5147, "mean_token_accuracy": 0.8337369963526726, "num_tokens": 175261255.0, "step": 145680 }, { "entropy": 1.8208548158407212, "epoch": 0.4516262528490133, "grad_norm": 9.698999404907227, "learning_rate": 3.7644764129632104e-06, "loss": 0.404, "mean_token_accuracy": 0.8611343666911125, "num_tokens": 175274442.0, "step": 145690 }, { "entropy": 1.8542212471365929, "epoch": 0.45165725197406303, "grad_norm": 8.707727432250977, "learning_rate": 3.7643472239842692e-06, "loss": 0.4699, "mean_token_accuracy": 0.8457312777638435, "num_tokens": 175287615.0, "step": 145700 }, { "entropy": 1.947345346212387, "epoch": 0.4516882510991127, "grad_norm": 9.62697696685791, "learning_rate": 3.7642180483049036e-06, "loss": 0.5165, "mean_token_accuracy": 0.8280424475669861, "num_tokens": 175299235.0, "step": 145710 }, { "entropy": 1.7479452803730964, "epoch": 0.4517192502241624, "grad_norm": 7.1266350746154785, "learning_rate": 3.7640888859228326e-06, "loss": 0.3448, "mean_token_accuracy": 0.8756140992045403, "num_tokens": 175312946.0, "step": 145720 }, { "entropy": 1.9307225465774536, "epoch": 0.4517502493492121, "grad_norm": 10.900723457336426, "learning_rate": 3.7639597368357745e-06, "loss": 0.4533, "mean_token_accuracy": 0.8503560811281204, "num_tokens": 175324628.0, "step": 145730 }, { "entropy": 1.9207808002829552, "epoch": 0.4517812484742618, "grad_norm": 8.007662773132324, "learning_rate": 3.763830601041448e-06, "loss": 0.4391, "mean_token_accuracy": 0.8547277525067329, "num_tokens": 175336268.0, "step": 145740 }, { "entropy": 1.9525502398610115, "epoch": 0.4518122475993115, "grad_norm": 7.941483974456787, "learning_rate": 3.763701478537576e-06, "loss": 0.4872, "mean_token_accuracy": 0.8508265346288681, "num_tokens": 175347435.0, "step": 145750 }, { "entropy": 1.8660466879606248, "epoch": 0.4518432467243612, "grad_norm": 4.241132736206055, "learning_rate": 3.763572369321876e-06, "loss": 0.4325, "mean_token_accuracy": 0.8603238686919212, "num_tokens": 175359501.0, "step": 145760 }, { "entropy": 1.9864515900611877, "epoch": 0.4518742458494109, "grad_norm": 8.52209758758545, "learning_rate": 3.763443273392069e-06, "loss": 0.539, "mean_token_accuracy": 0.8401314124464989, "num_tokens": 175370454.0, "step": 145770 }, { "entropy": 1.917511025071144, "epoch": 0.4519052449744606, "grad_norm": 4.007366180419922, "learning_rate": 3.7633141907458774e-06, "loss": 0.4836, "mean_token_accuracy": 0.8430694580078125, "num_tokens": 175382097.0, "step": 145780 }, { "entropy": 1.8464987218379973, "epoch": 0.4519362440995103, "grad_norm": 4.244717121124268, "learning_rate": 3.7631851213810237e-06, "loss": 0.4654, "mean_token_accuracy": 0.8494131848216057, "num_tokens": 175396376.0, "step": 145790 }, { "entropy": 1.8551280453801156, "epoch": 0.45196724322456, "grad_norm": 8.366869926452637, "learning_rate": 3.76305606529523e-06, "loss": 0.4078, "mean_token_accuracy": 0.8622759610414505, "num_tokens": 175408942.0, "step": 145800 }, { "entropy": 1.8460657626390458, "epoch": 0.45199824234960967, "grad_norm": 7.980034351348877, "learning_rate": 3.7629270224862198e-06, "loss": 0.3803, "mean_token_accuracy": 0.8638658255338669, "num_tokens": 175421526.0, "step": 145810 }, { "entropy": 1.8811449334025383, "epoch": 0.4520292414746594, "grad_norm": 8.846792221069336, "learning_rate": 3.762797992951716e-06, "loss": 0.4201, "mean_token_accuracy": 0.8668080389499664, "num_tokens": 175434163.0, "step": 145820 }, { "entropy": 1.8398293122649192, "epoch": 0.45206024059970906, "grad_norm": 8.336902618408203, "learning_rate": 3.762668976689443e-06, "loss": 0.4324, "mean_token_accuracy": 0.8575309097766877, "num_tokens": 175447165.0, "step": 145830 }, { "entropy": 1.8911280773580075, "epoch": 0.4520912397247588, "grad_norm": 3.6037964820861816, "learning_rate": 3.7625399736971264e-06, "loss": 0.4532, "mean_token_accuracy": 0.8530662402510643, "num_tokens": 175458732.0, "step": 145840 }, { "entropy": 1.937391071021557, "epoch": 0.45212223884980846, "grad_norm": 7.425451755523682, "learning_rate": 3.7624109839724915e-06, "loss": 0.4557, "mean_token_accuracy": 0.8544071182608605, "num_tokens": 175470385.0, "step": 145850 }, { "entropy": 1.9437742054462432, "epoch": 0.4521532379748582, "grad_norm": 7.55548095703125, "learning_rate": 3.762282007513263e-06, "loss": 0.459, "mean_token_accuracy": 0.8576256543397903, "num_tokens": 175482257.0, "step": 145860 }, { "entropy": 1.879883836209774, "epoch": 0.45218423709990785, "grad_norm": 9.295843124389648, "learning_rate": 3.7621530443171695e-06, "loss": 0.4445, "mean_token_accuracy": 0.8495224505662918, "num_tokens": 175494991.0, "step": 145870 }, { "entropy": 1.8533268101513385, "epoch": 0.4522152362249576, "grad_norm": 2.5859084129333496, "learning_rate": 3.7620240943819353e-06, "loss": 0.4178, "mean_token_accuracy": 0.8556357622146606, "num_tokens": 175507885.0, "step": 145880 }, { "entropy": 1.8991135403513908, "epoch": 0.45224623535000724, "grad_norm": 8.07573127746582, "learning_rate": 3.7618951577052897e-06, "loss": 0.4416, "mean_token_accuracy": 0.8474540561437607, "num_tokens": 175519719.0, "step": 145890 }, { "entropy": 1.9330607324838638, "epoch": 0.45227723447505697, "grad_norm": 7.152818202972412, "learning_rate": 3.7617662342849608e-06, "loss": 0.4621, "mean_token_accuracy": 0.8559790030121803, "num_tokens": 175531157.0, "step": 145900 }, { "entropy": 1.875030305981636, "epoch": 0.45230823360010664, "grad_norm": 10.458932876586914, "learning_rate": 3.7616373241186765e-06, "loss": 0.4402, "mean_token_accuracy": 0.85587347894907, "num_tokens": 175542756.0, "step": 145910 }, { "entropy": 1.9337833374738693, "epoch": 0.45233923272515636, "grad_norm": 9.064762115478516, "learning_rate": 3.7615084272041664e-06, "loss": 0.4848, "mean_token_accuracy": 0.8471284449100495, "num_tokens": 175555072.0, "step": 145920 }, { "entropy": 1.9829118058085442, "epoch": 0.45237023185020603, "grad_norm": 7.892855644226074, "learning_rate": 3.7613795435391603e-06, "loss": 0.4718, "mean_token_accuracy": 0.8524794027209281, "num_tokens": 175566512.0, "step": 145930 }, { "entropy": 1.8689216911792754, "epoch": 0.4524012309752557, "grad_norm": 7.025180816650391, "learning_rate": 3.761250673121388e-06, "loss": 0.4345, "mean_token_accuracy": 0.8556947037577629, "num_tokens": 175577952.0, "step": 145940 }, { "entropy": 1.8746563777327538, "epoch": 0.4524322301003054, "grad_norm": 8.948837280273438, "learning_rate": 3.7611218159485807e-06, "loss": 0.4173, "mean_token_accuracy": 0.8540759429335594, "num_tokens": 175589918.0, "step": 145950 }, { "entropy": 1.906599646806717, "epoch": 0.4524632292253551, "grad_norm": 8.048293113708496, "learning_rate": 3.7609929720184695e-06, "loss": 0.447, "mean_token_accuracy": 0.8491839274764061, "num_tokens": 175602264.0, "step": 145960 }, { "entropy": 1.8783759981393815, "epoch": 0.4524942283504048, "grad_norm": 8.030713081359863, "learning_rate": 3.7608641413287865e-06, "loss": 0.3877, "mean_token_accuracy": 0.8676090762019157, "num_tokens": 175613895.0, "step": 145970 }, { "entropy": 1.8621134147047997, "epoch": 0.4525252274754545, "grad_norm": 9.925078392028809, "learning_rate": 3.760735323877264e-06, "loss": 0.4474, "mean_token_accuracy": 0.8541931763291359, "num_tokens": 175625978.0, "step": 145980 }, { "entropy": 1.9505731910467148, "epoch": 0.4525562266005042, "grad_norm": 8.451035499572754, "learning_rate": 3.7606065196616353e-06, "loss": 0.4809, "mean_token_accuracy": 0.8430397853255271, "num_tokens": 175637809.0, "step": 145990 }, { "entropy": 1.9288512140512466, "epoch": 0.4525872257255539, "grad_norm": 9.2060546875, "learning_rate": 3.7604777286796333e-06, "loss": 0.4286, "mean_token_accuracy": 0.8581225946545601, "num_tokens": 175649771.0, "step": 146000 }, { "entropy": 1.9088809505105018, "epoch": 0.4526182248506036, "grad_norm": 9.723130226135254, "learning_rate": 3.7603489509289924e-06, "loss": 0.4687, "mean_token_accuracy": 0.8498413473367691, "num_tokens": 175661361.0, "step": 146010 }, { "entropy": 1.8807017832994462, "epoch": 0.45264922397565327, "grad_norm": 7.284520626068115, "learning_rate": 3.7602201864074476e-06, "loss": 0.4296, "mean_token_accuracy": 0.8585941612720489, "num_tokens": 175674036.0, "step": 146020 }, { "entropy": 1.9656174287199975, "epoch": 0.452680223100703, "grad_norm": 9.173661231994629, "learning_rate": 3.7600914351127343e-06, "loss": 0.4841, "mean_token_accuracy": 0.8418927356600762, "num_tokens": 175685277.0, "step": 146030 }, { "entropy": 1.9602088913321496, "epoch": 0.45271122222575266, "grad_norm": 8.204484939575195, "learning_rate": 3.759962697042587e-06, "loss": 0.4398, "mean_token_accuracy": 0.8636955425143242, "num_tokens": 175696654.0, "step": 146040 }, { "entropy": 1.886138205230236, "epoch": 0.4527422213508024, "grad_norm": 8.34501838684082, "learning_rate": 3.7598339721947426e-06, "loss": 0.4302, "mean_token_accuracy": 0.85477264970541, "num_tokens": 175708775.0, "step": 146050 }, { "entropy": 1.8968783617019653, "epoch": 0.45277322047585206, "grad_norm": 8.124290466308594, "learning_rate": 3.7597052605669376e-06, "loss": 0.4263, "mean_token_accuracy": 0.8634179666638374, "num_tokens": 175720239.0, "step": 146060 }, { "entropy": 1.9461852222681046, "epoch": 0.4528042196009018, "grad_norm": 10.085100173950195, "learning_rate": 3.7595765621569098e-06, "loss": 0.4547, "mean_token_accuracy": 0.8562173038721085, "num_tokens": 175731786.0, "step": 146070 }, { "entropy": 1.8739292353391648, "epoch": 0.45283521872595145, "grad_norm": 8.647810935974121, "learning_rate": 3.7594478769623967e-06, "loss": 0.4165, "mean_token_accuracy": 0.8552744179964066, "num_tokens": 175744743.0, "step": 146080 }, { "entropy": 1.972387745976448, "epoch": 0.4528662178510012, "grad_norm": 8.841172218322754, "learning_rate": 3.759319204981136e-06, "loss": 0.4821, "mean_token_accuracy": 0.8444529309868812, "num_tokens": 175756143.0, "step": 146090 }, { "entropy": 1.804822953045368, "epoch": 0.45289721697605084, "grad_norm": 7.164949893951416, "learning_rate": 3.7591905462108686e-06, "loss": 0.374, "mean_token_accuracy": 0.8675896748900414, "num_tokens": 175768824.0, "step": 146100 }, { "entropy": 1.858502623438835, "epoch": 0.45292821610110057, "grad_norm": 7.452975749969482, "learning_rate": 3.7590619006493333e-06, "loss": 0.4396, "mean_token_accuracy": 0.8505475029349328, "num_tokens": 175781202.0, "step": 146110 }, { "entropy": 1.9403677567839623, "epoch": 0.45295921522615024, "grad_norm": 9.553077697753906, "learning_rate": 3.7589332682942687e-06, "loss": 0.4697, "mean_token_accuracy": 0.8545985773205758, "num_tokens": 175792433.0, "step": 146120 }, { "entropy": 1.929755797982216, "epoch": 0.45299021435119996, "grad_norm": 7.850072860717773, "learning_rate": 3.7588046491434164e-06, "loss": 0.4831, "mean_token_accuracy": 0.8516537502408028, "num_tokens": 175803014.0, "step": 146130 }, { "entropy": 1.8702018111944199, "epoch": 0.45302121347624963, "grad_norm": 4.102813243865967, "learning_rate": 3.758676043194518e-06, "loss": 0.3885, "mean_token_accuracy": 0.8598198860883712, "num_tokens": 175814852.0, "step": 146140 }, { "entropy": 1.875620885193348, "epoch": 0.45305221260129935, "grad_norm": 7.794433116912842, "learning_rate": 3.7585474504453145e-06, "loss": 0.4321, "mean_token_accuracy": 0.8569299295544625, "num_tokens": 175826845.0, "step": 146150 }, { "entropy": 1.9491507172584535, "epoch": 0.453083211726349, "grad_norm": 8.696412086486816, "learning_rate": 3.758418870893548e-06, "loss": 0.479, "mean_token_accuracy": 0.8512806192040443, "num_tokens": 175838457.0, "step": 146160 }, { "entropy": 1.8456061393022538, "epoch": 0.45311421085139875, "grad_norm": 3.9336066246032715, "learning_rate": 3.758290304536962e-06, "loss": 0.3994, "mean_token_accuracy": 0.861776913702488, "num_tokens": 175850893.0, "step": 146170 }, { "entropy": 1.8336857661604882, "epoch": 0.4531452099764484, "grad_norm": 4.638808727264404, "learning_rate": 3.7581617513732994e-06, "loss": 0.3938, "mean_token_accuracy": 0.8632370933890343, "num_tokens": 175863816.0, "step": 146180 }, { "entropy": 1.8227697402238845, "epoch": 0.4531762091014981, "grad_norm": 8.638792991638184, "learning_rate": 3.7580332114003033e-06, "loss": 0.475, "mean_token_accuracy": 0.8531124874949455, "num_tokens": 175877462.0, "step": 146190 }, { "entropy": 1.938033263385296, "epoch": 0.4532072082265478, "grad_norm": 9.167952537536621, "learning_rate": 3.757904684615719e-06, "loss": 0.4254, "mean_token_accuracy": 0.8607913628220558, "num_tokens": 175889063.0, "step": 146200 }, { "entropy": 1.8250967353582381, "epoch": 0.4532382073515975, "grad_norm": 7.031805515289307, "learning_rate": 3.7577761710172912e-06, "loss": 0.3626, "mean_token_accuracy": 0.871262151002884, "num_tokens": 175902376.0, "step": 146210 }, { "entropy": 1.872345322370529, "epoch": 0.4532692064766472, "grad_norm": 7.401670932769775, "learning_rate": 3.757647670602764e-06, "loss": 0.4387, "mean_token_accuracy": 0.8529538735747337, "num_tokens": 175915132.0, "step": 146220 }, { "entropy": 1.8786262601613999, "epoch": 0.45330020560169687, "grad_norm": 4.830646514892578, "learning_rate": 3.757519183369886e-06, "loss": 0.4165, "mean_token_accuracy": 0.85924152135849, "num_tokens": 175927680.0, "step": 146230 }, { "entropy": 1.9301627531647683, "epoch": 0.4533312047267466, "grad_norm": 4.050126552581787, "learning_rate": 3.757390709316402e-06, "loss": 0.457, "mean_token_accuracy": 0.8585702002048492, "num_tokens": 175938798.0, "step": 146240 }, { "entropy": 1.909451249241829, "epoch": 0.45336220385179626, "grad_norm": 7.195918560028076, "learning_rate": 3.757262248440059e-06, "loss": 0.4654, "mean_token_accuracy": 0.8493908762931823, "num_tokens": 175949970.0, "step": 146250 }, { "entropy": 1.9241393342614175, "epoch": 0.453393202976846, "grad_norm": 3.823732614517212, "learning_rate": 3.7571338007386053e-06, "loss": 0.5309, "mean_token_accuracy": 0.830666047334671, "num_tokens": 175961893.0, "step": 146260 }, { "entropy": 1.9584316313266754, "epoch": 0.45342420210189566, "grad_norm": 7.582314968109131, "learning_rate": 3.7570053662097884e-06, "loss": 0.4784, "mean_token_accuracy": 0.8390909880399704, "num_tokens": 175973692.0, "step": 146270 }, { "entropy": 1.8017342045903206, "epoch": 0.4534552012269454, "grad_norm": 4.599339008331299, "learning_rate": 3.7568769448513577e-06, "loss": 0.336, "mean_token_accuracy": 0.87629035115242, "num_tokens": 175987073.0, "step": 146280 }, { "entropy": 1.9104434236884118, "epoch": 0.45348620035199505, "grad_norm": 5.8684210777282715, "learning_rate": 3.756748536661061e-06, "loss": 0.4021, "mean_token_accuracy": 0.8615088537335396, "num_tokens": 175999179.0, "step": 146290 }, { "entropy": 1.9020302399992943, "epoch": 0.4535171994770448, "grad_norm": 8.238286972045898, "learning_rate": 3.7566201416366497e-06, "loss": 0.4678, "mean_token_accuracy": 0.850467374920845, "num_tokens": 176010704.0, "step": 146300 }, { "entropy": 1.9250214383006097, "epoch": 0.45354819860209444, "grad_norm": 7.398449897766113, "learning_rate": 3.756491759775874e-06, "loss": 0.4741, "mean_token_accuracy": 0.8439469993114471, "num_tokens": 176023049.0, "step": 146310 }, { "entropy": 1.9645772278308868, "epoch": 0.45357919772714417, "grad_norm": 7.6212334632873535, "learning_rate": 3.7563633910764837e-06, "loss": 0.5461, "mean_token_accuracy": 0.8408754646778107, "num_tokens": 176034032.0, "step": 146320 }, { "entropy": 1.9224450066685677, "epoch": 0.45361019685219384, "grad_norm": 5.688684463500977, "learning_rate": 3.7562350355362297e-06, "loss": 0.5091, "mean_token_accuracy": 0.8347508668899536, "num_tokens": 176046264.0, "step": 146330 }, { "entropy": 1.9145216554403306, "epoch": 0.45364119597724356, "grad_norm": 8.109264373779297, "learning_rate": 3.7561066931528657e-06, "loss": 0.4583, "mean_token_accuracy": 0.8559110507369041, "num_tokens": 176057654.0, "step": 146340 }, { "entropy": 1.903997114300728, "epoch": 0.45367219510229323, "grad_norm": 7.708643436431885, "learning_rate": 3.755978363924143e-06, "loss": 0.4314, "mean_token_accuracy": 0.8482723668217659, "num_tokens": 176070619.0, "step": 146350 }, { "entropy": 1.8557177141308785, "epoch": 0.45370319422734295, "grad_norm": 4.477802753448486, "learning_rate": 3.755850047847815e-06, "loss": 0.4151, "mean_token_accuracy": 0.8525144010782242, "num_tokens": 176083576.0, "step": 146360 }, { "entropy": 1.9063315868377686, "epoch": 0.4537341933523926, "grad_norm": 8.484750747680664, "learning_rate": 3.7557217449216354e-06, "loss": 0.4557, "mean_token_accuracy": 0.8580427676439285, "num_tokens": 176095193.0, "step": 146370 }, { "entropy": 1.8667533531785012, "epoch": 0.45376519247744235, "grad_norm": 9.92103099822998, "learning_rate": 3.755593455143357e-06, "loss": 0.4305, "mean_token_accuracy": 0.8585796818137169, "num_tokens": 176107763.0, "step": 146380 }, { "entropy": 1.9320159569382667, "epoch": 0.453796191602492, "grad_norm": 9.859814643859863, "learning_rate": 3.7554651785107367e-06, "loss": 0.4778, "mean_token_accuracy": 0.8518664732575416, "num_tokens": 176119971.0, "step": 146390 }, { "entropy": 1.8948560282588005, "epoch": 0.45382719072754174, "grad_norm": 3.578590154647827, "learning_rate": 3.755336915021527e-06, "loss": 0.4498, "mean_token_accuracy": 0.8504968300461769, "num_tokens": 176132352.0, "step": 146400 }, { "entropy": 1.9224641382694245, "epoch": 0.4538581898525914, "grad_norm": 9.709887504577637, "learning_rate": 3.755208664673485e-06, "loss": 0.452, "mean_token_accuracy": 0.849474447965622, "num_tokens": 176145215.0, "step": 146410 }, { "entropy": 1.9500353157520294, "epoch": 0.45388918897764113, "grad_norm": 7.521528720855713, "learning_rate": 3.7550804274643673e-06, "loss": 0.4387, "mean_token_accuracy": 0.8548279941082001, "num_tokens": 176156902.0, "step": 146420 }, { "entropy": 1.9636742144823074, "epoch": 0.4539201881026908, "grad_norm": 9.059736251831055, "learning_rate": 3.7549522033919293e-06, "loss": 0.4779, "mean_token_accuracy": 0.8536644339561462, "num_tokens": 176167981.0, "step": 146430 }, { "entropy": 1.9579711258411407, "epoch": 0.45395118722774047, "grad_norm": 7.5242390632629395, "learning_rate": 3.7548239924539294e-06, "loss": 0.4569, "mean_token_accuracy": 0.855312067270279, "num_tokens": 176179862.0, "step": 146440 }, { "entropy": 1.9306029558181763, "epoch": 0.4539821863527902, "grad_norm": 8.735157012939453, "learning_rate": 3.754695794648125e-06, "loss": 0.4603, "mean_token_accuracy": 0.8461642101407051, "num_tokens": 176191558.0, "step": 146450 }, { "entropy": 1.8903416648507119, "epoch": 0.45401318547783986, "grad_norm": 8.93207836151123, "learning_rate": 3.7545676099722737e-06, "loss": 0.4224, "mean_token_accuracy": 0.8568781584501266, "num_tokens": 176203878.0, "step": 146460 }, { "entropy": 1.918420398235321, "epoch": 0.4540441846028896, "grad_norm": 8.027023315429688, "learning_rate": 3.7544394384241366e-06, "loss": 0.4635, "mean_token_accuracy": 0.8519258230924607, "num_tokens": 176215578.0, "step": 146470 }, { "entropy": 1.9900656551122666, "epoch": 0.45407518372793926, "grad_norm": 6.475451469421387, "learning_rate": 3.754311280001471e-06, "loss": 0.4894, "mean_token_accuracy": 0.8443711012601852, "num_tokens": 176226440.0, "step": 146480 }, { "entropy": 1.9735152557492257, "epoch": 0.454106182852989, "grad_norm": 8.966446876525879, "learning_rate": 3.7541831347020374e-06, "loss": 0.5237, "mean_token_accuracy": 0.8361330017447471, "num_tokens": 176237579.0, "step": 146490 }, { "entropy": 1.8347552955150603, "epoch": 0.45413718197803865, "grad_norm": 4.222280502319336, "learning_rate": 3.754055002523596e-06, "loss": 0.4493, "mean_token_accuracy": 0.8571354284882545, "num_tokens": 176251272.0, "step": 146500 }, { "entropy": 1.8998313397169113, "epoch": 0.4541681811030884, "grad_norm": 8.564806938171387, "learning_rate": 3.7539268834639085e-06, "loss": 0.4753, "mean_token_accuracy": 0.8468169465661048, "num_tokens": 176263481.0, "step": 146510 }, { "entropy": 1.912764126062393, "epoch": 0.45419918022813804, "grad_norm": 8.024970054626465, "learning_rate": 3.7537987775207373e-06, "loss": 0.3862, "mean_token_accuracy": 0.8708274886012077, "num_tokens": 176275846.0, "step": 146520 }, { "entropy": 1.9775932729244232, "epoch": 0.45423017935318777, "grad_norm": 10.816838264465332, "learning_rate": 3.753670684691842e-06, "loss": 0.5207, "mean_token_accuracy": 0.842138460278511, "num_tokens": 176286757.0, "step": 146530 }, { "entropy": 1.9837248116731643, "epoch": 0.45426117847823744, "grad_norm": 8.28238296508789, "learning_rate": 3.7535426049749867e-06, "loss": 0.4965, "mean_token_accuracy": 0.8460395529866218, "num_tokens": 176297210.0, "step": 146540 }, { "entropy": 1.9528759866952896, "epoch": 0.45429217760328716, "grad_norm": 8.31442642211914, "learning_rate": 3.7534145383679354e-06, "loss": 0.4845, "mean_token_accuracy": 0.8538484081625939, "num_tokens": 176309108.0, "step": 146550 }, { "entropy": 1.8652475848793983, "epoch": 0.45432317672833683, "grad_norm": 8.39297866821289, "learning_rate": 3.7532864848684496e-06, "loss": 0.4237, "mean_token_accuracy": 0.8631422132253647, "num_tokens": 176322014.0, "step": 146560 }, { "entropy": 1.9841240167617797, "epoch": 0.45435417585338655, "grad_norm": 9.03348159790039, "learning_rate": 3.7531584444742956e-06, "loss": 0.4887, "mean_token_accuracy": 0.8504109963774681, "num_tokens": 176332381.0, "step": 146570 }, { "entropy": 1.8902300730347634, "epoch": 0.4543851749784362, "grad_norm": 3.9884934425354004, "learning_rate": 3.753030417183237e-06, "loss": 0.4038, "mean_token_accuracy": 0.8569736734032631, "num_tokens": 176345267.0, "step": 146580 }, { "entropy": 1.8768309980630875, "epoch": 0.45441617410348595, "grad_norm": 8.625255584716797, "learning_rate": 3.75290240299304e-06, "loss": 0.4368, "mean_token_accuracy": 0.8578463882207871, "num_tokens": 176357705.0, "step": 146590 }, { "entropy": 1.9548759251832961, "epoch": 0.4544471732285356, "grad_norm": 4.450130462646484, "learning_rate": 3.7527744019014693e-06, "loss": 0.4713, "mean_token_accuracy": 0.855126628279686, "num_tokens": 176369275.0, "step": 146600 }, { "entropy": 1.8565620198845862, "epoch": 0.45447817235358534, "grad_norm": 8.501686096191406, "learning_rate": 3.7526464139062918e-06, "loss": 0.405, "mean_token_accuracy": 0.863280688226223, "num_tokens": 176382234.0, "step": 146610 }, { "entropy": 1.8947626411914826, "epoch": 0.454509171478635, "grad_norm": 9.0901460647583, "learning_rate": 3.752518439005274e-06, "loss": 0.4351, "mean_token_accuracy": 0.8616373479366303, "num_tokens": 176394636.0, "step": 146620 }, { "entropy": 1.9476829752326013, "epoch": 0.45454017060368473, "grad_norm": 8.009326934814453, "learning_rate": 3.752390477196185e-06, "loss": 0.4846, "mean_token_accuracy": 0.8480390936136246, "num_tokens": 176406091.0, "step": 146630 }, { "entropy": 1.9551826044917107, "epoch": 0.4545711697287344, "grad_norm": 3.430755853652954, "learning_rate": 3.752262528476791e-06, "loss": 0.5021, "mean_token_accuracy": 0.8439038008451462, "num_tokens": 176417335.0, "step": 146640 }, { "entropy": 1.9948366075754165, "epoch": 0.4546021688537841, "grad_norm": 6.840229034423828, "learning_rate": 3.752134592844861e-06, "loss": 0.5027, "mean_token_accuracy": 0.8471700206398964, "num_tokens": 176428379.0, "step": 146650 }, { "entropy": 1.964817936718464, "epoch": 0.4546331679788338, "grad_norm": 4.002964496612549, "learning_rate": 3.7520066702981637e-06, "loss": 0.4591, "mean_token_accuracy": 0.8483250498771667, "num_tokens": 176439775.0, "step": 146660 }, { "entropy": 1.9319669753313065, "epoch": 0.45466416710388347, "grad_norm": 4.155783653259277, "learning_rate": 3.7518787608344694e-06, "loss": 0.4414, "mean_token_accuracy": 0.8619429409503937, "num_tokens": 176451549.0, "step": 146670 }, { "entropy": 1.9431686520576477, "epoch": 0.4546951662289332, "grad_norm": 8.80432415008545, "learning_rate": 3.7517508644515476e-06, "loss": 0.4715, "mean_token_accuracy": 0.8515708222985268, "num_tokens": 176463221.0, "step": 146680 }, { "entropy": 1.912970869243145, "epoch": 0.45472616535398286, "grad_norm": 9.149775505065918, "learning_rate": 3.7516229811471686e-06, "loss": 0.4347, "mean_token_accuracy": 0.8584818720817566, "num_tokens": 176475849.0, "step": 146690 }, { "entropy": 1.948892466723919, "epoch": 0.4547571644790326, "grad_norm": 8.775398254394531, "learning_rate": 3.751495110919105e-06, "loss": 0.427, "mean_token_accuracy": 0.8604862332344055, "num_tokens": 176487394.0, "step": 146700 }, { "entropy": 1.879041202366352, "epoch": 0.45478816360408225, "grad_norm": 8.988895416259766, "learning_rate": 3.7513672537651273e-06, "loss": 0.4596, "mean_token_accuracy": 0.8561202257871627, "num_tokens": 176499781.0, "step": 146710 }, { "entropy": 1.9582669615745545, "epoch": 0.454819162729132, "grad_norm": 10.033254623413086, "learning_rate": 3.751239409683008e-06, "loss": 0.5125, "mean_token_accuracy": 0.8357150718569756, "num_tokens": 176512409.0, "step": 146720 }, { "entropy": 1.9103184998035432, "epoch": 0.45485016185418164, "grad_norm": 8.347360610961914, "learning_rate": 3.75111157867052e-06, "loss": 0.4322, "mean_token_accuracy": 0.8483709633350373, "num_tokens": 176524784.0, "step": 146730 }, { "entropy": 1.9160448759794235, "epoch": 0.45488116097923137, "grad_norm": 4.41226863861084, "learning_rate": 3.7509837607254356e-06, "loss": 0.4349, "mean_token_accuracy": 0.8565223976969719, "num_tokens": 176536691.0, "step": 146740 }, { "entropy": 1.8885368049144744, "epoch": 0.45491216010428104, "grad_norm": 7.640499591827393, "learning_rate": 3.75085595584553e-06, "loss": 0.4101, "mean_token_accuracy": 0.8599068447947502, "num_tokens": 176549077.0, "step": 146750 }, { "entropy": 1.9121224999427795, "epoch": 0.45494315922933076, "grad_norm": 8.662951469421387, "learning_rate": 3.750728164028577e-06, "loss": 0.4483, "mean_token_accuracy": 0.8458693385124206, "num_tokens": 176561257.0, "step": 146760 }, { "entropy": 1.8658764064311981, "epoch": 0.45497415835438043, "grad_norm": 8.34899616241455, "learning_rate": 3.7506003852723517e-06, "loss": 0.4581, "mean_token_accuracy": 0.8492805927991867, "num_tokens": 176574547.0, "step": 146770 }, { "entropy": 2.010634405910969, "epoch": 0.45500515747943016, "grad_norm": 4.921254634857178, "learning_rate": 3.7504726195746287e-06, "loss": 0.4775, "mean_token_accuracy": 0.8483109161257744, "num_tokens": 176585789.0, "step": 146780 }, { "entropy": 2.0231185287237166, "epoch": 0.4550361566044798, "grad_norm": 7.859384536743164, "learning_rate": 3.750344866933185e-06, "loss": 0.482, "mean_token_accuracy": 0.854818707704544, "num_tokens": 176596403.0, "step": 146790 }, { "entropy": 1.809226544201374, "epoch": 0.45506715572952955, "grad_norm": 7.986112594604492, "learning_rate": 3.750217127345796e-06, "loss": 0.3894, "mean_token_accuracy": 0.8633719727396965, "num_tokens": 176610424.0, "step": 146800 }, { "entropy": 1.9953494429588319, "epoch": 0.4550981548545792, "grad_norm": 8.195384979248047, "learning_rate": 3.7500894008102395e-06, "loss": 0.5193, "mean_token_accuracy": 0.8488559857010841, "num_tokens": 176621493.0, "step": 146810 }, { "entropy": 1.7815378695726394, "epoch": 0.45512915397962894, "grad_norm": 6.289517402648926, "learning_rate": 3.7499616873242926e-06, "loss": 0.3925, "mean_token_accuracy": 0.8637578442692757, "num_tokens": 176635418.0, "step": 146820 }, { "entropy": 1.9626459717750548, "epoch": 0.4551601531046786, "grad_norm": 8.439818382263184, "learning_rate": 3.7498339868857342e-06, "loss": 0.508, "mean_token_accuracy": 0.852840892970562, "num_tokens": 176646632.0, "step": 146830 }, { "entropy": 1.906586892902851, "epoch": 0.45519115222972834, "grad_norm": 8.007729530334473, "learning_rate": 3.749706299492342e-06, "loss": 0.4267, "mean_token_accuracy": 0.8653773680329323, "num_tokens": 176657728.0, "step": 146840 }, { "entropy": 1.9015120595693589, "epoch": 0.455222151354778, "grad_norm": 7.211781978607178, "learning_rate": 3.749578625141895e-06, "loss": 0.4305, "mean_token_accuracy": 0.8567000299692153, "num_tokens": 176670194.0, "step": 146850 }, { "entropy": 1.9847611114382744, "epoch": 0.45525315047982773, "grad_norm": 6.767532825469971, "learning_rate": 3.7494509638321734e-06, "loss": 0.4531, "mean_token_accuracy": 0.8509671688079834, "num_tokens": 176681480.0, "step": 146860 }, { "entropy": 1.8333649218082428, "epoch": 0.4552841496048774, "grad_norm": 7.665855407714844, "learning_rate": 3.749323315560957e-06, "loss": 0.4189, "mean_token_accuracy": 0.856324379146099, "num_tokens": 176694603.0, "step": 146870 }, { "entropy": 1.9811863422393798, "epoch": 0.4553151487299271, "grad_norm": 8.544697761535645, "learning_rate": 3.7491956803260273e-06, "loss": 0.4924, "mean_token_accuracy": 0.8474650055170059, "num_tokens": 176705780.0, "step": 146880 }, { "entropy": 1.8731940254569053, "epoch": 0.4553461478549768, "grad_norm": 8.085098266601562, "learning_rate": 3.7490680581251637e-06, "loss": 0.4266, "mean_token_accuracy": 0.8562427669763565, "num_tokens": 176717194.0, "step": 146890 }, { "entropy": 1.862533800303936, "epoch": 0.4553771469800265, "grad_norm": 4.403972625732422, "learning_rate": 3.74894044895615e-06, "loss": 0.4545, "mean_token_accuracy": 0.8544600263237954, "num_tokens": 176730041.0, "step": 146900 }, { "entropy": 1.8781135827302933, "epoch": 0.4554081461050762, "grad_norm": 3.9804108142852783, "learning_rate": 3.7488128528167672e-06, "loss": 0.428, "mean_token_accuracy": 0.8630628094077111, "num_tokens": 176742512.0, "step": 146910 }, { "entropy": 1.9072934925556182, "epoch": 0.45543914523012585, "grad_norm": 7.206721305847168, "learning_rate": 3.7486852697047988e-06, "loss": 0.4205, "mean_token_accuracy": 0.8565845042467117, "num_tokens": 176754331.0, "step": 146920 }, { "entropy": 1.9471127331256866, "epoch": 0.4554701443551756, "grad_norm": 6.824718952178955, "learning_rate": 3.748557699618028e-06, "loss": 0.4853, "mean_token_accuracy": 0.8565745607018471, "num_tokens": 176765360.0, "step": 146930 }, { "entropy": 1.888777793943882, "epoch": 0.45550114348022525, "grad_norm": 8.23228645324707, "learning_rate": 3.748430142554238e-06, "loss": 0.4229, "mean_token_accuracy": 0.865308640897274, "num_tokens": 176777052.0, "step": 146940 }, { "entropy": 1.881396123766899, "epoch": 0.45553214260527497, "grad_norm": 7.634214878082275, "learning_rate": 3.748302598511214e-06, "loss": 0.4877, "mean_token_accuracy": 0.8448997780680656, "num_tokens": 176789075.0, "step": 146950 }, { "entropy": 1.888343983888626, "epoch": 0.45556314173032464, "grad_norm": 8.560070991516113, "learning_rate": 3.748175067486742e-06, "loss": 0.447, "mean_token_accuracy": 0.8476789712905883, "num_tokens": 176801688.0, "step": 146960 }, { "entropy": 1.8695935264229775, "epoch": 0.45559414085537436, "grad_norm": 4.059643268585205, "learning_rate": 3.7480475494786045e-06, "loss": 0.3986, "mean_token_accuracy": 0.8586703911423683, "num_tokens": 176814689.0, "step": 146970 }, { "entropy": 1.8886354252696038, "epoch": 0.45562513998042403, "grad_norm": 3.8732738494873047, "learning_rate": 3.7479200444845893e-06, "loss": 0.4933, "mean_token_accuracy": 0.8417670831084252, "num_tokens": 176827493.0, "step": 146980 }, { "entropy": 1.8563087373971938, "epoch": 0.45565613910547376, "grad_norm": 8.856595993041992, "learning_rate": 3.7477925525024837e-06, "loss": 0.4056, "mean_token_accuracy": 0.8550028428435326, "num_tokens": 176840753.0, "step": 146990 }, { "entropy": 1.941939078271389, "epoch": 0.4556871382305234, "grad_norm": 4.509472846984863, "learning_rate": 3.7476650735300728e-06, "loss": 0.4409, "mean_token_accuracy": 0.8547850877046586, "num_tokens": 176852241.0, "step": 147000 }, { "entropy": 1.9420098468661309, "epoch": 0.45571813735557315, "grad_norm": 4.272128582000732, "learning_rate": 3.747537607565146e-06, "loss": 0.4371, "mean_token_accuracy": 0.8548914834856987, "num_tokens": 176863804.0, "step": 147010 }, { "entropy": 1.897740714251995, "epoch": 0.4557491364806228, "grad_norm": 9.78155517578125, "learning_rate": 3.7474101546054897e-06, "loss": 0.4496, "mean_token_accuracy": 0.8575510829687119, "num_tokens": 176876007.0, "step": 147020 }, { "entropy": 1.901134905219078, "epoch": 0.45578013560567254, "grad_norm": 8.032532691955566, "learning_rate": 3.747282714648894e-06, "loss": 0.4319, "mean_token_accuracy": 0.8620973736047745, "num_tokens": 176887951.0, "step": 147030 }, { "entropy": 1.9228925719857215, "epoch": 0.4558111347307222, "grad_norm": 3.396730422973633, "learning_rate": 3.747155287693148e-06, "loss": 0.4387, "mean_token_accuracy": 0.8453203931450843, "num_tokens": 176899964.0, "step": 147040 }, { "entropy": 1.9790107786655426, "epoch": 0.45584213385577194, "grad_norm": 6.975090026855469, "learning_rate": 3.7470278737360395e-06, "loss": 0.4874, "mean_token_accuracy": 0.8480637580156326, "num_tokens": 176911678.0, "step": 147050 }, { "entropy": 1.8822043985128403, "epoch": 0.4558731329808216, "grad_norm": 9.182622909545898, "learning_rate": 3.7469004727753605e-06, "loss": 0.4168, "mean_token_accuracy": 0.8651730179786682, "num_tokens": 176923300.0, "step": 147060 }, { "entropy": 1.9158610820770263, "epoch": 0.45590413210587133, "grad_norm": 3.5178349018096924, "learning_rate": 3.746773084808901e-06, "loss": 0.4353, "mean_token_accuracy": 0.8595368161797523, "num_tokens": 176935060.0, "step": 147070 }, { "entropy": 1.968504549562931, "epoch": 0.455935131230921, "grad_norm": 7.308728218078613, "learning_rate": 3.7466457098344528e-06, "loss": 0.4946, "mean_token_accuracy": 0.8455461367964745, "num_tokens": 176946490.0, "step": 147080 }, { "entropy": 1.8836671754717826, "epoch": 0.4559661303559707, "grad_norm": 8.155224800109863, "learning_rate": 3.7465183478498068e-06, "loss": 0.4312, "mean_token_accuracy": 0.8532806396484375, "num_tokens": 176959124.0, "step": 147090 }, { "entropy": 1.9047341987490654, "epoch": 0.4559971294810204, "grad_norm": 7.964391708374023, "learning_rate": 3.7463909988527563e-06, "loss": 0.4336, "mean_token_accuracy": 0.8509239420294762, "num_tokens": 176971598.0, "step": 147100 }, { "entropy": 1.9551147490739822, "epoch": 0.4560281286060701, "grad_norm": 7.997689723968506, "learning_rate": 3.7462636628410933e-06, "loss": 0.5301, "mean_token_accuracy": 0.8321620270609855, "num_tokens": 176982580.0, "step": 147110 }, { "entropy": 1.898515647649765, "epoch": 0.4560591277311198, "grad_norm": 2.4085474014282227, "learning_rate": 3.7461363398126123e-06, "loss": 0.4427, "mean_token_accuracy": 0.8540950760245323, "num_tokens": 176995367.0, "step": 147120 }, { "entropy": 1.841650950908661, "epoch": 0.4560901268561695, "grad_norm": 7.601720333099365, "learning_rate": 3.746009029765105e-06, "loss": 0.3595, "mean_token_accuracy": 0.8667580097913742, "num_tokens": 177008617.0, "step": 147130 }, { "entropy": 1.8938813239336014, "epoch": 0.4561211259812192, "grad_norm": 8.31351375579834, "learning_rate": 3.745881732696369e-06, "loss": 0.4297, "mean_token_accuracy": 0.8541634380817413, "num_tokens": 177021433.0, "step": 147140 }, { "entropy": 1.9346230700612068, "epoch": 0.4561521251062689, "grad_norm": 8.50265884399414, "learning_rate": 3.745754448604195e-06, "loss": 0.4375, "mean_token_accuracy": 0.8484914928674698, "num_tokens": 177033364.0, "step": 147150 }, { "entropy": 1.8723743125796317, "epoch": 0.45618312423131857, "grad_norm": 3.476085901260376, "learning_rate": 3.745627177486383e-06, "loss": 0.4031, "mean_token_accuracy": 0.8580270186066628, "num_tokens": 177046006.0, "step": 147160 }, { "entropy": 1.8366812959313392, "epoch": 0.45621412335636824, "grad_norm": 3.9904587268829346, "learning_rate": 3.745499919340726e-06, "loss": 0.3737, "mean_token_accuracy": 0.8696741983294487, "num_tokens": 177059038.0, "step": 147170 }, { "entropy": 1.8440161734819411, "epoch": 0.45624512248141796, "grad_norm": 9.400425910949707, "learning_rate": 3.745372674165021e-06, "loss": 0.3684, "mean_token_accuracy": 0.8613582566380501, "num_tokens": 177072384.0, "step": 147180 }, { "entropy": 1.9192558169364928, "epoch": 0.45627612160646763, "grad_norm": 10.3735933303833, "learning_rate": 3.7452454419570656e-06, "loss": 0.4259, "mean_token_accuracy": 0.8616964593529701, "num_tokens": 177084461.0, "step": 147190 }, { "entropy": 1.8800915464758874, "epoch": 0.45630712073151736, "grad_norm": 3.897484302520752, "learning_rate": 3.745118222714657e-06, "loss": 0.4315, "mean_token_accuracy": 0.8533887133002281, "num_tokens": 177096842.0, "step": 147200 }, { "entropy": 1.945913690328598, "epoch": 0.456338119856567, "grad_norm": 2.707284688949585, "learning_rate": 3.7449910164355936e-06, "loss": 0.4759, "mean_token_accuracy": 0.847554586827755, "num_tokens": 177109235.0, "step": 147210 }, { "entropy": 1.9585506185889243, "epoch": 0.45636911898161675, "grad_norm": 7.910290718078613, "learning_rate": 3.7448638231176737e-06, "loss": 0.4517, "mean_token_accuracy": 0.8551441594958306, "num_tokens": 177120779.0, "step": 147220 }, { "entropy": 1.8726617515087127, "epoch": 0.4564001181066664, "grad_norm": 3.750462770462036, "learning_rate": 3.7447366427586964e-06, "loss": 0.4599, "mean_token_accuracy": 0.8569039270281792, "num_tokens": 177133126.0, "step": 147230 }, { "entropy": 1.91754612326622, "epoch": 0.45643111723171614, "grad_norm": 7.520792007446289, "learning_rate": 3.7446094753564614e-06, "loss": 0.4543, "mean_token_accuracy": 0.8505095258355141, "num_tokens": 177145098.0, "step": 147240 }, { "entropy": 1.9441439002752303, "epoch": 0.4564621163567658, "grad_norm": 10.833855628967285, "learning_rate": 3.7444823209087682e-06, "loss": 0.4681, "mean_token_accuracy": 0.8531385481357574, "num_tokens": 177156290.0, "step": 147250 }, { "entropy": 1.912196746468544, "epoch": 0.45649311548181554, "grad_norm": 9.136136054992676, "learning_rate": 3.744355179413419e-06, "loss": 0.4201, "mean_token_accuracy": 0.8659820929169655, "num_tokens": 177167879.0, "step": 147260 }, { "entropy": 1.7876418299973011, "epoch": 0.4565241146068652, "grad_norm": 7.6198039054870605, "learning_rate": 3.7442280508682134e-06, "loss": 0.3973, "mean_token_accuracy": 0.8576739236712456, "num_tokens": 177181505.0, "step": 147270 }, { "entropy": 1.925425273180008, "epoch": 0.45655511373191493, "grad_norm": 7.062708377838135, "learning_rate": 3.7441009352709544e-06, "loss": 0.4873, "mean_token_accuracy": 0.8492991134524346, "num_tokens": 177193315.0, "step": 147280 }, { "entropy": 1.847182884812355, "epoch": 0.4565861128569646, "grad_norm": 7.065945625305176, "learning_rate": 3.7439738326194437e-06, "loss": 0.402, "mean_token_accuracy": 0.8619561657309532, "num_tokens": 177205609.0, "step": 147290 }, { "entropy": 1.8271872013807298, "epoch": 0.4566171119820143, "grad_norm": 4.04852294921875, "learning_rate": 3.7438467429114837e-06, "loss": 0.3711, "mean_token_accuracy": 0.8713533550500869, "num_tokens": 177219174.0, "step": 147300 }, { "entropy": 1.9408521130681038, "epoch": 0.456648111107064, "grad_norm": 8.535037994384766, "learning_rate": 3.743719666144879e-06, "loss": 0.5051, "mean_token_accuracy": 0.843614149093628, "num_tokens": 177230752.0, "step": 147310 }, { "entropy": 1.8946807518601418, "epoch": 0.4566791102321137, "grad_norm": 3.0620830059051514, "learning_rate": 3.743592602317431e-06, "loss": 0.4103, "mean_token_accuracy": 0.8616632342338562, "num_tokens": 177243742.0, "step": 147320 }, { "entropy": 1.8989785492420197, "epoch": 0.4567101093571634, "grad_norm": 7.388391017913818, "learning_rate": 3.743465551426947e-06, "loss": 0.4324, "mean_token_accuracy": 0.8464514210820198, "num_tokens": 177256265.0, "step": 147330 }, { "entropy": 1.7777711182832718, "epoch": 0.4567411084822131, "grad_norm": 8.797972679138184, "learning_rate": 3.7433385134712295e-06, "loss": 0.4015, "mean_token_accuracy": 0.8580346912145614, "num_tokens": 177270945.0, "step": 147340 }, { "entropy": 2.003285530209541, "epoch": 0.4567721076072628, "grad_norm": 8.642237663269043, "learning_rate": 3.7432114884480853e-06, "loss": 0.5276, "mean_token_accuracy": 0.8421431943774224, "num_tokens": 177281633.0, "step": 147350 }, { "entropy": 1.9756213307380677, "epoch": 0.4568031067323125, "grad_norm": 9.405720710754395, "learning_rate": 3.743084476355319e-06, "loss": 0.4912, "mean_token_accuracy": 0.8499099537730217, "num_tokens": 177292337.0, "step": 147360 }, { "entropy": 1.8737393349409104, "epoch": 0.45683410585736217, "grad_norm": 6.993408203125, "learning_rate": 3.742957477190739e-06, "loss": 0.4167, "mean_token_accuracy": 0.8587880119681358, "num_tokens": 177304763.0, "step": 147370 }, { "entropy": 1.934411568939686, "epoch": 0.4568651049824119, "grad_norm": 8.104722023010254, "learning_rate": 3.74283049095215e-06, "loss": 0.4842, "mean_token_accuracy": 0.8433166891336441, "num_tokens": 177316276.0, "step": 147380 }, { "entropy": 1.9655498325824738, "epoch": 0.45689610410746156, "grad_norm": 8.411802291870117, "learning_rate": 3.742703517637361e-06, "loss": 0.4673, "mean_token_accuracy": 0.8529037058353424, "num_tokens": 177327855.0, "step": 147390 }, { "entropy": 1.848972088098526, "epoch": 0.4569271032325113, "grad_norm": 8.530176162719727, "learning_rate": 3.74257655724418e-06, "loss": 0.4828, "mean_token_accuracy": 0.8518965750932693, "num_tokens": 177340947.0, "step": 147400 }, { "entropy": 1.8785428568720817, "epoch": 0.45695810235756096, "grad_norm": 8.014692306518555, "learning_rate": 3.742449609770415e-06, "loss": 0.4692, "mean_token_accuracy": 0.8454515695571899, "num_tokens": 177352583.0, "step": 147410 }, { "entropy": 1.9009640589356422, "epoch": 0.4569891014826106, "grad_norm": 8.411507606506348, "learning_rate": 3.7423226752138736e-06, "loss": 0.4361, "mean_token_accuracy": 0.8564955353736877, "num_tokens": 177364796.0, "step": 147420 }, { "entropy": 1.8695711359381675, "epoch": 0.45702010060766035, "grad_norm": 8.759578704833984, "learning_rate": 3.7421957535723686e-06, "loss": 0.4138, "mean_token_accuracy": 0.8556355476379395, "num_tokens": 177377130.0, "step": 147430 }, { "entropy": 1.882589338719845, "epoch": 0.45705109973271, "grad_norm": 5.764090538024902, "learning_rate": 3.7420688448437077e-06, "loss": 0.4659, "mean_token_accuracy": 0.8518973112106323, "num_tokens": 177389852.0, "step": 147440 }, { "entropy": 1.9209093794226646, "epoch": 0.45708209885775974, "grad_norm": 9.359416007995605, "learning_rate": 3.7419419490257012e-06, "loss": 0.475, "mean_token_accuracy": 0.8516506165266037, "num_tokens": 177401433.0, "step": 147450 }, { "entropy": 1.86415656208992, "epoch": 0.4571130979828094, "grad_norm": 8.562514305114746, "learning_rate": 3.741815066116162e-06, "loss": 0.4233, "mean_token_accuracy": 0.8509325221180916, "num_tokens": 177414478.0, "step": 147460 }, { "entropy": 1.745816995203495, "epoch": 0.45714409710785914, "grad_norm": 6.630068302154541, "learning_rate": 3.7416881961129002e-06, "loss": 0.381, "mean_token_accuracy": 0.8737103626132011, "num_tokens": 177428315.0, "step": 147470 }, { "entropy": 1.8828457549214364, "epoch": 0.4571750962329088, "grad_norm": 4.396876335144043, "learning_rate": 3.7415613390137295e-06, "loss": 0.442, "mean_token_accuracy": 0.8575210615992546, "num_tokens": 177440392.0, "step": 147480 }, { "entropy": 1.9914973288774491, "epoch": 0.45720609535795853, "grad_norm": 6.813479423522949, "learning_rate": 3.7414344948164604e-06, "loss": 0.5172, "mean_token_accuracy": 0.8524721592664719, "num_tokens": 177451268.0, "step": 147490 }, { "entropy": 1.8895151317119598, "epoch": 0.4572370944830082, "grad_norm": 3.594588279724121, "learning_rate": 3.741307663518908e-06, "loss": 0.4189, "mean_token_accuracy": 0.8583090662956238, "num_tokens": 177463489.0, "step": 147500 }, { "entropy": 1.9084604054689407, "epoch": 0.4572680936080579, "grad_norm": 4.002728462219238, "learning_rate": 3.741180845118885e-06, "loss": 0.4889, "mean_token_accuracy": 0.8463861241936683, "num_tokens": 177475847.0, "step": 147510 }, { "entropy": 1.9138479217886926, "epoch": 0.4572990927331076, "grad_norm": 9.18737506866455, "learning_rate": 3.7410540396142063e-06, "loss": 0.4332, "mean_token_accuracy": 0.8641066178679466, "num_tokens": 177487582.0, "step": 147520 }, { "entropy": 1.87220705896616, "epoch": 0.4573300918581573, "grad_norm": 7.512433052062988, "learning_rate": 3.740927247002686e-06, "loss": 0.4415, "mean_token_accuracy": 0.858481977880001, "num_tokens": 177499543.0, "step": 147530 }, { "entropy": 1.9340161770582198, "epoch": 0.457361090983207, "grad_norm": 7.679035186767578, "learning_rate": 3.74080046728214e-06, "loss": 0.4543, "mean_token_accuracy": 0.8467412024736405, "num_tokens": 177510615.0, "step": 147540 }, { "entropy": 1.8590017393231393, "epoch": 0.4573920901082567, "grad_norm": 9.347676277160645, "learning_rate": 3.7406737004503834e-06, "loss": 0.429, "mean_token_accuracy": 0.8628332495689393, "num_tokens": 177522638.0, "step": 147550 }, { "entropy": 1.911301201581955, "epoch": 0.4574230892333064, "grad_norm": 8.779614448547363, "learning_rate": 3.740546946505233e-06, "loss": 0.4369, "mean_token_accuracy": 0.8550707563757897, "num_tokens": 177534749.0, "step": 147560 }, { "entropy": 1.9754537165164947, "epoch": 0.4574540883583561, "grad_norm": 8.753040313720703, "learning_rate": 3.740420205444505e-06, "loss": 0.4771, "mean_token_accuracy": 0.853889499604702, "num_tokens": 177546067.0, "step": 147570 }, { "entropy": 1.9112480387091637, "epoch": 0.4574850874834058, "grad_norm": 4.434845447540283, "learning_rate": 3.740293477266017e-06, "loss": 0.4801, "mean_token_accuracy": 0.8533158525824547, "num_tokens": 177557969.0, "step": 147580 }, { "entropy": 1.877299964427948, "epoch": 0.4575160866084555, "grad_norm": 7.680180072784424, "learning_rate": 3.7401667619675876e-06, "loss": 0.3953, "mean_token_accuracy": 0.8607863545417785, "num_tokens": 177570545.0, "step": 147590 }, { "entropy": 1.9265247106552124, "epoch": 0.45754708573350517, "grad_norm": 8.468583106994629, "learning_rate": 3.7400400595470337e-06, "loss": 0.4765, "mean_token_accuracy": 0.8503778129816055, "num_tokens": 177582395.0, "step": 147600 }, { "entropy": 1.8167890459299088, "epoch": 0.4575780848585549, "grad_norm": 8.333720207214355, "learning_rate": 3.7399133700021756e-06, "loss": 0.4504, "mean_token_accuracy": 0.8408757716417312, "num_tokens": 177596261.0, "step": 147610 }, { "entropy": 1.8485788330435753, "epoch": 0.45760908398360456, "grad_norm": 7.62937593460083, "learning_rate": 3.7397866933308325e-06, "loss": 0.4469, "mean_token_accuracy": 0.8608444139361382, "num_tokens": 177608763.0, "step": 147620 }, { "entropy": 1.9072691813111304, "epoch": 0.4576400831086543, "grad_norm": 9.699867248535156, "learning_rate": 3.7396600295308234e-06, "loss": 0.447, "mean_token_accuracy": 0.8526267111301422, "num_tokens": 177621101.0, "step": 147630 }, { "entropy": 1.9000159114599229, "epoch": 0.45767108223370395, "grad_norm": 7.382110118865967, "learning_rate": 3.7395333785999692e-06, "loss": 0.4659, "mean_token_accuracy": 0.8545347273349762, "num_tokens": 177633245.0, "step": 147640 }, { "entropy": 1.9029620870947839, "epoch": 0.4577020813587537, "grad_norm": 9.210521697998047, "learning_rate": 3.739406740536092e-06, "loss": 0.4738, "mean_token_accuracy": 0.8504366457462311, "num_tokens": 177643987.0, "step": 147650 }, { "entropy": 1.9046365901827813, "epoch": 0.45773308048380335, "grad_norm": 4.034806251525879, "learning_rate": 3.739280115337011e-06, "loss": 0.4549, "mean_token_accuracy": 0.8487413614988327, "num_tokens": 177655795.0, "step": 147660 }, { "entropy": 1.8236022911965848, "epoch": 0.457764079608853, "grad_norm": 4.0919623374938965, "learning_rate": 3.73915350300055e-06, "loss": 0.3661, "mean_token_accuracy": 0.8643067702651024, "num_tokens": 177668862.0, "step": 147670 }, { "entropy": 1.9250768944621086, "epoch": 0.45779507873390274, "grad_norm": 8.812362670898438, "learning_rate": 3.7390269035245302e-06, "loss": 0.4477, "mean_token_accuracy": 0.8513668492436409, "num_tokens": 177681125.0, "step": 147680 }, { "entropy": 1.8930511608719827, "epoch": 0.4578260778589524, "grad_norm": 7.733847141265869, "learning_rate": 3.738900316906776e-06, "loss": 0.4254, "mean_token_accuracy": 0.8619498953223228, "num_tokens": 177693301.0, "step": 147690 }, { "entropy": 1.9794332563877106, "epoch": 0.45785707698400213, "grad_norm": 7.670042514801025, "learning_rate": 3.7387737431451097e-06, "loss": 0.504, "mean_token_accuracy": 0.8443345949053764, "num_tokens": 177704344.0, "step": 147700 }, { "entropy": 1.9156940042972566, "epoch": 0.4578880761090518, "grad_norm": 7.951246738433838, "learning_rate": 3.738647182237357e-06, "loss": 0.4712, "mean_token_accuracy": 0.8424468949437142, "num_tokens": 177716411.0, "step": 147710 }, { "entropy": 1.8918796986341477, "epoch": 0.4579190752341015, "grad_norm": 6.88615083694458, "learning_rate": 3.738520634181341e-06, "loss": 0.414, "mean_token_accuracy": 0.861275652050972, "num_tokens": 177728341.0, "step": 147720 }, { "entropy": 1.849881935119629, "epoch": 0.4579500743591512, "grad_norm": 3.4234378337860107, "learning_rate": 3.738394098974886e-06, "loss": 0.396, "mean_token_accuracy": 0.8636976391077041, "num_tokens": 177740803.0, "step": 147730 }, { "entropy": 1.8762715697288512, "epoch": 0.4579810734842009, "grad_norm": 8.60593318939209, "learning_rate": 3.7382675766158196e-06, "loss": 0.4829, "mean_token_accuracy": 0.8438662976026535, "num_tokens": 177752757.0, "step": 147740 }, { "entropy": 1.9411585584282876, "epoch": 0.4580120726092506, "grad_norm": 9.615153312683105, "learning_rate": 3.738141067101967e-06, "loss": 0.4989, "mean_token_accuracy": 0.8427041709423065, "num_tokens": 177764050.0, "step": 147750 }, { "entropy": 1.8888093829154968, "epoch": 0.4580430717343003, "grad_norm": 8.258987426757812, "learning_rate": 3.7380145704311548e-06, "loss": 0.4782, "mean_token_accuracy": 0.85269236266613, "num_tokens": 177775788.0, "step": 147760 }, { "entropy": 1.857784403860569, "epoch": 0.45807407085935, "grad_norm": 3.8808376789093018, "learning_rate": 3.73788808660121e-06, "loss": 0.4144, "mean_token_accuracy": 0.8614834606647491, "num_tokens": 177787727.0, "step": 147770 }, { "entropy": 1.9264632612466812, "epoch": 0.4581050699843997, "grad_norm": 8.179401397705078, "learning_rate": 3.7377616156099605e-06, "loss": 0.4947, "mean_token_accuracy": 0.8487723514437675, "num_tokens": 177799380.0, "step": 147780 }, { "entropy": 1.9536435410380364, "epoch": 0.4581360691094494, "grad_norm": 8.652231216430664, "learning_rate": 3.737635157455235e-06, "loss": 0.5055, "mean_token_accuracy": 0.8477381393313408, "num_tokens": 177810671.0, "step": 147790 }, { "entropy": 1.8677313759922982, "epoch": 0.4581670682344991, "grad_norm": 8.161964416503906, "learning_rate": 3.7375087121348613e-06, "loss": 0.4361, "mean_token_accuracy": 0.8476336553692818, "num_tokens": 177823218.0, "step": 147800 }, { "entropy": 1.8325920164585114, "epoch": 0.45819806735954877, "grad_norm": 7.421534538269043, "learning_rate": 3.737382279646669e-06, "loss": 0.367, "mean_token_accuracy": 0.8645919308066368, "num_tokens": 177836151.0, "step": 147810 }, { "entropy": 1.9766230046749116, "epoch": 0.4582290664845985, "grad_norm": 8.750480651855469, "learning_rate": 3.7372558599884873e-06, "loss": 0.5172, "mean_token_accuracy": 0.8399770334362984, "num_tokens": 177847098.0, "step": 147820 }, { "entropy": 1.9309228926897049, "epoch": 0.45826006560964816, "grad_norm": 7.476661205291748, "learning_rate": 3.737129453158147e-06, "loss": 0.4841, "mean_token_accuracy": 0.842421543598175, "num_tokens": 177859514.0, "step": 147830 }, { "entropy": 1.847913098335266, "epoch": 0.4582910647346979, "grad_norm": 6.960815906524658, "learning_rate": 3.737003059153479e-06, "loss": 0.5784, "mean_token_accuracy": 0.8363130848854781, "num_tokens": 177873072.0, "step": 147840 }, { "entropy": 1.8798335790634155, "epoch": 0.45832206385974755, "grad_norm": 7.6284894943237305, "learning_rate": 3.7368766779723135e-06, "loss": 0.4554, "mean_token_accuracy": 0.8519891336560249, "num_tokens": 177884925.0, "step": 147850 }, { "entropy": 1.9396116152405738, "epoch": 0.4583530629847973, "grad_norm": 7.631931304931641, "learning_rate": 3.7367503096124836e-06, "loss": 0.4735, "mean_token_accuracy": 0.8487934455275535, "num_tokens": 177896276.0, "step": 147860 }, { "entropy": 1.9332348197698592, "epoch": 0.45838406210984695, "grad_norm": 9.10966682434082, "learning_rate": 3.7366239540718206e-06, "loss": 0.4686, "mean_token_accuracy": 0.8453480824828148, "num_tokens": 177907658.0, "step": 147870 }, { "entropy": 1.847151155769825, "epoch": 0.45841506123489667, "grad_norm": 7.519127368927002, "learning_rate": 3.736497611348158e-06, "loss": 0.4284, "mean_token_accuracy": 0.8468550607562065, "num_tokens": 177919761.0, "step": 147880 }, { "entropy": 1.854358959197998, "epoch": 0.45844606035994634, "grad_norm": 7.146642684936523, "learning_rate": 3.7363712814393296e-06, "loss": 0.4346, "mean_token_accuracy": 0.8587031260132789, "num_tokens": 177931858.0, "step": 147890 }, { "entropy": 1.8741928294301033, "epoch": 0.45847705948499606, "grad_norm": 8.542024612426758, "learning_rate": 3.7362449643431677e-06, "loss": 0.449, "mean_token_accuracy": 0.8568659141659737, "num_tokens": 177943416.0, "step": 147900 }, { "entropy": 1.9269966036081314, "epoch": 0.45850805861004573, "grad_norm": 5.178137302398682, "learning_rate": 3.736118660057506e-06, "loss": 0.5051, "mean_token_accuracy": 0.8503173589706421, "num_tokens": 177954691.0, "step": 147910 }, { "entropy": 1.856522725522518, "epoch": 0.4585390577350954, "grad_norm": 7.2768330574035645, "learning_rate": 3.735992368580183e-06, "loss": 0.4206, "mean_token_accuracy": 0.8675465360283852, "num_tokens": 177966749.0, "step": 147920 }, { "entropy": 1.8835629358887673, "epoch": 0.4585700568601451, "grad_norm": 3.9212841987609863, "learning_rate": 3.73586608990903e-06, "loss": 0.4512, "mean_token_accuracy": 0.853912553191185, "num_tokens": 177978879.0, "step": 147930 }, { "entropy": 1.8709630161523818, "epoch": 0.4586010559851948, "grad_norm": 9.633736610412598, "learning_rate": 3.735739824041885e-06, "loss": 0.4579, "mean_token_accuracy": 0.8498531699180603, "num_tokens": 177990951.0, "step": 147940 }, { "entropy": 1.9850156009197235, "epoch": 0.4586320551102445, "grad_norm": 8.021490097045898, "learning_rate": 3.735613570976584e-06, "loss": 0.5309, "mean_token_accuracy": 0.8361830487847328, "num_tokens": 178001723.0, "step": 147950 }, { "entropy": 1.9277898535132407, "epoch": 0.4586630542352942, "grad_norm": 8.133795738220215, "learning_rate": 3.735487330710964e-06, "loss": 0.5754, "mean_token_accuracy": 0.8342059582471848, "num_tokens": 178013683.0, "step": 147960 }, { "entropy": 1.9039981633424758, "epoch": 0.4586940533603439, "grad_norm": 9.529333114624023, "learning_rate": 3.735361103242862e-06, "loss": 0.4942, "mean_token_accuracy": 0.8599925115704536, "num_tokens": 178025035.0, "step": 147970 }, { "entropy": 1.8207675963640213, "epoch": 0.4587250524853936, "grad_norm": 4.995032787322998, "learning_rate": 3.735234888570117e-06, "loss": 0.3791, "mean_token_accuracy": 0.8648058965802192, "num_tokens": 178038165.0, "step": 147980 }, { "entropy": 1.7604272417724132, "epoch": 0.4587560516104433, "grad_norm": 7.621956825256348, "learning_rate": 3.735108686690566e-06, "loss": 0.3362, "mean_token_accuracy": 0.8773904070258141, "num_tokens": 178051753.0, "step": 147990 }, { "entropy": 1.8666868284344673, "epoch": 0.458787050735493, "grad_norm": 8.41880989074707, "learning_rate": 3.7349824976020483e-06, "loss": 0.4879, "mean_token_accuracy": 0.8527174681425095, "num_tokens": 178063598.0, "step": 148000 }, { "entropy": 1.8387210443615913, "epoch": 0.4588180498605427, "grad_norm": 3.875763416290283, "learning_rate": 3.7348563213024038e-06, "loss": 0.4166, "mean_token_accuracy": 0.8554558053612709, "num_tokens": 178075606.0, "step": 148010 }, { "entropy": 1.8273346543312072, "epoch": 0.45884904898559237, "grad_norm": 9.78944206237793, "learning_rate": 3.7347301577894716e-06, "loss": 0.4098, "mean_token_accuracy": 0.8571007132530213, "num_tokens": 178088272.0, "step": 148020 }, { "entropy": 1.8279028117656708, "epoch": 0.4588800481106421, "grad_norm": 7.993279933929443, "learning_rate": 3.7346040070610935e-06, "loss": 0.4273, "mean_token_accuracy": 0.8609201908111572, "num_tokens": 178100592.0, "step": 148030 }, { "entropy": 1.9010518863797188, "epoch": 0.45891104723569176, "grad_norm": 8.148711204528809, "learning_rate": 3.73447786911511e-06, "loss": 0.4772, "mean_token_accuracy": 0.8482613220810891, "num_tokens": 178112256.0, "step": 148040 }, { "entropy": 1.84859948605299, "epoch": 0.4589420463607415, "grad_norm": 6.481073379516602, "learning_rate": 3.734351743949362e-06, "loss": 0.4582, "mean_token_accuracy": 0.8522376015782356, "num_tokens": 178124130.0, "step": 148050 }, { "entropy": 1.9222456485033035, "epoch": 0.45897304548579115, "grad_norm": 9.769478797912598, "learning_rate": 3.734225631561692e-06, "loss": 0.4303, "mean_token_accuracy": 0.858902907371521, "num_tokens": 178135001.0, "step": 148060 }, { "entropy": 1.9226790130138398, "epoch": 0.4590040446108409, "grad_norm": 7.749213218688965, "learning_rate": 3.7340995319499424e-06, "loss": 0.4852, "mean_token_accuracy": 0.8433895200490952, "num_tokens": 178146194.0, "step": 148070 }, { "entropy": 1.7752541199326515, "epoch": 0.45903504373589055, "grad_norm": 9.610624313354492, "learning_rate": 3.7339734451119556e-06, "loss": 0.423, "mean_token_accuracy": 0.8526219561696052, "num_tokens": 178159988.0, "step": 148080 }, { "entropy": 1.7483320951461792, "epoch": 0.45906604286094027, "grad_norm": 8.351910591125488, "learning_rate": 3.733847371045577e-06, "loss": 0.3534, "mean_token_accuracy": 0.8714915245771409, "num_tokens": 178173336.0, "step": 148090 }, { "entropy": 1.8959960505366324, "epoch": 0.45909704198598994, "grad_norm": 4.344943523406982, "learning_rate": 3.7337213097486484e-06, "loss": 0.4555, "mean_token_accuracy": 0.852771207690239, "num_tokens": 178184748.0, "step": 148100 }, { "entropy": 1.8576713547110557, "epoch": 0.45912804111103966, "grad_norm": 8.485532760620117, "learning_rate": 3.733595261219016e-06, "loss": 0.4161, "mean_token_accuracy": 0.8570611268281937, "num_tokens": 178196913.0, "step": 148110 }, { "entropy": 1.8324482500553132, "epoch": 0.45915904023608933, "grad_norm": 3.7649433612823486, "learning_rate": 3.7334692254545234e-06, "loss": 0.4328, "mean_token_accuracy": 0.8562913209199905, "num_tokens": 178209039.0, "step": 148120 }, { "entropy": 1.8635966286063195, "epoch": 0.45919003936113906, "grad_norm": 3.6343321800231934, "learning_rate": 3.733343202453017e-06, "loss": 0.4415, "mean_token_accuracy": 0.8553838461637497, "num_tokens": 178220749.0, "step": 148130 }, { "entropy": 1.8851610526442528, "epoch": 0.4592210384861887, "grad_norm": 6.960264682769775, "learning_rate": 3.7332171922123433e-06, "loss": 0.458, "mean_token_accuracy": 0.8590750455856323, "num_tokens": 178232494.0, "step": 148140 }, { "entropy": 1.8172919273376464, "epoch": 0.4592520376112384, "grad_norm": 8.392343521118164, "learning_rate": 3.733091194730348e-06, "loss": 0.4938, "mean_token_accuracy": 0.844761498272419, "num_tokens": 178245770.0, "step": 148150 }, { "entropy": 1.8854141429066658, "epoch": 0.4592830367362881, "grad_norm": 8.518744468688965, "learning_rate": 3.7329652100048796e-06, "loss": 0.4558, "mean_token_accuracy": 0.8478623673319816, "num_tokens": 178257483.0, "step": 148160 }, { "entropy": 1.9081405073404312, "epoch": 0.4593140358613378, "grad_norm": 8.384743690490723, "learning_rate": 3.7328392380337842e-06, "loss": 0.4752, "mean_token_accuracy": 0.8525558218359948, "num_tokens": 178268727.0, "step": 148170 }, { "entropy": 1.8962554574012755, "epoch": 0.4593450349863875, "grad_norm": 9.84249496459961, "learning_rate": 3.73271327881491e-06, "loss": 0.4361, "mean_token_accuracy": 0.8616734966635704, "num_tokens": 178280654.0, "step": 148180 }, { "entropy": 1.9595209866762162, "epoch": 0.4593760341114372, "grad_norm": 7.133771896362305, "learning_rate": 3.732587332346106e-06, "loss": 0.4789, "mean_token_accuracy": 0.8539301365613937, "num_tokens": 178292341.0, "step": 148190 }, { "entropy": 1.9443857610225677, "epoch": 0.4594070332364869, "grad_norm": 7.489109516143799, "learning_rate": 3.732461398625222e-06, "loss": 0.4481, "mean_token_accuracy": 0.8486480563879013, "num_tokens": 178303553.0, "step": 148200 }, { "entropy": 1.8499363631010055, "epoch": 0.4594380323615366, "grad_norm": 9.401103019714355, "learning_rate": 3.7323354776501063e-06, "loss": 0.4375, "mean_token_accuracy": 0.8625474691390991, "num_tokens": 178315044.0, "step": 148210 }, { "entropy": 1.9178618222475052, "epoch": 0.4594690314865863, "grad_norm": 7.870799541473389, "learning_rate": 3.73220956941861e-06, "loss": 0.4987, "mean_token_accuracy": 0.8432634264230728, "num_tokens": 178326928.0, "step": 148220 }, { "entropy": 1.948700188100338, "epoch": 0.45950003061163597, "grad_norm": 9.164449691772461, "learning_rate": 3.7320836739285838e-06, "loss": 0.4742, "mean_token_accuracy": 0.8515329852700233, "num_tokens": 178338112.0, "step": 148230 }, { "entropy": 1.9085903450846673, "epoch": 0.4595310297366857, "grad_norm": 8.133758544921875, "learning_rate": 3.731957791177878e-06, "loss": 0.4241, "mean_token_accuracy": 0.8663553714752197, "num_tokens": 178349859.0, "step": 148240 }, { "entropy": 1.8798146709799766, "epoch": 0.45956202886173536, "grad_norm": 3.9677722454071045, "learning_rate": 3.7318319211643456e-06, "loss": 0.4692, "mean_token_accuracy": 0.8445356607437133, "num_tokens": 178362252.0, "step": 148250 }, { "entropy": 1.843547348678112, "epoch": 0.4595930279867851, "grad_norm": 9.012165069580078, "learning_rate": 3.731706063885837e-06, "loss": 0.4151, "mean_token_accuracy": 0.8570879876613617, "num_tokens": 178374338.0, "step": 148260 }, { "entropy": 1.897803196310997, "epoch": 0.45962402711183475, "grad_norm": 2.348179817199707, "learning_rate": 3.731580219340206e-06, "loss": 0.4171, "mean_token_accuracy": 0.8629708766937256, "num_tokens": 178385867.0, "step": 148270 }, { "entropy": 1.755355989933014, "epoch": 0.4596550262368845, "grad_norm": 5.742542743682861, "learning_rate": 3.7314543875253065e-06, "loss": 0.359, "mean_token_accuracy": 0.8633789956569672, "num_tokens": 178400091.0, "step": 148280 }, { "entropy": 1.8990287438035012, "epoch": 0.45968602536193415, "grad_norm": 8.405163764953613, "learning_rate": 3.731328568438991e-06, "loss": 0.48, "mean_token_accuracy": 0.8494486227631569, "num_tokens": 178412761.0, "step": 148290 }, { "entropy": 1.8492143407464028, "epoch": 0.45971702448698387, "grad_norm": 8.039115905761719, "learning_rate": 3.731202762079114e-06, "loss": 0.3856, "mean_token_accuracy": 0.8641363605856895, "num_tokens": 178425735.0, "step": 148300 }, { "entropy": 1.9216509327292441, "epoch": 0.45974802361203354, "grad_norm": 8.291874885559082, "learning_rate": 3.7310769684435306e-06, "loss": 0.4854, "mean_token_accuracy": 0.8502373903989792, "num_tokens": 178437058.0, "step": 148310 }, { "entropy": 1.925723034143448, "epoch": 0.45977902273708326, "grad_norm": 9.9407320022583, "learning_rate": 3.730951187530095e-06, "loss": 0.4756, "mean_token_accuracy": 0.8500989750027657, "num_tokens": 178448170.0, "step": 148320 }, { "entropy": 1.8789965465664864, "epoch": 0.45981002186213293, "grad_norm": 8.22014045715332, "learning_rate": 3.7308254193366646e-06, "loss": 0.4538, "mean_token_accuracy": 0.8639925360679627, "num_tokens": 178461198.0, "step": 148330 }, { "entropy": 1.9338722795248031, "epoch": 0.45984102098718266, "grad_norm": 7.892792224884033, "learning_rate": 3.7306996638610936e-06, "loss": 0.4897, "mean_token_accuracy": 0.8488512441515923, "num_tokens": 178471923.0, "step": 148340 }, { "entropy": 1.9255221277475356, "epoch": 0.4598720201122323, "grad_norm": 4.1723198890686035, "learning_rate": 3.7305739211012404e-06, "loss": 0.483, "mean_token_accuracy": 0.8413741737604141, "num_tokens": 178483299.0, "step": 148350 }, { "entropy": 1.9141563907265664, "epoch": 0.45990301923728205, "grad_norm": 7.0736260414123535, "learning_rate": 3.7304481910549613e-06, "loss": 0.4735, "mean_token_accuracy": 0.8529133036732673, "num_tokens": 178494625.0, "step": 148360 }, { "entropy": 1.8494025871157647, "epoch": 0.4599340183623317, "grad_norm": 7.612475872039795, "learning_rate": 3.7303224737201137e-06, "loss": 0.4706, "mean_token_accuracy": 0.8510186776518822, "num_tokens": 178507700.0, "step": 148370 }, { "entropy": 1.8061942994594573, "epoch": 0.45996501748738144, "grad_norm": 4.09743595123291, "learning_rate": 3.730196769094557e-06, "loss": 0.4241, "mean_token_accuracy": 0.8509301677346229, "num_tokens": 178521259.0, "step": 148380 }, { "entropy": 1.9035452425479888, "epoch": 0.4599960166124311, "grad_norm": 3.9658563137054443, "learning_rate": 3.73007107717615e-06, "loss": 0.4427, "mean_token_accuracy": 0.8570003524422646, "num_tokens": 178532802.0, "step": 148390 }, { "entropy": 1.821213935315609, "epoch": 0.4600270157374808, "grad_norm": 3.9787087440490723, "learning_rate": 3.729945397962751e-06, "loss": 0.3726, "mean_token_accuracy": 0.869298306107521, "num_tokens": 178545572.0, "step": 148400 }, { "entropy": 1.796069860458374, "epoch": 0.4600580148625305, "grad_norm": 4.056951999664307, "learning_rate": 3.72981973145222e-06, "loss": 0.4046, "mean_token_accuracy": 0.8616714045405388, "num_tokens": 178558915.0, "step": 148410 }, { "entropy": 1.9034935608506203, "epoch": 0.4600890139875802, "grad_norm": 8.9136381149292, "learning_rate": 3.7296940776424174e-06, "loss": 0.4599, "mean_token_accuracy": 0.851823453605175, "num_tokens": 178570522.0, "step": 148420 }, { "entropy": 1.921525527536869, "epoch": 0.4601200131126299, "grad_norm": 8.047652244567871, "learning_rate": 3.7295684365312045e-06, "loss": 0.4648, "mean_token_accuracy": 0.8542478293180465, "num_tokens": 178581965.0, "step": 148430 }, { "entropy": 1.8355329155921936, "epoch": 0.46015101223767957, "grad_norm": 4.0909223556518555, "learning_rate": 3.7294428081164413e-06, "loss": 0.3998, "mean_token_accuracy": 0.8662302419543266, "num_tokens": 178594538.0, "step": 148440 }, { "entropy": 1.859887145459652, "epoch": 0.4601820113627293, "grad_norm": 9.011685371398926, "learning_rate": 3.729317192395991e-06, "loss": 0.453, "mean_token_accuracy": 0.8597676187753678, "num_tokens": 178606405.0, "step": 148450 }, { "entropy": 1.8634124889969825, "epoch": 0.46021301048777896, "grad_norm": 7.429311752319336, "learning_rate": 3.729191589367715e-06, "loss": 0.4054, "mean_token_accuracy": 0.8525898218154907, "num_tokens": 178619148.0, "step": 148460 }, { "entropy": 1.8275597229599954, "epoch": 0.4602440096128287, "grad_norm": 3.9588801860809326, "learning_rate": 3.729065999029476e-06, "loss": 0.3897, "mean_token_accuracy": 0.8704280957579613, "num_tokens": 178630852.0, "step": 148470 }, { "entropy": 1.8700731366872787, "epoch": 0.46027500873787836, "grad_norm": 9.530686378479004, "learning_rate": 3.728940421379138e-06, "loss": 0.4343, "mean_token_accuracy": 0.8597807481884956, "num_tokens": 178643067.0, "step": 148480 }, { "entropy": 1.8858630776405334, "epoch": 0.4603060078629281, "grad_norm": 8.52836799621582, "learning_rate": 3.7288148564145645e-06, "loss": 0.4496, "mean_token_accuracy": 0.84766735881567, "num_tokens": 178654791.0, "step": 148490 }, { "entropy": 1.9085488021373749, "epoch": 0.46033700698797775, "grad_norm": 7.8866095542907715, "learning_rate": 3.7286893041336187e-06, "loss": 0.512, "mean_token_accuracy": 0.8404016211628914, "num_tokens": 178666290.0, "step": 148500 }, { "entropy": 1.9258504971861838, "epoch": 0.4603680061130275, "grad_norm": 8.244267463684082, "learning_rate": 3.728563764534167e-06, "loss": 0.4795, "mean_token_accuracy": 0.8527882546186447, "num_tokens": 178677958.0, "step": 148510 }, { "entropy": 1.93827313631773, "epoch": 0.46039900523807714, "grad_norm": 7.484049320220947, "learning_rate": 3.728438237614075e-06, "loss": 0.4855, "mean_token_accuracy": 0.8486429572105407, "num_tokens": 178689438.0, "step": 148520 }, { "entropy": 1.8482612371444702, "epoch": 0.46043000436312687, "grad_norm": 6.9296650886535645, "learning_rate": 3.7283127233712067e-06, "loss": 0.405, "mean_token_accuracy": 0.8683997794985772, "num_tokens": 178702457.0, "step": 148530 }, { "entropy": 1.9950364649295806, "epoch": 0.46046100348817653, "grad_norm": 7.755855560302734, "learning_rate": 3.7281872218034292e-06, "loss": 0.5214, "mean_token_accuracy": 0.838994313776493, "num_tokens": 178713062.0, "step": 148540 }, { "entropy": 1.7965051412582398, "epoch": 0.46049200261322626, "grad_norm": 7.688485622406006, "learning_rate": 3.7280617329086093e-06, "loss": 0.3361, "mean_token_accuracy": 0.8706543311476708, "num_tokens": 178726109.0, "step": 148550 }, { "entropy": 1.939668095111847, "epoch": 0.46052300173827593, "grad_norm": 8.922786712646484, "learning_rate": 3.7279362566846154e-06, "loss": 0.4764, "mean_token_accuracy": 0.8509280979633331, "num_tokens": 178737184.0, "step": 148560 }, { "entropy": 1.8518592774868012, "epoch": 0.46055400086332565, "grad_norm": 8.265620231628418, "learning_rate": 3.7278107931293138e-06, "loss": 0.387, "mean_token_accuracy": 0.8692864283919335, "num_tokens": 178749576.0, "step": 148570 }, { "entropy": 1.9392090559005737, "epoch": 0.4605849999883753, "grad_norm": 8.548821449279785, "learning_rate": 3.7276853422405733e-06, "loss": 0.5045, "mean_token_accuracy": 0.8460419103503227, "num_tokens": 178760264.0, "step": 148580 }, { "entropy": 1.9111363351345063, "epoch": 0.46061599911342505, "grad_norm": 7.530226230621338, "learning_rate": 3.7275599040162634e-06, "loss": 0.4283, "mean_token_accuracy": 0.8527784243226051, "num_tokens": 178771695.0, "step": 148590 }, { "entropy": 1.8285814180970192, "epoch": 0.4606469982384747, "grad_norm": 3.894634485244751, "learning_rate": 3.727434478454252e-06, "loss": 0.4168, "mean_token_accuracy": 0.8559230253100395, "num_tokens": 178784229.0, "step": 148600 }, { "entropy": 1.8875297084450722, "epoch": 0.46067799736352444, "grad_norm": 7.381993293762207, "learning_rate": 3.7273090655524108e-06, "loss": 0.4304, "mean_token_accuracy": 0.8570609837770462, "num_tokens": 178796492.0, "step": 148610 }, { "entropy": 1.8404451578855514, "epoch": 0.4607089964885741, "grad_norm": 7.309342384338379, "learning_rate": 3.7271836653086084e-06, "loss": 0.4021, "mean_token_accuracy": 0.8731897443532943, "num_tokens": 178808742.0, "step": 148620 }, { "entropy": 1.879277119040489, "epoch": 0.46073999561362383, "grad_norm": 9.954212188720703, "learning_rate": 3.7270582777207166e-06, "loss": 0.4689, "mean_token_accuracy": 0.8552733480930328, "num_tokens": 178820065.0, "step": 148630 }, { "entropy": 1.909400151669979, "epoch": 0.4607709947386735, "grad_norm": 8.673418045043945, "learning_rate": 3.7269329027866064e-06, "loss": 0.4915, "mean_token_accuracy": 0.8421248093247413, "num_tokens": 178832605.0, "step": 148640 }, { "entropy": 1.9523657143115998, "epoch": 0.46080199386372317, "grad_norm": 7.831915378570557, "learning_rate": 3.7268075405041496e-06, "loss": 0.4772, "mean_token_accuracy": 0.8484080314636231, "num_tokens": 178844397.0, "step": 148650 }, { "entropy": 1.7859274119138717, "epoch": 0.4608329929887729, "grad_norm": 8.42618465423584, "learning_rate": 3.7266821908712185e-06, "loss": 0.4208, "mean_token_accuracy": 0.8656513899564743, "num_tokens": 178856946.0, "step": 148660 }, { "entropy": 1.9320579081773759, "epoch": 0.46086399211382256, "grad_norm": 7.774157524108887, "learning_rate": 3.7265568538856866e-06, "loss": 0.4744, "mean_token_accuracy": 0.8523464262485504, "num_tokens": 178868317.0, "step": 148670 }, { "entropy": 1.8498662784695625, "epoch": 0.4608949912388723, "grad_norm": 8.220366477966309, "learning_rate": 3.726431529545426e-06, "loss": 0.4639, "mean_token_accuracy": 0.8491094589233399, "num_tokens": 178880925.0, "step": 148680 }, { "entropy": 1.9150644198060036, "epoch": 0.46092599036392196, "grad_norm": 9.27624225616455, "learning_rate": 3.726306217848312e-06, "loss": 0.4911, "mean_token_accuracy": 0.8403692692518234, "num_tokens": 178892546.0, "step": 148690 }, { "entropy": 1.7894390970468521, "epoch": 0.4609569894889717, "grad_norm": 9.18440055847168, "learning_rate": 3.726180918792218e-06, "loss": 0.3835, "mean_token_accuracy": 0.8709027290344238, "num_tokens": 178904334.0, "step": 148700 }, { "entropy": 1.858834259212017, "epoch": 0.46098798861402135, "grad_norm": 3.9686553478240967, "learning_rate": 3.7260556323750185e-06, "loss": 0.3973, "mean_token_accuracy": 0.8583307415246964, "num_tokens": 178916596.0, "step": 148710 }, { "entropy": 1.8790298044681548, "epoch": 0.4610189877390711, "grad_norm": 8.098514556884766, "learning_rate": 3.72593035859459e-06, "loss": 0.4591, "mean_token_accuracy": 0.8534575864672661, "num_tokens": 178929270.0, "step": 148720 }, { "entropy": 1.8725609213113785, "epoch": 0.46104998686412074, "grad_norm": 8.010581016540527, "learning_rate": 3.725805097448807e-06, "loss": 0.4111, "mean_token_accuracy": 0.8627361282706261, "num_tokens": 178941517.0, "step": 148730 }, { "entropy": 1.9305458396673203, "epoch": 0.46108098598917047, "grad_norm": 8.154109954833984, "learning_rate": 3.7256798489355474e-06, "loss": 0.4493, "mean_token_accuracy": 0.8501837208867074, "num_tokens": 178953101.0, "step": 148740 }, { "entropy": 1.9796837285161017, "epoch": 0.46111198511422014, "grad_norm": 9.379487991333008, "learning_rate": 3.7255546130526867e-06, "loss": 0.508, "mean_token_accuracy": 0.8350688204169273, "num_tokens": 178964485.0, "step": 148750 }, { "entropy": 1.951874351501465, "epoch": 0.46114298423926986, "grad_norm": 7.641496658325195, "learning_rate": 3.7254293897981025e-06, "loss": 0.4815, "mean_token_accuracy": 0.843595664203167, "num_tokens": 178975573.0, "step": 148760 }, { "entropy": 1.8258162140846252, "epoch": 0.46117398336431953, "grad_norm": 2.5984456539154053, "learning_rate": 3.725304179169673e-06, "loss": 0.4267, "mean_token_accuracy": 0.8542465507984162, "num_tokens": 178988605.0, "step": 148770 }, { "entropy": 1.8872524976730347, "epoch": 0.46120498248936925, "grad_norm": 7.967230319976807, "learning_rate": 3.7251789811652763e-06, "loss": 0.4397, "mean_token_accuracy": 0.8548190101981163, "num_tokens": 179001072.0, "step": 148780 }, { "entropy": 1.9689855933189393, "epoch": 0.4612359816144189, "grad_norm": 4.094521522521973, "learning_rate": 3.7250537957827913e-06, "loss": 0.5026, "mean_token_accuracy": 0.8418999388813972, "num_tokens": 179012080.0, "step": 148790 }, { "entropy": 1.9371369108557701, "epoch": 0.46126698073946865, "grad_norm": 7.492170333862305, "learning_rate": 3.7249286230200974e-06, "loss": 0.4804, "mean_token_accuracy": 0.854214908182621, "num_tokens": 179023118.0, "step": 148800 }, { "entropy": 1.941811603307724, "epoch": 0.4612979798645183, "grad_norm": 7.5748491287231445, "learning_rate": 3.7248034628750744e-06, "loss": 0.4423, "mean_token_accuracy": 0.85891852080822, "num_tokens": 179034894.0, "step": 148810 }, { "entropy": 1.8954156935214996, "epoch": 0.46132897898956804, "grad_norm": 3.9654111862182617, "learning_rate": 3.7246783153456033e-06, "loss": 0.4425, "mean_token_accuracy": 0.8546635210514069, "num_tokens": 179047196.0, "step": 148820 }, { "entropy": 1.9124370649456979, "epoch": 0.4613599781146177, "grad_norm": 8.43816089630127, "learning_rate": 3.7245531804295637e-06, "loss": 0.4774, "mean_token_accuracy": 0.8510984659194947, "num_tokens": 179058196.0, "step": 148830 }, { "entropy": 1.949829702079296, "epoch": 0.46139097723966743, "grad_norm": 9.092519760131836, "learning_rate": 3.724428058124837e-06, "loss": 0.4881, "mean_token_accuracy": 0.8465055644512176, "num_tokens": 179069758.0, "step": 148840 }, { "entropy": 1.9349031627178193, "epoch": 0.4614219763647171, "grad_norm": 7.756466865539551, "learning_rate": 3.7243029484293057e-06, "loss": 0.4625, "mean_token_accuracy": 0.8493330150842666, "num_tokens": 179081712.0, "step": 148850 }, { "entropy": 1.8835980251431466, "epoch": 0.4614529754897668, "grad_norm": 5.772395610809326, "learning_rate": 3.724177851340852e-06, "loss": 0.5075, "mean_token_accuracy": 0.8446671605110169, "num_tokens": 179095186.0, "step": 148860 }, { "entropy": 1.9193114623427392, "epoch": 0.4614839746148165, "grad_norm": 7.486071586608887, "learning_rate": 3.724052766857359e-06, "loss": 0.4239, "mean_token_accuracy": 0.8581682324409485, "num_tokens": 179107120.0, "step": 148870 }, { "entropy": 1.9004424124956132, "epoch": 0.4615149737398662, "grad_norm": 4.124865531921387, "learning_rate": 3.7239276949767094e-06, "loss": 0.4304, "mean_token_accuracy": 0.8557851612567902, "num_tokens": 179119606.0, "step": 148880 }, { "entropy": 1.8963897615671157, "epoch": 0.4615459728649159, "grad_norm": 3.557903289794922, "learning_rate": 3.7238026356967873e-06, "loss": 0.4233, "mean_token_accuracy": 0.8588035330176353, "num_tokens": 179132126.0, "step": 148890 }, { "entropy": 1.9891429126262665, "epoch": 0.46157697198996556, "grad_norm": 8.919794082641602, "learning_rate": 3.7236775890154765e-06, "loss": 0.5074, "mean_token_accuracy": 0.8473231881856919, "num_tokens": 179143132.0, "step": 148900 }, { "entropy": 1.9493281915783882, "epoch": 0.4616079711150153, "grad_norm": 9.03589153289795, "learning_rate": 3.723552554930663e-06, "loss": 0.5008, "mean_token_accuracy": 0.8437724366784096, "num_tokens": 179154839.0, "step": 148910 }, { "entropy": 1.879402995109558, "epoch": 0.46163897024006495, "grad_norm": 7.5339741706848145, "learning_rate": 3.7234275334402314e-06, "loss": 0.4215, "mean_token_accuracy": 0.8598219409585, "num_tokens": 179167503.0, "step": 148920 }, { "entropy": 1.8404528819024564, "epoch": 0.4616699693651147, "grad_norm": 5.325761795043945, "learning_rate": 3.7233025245420666e-06, "loss": 0.4685, "mean_token_accuracy": 0.8565814971923829, "num_tokens": 179180748.0, "step": 148930 }, { "entropy": 1.8871916115283967, "epoch": 0.46170096849016434, "grad_norm": 7.421195983886719, "learning_rate": 3.7231775282340564e-06, "loss": 0.4556, "mean_token_accuracy": 0.8600098505616188, "num_tokens": 179192693.0, "step": 148940 }, { "entropy": 1.9641156986355781, "epoch": 0.46173196761521407, "grad_norm": 7.706519603729248, "learning_rate": 3.7230525445140874e-06, "loss": 0.4721, "mean_token_accuracy": 0.8449445277452469, "num_tokens": 179203723.0, "step": 148950 }, { "entropy": 1.9411569505929946, "epoch": 0.46176296674026374, "grad_norm": 9.283172607421875, "learning_rate": 3.7229275733800462e-06, "loss": 0.4603, "mean_token_accuracy": 0.8541247725486756, "num_tokens": 179215213.0, "step": 148960 }, { "entropy": 2.000536176562309, "epoch": 0.46179396586531346, "grad_norm": 7.330437183380127, "learning_rate": 3.722802614829821e-06, "loss": 0.4743, "mean_token_accuracy": 0.8524016052484512, "num_tokens": 179225916.0, "step": 148970 }, { "entropy": 1.841599515080452, "epoch": 0.46182496499036313, "grad_norm": 8.324941635131836, "learning_rate": 3.7226776688612994e-06, "loss": 0.4567, "mean_token_accuracy": 0.8502395913004875, "num_tokens": 179238822.0, "step": 148980 }, { "entropy": 1.8596518978476524, "epoch": 0.46185596411541285, "grad_norm": 9.439504623413086, "learning_rate": 3.722552735472371e-06, "loss": 0.3836, "mean_token_accuracy": 0.8619119867682457, "num_tokens": 179251678.0, "step": 148990 }, { "entropy": 1.9302584543824195, "epoch": 0.4618869632404625, "grad_norm": 9.211128234863281, "learning_rate": 3.722427814660926e-06, "loss": 0.4525, "mean_token_accuracy": 0.8613484084606171, "num_tokens": 179263263.0, "step": 149000 }, { "entropy": 1.840495379269123, "epoch": 0.46191796236551225, "grad_norm": 7.943533897399902, "learning_rate": 3.722302906424852e-06, "loss": 0.3887, "mean_token_accuracy": 0.8663021042943001, "num_tokens": 179275505.0, "step": 149010 }, { "entropy": 1.9889112055301665, "epoch": 0.4619489614905619, "grad_norm": 8.111519813537598, "learning_rate": 3.7221780107620397e-06, "loss": 0.4651, "mean_token_accuracy": 0.8498691335320473, "num_tokens": 179286521.0, "step": 149020 }, { "entropy": 1.9183252349495887, "epoch": 0.46197996061561164, "grad_norm": 7.588090419769287, "learning_rate": 3.7220531276703815e-06, "loss": 0.4665, "mean_token_accuracy": 0.8486698940396309, "num_tokens": 179297674.0, "step": 149030 }, { "entropy": 1.8954298749566079, "epoch": 0.4620109597406613, "grad_norm": 7.996798515319824, "learning_rate": 3.7219282571477677e-06, "loss": 0.4566, "mean_token_accuracy": 0.852338932454586, "num_tokens": 179309717.0, "step": 149040 }, { "entropy": 1.902578841149807, "epoch": 0.46204195886571103, "grad_norm": 7.982056140899658, "learning_rate": 3.7218033991920895e-06, "loss": 0.4607, "mean_token_accuracy": 0.8511950179934502, "num_tokens": 179321639.0, "step": 149050 }, { "entropy": 1.8236243352293968, "epoch": 0.4620729579907607, "grad_norm": 8.151361465454102, "learning_rate": 3.7216785538012397e-06, "loss": 0.3998, "mean_token_accuracy": 0.8600339099764824, "num_tokens": 179334706.0, "step": 149060 }, { "entropy": 1.9069883227348328, "epoch": 0.4621039571158104, "grad_norm": 8.232033729553223, "learning_rate": 3.7215537209731107e-06, "loss": 0.4365, "mean_token_accuracy": 0.860856780409813, "num_tokens": 179346482.0, "step": 149070 }, { "entropy": 1.8350558295845985, "epoch": 0.4621349562408601, "grad_norm": 8.054594039916992, "learning_rate": 3.7214289007055965e-06, "loss": 0.3866, "mean_token_accuracy": 0.8655603528022766, "num_tokens": 179359849.0, "step": 149080 }, { "entropy": 1.972979885339737, "epoch": 0.4621659553659098, "grad_norm": 10.824267387390137, "learning_rate": 3.7213040929965905e-06, "loss": 0.4864, "mean_token_accuracy": 0.8447522640228271, "num_tokens": 179371308.0, "step": 149090 }, { "entropy": 1.8467707887291909, "epoch": 0.4621969544909595, "grad_norm": 8.265110969543457, "learning_rate": 3.721179297843986e-06, "loss": 0.4606, "mean_token_accuracy": 0.8495004117488861, "num_tokens": 179384138.0, "step": 149100 }, { "entropy": 1.9475996479392053, "epoch": 0.4622279536160092, "grad_norm": 7.331584453582764, "learning_rate": 3.721054515245679e-06, "loss": 0.5038, "mean_token_accuracy": 0.848460578918457, "num_tokens": 179395588.0, "step": 149110 }, { "entropy": 1.879881000518799, "epoch": 0.4622589527410589, "grad_norm": 7.338623523712158, "learning_rate": 3.720929745199565e-06, "loss": 0.4278, "mean_token_accuracy": 0.8511856839060783, "num_tokens": 179407656.0, "step": 149120 }, { "entropy": 1.9637582659721375, "epoch": 0.4622899518661086, "grad_norm": 9.248175621032715, "learning_rate": 3.720804987703538e-06, "loss": 0.4885, "mean_token_accuracy": 0.8401120364665985, "num_tokens": 179418775.0, "step": 149130 }, { "entropy": 1.9078269064426423, "epoch": 0.4623209509911583, "grad_norm": 8.055317878723145, "learning_rate": 3.7206802427554957e-06, "loss": 0.4194, "mean_token_accuracy": 0.856049720942974, "num_tokens": 179431246.0, "step": 149140 }, { "entropy": 1.880311058461666, "epoch": 0.46235195011620794, "grad_norm": 3.6950716972351074, "learning_rate": 3.7205555103533336e-06, "loss": 0.4157, "mean_token_accuracy": 0.862914027273655, "num_tokens": 179443865.0, "step": 149150 }, { "entropy": 1.9153502866625787, "epoch": 0.46238294924125767, "grad_norm": 3.731424331665039, "learning_rate": 3.7204307904949507e-06, "loss": 0.4726, "mean_token_accuracy": 0.849963067471981, "num_tokens": 179455278.0, "step": 149160 }, { "entropy": 1.9533527433872222, "epoch": 0.46241394836630734, "grad_norm": 8.858798027038574, "learning_rate": 3.7203060831782423e-06, "loss": 0.4618, "mean_token_accuracy": 0.8534232169389725, "num_tokens": 179467185.0, "step": 149170 }, { "entropy": 1.9469057649374009, "epoch": 0.46244494749135706, "grad_norm": 7.934726715087891, "learning_rate": 3.7201813884011084e-06, "loss": 0.4779, "mean_token_accuracy": 0.8529333204030991, "num_tokens": 179478428.0, "step": 149180 }, { "entropy": 1.8080351307988167, "epoch": 0.46247594661640673, "grad_norm": 7.475351810455322, "learning_rate": 3.7200567061614475e-06, "loss": 0.4169, "mean_token_accuracy": 0.8639614179730415, "num_tokens": 179491614.0, "step": 149190 }, { "entropy": 1.9022365421056748, "epoch": 0.46250694574145645, "grad_norm": 8.010424613952637, "learning_rate": 3.719932036457157e-06, "loss": 0.4343, "mean_token_accuracy": 0.8514123097062111, "num_tokens": 179502942.0, "step": 149200 }, { "entropy": 2.004026171565056, "epoch": 0.4625379448665061, "grad_norm": 12.545236587524414, "learning_rate": 3.71980737928614e-06, "loss": 0.5794, "mean_token_accuracy": 0.830876411497593, "num_tokens": 179513755.0, "step": 149210 }, { "entropy": 1.8916426077485085, "epoch": 0.46256894399155585, "grad_norm": 5.487705230712891, "learning_rate": 3.719682734646293e-06, "loss": 0.4147, "mean_token_accuracy": 0.8689695119857788, "num_tokens": 179525126.0, "step": 149220 }, { "entropy": 1.8946526229381562, "epoch": 0.4625999431166055, "grad_norm": 9.722243309020996, "learning_rate": 3.7195581025355194e-06, "loss": 0.4581, "mean_token_accuracy": 0.8481919586658477, "num_tokens": 179536892.0, "step": 149230 }, { "entropy": 1.9386965110898018, "epoch": 0.46263094224165524, "grad_norm": 7.750062942504883, "learning_rate": 3.719433482951718e-06, "loss": 0.4443, "mean_token_accuracy": 0.8568326279520988, "num_tokens": 179548377.0, "step": 149240 }, { "entropy": 1.8785508409142495, "epoch": 0.4626619413667049, "grad_norm": 4.4261393547058105, "learning_rate": 3.7193088758927924e-06, "loss": 0.3965, "mean_token_accuracy": 0.8603580877184868, "num_tokens": 179560882.0, "step": 149250 }, { "entropy": 1.863745103776455, "epoch": 0.46269294049175463, "grad_norm": 7.138288974761963, "learning_rate": 3.7191842813566436e-06, "loss": 0.4085, "mean_token_accuracy": 0.8604954317212105, "num_tokens": 179573767.0, "step": 149260 }, { "entropy": 1.848945789039135, "epoch": 0.4627239396168043, "grad_norm": 5.103929042816162, "learning_rate": 3.7190596993411744e-06, "loss": 0.4499, "mean_token_accuracy": 0.8594519913196563, "num_tokens": 179587556.0, "step": 149270 }, { "entropy": 1.8771667987108231, "epoch": 0.462754938741854, "grad_norm": 9.511100769042969, "learning_rate": 3.7189351298442885e-06, "loss": 0.4375, "mean_token_accuracy": 0.8482055693864823, "num_tokens": 179599698.0, "step": 149280 }, { "entropy": 1.8984940245747566, "epoch": 0.4627859378669037, "grad_norm": 3.7801976203918457, "learning_rate": 3.718810572863889e-06, "loss": 0.4823, "mean_token_accuracy": 0.8494816675782204, "num_tokens": 179611306.0, "step": 149290 }, { "entropy": 1.8457211047410964, "epoch": 0.4628169369919534, "grad_norm": 8.371335983276367, "learning_rate": 3.718686028397879e-06, "loss": 0.425, "mean_token_accuracy": 0.8612516298890114, "num_tokens": 179623572.0, "step": 149300 }, { "entropy": 1.8669260188937187, "epoch": 0.4628479361170031, "grad_norm": 4.057291507720947, "learning_rate": 3.718561496444166e-06, "loss": 0.4648, "mean_token_accuracy": 0.8506672963500023, "num_tokens": 179635689.0, "step": 149310 }, { "entropy": 2.0177447497844696, "epoch": 0.4628789352420528, "grad_norm": 7.82882022857666, "learning_rate": 3.718436977000652e-06, "loss": 0.5247, "mean_token_accuracy": 0.8370867878198623, "num_tokens": 179646327.0, "step": 149320 }, { "entropy": 1.9195616245269775, "epoch": 0.4629099343671025, "grad_norm": 4.400636196136475, "learning_rate": 3.7183124700652433e-06, "loss": 0.4436, "mean_token_accuracy": 0.8610096588730812, "num_tokens": 179658283.0, "step": 149330 }, { "entropy": 1.9372329905629158, "epoch": 0.4629409334921522, "grad_norm": 7.58730936050415, "learning_rate": 3.7181879756358472e-06, "loss": 0.4956, "mean_token_accuracy": 0.8437316343188286, "num_tokens": 179670274.0, "step": 149340 }, { "entropy": 1.8574361830949784, "epoch": 0.4629719326172019, "grad_norm": 6.498873233795166, "learning_rate": 3.7180634937103697e-06, "loss": 0.4185, "mean_token_accuracy": 0.8578757628798485, "num_tokens": 179682351.0, "step": 149350 }, { "entropy": 1.9330038219690322, "epoch": 0.4630029317422516, "grad_norm": 8.56723690032959, "learning_rate": 3.717939024286717e-06, "loss": 0.4655, "mean_token_accuracy": 0.8528059482574463, "num_tokens": 179693716.0, "step": 149360 }, { "entropy": 1.9517970651388168, "epoch": 0.46303393086730127, "grad_norm": 7.569751262664795, "learning_rate": 3.7178145673627976e-06, "loss": 0.4461, "mean_token_accuracy": 0.8599812477827072, "num_tokens": 179705054.0, "step": 149370 }, { "entropy": 1.8608211636543275, "epoch": 0.463064929992351, "grad_norm": 4.166712760925293, "learning_rate": 3.717690122936518e-06, "loss": 0.4196, "mean_token_accuracy": 0.8539978817105294, "num_tokens": 179718580.0, "step": 149380 }, { "entropy": 1.797726885974407, "epoch": 0.46309592911740066, "grad_norm": 8.749974250793457, "learning_rate": 3.7175656910057894e-06, "loss": 0.4225, "mean_token_accuracy": 0.8570615857839584, "num_tokens": 179732381.0, "step": 149390 }, { "entropy": 1.863544289022684, "epoch": 0.46312692824245033, "grad_norm": 6.951138496398926, "learning_rate": 3.717441271568518e-06, "loss": 0.4032, "mean_token_accuracy": 0.8617607593536377, "num_tokens": 179745055.0, "step": 149400 }, { "entropy": 1.8385921865701675, "epoch": 0.46315792736750006, "grad_norm": 3.46244215965271, "learning_rate": 3.7173168646226155e-06, "loss": 0.3779, "mean_token_accuracy": 0.8700759828090667, "num_tokens": 179757818.0, "step": 149410 }, { "entropy": 1.88816240131855, "epoch": 0.4631889264925497, "grad_norm": 6.927772521972656, "learning_rate": 3.7171924701659907e-06, "loss": 0.4714, "mean_token_accuracy": 0.8400983780622482, "num_tokens": 179770718.0, "step": 149420 }, { "entropy": 1.9157932326197624, "epoch": 0.46321992561759945, "grad_norm": 6.942224025726318, "learning_rate": 3.717068088196554e-06, "loss": 0.4132, "mean_token_accuracy": 0.8620764970779419, "num_tokens": 179782310.0, "step": 149430 }, { "entropy": 1.925161102414131, "epoch": 0.4632509247426491, "grad_norm": 6.985175609588623, "learning_rate": 3.7169437187122166e-06, "loss": 0.4419, "mean_token_accuracy": 0.8568544924259186, "num_tokens": 179793811.0, "step": 149440 }, { "entropy": 1.8122023433446883, "epoch": 0.46328192386769884, "grad_norm": 8.966583251953125, "learning_rate": 3.71681936171089e-06, "loss": 0.4477, "mean_token_accuracy": 0.8570965170860291, "num_tokens": 179807530.0, "step": 149450 }, { "entropy": 1.8448025688529015, "epoch": 0.4633129229927485, "grad_norm": 8.287588119506836, "learning_rate": 3.716695017190486e-06, "loss": 0.3767, "mean_token_accuracy": 0.8626005664467812, "num_tokens": 179820550.0, "step": 149460 }, { "entropy": 1.963685867190361, "epoch": 0.46334392211779823, "grad_norm": 7.667242050170898, "learning_rate": 3.716570685148917e-06, "loss": 0.4761, "mean_token_accuracy": 0.85397337526083, "num_tokens": 179830999.0, "step": 149470 }, { "entropy": 1.8665792480111123, "epoch": 0.4633749212428479, "grad_norm": 8.30051040649414, "learning_rate": 3.716446365584096e-06, "loss": 0.3937, "mean_token_accuracy": 0.8599151030182839, "num_tokens": 179843005.0, "step": 149480 }, { "entropy": 1.8342208310961723, "epoch": 0.46340592036789763, "grad_norm": 3.5341451168060303, "learning_rate": 3.7163220584939363e-06, "loss": 0.402, "mean_token_accuracy": 0.8594845876097679, "num_tokens": 179855385.0, "step": 149490 }, { "entropy": 1.9173177301883697, "epoch": 0.4634369194929473, "grad_norm": 8.4180326461792, "learning_rate": 3.7161977638763523e-06, "loss": 0.4628, "mean_token_accuracy": 0.8482234224677085, "num_tokens": 179867801.0, "step": 149500 }, { "entropy": 1.9616335391998292, "epoch": 0.463467918617997, "grad_norm": 6.9208221435546875, "learning_rate": 3.716073481729258e-06, "loss": 0.5128, "mean_token_accuracy": 0.8467522487044334, "num_tokens": 179879398.0, "step": 149510 }, { "entropy": 1.8291648626327515, "epoch": 0.4634989177430467, "grad_norm": 3.6951959133148193, "learning_rate": 3.715949212050568e-06, "loss": 0.4187, "mean_token_accuracy": 0.8521149665117264, "num_tokens": 179892903.0, "step": 149520 }, { "entropy": 1.9397112756967545, "epoch": 0.4635299168680964, "grad_norm": 7.79329252243042, "learning_rate": 3.715824954838198e-06, "loss": 0.4978, "mean_token_accuracy": 0.8460395872592926, "num_tokens": 179903789.0, "step": 149530 }, { "entropy": 1.8958303049206733, "epoch": 0.4635609159931461, "grad_norm": 9.17414665222168, "learning_rate": 3.715700710090064e-06, "loss": 0.4351, "mean_token_accuracy": 0.8583477824926377, "num_tokens": 179915857.0, "step": 149540 }, { "entropy": 1.934301419556141, "epoch": 0.4635919151181958, "grad_norm": 11.262062072753906, "learning_rate": 3.7155764778040813e-06, "loss": 0.4945, "mean_token_accuracy": 0.8361470863223076, "num_tokens": 179927748.0, "step": 149550 }, { "entropy": 2.001458024978638, "epoch": 0.4636229142432455, "grad_norm": 9.550740242004395, "learning_rate": 3.7154522579781682e-06, "loss": 0.531, "mean_token_accuracy": 0.8417502701282501, "num_tokens": 179938659.0, "step": 149560 }, { "entropy": 1.838100890815258, "epoch": 0.4636539133682952, "grad_norm": 7.261234760284424, "learning_rate": 3.715328050610241e-06, "loss": 0.3895, "mean_token_accuracy": 0.8724177911877632, "num_tokens": 179951437.0, "step": 149570 }, { "entropy": 1.8873604014515877, "epoch": 0.46368491249334487, "grad_norm": 4.315876007080078, "learning_rate": 3.715203855698218e-06, "loss": 0.4199, "mean_token_accuracy": 0.854177550971508, "num_tokens": 179964006.0, "step": 149580 }, { "entropy": 1.938150754570961, "epoch": 0.4637159116183946, "grad_norm": 8.612800598144531, "learning_rate": 3.715079673240017e-06, "loss": 0.4698, "mean_token_accuracy": 0.8499852314591407, "num_tokens": 179975104.0, "step": 149590 }, { "entropy": 1.9041712909936905, "epoch": 0.46374691074344426, "grad_norm": 3.8847134113311768, "learning_rate": 3.714955503233557e-06, "loss": 0.4501, "mean_token_accuracy": 0.8547717839479446, "num_tokens": 179987110.0, "step": 149600 }, { "entropy": 1.951756013929844, "epoch": 0.463777909868494, "grad_norm": 6.442052364349365, "learning_rate": 3.714831345676757e-06, "loss": 0.4659, "mean_token_accuracy": 0.8537251800298691, "num_tokens": 179998453.0, "step": 149610 }, { "entropy": 1.8288054898381234, "epoch": 0.46380890899354366, "grad_norm": 6.619274139404297, "learning_rate": 3.714707200567537e-06, "loss": 0.3848, "mean_token_accuracy": 0.8710165083408355, "num_tokens": 180010976.0, "step": 149620 }, { "entropy": 1.9353380858898164, "epoch": 0.4638399081185934, "grad_norm": 7.440566062927246, "learning_rate": 3.7145830679038177e-06, "loss": 0.495, "mean_token_accuracy": 0.8452064722776413, "num_tokens": 180022973.0, "step": 149630 }, { "entropy": 1.857330885529518, "epoch": 0.46387090724364305, "grad_norm": 5.837486267089844, "learning_rate": 3.714458947683519e-06, "loss": 0.3873, "mean_token_accuracy": 0.8586672931909561, "num_tokens": 180035450.0, "step": 149640 }, { "entropy": 1.9173126935958862, "epoch": 0.4639019063686927, "grad_norm": 9.367176055908203, "learning_rate": 3.714334839904563e-06, "loss": 0.4133, "mean_token_accuracy": 0.8661222815513611, "num_tokens": 180046863.0, "step": 149650 }, { "entropy": 1.8740798771381377, "epoch": 0.46393290549374244, "grad_norm": 8.433083534240723, "learning_rate": 3.7142107445648706e-06, "loss": 0.4236, "mean_token_accuracy": 0.8610092878341675, "num_tokens": 180059176.0, "step": 149660 }, { "entropy": 1.9137323945760727, "epoch": 0.4639639046187921, "grad_norm": 8.501178741455078, "learning_rate": 3.714086661662364e-06, "loss": 0.443, "mean_token_accuracy": 0.8547899782657623, "num_tokens": 180071303.0, "step": 149670 }, { "entropy": 1.8415597707033158, "epoch": 0.46399490374384184, "grad_norm": 9.41646957397461, "learning_rate": 3.713962591194966e-06, "loss": 0.4168, "mean_token_accuracy": 0.8580908700823784, "num_tokens": 180084425.0, "step": 149680 }, { "entropy": 1.8512480854988098, "epoch": 0.4640259028688915, "grad_norm": 4.0245442390441895, "learning_rate": 3.7138385331606e-06, "loss": 0.3873, "mean_token_accuracy": 0.8679956078529358, "num_tokens": 180097243.0, "step": 149690 }, { "entropy": 1.908530332148075, "epoch": 0.46405690199394123, "grad_norm": 8.557743072509766, "learning_rate": 3.713714487557189e-06, "loss": 0.4529, "mean_token_accuracy": 0.852016007900238, "num_tokens": 180108962.0, "step": 149700 }, { "entropy": 1.8267366781830787, "epoch": 0.4640879011189909, "grad_norm": 4.43588924407959, "learning_rate": 3.713590454382658e-06, "loss": 0.4163, "mean_token_accuracy": 0.8512490093708038, "num_tokens": 180121542.0, "step": 149710 }, { "entropy": 1.8044231168925762, "epoch": 0.4641189002440406, "grad_norm": 8.573354721069336, "learning_rate": 3.71346643363493e-06, "loss": 0.388, "mean_token_accuracy": 0.8628219202160835, "num_tokens": 180134671.0, "step": 149720 }, { "entropy": 1.9123155683279038, "epoch": 0.4641498993690903, "grad_norm": 3.89583158493042, "learning_rate": 3.7133424253119323e-06, "loss": 0.4387, "mean_token_accuracy": 0.8472074523568154, "num_tokens": 180146781.0, "step": 149730 }, { "entropy": 1.9491783201694488, "epoch": 0.46418089849414, "grad_norm": 7.883488655090332, "learning_rate": 3.713218429411589e-06, "loss": 0.5038, "mean_token_accuracy": 0.8498224407434464, "num_tokens": 180158375.0, "step": 149740 }, { "entropy": 1.8938617929816246, "epoch": 0.4642118976191897, "grad_norm": 7.663883209228516, "learning_rate": 3.713094445931827e-06, "loss": 0.4595, "mean_token_accuracy": 0.8473029389977456, "num_tokens": 180170450.0, "step": 149750 }, { "entropy": 1.8819366082549096, "epoch": 0.4642428967442394, "grad_norm": 8.43736457824707, "learning_rate": 3.712970474870572e-06, "loss": 0.4776, "mean_token_accuracy": 0.8485167771577835, "num_tokens": 180182097.0, "step": 149760 }, { "entropy": 1.8832198321819305, "epoch": 0.4642738958692891, "grad_norm": 6.826776504516602, "learning_rate": 3.7128465162257517e-06, "loss": 0.4203, "mean_token_accuracy": 0.8600822359323501, "num_tokens": 180194184.0, "step": 149770 }, { "entropy": 1.9311166375875473, "epoch": 0.4643048949943388, "grad_norm": 8.055139541625977, "learning_rate": 3.712722569995293e-06, "loss": 0.4818, "mean_token_accuracy": 0.850020831823349, "num_tokens": 180205786.0, "step": 149780 }, { "entropy": 1.9121816590428353, "epoch": 0.46433589411938847, "grad_norm": 8.332880973815918, "learning_rate": 3.712598636177124e-06, "loss": 0.4429, "mean_token_accuracy": 0.8503255888819694, "num_tokens": 180218001.0, "step": 149790 }, { "entropy": 1.9609209045767784, "epoch": 0.4643668932444382, "grad_norm": 4.948374271392822, "learning_rate": 3.7124747147691733e-06, "loss": 0.4832, "mean_token_accuracy": 0.8423975124955178, "num_tokens": 180229921.0, "step": 149800 }, { "entropy": 1.9410631626844406, "epoch": 0.46439789236948786, "grad_norm": 8.004473686218262, "learning_rate": 3.71235080576937e-06, "loss": 0.4916, "mean_token_accuracy": 0.8438965693116188, "num_tokens": 180242344.0, "step": 149810 }, { "entropy": 1.8821017548441887, "epoch": 0.4644288914945376, "grad_norm": 5.065506935119629, "learning_rate": 3.7122269091756436e-06, "loss": 0.4566, "mean_token_accuracy": 0.8540616199374199, "num_tokens": 180254354.0, "step": 149820 }, { "entropy": 1.9584839269518852, "epoch": 0.46445989061958726, "grad_norm": 7.649283409118652, "learning_rate": 3.7121030249859243e-06, "loss": 0.512, "mean_token_accuracy": 0.8388520136475563, "num_tokens": 180265796.0, "step": 149830 }, { "entropy": 1.8087536610662938, "epoch": 0.464490889744637, "grad_norm": 4.455826282501221, "learning_rate": 3.7119791531981408e-06, "loss": 0.3759, "mean_token_accuracy": 0.8636887416243553, "num_tokens": 180279611.0, "step": 149840 }, { "entropy": 1.9264666765928269, "epoch": 0.46452188886968665, "grad_norm": 7.5778985023498535, "learning_rate": 3.711855293810227e-06, "loss": 0.479, "mean_token_accuracy": 0.8526265174150467, "num_tokens": 180290829.0, "step": 149850 }, { "entropy": 1.9072339527308941, "epoch": 0.4645528879947364, "grad_norm": 6.993530750274658, "learning_rate": 3.711731446820111e-06, "loss": 0.4546, "mean_token_accuracy": 0.8573938056826591, "num_tokens": 180303038.0, "step": 149860 }, { "entropy": 1.9101138547062875, "epoch": 0.46458388711978604, "grad_norm": 8.588576316833496, "learning_rate": 3.7116076122257273e-06, "loss": 0.4783, "mean_token_accuracy": 0.8481775879859924, "num_tokens": 180314807.0, "step": 149870 }, { "entropy": 1.7950350888073445, "epoch": 0.4646148862448357, "grad_norm": 4.491353511810303, "learning_rate": 3.7114837900250068e-06, "loss": 0.4137, "mean_token_accuracy": 0.8682404264807702, "num_tokens": 180328419.0, "step": 149880 }, { "entropy": 1.9566629469394683, "epoch": 0.46464588536988544, "grad_norm": 7.9985198974609375, "learning_rate": 3.7113599802158823e-06, "loss": 0.4799, "mean_token_accuracy": 0.8541330322623253, "num_tokens": 180339648.0, "step": 149890 }, { "entropy": 1.9067189604043961, "epoch": 0.4646768844949351, "grad_norm": 7.05324649810791, "learning_rate": 3.711236182796288e-06, "loss": 0.427, "mean_token_accuracy": 0.8619942396879197, "num_tokens": 180351473.0, "step": 149900 }, { "entropy": 1.9476832181215287, "epoch": 0.46470788361998483, "grad_norm": 3.14349365234375, "learning_rate": 3.711112397764157e-06, "loss": 0.4888, "mean_token_accuracy": 0.8488188147544861, "num_tokens": 180362723.0, "step": 149910 }, { "entropy": 1.7743832789361478, "epoch": 0.4647388827450345, "grad_norm": 7.153512001037598, "learning_rate": 3.7109886251174236e-06, "loss": 0.3569, "mean_token_accuracy": 0.8677819207310676, "num_tokens": 180376546.0, "step": 149920 }, { "entropy": 1.8927229553461076, "epoch": 0.4647698818700842, "grad_norm": 2.545341968536377, "learning_rate": 3.710864864854023e-06, "loss": 0.4332, "mean_token_accuracy": 0.8583212435245514, "num_tokens": 180388347.0, "step": 149930 }, { "entropy": 1.8385560512542725, "epoch": 0.4648008809951339, "grad_norm": 7.90435791015625, "learning_rate": 3.71074111697189e-06, "loss": 0.4113, "mean_token_accuracy": 0.8564158409833909, "num_tokens": 180400665.0, "step": 149940 }, { "entropy": 1.8350412741303443, "epoch": 0.4648318801201836, "grad_norm": 4.03927755355835, "learning_rate": 3.7106173814689606e-06, "loss": 0.4009, "mean_token_accuracy": 0.8569675713777543, "num_tokens": 180412723.0, "step": 149950 }, { "entropy": 1.8664705529808998, "epoch": 0.4648628792452333, "grad_norm": 2.9268198013305664, "learning_rate": 3.710493658343171e-06, "loss": 0.5036, "mean_token_accuracy": 0.8416856169700623, "num_tokens": 180426131.0, "step": 149960 }, { "entropy": 1.922463881969452, "epoch": 0.464893878370283, "grad_norm": 7.96071195602417, "learning_rate": 3.7103699475924576e-06, "loss": 0.4839, "mean_token_accuracy": 0.844486691057682, "num_tokens": 180437621.0, "step": 149970 }, { "entropy": 1.8738661885261536, "epoch": 0.4649248774953327, "grad_norm": 7.6994404792785645, "learning_rate": 3.710246249214757e-06, "loss": 0.4296, "mean_token_accuracy": 0.859145550429821, "num_tokens": 180449886.0, "step": 149980 }, { "entropy": 1.854253427684307, "epoch": 0.4649558766203824, "grad_norm": 3.478299856185913, "learning_rate": 3.7101225632080085e-06, "loss": 0.4088, "mean_token_accuracy": 0.8661620482802391, "num_tokens": 180461919.0, "step": 149990 }, { "entropy": 1.9357280641794206, "epoch": 0.46498687574543207, "grad_norm": 10.878789901733398, "learning_rate": 3.709998889570149e-06, "loss": 0.464, "mean_token_accuracy": 0.8454708248376847, "num_tokens": 180473902.0, "step": 150000 } ], "logging_steps": 10, "max_steps": 258072, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.432957984332841e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }