{ "best_global_step": 50000, "best_metric": 0.27366578578948975, "best_model_checkpoint": "/workspace/llm-storage/output/Qwen3-32B-LoRA/checkpoint-50000", "epoch": 0.7696762933929063, "eval_steps": 10000, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5919100336730481, "epoch": 0.00015393525867858125, "grad_norm": 0.3611195683479309, "learning_rate": 9.235979270357638e-09, "loss": 0.8773, "mean_token_accuracy": 0.7774614050984383, "num_tokens": 62034.0, "step": 10 }, { "entropy": 0.5695753637701273, "epoch": 0.0003078705173571625, "grad_norm": 0.2273644208908081, "learning_rate": 1.9498178459643902e-08, "loss": 0.8494, "mean_token_accuracy": 0.7914455957710743, "num_tokens": 134895.0, "step": 20 }, { "entropy": 0.561776302009821, "epoch": 0.0004618057760357438, "grad_norm": 0.2052762806415558, "learning_rate": 2.9760377648930166e-08, "loss": 0.8389, "mean_token_accuracy": 0.7925515361130238, "num_tokens": 210101.0, "step": 30 }, { "entropy": 0.5628706313669681, "epoch": 0.000615741034714325, "grad_norm": 0.24249267578125, "learning_rate": 4.002257683821643e-08, "loss": 0.8472, "mean_token_accuracy": 0.7916671767830848, "num_tokens": 284741.0, "step": 40 }, { "entropy": 0.5647880837321282, "epoch": 0.0007696762933929063, "grad_norm": 0.24632197618484497, "learning_rate": 5.02847760275027e-08, "loss": 0.8443, "mean_token_accuracy": 0.7913129165768623, "num_tokens": 356635.0, "step": 50 }, { "entropy": 0.5738844621926547, "epoch": 0.0009236115520714876, "grad_norm": 0.3095531165599823, "learning_rate": 6.054697521678896e-08, "loss": 0.8685, "mean_token_accuracy": 0.7850640155375004, "num_tokens": 420502.0, "step": 60 }, { "entropy": 0.5717844899743796, "epoch": 0.0010775468107500688, "grad_norm": 0.2198827862739563, "learning_rate": 7.080917440607522e-08, "loss": 0.8582, "mean_token_accuracy": 0.7902551002800464, "num_tokens": 489284.0, "step": 70 }, { "entropy": 0.5798778992146254, "epoch": 0.00123148206942865, "grad_norm": 0.2500256896018982, "learning_rate": 8.107137359536149e-08, "loss": 0.8697, "mean_token_accuracy": 0.7830588243901729, "num_tokens": 556283.0, "step": 80 }, { "entropy": 0.5745068699121475, "epoch": 0.0013854173281072314, "grad_norm": 0.38751161098480225, "learning_rate": 9.133357278464776e-08, "loss": 0.8956, "mean_token_accuracy": 0.7861850716173648, "num_tokens": 622214.0, "step": 90 }, { "entropy": 0.5733911626040935, "epoch": 0.0015393525867858126, "grad_norm": 0.1644970327615738, "learning_rate": 1.0159577197393403e-07, "loss": 0.8439, "mean_token_accuracy": 0.7884633563458919, "num_tokens": 693969.0, "step": 100 }, { "entropy": 0.56932178363204, "epoch": 0.0016932878454643937, "grad_norm": 0.2642531991004944, "learning_rate": 1.1185797116322028e-07, "loss": 0.88, "mean_token_accuracy": 0.7883102469146251, "num_tokens": 758016.0, "step": 110 }, { "entropy": 0.5665425475686788, "epoch": 0.0018472231041429751, "grad_norm": 0.28076693415641785, "learning_rate": 1.2212017035250654e-07, "loss": 0.8845, "mean_token_accuracy": 0.7915772706270218, "num_tokens": 824096.0, "step": 120 }, { "entropy": 0.5781571220606565, "epoch": 0.0020011583628215565, "grad_norm": 0.23735640943050385, "learning_rate": 1.3238236954179283e-07, "loss": 0.8886, "mean_token_accuracy": 0.783251203596592, "num_tokens": 886419.0, "step": 130 }, { "entropy": 0.5908713575452567, "epoch": 0.0021550936215001377, "grad_norm": 0.27820247411727905, "learning_rate": 1.4264456873107907e-07, "loss": 0.895, "mean_token_accuracy": 0.7805007271468639, "num_tokens": 955533.0, "step": 140 }, { "entropy": 0.5671172108501196, "epoch": 0.002309028880178719, "grad_norm": 0.23224300146102905, "learning_rate": 1.5290676792036534e-07, "loss": 0.8388, "mean_token_accuracy": 0.7931008167564869, "num_tokens": 1025326.0, "step": 150 }, { "entropy": 0.5744707442820072, "epoch": 0.0024629641388573, "grad_norm": 0.2711655795574188, "learning_rate": 1.631689671096516e-07, "loss": 0.8582, "mean_token_accuracy": 0.7878938525915146, "num_tokens": 1091851.0, "step": 160 }, { "entropy": 0.5759819090366364, "epoch": 0.002616899397535881, "grad_norm": 0.2822760045528412, "learning_rate": 1.7343116629893787e-07, "loss": 0.8567, "mean_token_accuracy": 0.7887153863906861, "num_tokens": 1163514.0, "step": 170 }, { "entropy": 0.5824167054146528, "epoch": 0.0027708346562144628, "grad_norm": 0.22880521416664124, "learning_rate": 1.8369336548822413e-07, "loss": 0.8835, "mean_token_accuracy": 0.7806915760040283, "num_tokens": 1227242.0, "step": 180 }, { "entropy": 0.5953403405845166, "epoch": 0.002924769914893044, "grad_norm": 0.30674031376838684, "learning_rate": 1.9395556467751042e-07, "loss": 0.8909, "mean_token_accuracy": 0.7770949319005013, "num_tokens": 1289901.0, "step": 190 }, { "entropy": 0.5859499953687191, "epoch": 0.003078705173571625, "grad_norm": 0.3147706985473633, "learning_rate": 2.0421776386679668e-07, "loss": 0.8853, "mean_token_accuracy": 0.7808834180235863, "num_tokens": 1354311.0, "step": 200 }, { "entropy": 0.5712988469749689, "epoch": 0.0032326404322502063, "grad_norm": 0.2416708916425705, "learning_rate": 2.1447996305608295e-07, "loss": 0.8856, "mean_token_accuracy": 0.7868104137480258, "num_tokens": 1415066.0, "step": 210 }, { "entropy": 0.5775881927460432, "epoch": 0.0033865756909287874, "grad_norm": 0.26698336005210876, "learning_rate": 2.247421622453692e-07, "loss": 0.8531, "mean_token_accuracy": 0.7884304463863373, "num_tokens": 1484159.0, "step": 220 }, { "entropy": 0.5648292921483516, "epoch": 0.003540510949607369, "grad_norm": 0.32108744978904724, "learning_rate": 2.3500436143465548e-07, "loss": 0.8623, "mean_token_accuracy": 0.7905273355543614, "num_tokens": 1554528.0, "step": 230 }, { "entropy": 0.5757384546101093, "epoch": 0.0036944462082859502, "grad_norm": 0.24875187873840332, "learning_rate": 2.452665606239417e-07, "loss": 0.8537, "mean_token_accuracy": 0.7857667058706284, "num_tokens": 1623049.0, "step": 240 }, { "entropy": 0.5629590973258018, "epoch": 0.0038483814669645314, "grad_norm": 0.38581788539886475, "learning_rate": 2.55528759813228e-07, "loss": 0.8707, "mean_token_accuracy": 0.7894535154104233, "num_tokens": 1693346.0, "step": 250 }, { "entropy": 0.5630147006362677, "epoch": 0.004002316725643113, "grad_norm": 0.27262187004089355, "learning_rate": 2.6579095900251424e-07, "loss": 0.8811, "mean_token_accuracy": 0.791602236777544, "num_tokens": 1753394.0, "step": 260 }, { "entropy": 0.5739092491567135, "epoch": 0.004156251984321694, "grad_norm": 0.2657138407230377, "learning_rate": 2.7605315819180056e-07, "loss": 0.8783, "mean_token_accuracy": 0.7878672853112221, "num_tokens": 1817008.0, "step": 270 }, { "entropy": 0.5919722057878971, "epoch": 0.004310187243000275, "grad_norm": 0.3061436414718628, "learning_rate": 2.863153573810868e-07, "loss": 0.931, "mean_token_accuracy": 0.7785788096487523, "num_tokens": 1878636.0, "step": 280 }, { "entropy": 0.6055156271904707, "epoch": 0.0044641225016788565, "grad_norm": 0.21834532916545868, "learning_rate": 2.965775565703731e-07, "loss": 0.8897, "mean_token_accuracy": 0.7740088667720556, "num_tokens": 1941616.0, "step": 290 }, { "entropy": 0.5785886142402887, "epoch": 0.004618057760357438, "grad_norm": 0.23088103532791138, "learning_rate": 3.068397557596593e-07, "loss": 0.8546, "mean_token_accuracy": 0.7840256042778492, "num_tokens": 2008503.0, "step": 300 }, { "entropy": 0.5990664754062891, "epoch": 0.004771993019036019, "grad_norm": 0.33749788999557495, "learning_rate": 3.1710195494894556e-07, "loss": 0.8981, "mean_token_accuracy": 0.7758807107806206, "num_tokens": 2070942.0, "step": 310 }, { "entropy": 0.5751361843198538, "epoch": 0.0049259282777146, "grad_norm": 0.2882656157016754, "learning_rate": 3.2736415413823183e-07, "loss": 0.8597, "mean_token_accuracy": 0.7894277334213257, "num_tokens": 2141018.0, "step": 320 }, { "entropy": 0.580964620038867, "epoch": 0.005079863536393181, "grad_norm": 0.38300976157188416, "learning_rate": 3.3762635332751815e-07, "loss": 0.8606, "mean_token_accuracy": 0.7832713477313519, "num_tokens": 2206602.0, "step": 330 }, { "entropy": 0.5678949642926454, "epoch": 0.005233798795071762, "grad_norm": 0.26981285214424133, "learning_rate": 3.4788855251680436e-07, "loss": 0.8296, "mean_token_accuracy": 0.7895029321312904, "num_tokens": 2277499.0, "step": 340 }, { "entropy": 0.5874428879469633, "epoch": 0.005387734053750344, "grad_norm": 0.5994871854782104, "learning_rate": 3.581507517060907e-07, "loss": 0.9315, "mean_token_accuracy": 0.780988372862339, "num_tokens": 2334248.0, "step": 350 }, { "entropy": 0.5603416085243225, "epoch": 0.0055416693124289255, "grad_norm": 0.3228195607662201, "learning_rate": 3.684129508953769e-07, "loss": 0.8335, "mean_token_accuracy": 0.7959635682404042, "num_tokens": 2401992.0, "step": 360 }, { "entropy": 0.581711046397686, "epoch": 0.005695604571107507, "grad_norm": 0.23580802977085114, "learning_rate": 3.786751500846632e-07, "loss": 0.8591, "mean_token_accuracy": 0.7823587693274021, "num_tokens": 2470169.0, "step": 370 }, { "entropy": 0.5775255162268877, "epoch": 0.005849539829786088, "grad_norm": 0.27742230892181396, "learning_rate": 3.889373492739494e-07, "loss": 0.8839, "mean_token_accuracy": 0.7844514586031437, "num_tokens": 2533944.0, "step": 380 }, { "entropy": 0.5847948793321848, "epoch": 0.006003475088464669, "grad_norm": 0.2722921073436737, "learning_rate": 3.9919954846323573e-07, "loss": 0.8508, "mean_token_accuracy": 0.7849290184676647, "num_tokens": 2599547.0, "step": 390 }, { "entropy": 0.5586138781160116, "epoch": 0.00615741034714325, "grad_norm": 0.24430985748767853, "learning_rate": 4.09461747652522e-07, "loss": 0.7995, "mean_token_accuracy": 0.7976797498762608, "num_tokens": 2675674.0, "step": 400 }, { "entropy": 0.5621580857783556, "epoch": 0.006311345605821831, "grad_norm": 0.255104660987854, "learning_rate": 4.197239468418082e-07, "loss": 0.8095, "mean_token_accuracy": 0.7943671174347401, "num_tokens": 2751892.0, "step": 410 }, { "entropy": 0.5726390678435564, "epoch": 0.0064652808645004126, "grad_norm": 0.28243693709373474, "learning_rate": 4.299861460310945e-07, "loss": 0.8241, "mean_token_accuracy": 0.7895980916917325, "num_tokens": 2821430.0, "step": 420 }, { "entropy": 0.5653122279793024, "epoch": 0.006619216123178994, "grad_norm": 0.21811330318450928, "learning_rate": 4.4024834522038073e-07, "loss": 0.8476, "mean_token_accuracy": 0.7920493863523006, "num_tokens": 2887969.0, "step": 430 }, { "entropy": 0.5972476743161679, "epoch": 0.006773151381857575, "grad_norm": 0.3025626838207245, "learning_rate": 4.5051054440966705e-07, "loss": 0.8879, "mean_token_accuracy": 0.7763870328664779, "num_tokens": 2950376.0, "step": 440 }, { "entropy": 0.5704224471002817, "epoch": 0.006927086640536157, "grad_norm": 0.22052714228630066, "learning_rate": 4.6077274359895326e-07, "loss": 0.8318, "mean_token_accuracy": 0.791762662678957, "num_tokens": 3020629.0, "step": 450 }, { "entropy": 0.5763071745634079, "epoch": 0.007081021899214738, "grad_norm": 0.24483585357666016, "learning_rate": 4.710349427882396e-07, "loss": 0.8182, "mean_token_accuracy": 0.7908934511244297, "num_tokens": 3087391.0, "step": 460 }, { "entropy": 0.5907505482435227, "epoch": 0.007234957157893319, "grad_norm": 0.3237786889076233, "learning_rate": 4.812971419775258e-07, "loss": 0.8394, "mean_token_accuracy": 0.7913903385400772, "num_tokens": 3159289.0, "step": 470 }, { "entropy": 0.5875931903719902, "epoch": 0.0073888924165719004, "grad_norm": 0.2980842888355255, "learning_rate": 4.915593411668121e-07, "loss": 0.8576, "mean_token_accuracy": 0.7897975154221057, "num_tokens": 3219148.0, "step": 480 }, { "entropy": 0.5801454056054354, "epoch": 0.007542827675250482, "grad_norm": 0.38076719641685486, "learning_rate": 5.018215403560984e-07, "loss": 0.8086, "mean_token_accuracy": 0.7943312354385853, "num_tokens": 3287831.0, "step": 490 }, { "entropy": 0.5990536075085402, "epoch": 0.007696762933929063, "grad_norm": 0.3474610149860382, "learning_rate": 5.120837395453846e-07, "loss": 0.8256, "mean_token_accuracy": 0.787221422791481, "num_tokens": 3350402.0, "step": 500 }, { "entropy": 0.5774485882371664, "epoch": 0.007850698192607645, "grad_norm": 0.3027697503566742, "learning_rate": 5.223459387346709e-07, "loss": 0.8137, "mean_token_accuracy": 0.798531173914671, "num_tokens": 3416554.0, "step": 510 }, { "entropy": 0.5724255967885256, "epoch": 0.008004633451286226, "grad_norm": 0.2963491976261139, "learning_rate": 5.326081379239572e-07, "loss": 0.8147, "mean_token_accuracy": 0.7999022766947746, "num_tokens": 3480485.0, "step": 520 }, { "entropy": 0.6055585689842701, "epoch": 0.008158568709964807, "grad_norm": 0.38398170471191406, "learning_rate": 5.428703371132434e-07, "loss": 0.8509, "mean_token_accuracy": 0.7852301500737667, "num_tokens": 3538643.0, "step": 530 }, { "entropy": 0.6024080418050289, "epoch": 0.008312503968643388, "grad_norm": 0.3418945074081421, "learning_rate": 5.531325363025297e-07, "loss": 0.8253, "mean_token_accuracy": 0.7884251795709133, "num_tokens": 3597620.0, "step": 540 }, { "entropy": 0.5789231721311807, "epoch": 0.00846643922732197, "grad_norm": 0.2518658936023712, "learning_rate": 5.63394735491816e-07, "loss": 0.8049, "mean_token_accuracy": 0.7960966594517231, "num_tokens": 3664276.0, "step": 550 }, { "entropy": 0.581622064858675, "epoch": 0.00862037448600055, "grad_norm": 0.279163658618927, "learning_rate": 5.736569346811022e-07, "loss": 0.7823, "mean_token_accuracy": 0.8012498214840889, "num_tokens": 3728394.0, "step": 560 }, { "entropy": 0.5901626937091351, "epoch": 0.008774309744679132, "grad_norm": 0.21117708086967468, "learning_rate": 5.839191338703885e-07, "loss": 0.7937, "mean_token_accuracy": 0.7999440379440784, "num_tokens": 3794846.0, "step": 570 }, { "entropy": 0.5772449899464845, "epoch": 0.008928245003357713, "grad_norm": 0.43725132942199707, "learning_rate": 5.941813330596748e-07, "loss": 0.781, "mean_token_accuracy": 0.8007708184421063, "num_tokens": 3862863.0, "step": 580 }, { "entropy": 0.5918231148272752, "epoch": 0.009082180262036294, "grad_norm": 0.3120638430118561, "learning_rate": 6.04443532248961e-07, "loss": 0.7606, "mean_token_accuracy": 0.7995412901043892, "num_tokens": 3937702.0, "step": 590 }, { "entropy": 0.6080436132848263, "epoch": 0.009236115520714875, "grad_norm": 0.29440855979919434, "learning_rate": 6.147057314382473e-07, "loss": 0.78, "mean_token_accuracy": 0.7907770864665509, "num_tokens": 4002080.0, "step": 600 }, { "entropy": 0.5953700684010983, "epoch": 0.009390050779393457, "grad_norm": 0.29938367009162903, "learning_rate": 6.249679306275335e-07, "loss": 0.7998, "mean_token_accuracy": 0.7963499970734119, "num_tokens": 4060582.0, "step": 610 }, { "entropy": 0.6018656741827726, "epoch": 0.009543986038072038, "grad_norm": 0.3602679669857025, "learning_rate": 6.352301298168198e-07, "loss": 0.787, "mean_token_accuracy": 0.7965037129819393, "num_tokens": 4119460.0, "step": 620 }, { "entropy": 0.5884046301245689, "epoch": 0.009697921296750619, "grad_norm": 0.1880338340997696, "learning_rate": 6.454923290061061e-07, "loss": 0.7406, "mean_token_accuracy": 0.8052979663014412, "num_tokens": 4184233.0, "step": 630 }, { "entropy": 0.6124087158590555, "epoch": 0.0098518565554292, "grad_norm": 0.28648313879966736, "learning_rate": 6.557545281953922e-07, "loss": 0.7487, "mean_token_accuracy": 0.7985076539218425, "num_tokens": 4246127.0, "step": 640 }, { "entropy": 0.5954696122556925, "epoch": 0.010005791814107781, "grad_norm": 0.25778713822364807, "learning_rate": 6.660167273846786e-07, "loss": 0.7228, "mean_token_accuracy": 0.8081704720854759, "num_tokens": 4319347.0, "step": 650 }, { "entropy": 0.5947488233447075, "epoch": 0.010159727072786362, "grad_norm": 0.3250480592250824, "learning_rate": 6.762789265739649e-07, "loss": 0.7341, "mean_token_accuracy": 0.80690588504076, "num_tokens": 4387570.0, "step": 660 }, { "entropy": 0.5889777861535549, "epoch": 0.010313662331464944, "grad_norm": 0.22442400455474854, "learning_rate": 6.865411257632511e-07, "loss": 0.7084, "mean_token_accuracy": 0.8085587866604328, "num_tokens": 4455964.0, "step": 670 }, { "entropy": 0.5706839349120856, "epoch": 0.010467597590143525, "grad_norm": 0.22939367592334747, "learning_rate": 6.968033249525374e-07, "loss": 0.694, "mean_token_accuracy": 0.8173183344304562, "num_tokens": 4526239.0, "step": 680 }, { "entropy": 0.579495744779706, "epoch": 0.010621532848822106, "grad_norm": 0.32433322072029114, "learning_rate": 7.070655241418237e-07, "loss": 0.6855, "mean_token_accuracy": 0.8145696967840195, "num_tokens": 4595282.0, "step": 690 }, { "entropy": 0.5654889680445194, "epoch": 0.010775468107500689, "grad_norm": 0.2458539754152298, "learning_rate": 7.173277233311099e-07, "loss": 0.6815, "mean_token_accuracy": 0.817343182861805, "num_tokens": 4673894.0, "step": 700 }, { "entropy": 0.5825007408857346, "epoch": 0.01092940336617927, "grad_norm": 0.23871126770973206, "learning_rate": 7.275899225203961e-07, "loss": 0.6873, "mean_token_accuracy": 0.8104559756815434, "num_tokens": 4744248.0, "step": 710 }, { "entropy": 0.5903335172683001, "epoch": 0.011083338624857851, "grad_norm": 0.302754282951355, "learning_rate": 7.378521217096824e-07, "loss": 0.6804, "mean_token_accuracy": 0.8155585058033467, "num_tokens": 4804640.0, "step": 720 }, { "entropy": 0.5674149308353662, "epoch": 0.011237273883536432, "grad_norm": 0.20674283802509308, "learning_rate": 7.481143208989687e-07, "loss": 0.6561, "mean_token_accuracy": 0.8202797174453735, "num_tokens": 4871885.0, "step": 730 }, { "entropy": 0.5710958525538444, "epoch": 0.011391209142215013, "grad_norm": 0.33155515789985657, "learning_rate": 7.58376520088255e-07, "loss": 0.6519, "mean_token_accuracy": 0.8175522983074188, "num_tokens": 4938930.0, "step": 740 }, { "entropy": 0.579977649077773, "epoch": 0.011545144400893595, "grad_norm": 0.2054048329591751, "learning_rate": 7.686387192775413e-07, "loss": 0.6641, "mean_token_accuracy": 0.816803714632988, "num_tokens": 4999444.0, "step": 750 }, { "entropy": 0.5899416249245405, "epoch": 0.011699079659572176, "grad_norm": 0.23790483176708221, "learning_rate": 7.789009184668275e-07, "loss": 0.6533, "mean_token_accuracy": 0.8144042409956456, "num_tokens": 5065099.0, "step": 760 }, { "entropy": 0.5952073518186808, "epoch": 0.011853014918250757, "grad_norm": 0.3782675862312317, "learning_rate": 7.891631176561138e-07, "loss": 0.6549, "mean_token_accuracy": 0.8115997463464737, "num_tokens": 5125101.0, "step": 770 }, { "entropy": 0.5837824650108814, "epoch": 0.012006950176929338, "grad_norm": 0.2830575406551361, "learning_rate": 7.994253168453999e-07, "loss": 0.6431, "mean_token_accuracy": 0.8151948042213917, "num_tokens": 5188010.0, "step": 780 }, { "entropy": 0.5804638035595417, "epoch": 0.01216088543560792, "grad_norm": 0.20560069382190704, "learning_rate": 8.096875160346863e-07, "loss": 0.617, "mean_token_accuracy": 0.8237873263657093, "num_tokens": 5256017.0, "step": 790 }, { "entropy": 0.5535229567438364, "epoch": 0.0123148206942865, "grad_norm": 0.18642188608646393, "learning_rate": 8.199497152239726e-07, "loss": 0.5882, "mean_token_accuracy": 0.8303017787635326, "num_tokens": 5325183.0, "step": 800 }, { "entropy": 0.5633439477533102, "epoch": 0.012468755952965082, "grad_norm": 0.2673782706260681, "learning_rate": 8.302119144132588e-07, "loss": 0.6015, "mean_token_accuracy": 0.8260761052370071, "num_tokens": 5397182.0, "step": 810 }, { "entropy": 0.5712029971182346, "epoch": 0.012622691211643663, "grad_norm": 0.32098516821861267, "learning_rate": 8.404741136025452e-07, "loss": 0.6155, "mean_token_accuracy": 0.823362759500742, "num_tokens": 5463199.0, "step": 820 }, { "entropy": 0.5782248791307211, "epoch": 0.012776626470322244, "grad_norm": 0.3076578378677368, "learning_rate": 8.507363127918314e-07, "loss": 0.6032, "mean_token_accuracy": 0.8211322546005249, "num_tokens": 5528094.0, "step": 830 }, { "entropy": 0.5812303598970174, "epoch": 0.012930561729000825, "grad_norm": 0.31803205609321594, "learning_rate": 8.609985119811176e-07, "loss": 0.6116, "mean_token_accuracy": 0.8218949317932129, "num_tokens": 5594792.0, "step": 840 }, { "entropy": 0.5621976125985384, "epoch": 0.013084496987679406, "grad_norm": 0.284589558839798, "learning_rate": 8.712607111704039e-07, "loss": 0.595, "mean_token_accuracy": 0.8299679301679135, "num_tokens": 5658839.0, "step": 850 }, { "entropy": 0.5618294723331928, "epoch": 0.013238432246357987, "grad_norm": 0.2638741135597229, "learning_rate": 8.815229103596903e-07, "loss": 0.5896, "mean_token_accuracy": 0.8317821763455868, "num_tokens": 5731743.0, "step": 860 }, { "entropy": 0.5802610736340285, "epoch": 0.013392367505036569, "grad_norm": 0.21083752810955048, "learning_rate": 8.917851095489764e-07, "loss": 0.6063, "mean_token_accuracy": 0.8291312985122203, "num_tokens": 5794448.0, "step": 870 }, { "entropy": 0.5641825083643198, "epoch": 0.01354630276371515, "grad_norm": 0.21583209931850433, "learning_rate": 9.020473087382627e-07, "loss": 0.5632, "mean_token_accuracy": 0.8351070746779442, "num_tokens": 5865590.0, "step": 880 }, { "entropy": 0.5706600766628981, "epoch": 0.013700238022393731, "grad_norm": 0.19863373041152954, "learning_rate": 9.123095079275488e-07, "loss": 0.5693, "mean_token_accuracy": 0.8353834792971611, "num_tokens": 5934690.0, "step": 890 }, { "entropy": 0.5750227101147175, "epoch": 0.013854173281072314, "grad_norm": 0.22449633479118347, "learning_rate": 9.225717071168352e-07, "loss": 0.5749, "mean_token_accuracy": 0.8357964880764485, "num_tokens": 6001292.0, "step": 900 }, { "entropy": 0.5683520775288343, "epoch": 0.014008108539750895, "grad_norm": 0.2452477216720581, "learning_rate": 9.328339063061215e-07, "loss": 0.5648, "mean_token_accuracy": 0.8375016011297702, "num_tokens": 6073805.0, "step": 910 }, { "entropy": 0.5826122537255287, "epoch": 0.014162043798429476, "grad_norm": 0.19749343395233154, "learning_rate": 9.430961054954077e-07, "loss": 0.5701, "mean_token_accuracy": 0.8349515549838543, "num_tokens": 6136119.0, "step": 920 }, { "entropy": 0.5817314147949219, "epoch": 0.014315979057108057, "grad_norm": 0.2198537290096283, "learning_rate": 9.533583046846941e-07, "loss": 0.5727, "mean_token_accuracy": 0.8348510041832924, "num_tokens": 6204444.0, "step": 930 }, { "entropy": 0.5906016174703836, "epoch": 0.014469914315786639, "grad_norm": 0.3212786018848419, "learning_rate": 9.636205038739804e-07, "loss": 0.575, "mean_token_accuracy": 0.8307074151933194, "num_tokens": 6266317.0, "step": 940 }, { "entropy": 0.5749761175364256, "epoch": 0.01462384957446522, "grad_norm": 0.2105342000722885, "learning_rate": 9.738827030632665e-07, "loss": 0.5658, "mean_token_accuracy": 0.8339016251266003, "num_tokens": 6329026.0, "step": 950 }, { "entropy": 0.5872229471802711, "epoch": 0.014777784833143801, "grad_norm": 0.2847383916378021, "learning_rate": 9.841449022525527e-07, "loss": 0.5855, "mean_token_accuracy": 0.835652182251215, "num_tokens": 6395191.0, "step": 960 }, { "entropy": 0.568607385084033, "epoch": 0.014931720091822382, "grad_norm": 0.20542892813682556, "learning_rate": 9.94407101441839e-07, "loss": 0.5583, "mean_token_accuracy": 0.8382078535854817, "num_tokens": 6467260.0, "step": 970 }, { "entropy": 0.5805447198450565, "epoch": 0.015085655350500963, "grad_norm": 0.30979448556900024, "learning_rate": 1.0046693006311254e-06, "loss": 0.5658, "mean_token_accuracy": 0.8349114634096623, "num_tokens": 6527169.0, "step": 980 }, { "entropy": 0.5868692245334387, "epoch": 0.015239590609179544, "grad_norm": 0.21536371111869812, "learning_rate": 1.0149314998204116e-06, "loss": 0.5752, "mean_token_accuracy": 0.8322969645261764, "num_tokens": 6591755.0, "step": 990 }, { "entropy": 0.5727530140429735, "epoch": 0.015393525867858126, "grad_norm": 0.42073047161102295, "learning_rate": 1.025193699009698e-06, "loss": 0.5684, "mean_token_accuracy": 0.8380010016262531, "num_tokens": 6663664.0, "step": 1000 }, { "entropy": 0.5736542452126742, "epoch": 0.015547461126536707, "grad_norm": 0.22397920489311218, "learning_rate": 1.0354558981989841e-06, "loss": 0.562, "mean_token_accuracy": 0.8370812490582467, "num_tokens": 6729618.0, "step": 1010 }, { "entropy": 0.580896618962288, "epoch": 0.01570139638521529, "grad_norm": 0.34280356764793396, "learning_rate": 1.0457180973882705e-06, "loss": 0.5781, "mean_token_accuracy": 0.8341807328164578, "num_tokens": 6797011.0, "step": 1020 }, { "entropy": 0.5814141903072596, "epoch": 0.01585533164389387, "grad_norm": 0.3303168714046478, "learning_rate": 1.0559802965775566e-06, "loss": 0.5551, "mean_token_accuracy": 0.834510674327612, "num_tokens": 6863418.0, "step": 1030 }, { "entropy": 0.5539891216903925, "epoch": 0.016009266902572452, "grad_norm": 0.328671932220459, "learning_rate": 1.066242495766843e-06, "loss": 0.5442, "mean_token_accuracy": 0.8454210713505745, "num_tokens": 6927537.0, "step": 1040 }, { "entropy": 0.5650980964303016, "epoch": 0.01616320216125103, "grad_norm": 0.35914215445518494, "learning_rate": 1.0765046949561292e-06, "loss": 0.5471, "mean_token_accuracy": 0.8428660087287426, "num_tokens": 6992980.0, "step": 1050 }, { "entropy": 0.5762754313647747, "epoch": 0.016317137419929614, "grad_norm": 0.807826042175293, "learning_rate": 1.0867668941454153e-06, "loss": 0.5688, "mean_token_accuracy": 0.8367544539272785, "num_tokens": 7053511.0, "step": 1060 }, { "entropy": 0.5708387818187475, "epoch": 0.016471072678608194, "grad_norm": 0.19690760970115662, "learning_rate": 1.0970290933347017e-06, "loss": 0.56, "mean_token_accuracy": 0.8358969882130622, "num_tokens": 7121874.0, "step": 1070 }, { "entropy": 0.5586157351732254, "epoch": 0.016625007937286777, "grad_norm": 0.6106492877006531, "learning_rate": 1.107291292523988e-06, "loss": 0.546, "mean_token_accuracy": 0.840894202888012, "num_tokens": 7183181.0, "step": 1080 }, { "entropy": 0.5624430205672979, "epoch": 0.016778943195965356, "grad_norm": 0.27873215079307556, "learning_rate": 1.1175534917132742e-06, "loss": 0.5485, "mean_token_accuracy": 0.8402581870555877, "num_tokens": 7249781.0, "step": 1090 }, { "entropy": 0.5605059988796711, "epoch": 0.01693287845464394, "grad_norm": 0.219707652926445, "learning_rate": 1.1278156909025604e-06, "loss": 0.5432, "mean_token_accuracy": 0.8408319845795631, "num_tokens": 7315129.0, "step": 1100 }, { "entropy": 0.5751965824514628, "epoch": 0.01708681371332252, "grad_norm": 0.6972764134407043, "learning_rate": 1.1380778900918468e-06, "loss": 0.5603, "mean_token_accuracy": 0.8381582148373127, "num_tokens": 7378689.0, "step": 1110 }, { "entropy": 0.5564561605453491, "epoch": 0.0172407489720011, "grad_norm": 0.3115769028663635, "learning_rate": 1.1483400892811331e-06, "loss": 0.5343, "mean_token_accuracy": 0.8441146217286587, "num_tokens": 7438339.0, "step": 1120 }, { "entropy": 0.5680774599313736, "epoch": 0.01739468423067968, "grad_norm": 0.331267774105072, "learning_rate": 1.1586022884704193e-06, "loss": 0.5535, "mean_token_accuracy": 0.8354248732328415, "num_tokens": 7503056.0, "step": 1130 }, { "entropy": 0.5660824920982123, "epoch": 0.017548619489358264, "grad_norm": 0.3444870114326477, "learning_rate": 1.1688644876597054e-06, "loss": 0.5547, "mean_token_accuracy": 0.836595393717289, "num_tokens": 7572149.0, "step": 1140 }, { "entropy": 0.5610816046595574, "epoch": 0.017702554748036843, "grad_norm": 0.43003037571907043, "learning_rate": 1.1791266868489918e-06, "loss": 0.5489, "mean_token_accuracy": 0.8385420113801956, "num_tokens": 7637837.0, "step": 1150 }, { "entropy": 0.55774677246809, "epoch": 0.017856490006715426, "grad_norm": 0.5127699971199036, "learning_rate": 1.1893888860382782e-06, "loss": 0.5506, "mean_token_accuracy": 0.8416648909449578, "num_tokens": 7700364.0, "step": 1160 }, { "entropy": 0.5588976331055164, "epoch": 0.018010425265394005, "grad_norm": 0.4313962459564209, "learning_rate": 1.1996510852275643e-06, "loss": 0.5569, "mean_token_accuracy": 0.8388491250574589, "num_tokens": 7769275.0, "step": 1170 }, { "entropy": 0.5567357525229454, "epoch": 0.01816436052407259, "grad_norm": 0.41337382793426514, "learning_rate": 1.2099132844168507e-06, "loss": 0.555, "mean_token_accuracy": 0.8402017563581466, "num_tokens": 7837680.0, "step": 1180 }, { "entropy": 0.5557991713285446, "epoch": 0.018318295782751168, "grad_norm": 0.4462444484233856, "learning_rate": 1.2201754836061369e-06, "loss": 0.5488, "mean_token_accuracy": 0.84064222574234, "num_tokens": 7899097.0, "step": 1190 }, { "entropy": 0.5794588129967451, "epoch": 0.01847223104142975, "grad_norm": 0.3214733898639679, "learning_rate": 1.2304376827954232e-06, "loss": 0.5649, "mean_token_accuracy": 0.8327567972242832, "num_tokens": 7966287.0, "step": 1200 }, { "entropy": 0.5648639090359211, "epoch": 0.018626166300108334, "grad_norm": 0.3796241581439972, "learning_rate": 1.2406998819847094e-06, "loss": 0.5524, "mean_token_accuracy": 0.8410445056855679, "num_tokens": 8023229.0, "step": 1210 }, { "entropy": 0.56885349676013, "epoch": 0.018780101558786913, "grad_norm": 0.837009072303772, "learning_rate": 1.2509620811739956e-06, "loss": 0.5614, "mean_token_accuracy": 0.8355847328901291, "num_tokens": 8092123.0, "step": 1220 }, { "entropy": 0.5266733232885599, "epoch": 0.018934036817465496, "grad_norm": 0.6339938640594482, "learning_rate": 1.2612242803632817e-06, "loss": 0.5241, "mean_token_accuracy": 0.8532371610403061, "num_tokens": 8159356.0, "step": 1230 }, { "entropy": 0.5407309867441654, "epoch": 0.019087972076144075, "grad_norm": 0.39163440465927124, "learning_rate": 1.2714864795525683e-06, "loss": 0.5371, "mean_token_accuracy": 0.8464967824518681, "num_tokens": 8232546.0, "step": 1240 }, { "entropy": 0.5554712042212486, "epoch": 0.019241907334822658, "grad_norm": 0.3747962415218353, "learning_rate": 1.2817486787418545e-06, "loss": 0.5469, "mean_token_accuracy": 0.8406825013458729, "num_tokens": 8293789.0, "step": 1250 }, { "entropy": 0.5424840893596411, "epoch": 0.019395842593501238, "grad_norm": 0.41734445095062256, "learning_rate": 1.2920108779311408e-06, "loss": 0.5277, "mean_token_accuracy": 0.8451595656573773, "num_tokens": 8362679.0, "step": 1260 }, { "entropy": 0.5431670799851418, "epoch": 0.01954977785217982, "grad_norm": 0.3911650478839874, "learning_rate": 1.302273077120427e-06, "loss": 0.5286, "mean_token_accuracy": 0.8444758154451847, "num_tokens": 8436466.0, "step": 1270 }, { "entropy": 0.5634684193879366, "epoch": 0.0197037131108584, "grad_norm": 0.4117857813835144, "learning_rate": 1.3125352763097131e-06, "loss": 0.5479, "mean_token_accuracy": 0.8355170890688897, "num_tokens": 8505493.0, "step": 1280 }, { "entropy": 0.5582088928669691, "epoch": 0.019857648369536983, "grad_norm": 0.5035058259963989, "learning_rate": 1.3227974754989995e-06, "loss": 0.537, "mean_token_accuracy": 0.8411084197461605, "num_tokens": 8572983.0, "step": 1290 }, { "entropy": 0.5415021900087595, "epoch": 0.020011583628215562, "grad_norm": 0.32829996943473816, "learning_rate": 1.3330596746882857e-06, "loss": 0.5422, "mean_token_accuracy": 0.8464752838015557, "num_tokens": 8642727.0, "step": 1300 }, { "entropy": 0.5416510287672281, "epoch": 0.020165518886894145, "grad_norm": 0.3520232141017914, "learning_rate": 1.3433218738775722e-06, "loss": 0.5285, "mean_token_accuracy": 0.8443135373294354, "num_tokens": 8716248.0, "step": 1310 }, { "entropy": 0.5634296510368586, "epoch": 0.020319454145572725, "grad_norm": 0.34143540263175964, "learning_rate": 1.3535840730668584e-06, "loss": 0.5367, "mean_token_accuracy": 0.8377905361354351, "num_tokens": 8792400.0, "step": 1320 }, { "entropy": 0.5543441720306873, "epoch": 0.020473389404251308, "grad_norm": 0.34941935539245605, "learning_rate": 1.3638462722561446e-06, "loss": 0.5333, "mean_token_accuracy": 0.8431934580206871, "num_tokens": 8853778.0, "step": 1330 }, { "entropy": 0.562688821181655, "epoch": 0.020627324662929887, "grad_norm": 0.5376359820365906, "learning_rate": 1.374108471445431e-06, "loss": 0.5478, "mean_token_accuracy": 0.8377990789711476, "num_tokens": 8913470.0, "step": 1340 }, { "entropy": 0.5664670672267675, "epoch": 0.02078125992160847, "grad_norm": 0.4118405878543854, "learning_rate": 1.384370670634717e-06, "loss": 0.5379, "mean_token_accuracy": 0.8409068696200848, "num_tokens": 8981135.0, "step": 1350 }, { "entropy": 0.547214786708355, "epoch": 0.02093519518028705, "grad_norm": 0.42955267429351807, "learning_rate": 1.3946328698240033e-06, "loss": 0.5332, "mean_token_accuracy": 0.8455484814941883, "num_tokens": 9050929.0, "step": 1360 }, { "entropy": 0.5551504231989384, "epoch": 0.021089130438965632, "grad_norm": 0.5034512877464294, "learning_rate": 1.4048950690132896e-06, "loss": 0.539, "mean_token_accuracy": 0.8435608685016632, "num_tokens": 9122866.0, "step": 1370 }, { "entropy": 0.5419036217033864, "epoch": 0.02124306569764421, "grad_norm": 0.7381494045257568, "learning_rate": 1.415157268202576e-06, "loss": 0.5454, "mean_token_accuracy": 0.8476443752646446, "num_tokens": 9182913.0, "step": 1380 }, { "entropy": 0.5454111706465483, "epoch": 0.021397000956322795, "grad_norm": 0.3690260946750641, "learning_rate": 1.4254194673918624e-06, "loss": 0.5243, "mean_token_accuracy": 0.844329472631216, "num_tokens": 9244141.0, "step": 1390 }, { "entropy": 0.5463173754513264, "epoch": 0.021550936215001378, "grad_norm": 0.41608718037605286, "learning_rate": 1.4356816665811485e-06, "loss": 0.5381, "mean_token_accuracy": 0.8461933605372906, "num_tokens": 9315354.0, "step": 1400 }, { "entropy": 0.5589236166328192, "epoch": 0.021704871473679957, "grad_norm": 0.625901460647583, "learning_rate": 1.4459438657704347e-06, "loss": 0.5443, "mean_token_accuracy": 0.8408835545182228, "num_tokens": 9378435.0, "step": 1410 }, { "entropy": 0.5625868238508701, "epoch": 0.02185880673235854, "grad_norm": 1.4490634202957153, "learning_rate": 1.456206064959721e-06, "loss": 0.5493, "mean_token_accuracy": 0.8380113519728184, "num_tokens": 9438614.0, "step": 1420 }, { "entropy": 0.544154429063201, "epoch": 0.02201274199103712, "grad_norm": 0.8526740074157715, "learning_rate": 1.4664682641490072e-06, "loss": 0.5299, "mean_token_accuracy": 0.8456660859286785, "num_tokens": 9499088.0, "step": 1430 }, { "entropy": 0.569502591714263, "epoch": 0.022166677249715702, "grad_norm": 1.0708087682724, "learning_rate": 1.4767304633382934e-06, "loss": 0.5548, "mean_token_accuracy": 0.8374017395079136, "num_tokens": 9563314.0, "step": 1440 }, { "entropy": 0.5543609332293272, "epoch": 0.02232061250839428, "grad_norm": 0.5648512840270996, "learning_rate": 1.48699266252758e-06, "loss": 0.5291, "mean_token_accuracy": 0.8446345172822476, "num_tokens": 9620514.0, "step": 1450 }, { "entropy": 0.5327244058251381, "epoch": 0.022474547767072865, "grad_norm": 0.42342913150787354, "learning_rate": 1.4972548617168661e-06, "loss": 0.5069, "mean_token_accuracy": 0.8523367621004582, "num_tokens": 9685287.0, "step": 1460 }, { "entropy": 0.556696479395032, "epoch": 0.022628483025751444, "grad_norm": 0.4053494334220886, "learning_rate": 1.5075170609061523e-06, "loss": 0.532, "mean_token_accuracy": 0.8409239575266838, "num_tokens": 9748852.0, "step": 1470 }, { "entropy": 0.5689851250499487, "epoch": 0.022782418284430027, "grad_norm": 0.3695392906665802, "learning_rate": 1.5177792600954386e-06, "loss": 0.5438, "mean_token_accuracy": 0.8362425953149796, "num_tokens": 9807695.0, "step": 1480 }, { "entropy": 0.5493887331336736, "epoch": 0.022936353543108606, "grad_norm": 0.581062376499176, "learning_rate": 1.5280414592847248e-06, "loss": 0.5375, "mean_token_accuracy": 0.8446490645408631, "num_tokens": 9878398.0, "step": 1490 }, { "entropy": 0.5402518838644028, "epoch": 0.02309028880178719, "grad_norm": 0.32712429761886597, "learning_rate": 1.538303658474011e-06, "loss": 0.5315, "mean_token_accuracy": 0.8465850926935673, "num_tokens": 9940965.0, "step": 1500 }, { "entropy": 0.5394793681800365, "epoch": 0.02324422406046577, "grad_norm": 0.4485490918159485, "learning_rate": 1.5485658576632973e-06, "loss": 0.513, "mean_token_accuracy": 0.8464520052075386, "num_tokens": 10018582.0, "step": 1510 }, { "entropy": 0.5471213325858116, "epoch": 0.02339815931914435, "grad_norm": 0.450368195772171, "learning_rate": 1.5588280568525837e-06, "loss": 0.5318, "mean_token_accuracy": 0.8445595934987068, "num_tokens": 10090747.0, "step": 1520 }, { "entropy": 0.5592851422727108, "epoch": 0.02355209457782293, "grad_norm": 0.4147779047489166, "learning_rate": 1.56909025604187e-06, "loss": 0.53, "mean_token_accuracy": 0.8421593427658081, "num_tokens": 10154964.0, "step": 1530 }, { "entropy": 0.5707831926643848, "epoch": 0.023706029836501514, "grad_norm": 1.5432734489440918, "learning_rate": 1.5793524552311562e-06, "loss": 0.5481, "mean_token_accuracy": 0.8414090037345886, "num_tokens": 10214657.0, "step": 1540 }, { "entropy": 0.5239541664719581, "epoch": 0.023859965095180093, "grad_norm": 0.4119246304035187, "learning_rate": 1.5896146544204424e-06, "loss": 0.5078, "mean_token_accuracy": 0.8498368702828885, "num_tokens": 10290447.0, "step": 1550 }, { "entropy": 0.5670832943171262, "epoch": 0.024013900353858676, "grad_norm": 0.4431641399860382, "learning_rate": 1.5998768536097287e-06, "loss": 0.5469, "mean_token_accuracy": 0.8391273215413093, "num_tokens": 10352359.0, "step": 1560 }, { "entropy": 0.5635010324418545, "epoch": 0.024167835612537256, "grad_norm": 0.7251086831092834, "learning_rate": 1.610139052799015e-06, "loss": 0.5445, "mean_token_accuracy": 0.8404857434332371, "num_tokens": 10416235.0, "step": 1570 }, { "entropy": 0.5455088254064322, "epoch": 0.02432177087121584, "grad_norm": 0.7295903563499451, "learning_rate": 1.620401251988301e-06, "loss": 0.5316, "mean_token_accuracy": 0.8446026094257831, "num_tokens": 10487165.0, "step": 1580 }, { "entropy": 0.5368785060942173, "epoch": 0.024475706129894418, "grad_norm": 0.5106576085090637, "learning_rate": 1.6306634511775876e-06, "loss": 0.5219, "mean_token_accuracy": 0.8490719094872474, "num_tokens": 10562891.0, "step": 1590 }, { "entropy": 0.5433236934244633, "epoch": 0.024629641388573, "grad_norm": 0.5136491060256958, "learning_rate": 1.6409256503668738e-06, "loss": 0.5165, "mean_token_accuracy": 0.8479399651288986, "num_tokens": 10631644.0, "step": 1600 }, { "entropy": 0.5463156070560217, "epoch": 0.024783576647251584, "grad_norm": 0.365967333316803, "learning_rate": 1.6511878495561602e-06, "loss": 0.5299, "mean_token_accuracy": 0.8467783831059933, "num_tokens": 10696393.0, "step": 1610 }, { "entropy": 0.5447094734758139, "epoch": 0.024937511905930163, "grad_norm": 0.7062378525733948, "learning_rate": 1.6614500487454463e-06, "loss": 0.5264, "mean_token_accuracy": 0.8450621575117111, "num_tokens": 10756423.0, "step": 1620 }, { "entropy": 0.5590496104210615, "epoch": 0.025091447164608746, "grad_norm": 0.5271353721618652, "learning_rate": 1.6717122479347325e-06, "loss": 0.5377, "mean_token_accuracy": 0.841252314299345, "num_tokens": 10815834.0, "step": 1630 }, { "entropy": 0.5497857701033354, "epoch": 0.025245382423287326, "grad_norm": 0.6618366241455078, "learning_rate": 1.6819744471240187e-06, "loss": 0.5419, "mean_token_accuracy": 0.8462118171155453, "num_tokens": 10880214.0, "step": 1640 }, { "entropy": 0.551817687600851, "epoch": 0.02539931768196591, "grad_norm": 0.5673640966415405, "learning_rate": 1.692236646313305e-06, "loss": 0.5216, "mean_token_accuracy": 0.8424673900008202, "num_tokens": 10956536.0, "step": 1650 }, { "entropy": 0.5281467322260142, "epoch": 0.025553252940644488, "grad_norm": 0.41285285353660583, "learning_rate": 1.7024988455025914e-06, "loss": 0.501, "mean_token_accuracy": 0.8510936543345451, "num_tokens": 11029376.0, "step": 1660 }, { "entropy": 0.538617591932416, "epoch": 0.02570718819932307, "grad_norm": 0.49293142557144165, "learning_rate": 1.7127610446918778e-06, "loss": 0.511, "mean_token_accuracy": 0.8471094354987144, "num_tokens": 11101986.0, "step": 1670 }, { "entropy": 0.5412086550146341, "epoch": 0.02586112345800165, "grad_norm": 0.33844277262687683, "learning_rate": 1.723023243881164e-06, "loss": 0.5177, "mean_token_accuracy": 0.8484695665538311, "num_tokens": 11174849.0, "step": 1680 }, { "entropy": 0.5352476324886084, "epoch": 0.026015058716680233, "grad_norm": 0.5516999959945679, "learning_rate": 1.73328544307045e-06, "loss": 0.519, "mean_token_accuracy": 0.8494764685630798, "num_tokens": 11238614.0, "step": 1690 }, { "entropy": 0.550838853046298, "epoch": 0.026168993975358813, "grad_norm": 0.44671446084976196, "learning_rate": 1.7435476422597364e-06, "loss": 0.515, "mean_token_accuracy": 0.8454580582678318, "num_tokens": 11309838.0, "step": 1700 }, { "entropy": 0.5478288542479277, "epoch": 0.026322929234037395, "grad_norm": 0.5337618589401245, "learning_rate": 1.7538098414490226e-06, "loss": 0.5201, "mean_token_accuracy": 0.8466238118708134, "num_tokens": 11387166.0, "step": 1710 }, { "entropy": 0.5470820013433695, "epoch": 0.026476864492715975, "grad_norm": 0.36631977558135986, "learning_rate": 1.7640720406383088e-06, "loss": 0.5428, "mean_token_accuracy": 0.8456047914922238, "num_tokens": 11453129.0, "step": 1720 }, { "entropy": 0.5499895934015513, "epoch": 0.026630799751394558, "grad_norm": 0.5128644704818726, "learning_rate": 1.7743342398275951e-06, "loss": 0.5305, "mean_token_accuracy": 0.8469660237431527, "num_tokens": 11521440.0, "step": 1730 }, { "entropy": 0.5525856669992208, "epoch": 0.026784735010073137, "grad_norm": 0.48873525857925415, "learning_rate": 1.7845964390168815e-06, "loss": 0.5364, "mean_token_accuracy": 0.8417168721556664, "num_tokens": 11587121.0, "step": 1740 }, { "entropy": 0.5514168586581946, "epoch": 0.02693867026875172, "grad_norm": 0.7496342062950134, "learning_rate": 1.7948586382061679e-06, "loss": 0.5303, "mean_token_accuracy": 0.8447843372821808, "num_tokens": 11654335.0, "step": 1750 }, { "entropy": 0.5489217091351748, "epoch": 0.0270926055274303, "grad_norm": 0.5287875533103943, "learning_rate": 1.805120837395454e-06, "loss": 0.5328, "mean_token_accuracy": 0.8471907287836075, "num_tokens": 11715224.0, "step": 1760 }, { "entropy": 0.5551241960376501, "epoch": 0.027246540786108882, "grad_norm": 0.44238126277923584, "learning_rate": 1.8153830365847402e-06, "loss": 0.531, "mean_token_accuracy": 0.8433790393173695, "num_tokens": 11783276.0, "step": 1770 }, { "entropy": 0.5391635961830616, "epoch": 0.027400476044787462, "grad_norm": 0.3819088041782379, "learning_rate": 1.8256452357740266e-06, "loss": 0.5151, "mean_token_accuracy": 0.8497254334390163, "num_tokens": 11841325.0, "step": 1780 }, { "entropy": 0.535517729818821, "epoch": 0.027554411303466045, "grad_norm": 0.3325054943561554, "learning_rate": 1.8359074349633127e-06, "loss": 0.509, "mean_token_accuracy": 0.8508121982216835, "num_tokens": 11912753.0, "step": 1790 }, { "entropy": 0.5371477752923965, "epoch": 0.027708346562144628, "grad_norm": 0.42143240571022034, "learning_rate": 1.8461696341525989e-06, "loss": 0.5234, "mean_token_accuracy": 0.8480089411139489, "num_tokens": 11970836.0, "step": 1800 }, { "entropy": 0.5346264071762562, "epoch": 0.027862281820823207, "grad_norm": 0.38573354482650757, "learning_rate": 1.8564318333418855e-06, "loss": 0.5102, "mean_token_accuracy": 0.8516876585781574, "num_tokens": 12042003.0, "step": 1810 }, { "entropy": 0.5525003287941217, "epoch": 0.02801621707950179, "grad_norm": 0.5890640020370483, "learning_rate": 1.8666940325311716e-06, "loss": 0.5281, "mean_token_accuracy": 0.84368095099926, "num_tokens": 12110928.0, "step": 1820 }, { "entropy": 0.5416143383830786, "epoch": 0.02817015233818037, "grad_norm": 0.5008258819580078, "learning_rate": 1.8769562317204578e-06, "loss": 0.5214, "mean_token_accuracy": 0.8476007498800755, "num_tokens": 12176374.0, "step": 1830 }, { "entropy": 0.5236173801124095, "epoch": 0.028324087596858952, "grad_norm": 0.5492127537727356, "learning_rate": 1.8872184309097441e-06, "loss": 0.4973, "mean_token_accuracy": 0.8538195744156838, "num_tokens": 12241850.0, "step": 1840 }, { "entropy": 0.5388133291155099, "epoch": 0.028478022855537532, "grad_norm": 0.574918806552887, "learning_rate": 1.8974806300990303e-06, "loss": 0.5146, "mean_token_accuracy": 0.8469141066074372, "num_tokens": 12306357.0, "step": 1850 }, { "entropy": 0.5334253013134003, "epoch": 0.028631958114216115, "grad_norm": 0.4284028708934784, "learning_rate": 1.9077428292883167e-06, "loss": 0.5137, "mean_token_accuracy": 0.8507631063461304, "num_tokens": 12364278.0, "step": 1860 }, { "entropy": 0.5377473220229149, "epoch": 0.028785893372894694, "grad_norm": 0.557306706905365, "learning_rate": 1.9180050284776026e-06, "loss": 0.5115, "mean_token_accuracy": 0.8488616824150086, "num_tokens": 12431572.0, "step": 1870 }, { "entropy": 0.5432716846466065, "epoch": 0.028939828631573277, "grad_norm": 0.46545207500457764, "learning_rate": 1.9282672276668894e-06, "loss": 0.5325, "mean_token_accuracy": 0.8455002710223198, "num_tokens": 12494195.0, "step": 1880 }, { "entropy": 0.5463311459869147, "epoch": 0.029093763890251857, "grad_norm": 0.40986013412475586, "learning_rate": 1.9385294268561754e-06, "loss": 0.5023, "mean_token_accuracy": 0.8486354663968086, "num_tokens": 12557802.0, "step": 1890 }, { "entropy": 0.5544444002211094, "epoch": 0.02924769914893044, "grad_norm": 0.5629920363426208, "learning_rate": 1.9487916260454617e-06, "loss": 0.5298, "mean_token_accuracy": 0.8436767742037773, "num_tokens": 12621433.0, "step": 1900 }, { "entropy": 0.5329311300069094, "epoch": 0.02940163440760902, "grad_norm": 0.4564446806907654, "learning_rate": 1.959053825234748e-06, "loss": 0.5068, "mean_token_accuracy": 0.8504777267575264, "num_tokens": 12685029.0, "step": 1910 }, { "entropy": 0.5368456728756428, "epoch": 0.029555569666287602, "grad_norm": 0.6358550190925598, "learning_rate": 1.969316024424034e-06, "loss": 0.5145, "mean_token_accuracy": 0.849727138876915, "num_tokens": 12747268.0, "step": 1920 }, { "entropy": 0.5514231469482184, "epoch": 0.02970950492496618, "grad_norm": 0.5135514736175537, "learning_rate": 1.9795782236133204e-06, "loss": 0.5384, "mean_token_accuracy": 0.8450152948498726, "num_tokens": 12812124.0, "step": 1930 }, { "entropy": 0.5268879655748606, "epoch": 0.029863440183644764, "grad_norm": 0.5031663179397583, "learning_rate": 1.989840422802607e-06, "loss": 0.4983, "mean_token_accuracy": 0.8550363518297672, "num_tokens": 12882372.0, "step": 1940 }, { "entropy": 0.5413805730640888, "epoch": 0.030017375442323344, "grad_norm": 0.4025253653526306, "learning_rate": 2.000102621991893e-06, "loss": 0.5181, "mean_token_accuracy": 0.8478816770017147, "num_tokens": 12954806.0, "step": 1950 }, { "entropy": 0.545957625284791, "epoch": 0.030171310701001926, "grad_norm": 0.5439096093177795, "learning_rate": 2.0103648211811795e-06, "loss": 0.5218, "mean_token_accuracy": 0.8448628783226013, "num_tokens": 13024408.0, "step": 1960 }, { "entropy": 0.5383702229708434, "epoch": 0.030325245959680506, "grad_norm": 0.7378926277160645, "learning_rate": 2.0206270203704655e-06, "loss": 0.5127, "mean_token_accuracy": 0.8499216236174106, "num_tokens": 13084767.0, "step": 1970 }, { "entropy": 0.5378314692527055, "epoch": 0.03047918121835909, "grad_norm": 0.34279897809028625, "learning_rate": 2.030889219559752e-06, "loss": 0.5265, "mean_token_accuracy": 0.8462032608687877, "num_tokens": 13158673.0, "step": 1980 }, { "entropy": 0.5208405170589685, "epoch": 0.030633116477037668, "grad_norm": 0.3206946849822998, "learning_rate": 2.0411514187490382e-06, "loss": 0.4858, "mean_token_accuracy": 0.8530828878283501, "num_tokens": 13230508.0, "step": 1990 }, { "entropy": 0.5365557678043842, "epoch": 0.03078705173571625, "grad_norm": 0.3557003140449524, "learning_rate": 2.051413617938324e-06, "loss": 0.5105, "mean_token_accuracy": 0.8504032664000988, "num_tokens": 13301740.0, "step": 2000 }, { "entropy": 0.5410757973790169, "epoch": 0.030940986994394834, "grad_norm": 0.7600478529930115, "learning_rate": 2.0616758171276105e-06, "loss": 0.5128, "mean_token_accuracy": 0.8473672799766063, "num_tokens": 13363884.0, "step": 2010 }, { "entropy": 0.5512894354760647, "epoch": 0.031094922253073413, "grad_norm": 0.46306130290031433, "learning_rate": 2.071938016316897e-06, "loss": 0.5189, "mean_token_accuracy": 0.8463278122246265, "num_tokens": 13430359.0, "step": 2020 }, { "entropy": 0.5278117209672928, "epoch": 0.031248857511751996, "grad_norm": 0.6800028085708618, "learning_rate": 2.0822002155061833e-06, "loss": 0.4968, "mean_token_accuracy": 0.8523270964622498, "num_tokens": 13495194.0, "step": 2030 }, { "entropy": 0.525285629555583, "epoch": 0.03140279277043058, "grad_norm": 0.4372037947177887, "learning_rate": 2.0924624146954696e-06, "loss": 0.5113, "mean_token_accuracy": 0.8515525847673416, "num_tokens": 13555895.0, "step": 2040 }, { "entropy": 0.5351852223277092, "epoch": 0.03155672802910916, "grad_norm": 0.49968820810317993, "learning_rate": 2.1027246138847556e-06, "loss": 0.5058, "mean_token_accuracy": 0.8480390325188637, "num_tokens": 13617971.0, "step": 2050 }, { "entropy": 0.5462989289313555, "epoch": 0.03171066328778774, "grad_norm": 0.5065450072288513, "learning_rate": 2.112986813074042e-06, "loss": 0.5126, "mean_token_accuracy": 0.8457625821232796, "num_tokens": 13679233.0, "step": 2060 }, { "entropy": 0.5469788275659084, "epoch": 0.03186459854646632, "grad_norm": 0.4488777220249176, "learning_rate": 2.1232490122633283e-06, "loss": 0.5284, "mean_token_accuracy": 0.8462889738380909, "num_tokens": 13741663.0, "step": 2070 }, { "entropy": 0.5234031520783902, "epoch": 0.032018533805144904, "grad_norm": 0.41374754905700684, "learning_rate": 2.1335112114526143e-06, "loss": 0.5174, "mean_token_accuracy": 0.8517357923090458, "num_tokens": 13820403.0, "step": 2080 }, { "entropy": 0.5282654907554388, "epoch": 0.03217246906382348, "grad_norm": 0.4113750159740448, "learning_rate": 2.143773410641901e-06, "loss": 0.4943, "mean_token_accuracy": 0.8507110550999641, "num_tokens": 13896918.0, "step": 2090 }, { "entropy": 0.5274000048637391, "epoch": 0.03232640432250206, "grad_norm": 0.3817043900489807, "learning_rate": 2.154035609831187e-06, "loss": 0.4999, "mean_token_accuracy": 0.8542025357484817, "num_tokens": 13967642.0, "step": 2100 }, { "entropy": 0.548486365005374, "epoch": 0.03248033958118064, "grad_norm": 0.47256988286972046, "learning_rate": 2.1642978090204734e-06, "loss": 0.5293, "mean_token_accuracy": 0.8441834710538387, "num_tokens": 14023434.0, "step": 2110 }, { "entropy": 0.5379079256206751, "epoch": 0.03263427483985923, "grad_norm": 0.6844898462295532, "learning_rate": 2.1745600082097593e-06, "loss": 0.5034, "mean_token_accuracy": 0.8507031433284282, "num_tokens": 14083747.0, "step": 2120 }, { "entropy": 0.5396283466368914, "epoch": 0.03278821009853781, "grad_norm": 0.6394866704940796, "learning_rate": 2.1848222073990457e-06, "loss": 0.5227, "mean_token_accuracy": 0.8482442460954189, "num_tokens": 14134840.0, "step": 2130 }, { "entropy": 0.5204352997243404, "epoch": 0.03294214535721639, "grad_norm": 0.35353049635887146, "learning_rate": 2.195084406588332e-06, "loss": 0.49, "mean_token_accuracy": 0.8548033952713012, "num_tokens": 14204016.0, "step": 2140 }, { "entropy": 0.553047152236104, "epoch": 0.03309608061589497, "grad_norm": 0.40099021792411804, "learning_rate": 2.205346605777618e-06, "loss": 0.5242, "mean_token_accuracy": 0.8476227879524231, "num_tokens": 14267942.0, "step": 2150 }, { "entropy": 0.519257691130042, "epoch": 0.03325001587457355, "grad_norm": 0.4614008069038391, "learning_rate": 2.215608804966905e-06, "loss": 0.4909, "mean_token_accuracy": 0.8554672382771968, "num_tokens": 14336396.0, "step": 2160 }, { "entropy": 0.5420856468379498, "epoch": 0.03340395113325213, "grad_norm": 0.3716961145401001, "learning_rate": 2.2258710041561908e-06, "loss": 0.5097, "mean_token_accuracy": 0.8453212685883045, "num_tokens": 14402041.0, "step": 2170 }, { "entropy": 0.5372737713158131, "epoch": 0.03355788639193071, "grad_norm": 0.6902923583984375, "learning_rate": 2.236133203345477e-06, "loss": 0.5037, "mean_token_accuracy": 0.849596357345581, "num_tokens": 14461737.0, "step": 2180 }, { "entropy": 0.5454431146383285, "epoch": 0.03371182165060929, "grad_norm": 0.42740628123283386, "learning_rate": 2.2463954025347635e-06, "loss": 0.528, "mean_token_accuracy": 0.845777689665556, "num_tokens": 14526905.0, "step": 2190 }, { "entropy": 0.5485272821038961, "epoch": 0.03386575690928788, "grad_norm": 0.8392878770828247, "learning_rate": 2.2566576017240494e-06, "loss": 0.513, "mean_token_accuracy": 0.847602279484272, "num_tokens": 14581674.0, "step": 2200 }, { "entropy": 0.539389343932271, "epoch": 0.03401969216796646, "grad_norm": 0.38500136137008667, "learning_rate": 2.266919800913336e-06, "loss": 0.5221, "mean_token_accuracy": 0.8473579108715057, "num_tokens": 14647440.0, "step": 2210 }, { "entropy": 0.5432325772941112, "epoch": 0.03417362742664504, "grad_norm": 0.4707469344139099, "learning_rate": 2.277182000102622e-06, "loss": 0.5132, "mean_token_accuracy": 0.8453107759356498, "num_tokens": 14707857.0, "step": 2220 }, { "entropy": 0.5141806758940219, "epoch": 0.03432756268532362, "grad_norm": 0.45236846804618835, "learning_rate": 2.287444199291908e-06, "loss": 0.4924, "mean_token_accuracy": 0.8547611981630325, "num_tokens": 14787214.0, "step": 2230 }, { "entropy": 0.5175751883536577, "epoch": 0.0344814979440022, "grad_norm": 0.46245115995407104, "learning_rate": 2.297706398481195e-06, "loss": 0.5029, "mean_token_accuracy": 0.850262375921011, "num_tokens": 14848918.0, "step": 2240 }, { "entropy": 0.5222197994589806, "epoch": 0.03463543320268078, "grad_norm": 0.5476166605949402, "learning_rate": 2.307968597670481e-06, "loss": 0.4873, "mean_token_accuracy": 0.8568701945245266, "num_tokens": 14912915.0, "step": 2250 }, { "entropy": 0.5345869816839695, "epoch": 0.03478936846135936, "grad_norm": 0.5041570663452148, "learning_rate": 2.3182307968597672e-06, "loss": 0.4984, "mean_token_accuracy": 0.8520513974130154, "num_tokens": 14987582.0, "step": 2260 }, { "entropy": 0.5435335248708725, "epoch": 0.03494330372003795, "grad_norm": 0.37886735796928406, "learning_rate": 2.3284929960490536e-06, "loss": 0.5171, "mean_token_accuracy": 0.8456149287521839, "num_tokens": 15053662.0, "step": 2270 }, { "entropy": 0.531095540523529, "epoch": 0.03509723897871653, "grad_norm": 0.4508146345615387, "learning_rate": 2.3387551952383396e-06, "loss": 0.5041, "mean_token_accuracy": 0.8518501356244087, "num_tokens": 15117853.0, "step": 2280 }, { "entropy": 0.5165737014263868, "epoch": 0.03525117423739511, "grad_norm": 0.4709516763687134, "learning_rate": 2.349017394427626e-06, "loss": 0.4871, "mean_token_accuracy": 0.8556084915995598, "num_tokens": 15180771.0, "step": 2290 }, { "entropy": 0.5135207865387201, "epoch": 0.035405109496073686, "grad_norm": 0.359210342168808, "learning_rate": 2.3592795936169123e-06, "loss": 0.4946, "mean_token_accuracy": 0.8540414966642856, "num_tokens": 15250604.0, "step": 2300 }, { "entropy": 0.5164542838931083, "epoch": 0.03555904475475227, "grad_norm": 0.41648733615875244, "learning_rate": 2.3695417928061987e-06, "loss": 0.5043, "mean_token_accuracy": 0.852924108505249, "num_tokens": 15313384.0, "step": 2310 }, { "entropy": 0.5221754506230354, "epoch": 0.03571298001343085, "grad_norm": 0.7027745246887207, "learning_rate": 2.379803991995485e-06, "loss": 0.4998, "mean_token_accuracy": 0.8500079862773419, "num_tokens": 15388964.0, "step": 2320 }, { "entropy": 0.528238458558917, "epoch": 0.03586691527210943, "grad_norm": 0.42871326208114624, "learning_rate": 2.390066191184771e-06, "loss": 0.5051, "mean_token_accuracy": 0.8516973853111267, "num_tokens": 15456109.0, "step": 2330 }, { "entropy": 0.5255803897976875, "epoch": 0.03602085053078801, "grad_norm": 0.3979966938495636, "learning_rate": 2.4003283903740574e-06, "loss": 0.4895, "mean_token_accuracy": 0.8540043011307716, "num_tokens": 15521225.0, "step": 2340 }, { "entropy": 0.5250278271734714, "epoch": 0.0361747857894666, "grad_norm": 0.6622047424316406, "learning_rate": 2.4105905895633437e-06, "loss": 0.5023, "mean_token_accuracy": 0.8537747249007225, "num_tokens": 15582862.0, "step": 2350 }, { "entropy": 0.5213152717798948, "epoch": 0.03632872104814518, "grad_norm": 0.3488015830516815, "learning_rate": 2.4208527887526297e-06, "loss": 0.5028, "mean_token_accuracy": 0.8516898192465305, "num_tokens": 15660293.0, "step": 2360 }, { "entropy": 0.5378400158137083, "epoch": 0.036482656306823756, "grad_norm": 0.42587700486183167, "learning_rate": 2.431114987941916e-06, "loss": 0.4947, "mean_token_accuracy": 0.8522972002625465, "num_tokens": 15733481.0, "step": 2370 }, { "entropy": 0.5303870067000389, "epoch": 0.036636591565502336, "grad_norm": 0.5812597274780273, "learning_rate": 2.4413771871312024e-06, "loss": 0.5163, "mean_token_accuracy": 0.8509713977575302, "num_tokens": 15796686.0, "step": 2380 }, { "entropy": 0.5497906047850847, "epoch": 0.03679052682418092, "grad_norm": 0.437288761138916, "learning_rate": 2.4516393863204888e-06, "loss": 0.5196, "mean_token_accuracy": 0.8452339954674244, "num_tokens": 15863367.0, "step": 2390 }, { "entropy": 0.5218137551099062, "epoch": 0.0369444620828595, "grad_norm": 0.49609145522117615, "learning_rate": 2.461901585509775e-06, "loss": 0.4936, "mean_token_accuracy": 0.8532713204622269, "num_tokens": 15928062.0, "step": 2400 }, { "entropy": 0.5072901017963887, "epoch": 0.03709839734153808, "grad_norm": 0.40391993522644043, "learning_rate": 2.472163784699061e-06, "loss": 0.4808, "mean_token_accuracy": 0.859038432687521, "num_tokens": 15997686.0, "step": 2410 }, { "entropy": 0.518788518384099, "epoch": 0.03725233260021667, "grad_norm": 0.43725088238716125, "learning_rate": 2.4824259838883475e-06, "loss": 0.4948, "mean_token_accuracy": 0.8543360240757465, "num_tokens": 16064954.0, "step": 2420 }, { "entropy": 0.5142477825284004, "epoch": 0.03740626785889525, "grad_norm": 0.5483034253120422, "learning_rate": 2.492688183077634e-06, "loss": 0.4912, "mean_token_accuracy": 0.8541486620903015, "num_tokens": 16130435.0, "step": 2430 }, { "entropy": 0.5315693970769644, "epoch": 0.037560203117573826, "grad_norm": 0.5418652296066284, "learning_rate": 2.5029503822669198e-06, "loss": 0.4905, "mean_token_accuracy": 0.8493229895830154, "num_tokens": 16194831.0, "step": 2440 }, { "entropy": 0.509003971517086, "epoch": 0.037714138376252405, "grad_norm": 0.3317453861236572, "learning_rate": 2.5132125814562066e-06, "loss": 0.4713, "mean_token_accuracy": 0.8581799998879432, "num_tokens": 16267208.0, "step": 2450 }, { "entropy": 0.5122886069118977, "epoch": 0.03786807363493099, "grad_norm": 0.5180391669273376, "learning_rate": 2.5234747806454925e-06, "loss": 0.4875, "mean_token_accuracy": 0.8574409708380699, "num_tokens": 16333733.0, "step": 2460 }, { "entropy": 0.5200671505182981, "epoch": 0.03802200889360957, "grad_norm": 0.46580827236175537, "learning_rate": 2.533736979834779e-06, "loss": 0.4946, "mean_token_accuracy": 0.8547688968479633, "num_tokens": 16396313.0, "step": 2470 }, { "entropy": 0.5299093306064606, "epoch": 0.03817594415228815, "grad_norm": 0.3789680302143097, "learning_rate": 2.543999179024065e-06, "loss": 0.4982, "mean_token_accuracy": 0.8495425410568714, "num_tokens": 16462102.0, "step": 2480 }, { "entropy": 0.4984320055693388, "epoch": 0.03832987941096673, "grad_norm": 0.5113778710365295, "learning_rate": 2.5542613782133512e-06, "loss": 0.4834, "mean_token_accuracy": 0.8580108858644963, "num_tokens": 16520105.0, "step": 2490 }, { "entropy": 0.504259018972516, "epoch": 0.038483814669645316, "grad_norm": 0.4065929651260376, "learning_rate": 2.564523577402638e-06, "loss": 0.4678, "mean_token_accuracy": 0.8601178348064422, "num_tokens": 16587842.0, "step": 2500 }, { "entropy": 0.5069845620542764, "epoch": 0.038637749928323896, "grad_norm": 0.4221937656402588, "learning_rate": 2.574785776591924e-06, "loss": 0.4859, "mean_token_accuracy": 0.8566009886562824, "num_tokens": 16656192.0, "step": 2510 }, { "entropy": 0.5169490776956082, "epoch": 0.038791685187002475, "grad_norm": 0.5785375237464905, "learning_rate": 2.5850479757812103e-06, "loss": 0.4682, "mean_token_accuracy": 0.8559546187520027, "num_tokens": 16729390.0, "step": 2520 }, { "entropy": 0.5197915256023407, "epoch": 0.038945620445681055, "grad_norm": 0.3815048038959503, "learning_rate": 2.5953101749704963e-06, "loss": 0.488, "mean_token_accuracy": 0.8523627310991287, "num_tokens": 16794060.0, "step": 2530 }, { "entropy": 0.504789412021637, "epoch": 0.03909955570435964, "grad_norm": 0.47336724400520325, "learning_rate": 2.6055723741597826e-06, "loss": 0.4642, "mean_token_accuracy": 0.8598459124565124, "num_tokens": 16855288.0, "step": 2540 }, { "entropy": 0.5064449049532413, "epoch": 0.03925349096303822, "grad_norm": 0.5699107050895691, "learning_rate": 2.6158345733490686e-06, "loss": 0.4884, "mean_token_accuracy": 0.8577068731188774, "num_tokens": 16914784.0, "step": 2550 }, { "entropy": 0.5148227557539939, "epoch": 0.0394074262217168, "grad_norm": 0.48402586579322815, "learning_rate": 2.626096772538355e-06, "loss": 0.473, "mean_token_accuracy": 0.8551864750683308, "num_tokens": 16982820.0, "step": 2560 }, { "entropy": 0.5144754089415073, "epoch": 0.03956136148039538, "grad_norm": 0.461405873298645, "learning_rate": 2.6363589717276418e-06, "loss": 0.4995, "mean_token_accuracy": 0.8517455905675888, "num_tokens": 17045556.0, "step": 2570 }, { "entropy": 0.5082943011075258, "epoch": 0.039715296739073966, "grad_norm": 0.3916703760623932, "learning_rate": 2.6466211709169277e-06, "loss": 0.4853, "mean_token_accuracy": 0.8561552189290523, "num_tokens": 17117838.0, "step": 2580 }, { "entropy": 0.4987948052585125, "epoch": 0.039869231997752545, "grad_norm": 0.4296155571937561, "learning_rate": 2.656883370106214e-06, "loss": 0.479, "mean_token_accuracy": 0.8569062046706677, "num_tokens": 17186132.0, "step": 2590 }, { "entropy": 0.4979901500046253, "epoch": 0.040023167256431125, "grad_norm": 0.42422518134117126, "learning_rate": 2.6671455692955e-06, "loss": 0.4764, "mean_token_accuracy": 0.8578812628984451, "num_tokens": 17259563.0, "step": 2600 }, { "entropy": 0.5093794520944357, "epoch": 0.04017710251510971, "grad_norm": 0.34887516498565674, "learning_rate": 2.6774077684847864e-06, "loss": 0.4988, "mean_token_accuracy": 0.8546256564557553, "num_tokens": 17323588.0, "step": 2610 }, { "entropy": 0.500843757390976, "epoch": 0.04033103777378829, "grad_norm": 0.4194866716861725, "learning_rate": 2.6876699676740723e-06, "loss": 0.489, "mean_token_accuracy": 0.8557326443493366, "num_tokens": 17387458.0, "step": 2620 }, { "entropy": 0.5015277545899153, "epoch": 0.04048497303246687, "grad_norm": 0.40585359930992126, "learning_rate": 2.697932166863359e-06, "loss": 0.4865, "mean_token_accuracy": 0.8529739171266556, "num_tokens": 17445712.0, "step": 2630 }, { "entropy": 0.481063986197114, "epoch": 0.04063890829114545, "grad_norm": 0.36301103234291077, "learning_rate": 2.7081943660526455e-06, "loss": 0.4676, "mean_token_accuracy": 0.8637635000050068, "num_tokens": 17503255.0, "step": 2640 }, { "entropy": 0.5084440354257822, "epoch": 0.040792843549824036, "grad_norm": 0.42435991764068604, "learning_rate": 2.7184565652419314e-06, "loss": 0.4957, "mean_token_accuracy": 0.848768736422062, "num_tokens": 17579437.0, "step": 2650 }, { "entropy": 0.49696084298193455, "epoch": 0.040946778808502615, "grad_norm": 0.5531730651855469, "learning_rate": 2.728718764431218e-06, "loss": 0.4765, "mean_token_accuracy": 0.8580297961831093, "num_tokens": 17644847.0, "step": 2660 }, { "entropy": 0.4982036974281073, "epoch": 0.041100714067181195, "grad_norm": 0.37142860889434814, "learning_rate": 2.7389809636205038e-06, "loss": 0.4827, "mean_token_accuracy": 0.8566086515784264, "num_tokens": 17707488.0, "step": 2670 }, { "entropy": 0.5031122393906117, "epoch": 0.041254649325859774, "grad_norm": 0.4325660765171051, "learning_rate": 2.7492431628097906e-06, "loss": 0.4843, "mean_token_accuracy": 0.8566097974777221, "num_tokens": 17770730.0, "step": 2680 }, { "entropy": 0.49298719726502893, "epoch": 0.04140858458453836, "grad_norm": 0.44080278277397156, "learning_rate": 2.7595053619990765e-06, "loss": 0.4778, "mean_token_accuracy": 0.8568992793560029, "num_tokens": 17832739.0, "step": 2690 }, { "entropy": 0.5054962817579508, "epoch": 0.04156251984321694, "grad_norm": 0.446456640958786, "learning_rate": 2.769767561188363e-06, "loss": 0.4976, "mean_token_accuracy": 0.8531206406652927, "num_tokens": 17897516.0, "step": 2700 }, { "entropy": 0.48145605698227883, "epoch": 0.04171645510189552, "grad_norm": 0.4042004942893982, "learning_rate": 2.7800297603776492e-06, "loss": 0.4689, "mean_token_accuracy": 0.8605678334832192, "num_tokens": 17964343.0, "step": 2710 }, { "entropy": 0.48498384542763234, "epoch": 0.0418703903605741, "grad_norm": 0.5461635589599609, "learning_rate": 2.790291959566935e-06, "loss": 0.4765, "mean_token_accuracy": 0.8591228000819683, "num_tokens": 18028185.0, "step": 2720 }, { "entropy": 0.5041380483657122, "epoch": 0.042024325619252685, "grad_norm": 0.416968435049057, "learning_rate": 2.800554158756222e-06, "loss": 0.4809, "mean_token_accuracy": 0.8519018441438675, "num_tokens": 18094692.0, "step": 2730 }, { "entropy": 0.49318751841783526, "epoch": 0.042178260877931265, "grad_norm": 0.36322054266929626, "learning_rate": 2.810816357945508e-06, "loss": 0.4747, "mean_token_accuracy": 0.8568210758268833, "num_tokens": 18153146.0, "step": 2740 }, { "entropy": 0.47899128794670104, "epoch": 0.042332196136609844, "grad_norm": 0.39410400390625, "learning_rate": 2.8210785571347943e-06, "loss": 0.4694, "mean_token_accuracy": 0.8612624652683735, "num_tokens": 18220603.0, "step": 2750 }, { "entropy": 0.49078089408576486, "epoch": 0.04248613139528842, "grad_norm": 0.3688376843929291, "learning_rate": 2.8313407563240802e-06, "loss": 0.4834, "mean_token_accuracy": 0.8563191562891006, "num_tokens": 18286928.0, "step": 2760 }, { "entropy": 0.4871336441487074, "epoch": 0.04264006665396701, "grad_norm": 0.49200931191444397, "learning_rate": 2.8416029555133666e-06, "loss": 0.4654, "mean_token_accuracy": 0.8561928808689118, "num_tokens": 18352901.0, "step": 2770 }, { "entropy": 0.4969511557370424, "epoch": 0.04279400191264559, "grad_norm": 0.3549100160598755, "learning_rate": 2.8518651547026534e-06, "loss": 0.4815, "mean_token_accuracy": 0.8561749376356602, "num_tokens": 18426435.0, "step": 2780 }, { "entropy": 0.5152549266815185, "epoch": 0.04294793717132417, "grad_norm": 0.4580998718738556, "learning_rate": 2.8621273538919394e-06, "loss": 0.4901, "mean_token_accuracy": 0.8515326030552387, "num_tokens": 18494424.0, "step": 2790 }, { "entropy": 0.48918520137667654, "epoch": 0.043101872430002755, "grad_norm": 0.4392269253730774, "learning_rate": 2.8723895530812257e-06, "loss": 0.4689, "mean_token_accuracy": 0.8565140224993228, "num_tokens": 18569637.0, "step": 2800 }, { "entropy": 0.47668228670954704, "epoch": 0.043255807688681334, "grad_norm": 0.4084360897541046, "learning_rate": 2.8826517522705117e-06, "loss": 0.4702, "mean_token_accuracy": 0.860350401699543, "num_tokens": 18633320.0, "step": 2810 }, { "entropy": 0.4766545008867979, "epoch": 0.043409742947359914, "grad_norm": 0.34576714038848877, "learning_rate": 2.892913951459798e-06, "loss": 0.4658, "mean_token_accuracy": 0.8594283573329449, "num_tokens": 18698010.0, "step": 2820 }, { "entropy": 0.48372769095003604, "epoch": 0.04356367820603849, "grad_norm": 0.39566606283187866, "learning_rate": 2.903176150649084e-06, "loss": 0.4728, "mean_token_accuracy": 0.8584145598113537, "num_tokens": 18774466.0, "step": 2830 }, { "entropy": 0.4932734340429306, "epoch": 0.04371761346471708, "grad_norm": 0.3711501657962799, "learning_rate": 2.9134383498383708e-06, "loss": 0.4716, "mean_token_accuracy": 0.8559127449989319, "num_tokens": 18845015.0, "step": 2840 }, { "entropy": 0.47330547869205475, "epoch": 0.04387154872339566, "grad_norm": 0.3661205470561981, "learning_rate": 2.923700549027657e-06, "loss": 0.4712, "mean_token_accuracy": 0.8614593788981437, "num_tokens": 18907676.0, "step": 2850 }, { "entropy": 0.4851850405335426, "epoch": 0.04402548398207424, "grad_norm": 0.39886966347694397, "learning_rate": 2.933962748216943e-06, "loss": 0.4768, "mean_token_accuracy": 0.8584970690310001, "num_tokens": 18967662.0, "step": 2860 }, { "entropy": 0.5064614836126566, "epoch": 0.04417941924075282, "grad_norm": 0.362210214138031, "learning_rate": 2.9442249474062295e-06, "loss": 0.4981, "mean_token_accuracy": 0.8526823811233044, "num_tokens": 19035972.0, "step": 2870 }, { "entropy": 0.48068327233195307, "epoch": 0.044333354499431404, "grad_norm": 0.536994218826294, "learning_rate": 2.9544871465955154e-06, "loss": 0.4727, "mean_token_accuracy": 0.8580154769122601, "num_tokens": 19095967.0, "step": 2880 }, { "entropy": 0.49514865018427373, "epoch": 0.044487289758109984, "grad_norm": 0.47831016778945923, "learning_rate": 2.964749345784802e-06, "loss": 0.4946, "mean_token_accuracy": 0.8548254914581775, "num_tokens": 19159382.0, "step": 2890 }, { "entropy": 0.4962038930505514, "epoch": 0.04464122501678856, "grad_norm": 0.3985232710838318, "learning_rate": 2.975011544974088e-06, "loss": 0.485, "mean_token_accuracy": 0.8550618976354599, "num_tokens": 19220649.0, "step": 2900 }, { "entropy": 0.48330639079213145, "epoch": 0.04479516027546714, "grad_norm": 0.3650174140930176, "learning_rate": 2.9852737441633745e-06, "loss": 0.4612, "mean_token_accuracy": 0.8621611192822456, "num_tokens": 19294624.0, "step": 2910 }, { "entropy": 0.492469260841608, "epoch": 0.04494909553414573, "grad_norm": 0.37588176131248474, "learning_rate": 2.995535943352661e-06, "loss": 0.4837, "mean_token_accuracy": 0.8549105599522591, "num_tokens": 19365724.0, "step": 2920 }, { "entropy": 0.46695651337504385, "epoch": 0.04510303079282431, "grad_norm": 0.44310060143470764, "learning_rate": 3.005798142541947e-06, "loss": 0.4563, "mean_token_accuracy": 0.8628789156675338, "num_tokens": 19432574.0, "step": 2930 }, { "entropy": 0.5018115993589163, "epoch": 0.04525696605150289, "grad_norm": 0.41544172167778015, "learning_rate": 3.016060341731233e-06, "loss": 0.4927, "mean_token_accuracy": 0.8501388780772686, "num_tokens": 19497250.0, "step": 2940 }, { "entropy": 0.47896330431103706, "epoch": 0.04541090131018147, "grad_norm": 0.4193185567855835, "learning_rate": 3.026322540920519e-06, "loss": 0.4621, "mean_token_accuracy": 0.8604184970259666, "num_tokens": 19554490.0, "step": 2950 }, { "entropy": 0.4780110865831375, "epoch": 0.045564836568860054, "grad_norm": 0.3728090226650238, "learning_rate": 3.036584740109806e-06, "loss": 0.4562, "mean_token_accuracy": 0.862579096108675, "num_tokens": 19618339.0, "step": 2960 }, { "entropy": 0.47614218257367613, "epoch": 0.04571877182753863, "grad_norm": 0.43533238768577576, "learning_rate": 3.046846939299092e-06, "loss": 0.46, "mean_token_accuracy": 0.8603674508631229, "num_tokens": 19689014.0, "step": 2970 }, { "entropy": 0.49885031692683696, "epoch": 0.04587270708621721, "grad_norm": 0.4194674491882324, "learning_rate": 3.0571091384883783e-06, "loss": 0.4851, "mean_token_accuracy": 0.8495530150830746, "num_tokens": 19754130.0, "step": 2980 }, { "entropy": 0.47561506628990174, "epoch": 0.04602664234489579, "grad_norm": 0.414552241563797, "learning_rate": 3.0673713376776646e-06, "loss": 0.4614, "mean_token_accuracy": 0.8624397486448288, "num_tokens": 19814304.0, "step": 2990 }, { "entropy": 0.4736238345503807, "epoch": 0.04618057760357438, "grad_norm": 0.42118075489997864, "learning_rate": 3.0776335368669506e-06, "loss": 0.4581, "mean_token_accuracy": 0.8609484910964966, "num_tokens": 19881152.0, "step": 3000 }, { "entropy": 0.46562346182763575, "epoch": 0.04633451286225296, "grad_norm": 0.3987255096435547, "learning_rate": 3.0878957360562374e-06, "loss": 0.4548, "mean_token_accuracy": 0.8625684142112732, "num_tokens": 19950889.0, "step": 3010 }, { "entropy": 0.4806033708155155, "epoch": 0.04648844812093154, "grad_norm": 0.4630914032459259, "learning_rate": 3.0981579352455233e-06, "loss": 0.4736, "mean_token_accuracy": 0.8618213221430778, "num_tokens": 20015172.0, "step": 3020 }, { "entropy": 0.4608029305934906, "epoch": 0.046642383379610124, "grad_norm": 0.5229613780975342, "learning_rate": 3.1084201344348097e-06, "loss": 0.4505, "mean_token_accuracy": 0.8635811656713486, "num_tokens": 20087160.0, "step": 3030 }, { "entropy": 0.4786300778388977, "epoch": 0.0467963186382887, "grad_norm": 0.5432091951370239, "learning_rate": 3.1186823336240956e-06, "loss": 0.4742, "mean_token_accuracy": 0.8594743512570858, "num_tokens": 20141651.0, "step": 3040 }, { "entropy": 0.4786074675619602, "epoch": 0.04695025389696728, "grad_norm": 0.4786185622215271, "learning_rate": 3.128944532813382e-06, "loss": 0.4585, "mean_token_accuracy": 0.8607216760516166, "num_tokens": 20201922.0, "step": 3050 }, { "entropy": 0.47565300650894643, "epoch": 0.04710418915564586, "grad_norm": 0.5178991556167603, "learning_rate": 3.139206732002669e-06, "loss": 0.469, "mean_token_accuracy": 0.8594755798578262, "num_tokens": 20261086.0, "step": 3060 }, { "entropy": 0.4837412329390645, "epoch": 0.04725812441432445, "grad_norm": 0.4048655927181244, "learning_rate": 3.1494689311919547e-06, "loss": 0.4785, "mean_token_accuracy": 0.8581827118992805, "num_tokens": 20322852.0, "step": 3070 }, { "entropy": 0.4644537676125765, "epoch": 0.04741205967300303, "grad_norm": 0.4093380272388458, "learning_rate": 3.159731130381241e-06, "loss": 0.4445, "mean_token_accuracy": 0.8651019655168056, "num_tokens": 20398994.0, "step": 3080 }, { "entropy": 0.49016749709844587, "epoch": 0.04756599493168161, "grad_norm": 0.46361011266708374, "learning_rate": 3.169993329570527e-06, "loss": 0.481, "mean_token_accuracy": 0.8538561865687371, "num_tokens": 20464897.0, "step": 3090 }, { "entropy": 0.46319445557892325, "epoch": 0.04771993019036019, "grad_norm": 0.39804714918136597, "learning_rate": 3.1802555287598134e-06, "loss": 0.4581, "mean_token_accuracy": 0.8656220644712448, "num_tokens": 20534372.0, "step": 3100 }, { "entropy": 0.4658484049141407, "epoch": 0.04787386544903877, "grad_norm": 0.39583876729011536, "learning_rate": 3.1905177279490994e-06, "loss": 0.4581, "mean_token_accuracy": 0.8628982864320278, "num_tokens": 20603941.0, "step": 3110 }, { "entropy": 0.46748054809868334, "epoch": 0.04802780070771735, "grad_norm": 0.569844126701355, "learning_rate": 3.200779927138386e-06, "loss": 0.4521, "mean_token_accuracy": 0.8625423818826675, "num_tokens": 20669276.0, "step": 3120 }, { "entropy": 0.46999922804534433, "epoch": 0.04818173596639593, "grad_norm": 0.34531617164611816, "learning_rate": 3.2110421263276725e-06, "loss": 0.4759, "mean_token_accuracy": 0.8625832088291645, "num_tokens": 20738347.0, "step": 3130 }, { "entropy": 0.4595083013176918, "epoch": 0.04833567122507451, "grad_norm": 0.4226189851760864, "learning_rate": 3.2213043255169585e-06, "loss": 0.4509, "mean_token_accuracy": 0.8634210824966431, "num_tokens": 20810222.0, "step": 3140 }, { "entropy": 0.47402725778520105, "epoch": 0.0484896064837531, "grad_norm": 0.5318478941917419, "learning_rate": 3.231566524706245e-06, "loss": 0.4535, "mean_token_accuracy": 0.8618110284209252, "num_tokens": 20881739.0, "step": 3150 }, { "entropy": 0.47862141728401186, "epoch": 0.04864354174243168, "grad_norm": 0.44921329617500305, "learning_rate": 3.241828723895531e-06, "loss": 0.4633, "mean_token_accuracy": 0.8585563831031322, "num_tokens": 20948062.0, "step": 3160 }, { "entropy": 0.4785586640238762, "epoch": 0.04879747700111026, "grad_norm": 0.45010489225387573, "learning_rate": 3.2520909230848176e-06, "loss": 0.4735, "mean_token_accuracy": 0.8595190353691577, "num_tokens": 21011157.0, "step": 3170 }, { "entropy": 0.5033937424421311, "epoch": 0.048951412259788836, "grad_norm": 0.5086638331413269, "learning_rate": 3.2623531222741036e-06, "loss": 0.4878, "mean_token_accuracy": 0.8513988673686981, "num_tokens": 21071588.0, "step": 3180 }, { "entropy": 0.48296874538064005, "epoch": 0.04910534751846742, "grad_norm": 0.49625444412231445, "learning_rate": 3.27261532146339e-06, "loss": 0.4643, "mean_token_accuracy": 0.8581536114215851, "num_tokens": 21135702.0, "step": 3190 }, { "entropy": 0.4852360276505351, "epoch": 0.049259282777146, "grad_norm": 0.3567318916320801, "learning_rate": 3.2828775206526763e-06, "loss": 0.4611, "mean_token_accuracy": 0.8582854352891445, "num_tokens": 21202360.0, "step": 3200 }, { "entropy": 0.49142739176750183, "epoch": 0.04941321803582458, "grad_norm": 0.4172487258911133, "learning_rate": 3.2931397198419622e-06, "loss": 0.473, "mean_token_accuracy": 0.8558745965361595, "num_tokens": 21274671.0, "step": 3210 }, { "entropy": 0.4629239164292812, "epoch": 0.04956715329450317, "grad_norm": 0.40355199575424194, "learning_rate": 3.303401919031249e-06, "loss": 0.4608, "mean_token_accuracy": 0.861729059368372, "num_tokens": 21336345.0, "step": 3220 }, { "entropy": 0.45941671580076215, "epoch": 0.04972108855318175, "grad_norm": 0.4382309019565582, "learning_rate": 3.313664118220535e-06, "loss": 0.4548, "mean_token_accuracy": 0.8678113736212254, "num_tokens": 21406575.0, "step": 3230 }, { "entropy": 0.4668697815388441, "epoch": 0.049875023811860326, "grad_norm": 0.5009704232215881, "learning_rate": 3.3239263174098213e-06, "loss": 0.4581, "mean_token_accuracy": 0.8609800226986408, "num_tokens": 21470203.0, "step": 3240 }, { "entropy": 0.4880560081452131, "epoch": 0.050028959070538906, "grad_norm": 0.5825926065444946, "learning_rate": 3.3341885165991073e-06, "loss": 0.4892, "mean_token_accuracy": 0.8533754147589206, "num_tokens": 21540496.0, "step": 3250 }, { "entropy": 0.47189716398715975, "epoch": 0.05018289432921749, "grad_norm": 0.3982759714126587, "learning_rate": 3.3444507157883937e-06, "loss": 0.463, "mean_token_accuracy": 0.8608555220067501, "num_tokens": 21602426.0, "step": 3260 }, { "entropy": 0.4664815416559577, "epoch": 0.05033682958789607, "grad_norm": 0.41411706805229187, "learning_rate": 3.3547129149776805e-06, "loss": 0.4491, "mean_token_accuracy": 0.8640334777534008, "num_tokens": 21669264.0, "step": 3270 }, { "entropy": 0.4766206920146942, "epoch": 0.05049076484657465, "grad_norm": 0.35560521483421326, "learning_rate": 3.3649751141669664e-06, "loss": 0.4559, "mean_token_accuracy": 0.8593937613070011, "num_tokens": 21744055.0, "step": 3280 }, { "entropy": 0.47926376238465307, "epoch": 0.05064470010525323, "grad_norm": 0.5023449063301086, "learning_rate": 3.3752373133562528e-06, "loss": 0.4675, "mean_token_accuracy": 0.8568825788795948, "num_tokens": 21808634.0, "step": 3290 }, { "entropy": 0.49185780212283137, "epoch": 0.05079863536393182, "grad_norm": 0.46794310212135315, "learning_rate": 3.3854995125455387e-06, "loss": 0.4888, "mean_token_accuracy": 0.8554835021495819, "num_tokens": 21875260.0, "step": 3300 }, { "entropy": 0.472229178622365, "epoch": 0.050952570622610396, "grad_norm": 0.4785502254962921, "learning_rate": 3.395761711734825e-06, "loss": 0.4559, "mean_token_accuracy": 0.863686703145504, "num_tokens": 21935679.0, "step": 3310 }, { "entropy": 0.4691951235756278, "epoch": 0.051106505881288976, "grad_norm": 0.4103107750415802, "learning_rate": 3.406023910924111e-06, "loss": 0.4553, "mean_token_accuracy": 0.8622137896716595, "num_tokens": 22001804.0, "step": 3320 }, { "entropy": 0.4552751164883375, "epoch": 0.051260441139967555, "grad_norm": 0.37202244997024536, "learning_rate": 3.4162861101133974e-06, "loss": 0.446, "mean_token_accuracy": 0.867671524733305, "num_tokens": 22068890.0, "step": 3330 }, { "entropy": 0.4667286228388548, "epoch": 0.05141437639864614, "grad_norm": 0.52146977186203, "learning_rate": 3.426548309302684e-06, "loss": 0.4584, "mean_token_accuracy": 0.8638569332659245, "num_tokens": 22137499.0, "step": 3340 }, { "entropy": 0.48068500459194186, "epoch": 0.05156831165732472, "grad_norm": 0.6329039931297302, "learning_rate": 3.43681050849197e-06, "loss": 0.4603, "mean_token_accuracy": 0.8568483039736747, "num_tokens": 22201211.0, "step": 3350 }, { "entropy": 0.4544039942324162, "epoch": 0.0517222469160033, "grad_norm": 0.4709509015083313, "learning_rate": 3.4470727076812565e-06, "loss": 0.4354, "mean_token_accuracy": 0.8671738006174564, "num_tokens": 22273598.0, "step": 3360 }, { "entropy": 0.4683177687227726, "epoch": 0.05187618217468188, "grad_norm": 0.5625371336936951, "learning_rate": 3.4573349068705425e-06, "loss": 0.4614, "mean_token_accuracy": 0.8640124201774597, "num_tokens": 22329848.0, "step": 3370 }, { "entropy": 0.4671619884669781, "epoch": 0.052030117433360466, "grad_norm": 0.396567702293396, "learning_rate": 3.467597106059829e-06, "loss": 0.4526, "mean_token_accuracy": 0.8640213899314404, "num_tokens": 22394972.0, "step": 3380 }, { "entropy": 0.4896993700414896, "epoch": 0.052184052692039046, "grad_norm": 0.3867156207561493, "learning_rate": 3.4778593052491148e-06, "loss": 0.4771, "mean_token_accuracy": 0.8519129119813442, "num_tokens": 22464547.0, "step": 3390 }, { "entropy": 0.4594454549252987, "epoch": 0.052337987950717625, "grad_norm": 0.3562692403793335, "learning_rate": 3.4881215044384016e-06, "loss": 0.4427, "mean_token_accuracy": 0.8637728683650494, "num_tokens": 22538134.0, "step": 3400 }, { "entropy": 0.4618957854807377, "epoch": 0.05249192320939621, "grad_norm": 0.5350344777107239, "learning_rate": 3.498383703627688e-06, "loss": 0.4519, "mean_token_accuracy": 0.8653904445469379, "num_tokens": 22595221.0, "step": 3410 }, { "entropy": 0.4716145057231188, "epoch": 0.05264585846807479, "grad_norm": 0.42258119583129883, "learning_rate": 3.508645902816974e-06, "loss": 0.4537, "mean_token_accuracy": 0.8639770649373532, "num_tokens": 22661423.0, "step": 3420 }, { "entropy": 0.4660936377942562, "epoch": 0.05279979372675337, "grad_norm": 0.38081157207489014, "learning_rate": 3.5189081020062603e-06, "loss": 0.4512, "mean_token_accuracy": 0.8627615235745907, "num_tokens": 22724609.0, "step": 3430 }, { "entropy": 0.46550977751612665, "epoch": 0.05295372898543195, "grad_norm": 0.3994530439376831, "learning_rate": 3.529170301195546e-06, "loss": 0.45, "mean_token_accuracy": 0.8652859427034855, "num_tokens": 22789214.0, "step": 3440 }, { "entropy": 0.47242205049842595, "epoch": 0.053107664244110536, "grad_norm": 0.4704059362411499, "learning_rate": 3.539432500384833e-06, "loss": 0.4631, "mean_token_accuracy": 0.858353117108345, "num_tokens": 22848697.0, "step": 3450 }, { "entropy": 0.45999151915311814, "epoch": 0.053261599502789116, "grad_norm": 0.41963985562324524, "learning_rate": 3.549694699574119e-06, "loss": 0.4422, "mean_token_accuracy": 0.8648492239415646, "num_tokens": 22911205.0, "step": 3460 }, { "entropy": 0.46102733314037325, "epoch": 0.053415534761467695, "grad_norm": 0.5025171637535095, "learning_rate": 3.5599568987634053e-06, "loss": 0.4378, "mean_token_accuracy": 0.8654144816100597, "num_tokens": 22976564.0, "step": 3470 }, { "entropy": 0.46762806363403797, "epoch": 0.053569470020146275, "grad_norm": 0.37729594111442566, "learning_rate": 3.5702190979526913e-06, "loss": 0.454, "mean_token_accuracy": 0.8640327297151089, "num_tokens": 23034244.0, "step": 3480 }, { "entropy": 0.463880468532443, "epoch": 0.05372340527882486, "grad_norm": 0.4917917251586914, "learning_rate": 3.5804812971419776e-06, "loss": 0.4439, "mean_token_accuracy": 0.8637275911867619, "num_tokens": 23107178.0, "step": 3490 }, { "entropy": 0.45713591761887074, "epoch": 0.05387734053750344, "grad_norm": 0.503111720085144, "learning_rate": 3.5907434963312644e-06, "loss": 0.464, "mean_token_accuracy": 0.8620456017553806, "num_tokens": 23178607.0, "step": 3500 }, { "entropy": 0.4649581465870142, "epoch": 0.05403127579618202, "grad_norm": 0.548831045627594, "learning_rate": 3.6010056955205504e-06, "loss": 0.4529, "mean_token_accuracy": 0.8614737786352634, "num_tokens": 23238131.0, "step": 3510 }, { "entropy": 0.4638475481420755, "epoch": 0.0541852110548606, "grad_norm": 0.4035952091217041, "learning_rate": 3.6112678947098367e-06, "loss": 0.4565, "mean_token_accuracy": 0.8643191017210483, "num_tokens": 23312203.0, "step": 3520 }, { "entropy": 0.4574541479349136, "epoch": 0.054339146313539186, "grad_norm": 0.40222880244255066, "learning_rate": 3.6215300938991227e-06, "loss": 0.4451, "mean_token_accuracy": 0.8640716910362244, "num_tokens": 23379234.0, "step": 3530 }, { "entropy": 0.4548620041459799, "epoch": 0.054493081572217765, "grad_norm": 0.4786234498023987, "learning_rate": 3.631792293088409e-06, "loss": 0.4436, "mean_token_accuracy": 0.8615356490015984, "num_tokens": 23442454.0, "step": 3540 }, { "entropy": 0.46325308848172425, "epoch": 0.054647016830896344, "grad_norm": 0.5192655324935913, "learning_rate": 3.642054492277695e-06, "loss": 0.4547, "mean_token_accuracy": 0.865481149405241, "num_tokens": 23499263.0, "step": 3550 }, { "entropy": 0.45313067696988585, "epoch": 0.054800952089574924, "grad_norm": 0.5692862272262573, "learning_rate": 3.652316691466982e-06, "loss": 0.4394, "mean_token_accuracy": 0.8679087318480014, "num_tokens": 23562913.0, "step": 3560 }, { "entropy": 0.4712716117501259, "epoch": 0.05495488734825351, "grad_norm": 0.44528335332870483, "learning_rate": 3.662578890656268e-06, "loss": 0.4537, "mean_token_accuracy": 0.8601473607122898, "num_tokens": 23625355.0, "step": 3570 }, { "entropy": 0.4627768792212009, "epoch": 0.05510882260693209, "grad_norm": 0.5715892314910889, "learning_rate": 3.672841089845554e-06, "loss": 0.4498, "mean_token_accuracy": 0.862949462980032, "num_tokens": 23686680.0, "step": 3580 }, { "entropy": 0.4693655613809824, "epoch": 0.05526275786561067, "grad_norm": 0.436560720205307, "learning_rate": 3.6831032890348405e-06, "loss": 0.4611, "mean_token_accuracy": 0.8598898634314537, "num_tokens": 23752749.0, "step": 3590 }, { "entropy": 0.4543895564973354, "epoch": 0.055416693124289255, "grad_norm": 0.48591554164886475, "learning_rate": 3.6933654882241264e-06, "loss": 0.4397, "mean_token_accuracy": 0.8664734154939652, "num_tokens": 23818283.0, "step": 3600 }, { "entropy": 0.4766705486923456, "epoch": 0.055570628382967835, "grad_norm": 0.5066477060317993, "learning_rate": 3.7036276874134132e-06, "loss": 0.4768, "mean_token_accuracy": 0.8593741722404957, "num_tokens": 23883245.0, "step": 3610 }, { "entropy": 0.45425005443394184, "epoch": 0.055724563641646414, "grad_norm": 0.4093894958496094, "learning_rate": 3.713889886602699e-06, "loss": 0.4522, "mean_token_accuracy": 0.8680253006517887, "num_tokens": 23942319.0, "step": 3620 }, { "entropy": 0.4625934336334467, "epoch": 0.055878498900324994, "grad_norm": 0.5482757687568665, "learning_rate": 3.7241520857919855e-06, "loss": 0.4463, "mean_token_accuracy": 0.8648286245763301, "num_tokens": 24011916.0, "step": 3630 }, { "entropy": 0.4531638570129871, "epoch": 0.05603243415900358, "grad_norm": 0.4277053773403168, "learning_rate": 3.734414284981272e-06, "loss": 0.4506, "mean_token_accuracy": 0.8678131386637687, "num_tokens": 24075719.0, "step": 3640 }, { "entropy": 0.45438520386815073, "epoch": 0.05618636941768216, "grad_norm": 0.6514148116111755, "learning_rate": 3.744676484170558e-06, "loss": 0.4554, "mean_token_accuracy": 0.8677970394492149, "num_tokens": 24139900.0, "step": 3650 }, { "entropy": 0.4539454039186239, "epoch": 0.05634030467636074, "grad_norm": 0.5358089208602905, "learning_rate": 3.7549386833598447e-06, "loss": 0.4403, "mean_token_accuracy": 0.8668252117931843, "num_tokens": 24201776.0, "step": 3660 }, { "entropy": 0.45166930221021173, "epoch": 0.05649423993503932, "grad_norm": 0.4707483947277069, "learning_rate": 3.7652008825491306e-06, "loss": 0.4308, "mean_token_accuracy": 0.8684119544923306, "num_tokens": 24273202.0, "step": 3670 }, { "entropy": 0.45630919486284255, "epoch": 0.056648175193717905, "grad_norm": 0.4719012975692749, "learning_rate": 3.775463081738417e-06, "loss": 0.4568, "mean_token_accuracy": 0.8636429160833359, "num_tokens": 24343079.0, "step": 3680 }, { "entropy": 0.45550831053406, "epoch": 0.056802110452396484, "grad_norm": 0.5095070600509644, "learning_rate": 3.785725280927703e-06, "loss": 0.445, "mean_token_accuracy": 0.8675057232379914, "num_tokens": 24407933.0, "step": 3690 }, { "entropy": 0.46896865628659723, "epoch": 0.056956045711075064, "grad_norm": 0.4101592004299164, "learning_rate": 3.7959874801169893e-06, "loss": 0.4547, "mean_token_accuracy": 0.8613490931689739, "num_tokens": 24472449.0, "step": 3700 }, { "entropy": 0.46954872496426103, "epoch": 0.05710998096975364, "grad_norm": 0.4343108534812927, "learning_rate": 3.8062496793062757e-06, "loss": 0.4592, "mean_token_accuracy": 0.8610514752566815, "num_tokens": 24535235.0, "step": 3710 }, { "entropy": 0.4818595461547375, "epoch": 0.05726391622843223, "grad_norm": 0.4091334044933319, "learning_rate": 3.816511878495562e-06, "loss": 0.4827, "mean_token_accuracy": 0.8548646107316017, "num_tokens": 24599787.0, "step": 3720 }, { "entropy": 0.4582658663392067, "epoch": 0.05741785148711081, "grad_norm": 0.45793822407722473, "learning_rate": 3.826774077684848e-06, "loss": 0.4412, "mean_token_accuracy": 0.8679271958768368, "num_tokens": 24665860.0, "step": 3730 }, { "entropy": 0.48782130852341654, "epoch": 0.05757178674578939, "grad_norm": 0.48937752842903137, "learning_rate": 3.837036276874134e-06, "loss": 0.4755, "mean_token_accuracy": 0.8556538030505181, "num_tokens": 24733311.0, "step": 3740 }, { "entropy": 0.4652296878397465, "epoch": 0.05772572200446797, "grad_norm": 0.4727736711502075, "learning_rate": 3.847298476063421e-06, "loss": 0.4453, "mean_token_accuracy": 0.8627233736217021, "num_tokens": 24803066.0, "step": 3750 }, { "entropy": 0.45185140185058115, "epoch": 0.057879657263146554, "grad_norm": 0.5595993399620056, "learning_rate": 3.857560675252707e-06, "loss": 0.4338, "mean_token_accuracy": 0.8654114171862602, "num_tokens": 24867820.0, "step": 3760 }, { "entropy": 0.4563142366707325, "epoch": 0.058033592521825134, "grad_norm": 0.5490583777427673, "learning_rate": 3.867822874441993e-06, "loss": 0.4425, "mean_token_accuracy": 0.8659687809646129, "num_tokens": 24930914.0, "step": 3770 }, { "entropy": 0.47089482136070726, "epoch": 0.05818752778050371, "grad_norm": 0.656970739364624, "learning_rate": 3.87808507363128e-06, "loss": 0.46, "mean_token_accuracy": 0.8610228218138218, "num_tokens": 25000787.0, "step": 3780 }, { "entropy": 0.4559638751670718, "epoch": 0.05834146303918229, "grad_norm": 0.4767877757549286, "learning_rate": 3.888347272820566e-06, "loss": 0.4441, "mean_token_accuracy": 0.8634882546961308, "num_tokens": 25063600.0, "step": 3790 }, { "entropy": 0.4621624920517206, "epoch": 0.05849539829786088, "grad_norm": 0.5292560458183289, "learning_rate": 3.8986094720098526e-06, "loss": 0.4385, "mean_token_accuracy": 0.8621891669929027, "num_tokens": 25125521.0, "step": 3800 }, { "entropy": 0.45969152115285394, "epoch": 0.05864933355653946, "grad_norm": 0.47115787863731384, "learning_rate": 3.9088716711991385e-06, "loss": 0.4545, "mean_token_accuracy": 0.8619041584432126, "num_tokens": 25190474.0, "step": 3810 }, { "entropy": 0.4500957690179348, "epoch": 0.05880326881521804, "grad_norm": 0.3932344913482666, "learning_rate": 3.9191338703884245e-06, "loss": 0.4479, "mean_token_accuracy": 0.8689141541719436, "num_tokens": 25264641.0, "step": 3820 }, { "entropy": 0.4580691482871771, "epoch": 0.058957204073896624, "grad_norm": 0.44312307238578796, "learning_rate": 3.92939606957771e-06, "loss": 0.4471, "mean_token_accuracy": 0.8671365737915039, "num_tokens": 25327853.0, "step": 3830 }, { "entropy": 0.4430386945605278, "epoch": 0.059111139332575204, "grad_norm": 0.43189939856529236, "learning_rate": 3.939658268766997e-06, "loss": 0.4313, "mean_token_accuracy": 0.8711311154067516, "num_tokens": 25402960.0, "step": 3840 }, { "entropy": 0.44096125159412625, "epoch": 0.05926507459125378, "grad_norm": 0.42959046363830566, "learning_rate": 3.949920467956284e-06, "loss": 0.4431, "mean_token_accuracy": 0.8676520347595215, "num_tokens": 25474747.0, "step": 3850 }, { "entropy": 0.45981105901300906, "epoch": 0.05941900984993236, "grad_norm": 0.6110933423042297, "learning_rate": 3.96018266714557e-06, "loss": 0.454, "mean_token_accuracy": 0.8609394557774067, "num_tokens": 25534746.0, "step": 3860 }, { "entropy": 0.46140354685485363, "epoch": 0.05957294510861095, "grad_norm": 0.5715627074241638, "learning_rate": 3.970444866334856e-06, "loss": 0.4554, "mean_token_accuracy": 0.8632232055068016, "num_tokens": 25602161.0, "step": 3870 }, { "entropy": 0.4679777786135674, "epoch": 0.05972688036728953, "grad_norm": 0.44153058528900146, "learning_rate": 3.980707065524142e-06, "loss": 0.4594, "mean_token_accuracy": 0.8650828637182713, "num_tokens": 25671214.0, "step": 3880 }, { "entropy": 0.45815389305353166, "epoch": 0.05988081562596811, "grad_norm": 0.42036888003349304, "learning_rate": 3.990969264713429e-06, "loss": 0.4415, "mean_token_accuracy": 0.8670969031751156, "num_tokens": 25744314.0, "step": 3890 }, { "entropy": 0.45237275809049604, "epoch": 0.06003475088464669, "grad_norm": 0.5284343361854553, "learning_rate": 4.0012314639027146e-06, "loss": 0.4506, "mean_token_accuracy": 0.8662144504487514, "num_tokens": 25805267.0, "step": 3900 }, { "entropy": 0.4468564860522747, "epoch": 0.06018868614332527, "grad_norm": 0.5036371350288391, "learning_rate": 4.011493663092001e-06, "loss": 0.4406, "mean_token_accuracy": 0.8671295337378979, "num_tokens": 25861999.0, "step": 3910 }, { "entropy": 0.456574796512723, "epoch": 0.06034262140200385, "grad_norm": 0.4504340589046478, "learning_rate": 4.021755862281287e-06, "loss": 0.4375, "mean_token_accuracy": 0.8641104765236378, "num_tokens": 25931934.0, "step": 3920 }, { "entropy": 0.4526135422289371, "epoch": 0.06049655666068243, "grad_norm": 0.47584018111228943, "learning_rate": 4.032018061470573e-06, "loss": 0.4327, "mean_token_accuracy": 0.8684007465839386, "num_tokens": 26002224.0, "step": 3930 }, { "entropy": 0.454237574338913, "epoch": 0.06065049191936101, "grad_norm": 0.6877853274345398, "learning_rate": 4.04228026065986e-06, "loss": 0.4419, "mean_token_accuracy": 0.8658649109303951, "num_tokens": 26064773.0, "step": 3940 }, { "entropy": 0.45691534169018266, "epoch": 0.0608044271780396, "grad_norm": 0.4646106958389282, "learning_rate": 4.052542459849146e-06, "loss": 0.4501, "mean_token_accuracy": 0.8634351752698421, "num_tokens": 26127795.0, "step": 3950 }, { "entropy": 0.4434625133872032, "epoch": 0.06095836243671818, "grad_norm": 0.48959413170814514, "learning_rate": 4.062804659038432e-06, "loss": 0.4265, "mean_token_accuracy": 0.8716785810887814, "num_tokens": 26196671.0, "step": 3960 }, { "entropy": 0.4686501011252403, "epoch": 0.06111229769539676, "grad_norm": 0.5496765971183777, "learning_rate": 4.073066858227718e-06, "loss": 0.4433, "mean_token_accuracy": 0.8617699176073075, "num_tokens": 26266550.0, "step": 3970 }, { "entropy": 0.46825481168925764, "epoch": 0.061266232954075336, "grad_norm": 0.4274046719074249, "learning_rate": 4.083329057417005e-06, "loss": 0.4568, "mean_token_accuracy": 0.8612445332109928, "num_tokens": 26346781.0, "step": 3980 }, { "entropy": 0.44655272252857686, "epoch": 0.06142016821275392, "grad_norm": 0.6424245238304138, "learning_rate": 4.0935912566062915e-06, "loss": 0.4414, "mean_token_accuracy": 0.8691511929035187, "num_tokens": 26417019.0, "step": 3990 }, { "entropy": 0.45706156082451344, "epoch": 0.0615741034714325, "grad_norm": 0.5096727013587952, "learning_rate": 4.1038534557955774e-06, "loss": 0.447, "mean_token_accuracy": 0.8651815250515937, "num_tokens": 26486371.0, "step": 4000 }, { "entropy": 0.4502101169899106, "epoch": 0.06172803873011108, "grad_norm": 0.47437334060668945, "learning_rate": 4.114115654984863e-06, "loss": 0.4434, "mean_token_accuracy": 0.8679667346179485, "num_tokens": 26552993.0, "step": 4010 }, { "entropy": 0.4551722928881645, "epoch": 0.06188197398878967, "grad_norm": 0.46824219822883606, "learning_rate": 4.124377854174149e-06, "loss": 0.4595, "mean_token_accuracy": 0.8641784347593784, "num_tokens": 26618030.0, "step": 4020 }, { "entropy": 0.4524669874459505, "epoch": 0.06203590924746825, "grad_norm": 0.4672582745552063, "learning_rate": 4.134640053363436e-06, "loss": 0.4423, "mean_token_accuracy": 0.8666078172624111, "num_tokens": 26686551.0, "step": 4030 }, { "entropy": 0.43626517429947853, "epoch": 0.06218984450614683, "grad_norm": 0.4878115952014923, "learning_rate": 4.144902252552722e-06, "loss": 0.4302, "mean_token_accuracy": 0.8703767336905003, "num_tokens": 26752403.0, "step": 4040 }, { "entropy": 0.457973039150238, "epoch": 0.062343779764825406, "grad_norm": 0.49351686239242554, "learning_rate": 4.155164451742009e-06, "loss": 0.4429, "mean_token_accuracy": 0.8655408278107644, "num_tokens": 26808854.0, "step": 4050 }, { "entropy": 0.45130028314888476, "epoch": 0.06249771502350399, "grad_norm": 0.44747889041900635, "learning_rate": 4.165426650931295e-06, "loss": 0.4417, "mean_token_accuracy": 0.8685718968510627, "num_tokens": 26888044.0, "step": 4060 }, { "entropy": 0.453334741666913, "epoch": 0.06265165028218257, "grad_norm": 0.39464664459228516, "learning_rate": 4.175688850120581e-06, "loss": 0.4489, "mean_token_accuracy": 0.866968820989132, "num_tokens": 26966006.0, "step": 4070 }, { "entropy": 0.4550411984324455, "epoch": 0.06280558554086116, "grad_norm": 0.45517972111701965, "learning_rate": 4.1859510493098675e-06, "loss": 0.4355, "mean_token_accuracy": 0.8677096910774708, "num_tokens": 27036594.0, "step": 4080 }, { "entropy": 0.45575973521918056, "epoch": 0.06295952079953973, "grad_norm": 0.4842463731765747, "learning_rate": 4.1962132484991535e-06, "loss": 0.4427, "mean_token_accuracy": 0.8672869026660919, "num_tokens": 27097812.0, "step": 4090 }, { "entropy": 0.46496043130755427, "epoch": 0.06311345605821832, "grad_norm": 0.49534234404563904, "learning_rate": 4.20647544768844e-06, "loss": 0.4645, "mean_token_accuracy": 0.8644502736628056, "num_tokens": 27162614.0, "step": 4100 }, { "entropy": 0.440515648201108, "epoch": 0.06326739131689689, "grad_norm": 0.5684093236923218, "learning_rate": 4.216737646877726e-06, "loss": 0.4458, "mean_token_accuracy": 0.8730449408292771, "num_tokens": 27233698.0, "step": 4110 }, { "entropy": 0.4743261933326721, "epoch": 0.06342132657557548, "grad_norm": 0.41935038566589355, "learning_rate": 4.226999846067012e-06, "loss": 0.4565, "mean_token_accuracy": 0.8620549410581588, "num_tokens": 27299837.0, "step": 4120 }, { "entropy": 0.46844785250723364, "epoch": 0.06357526183425406, "grad_norm": 0.48469024896621704, "learning_rate": 4.237262045256299e-06, "loss": 0.4599, "mean_token_accuracy": 0.864503525942564, "num_tokens": 27372293.0, "step": 4130 }, { "entropy": 0.46538444310426713, "epoch": 0.06372919709293264, "grad_norm": 0.6437295079231262, "learning_rate": 4.247524244445585e-06, "loss": 0.4365, "mean_token_accuracy": 0.8625970833003521, "num_tokens": 27438114.0, "step": 4140 }, { "entropy": 0.4499321598559618, "epoch": 0.06388313235161122, "grad_norm": 0.38929426670074463, "learning_rate": 4.257786443634872e-06, "loss": 0.4323, "mean_token_accuracy": 0.8665673434734344, "num_tokens": 27502042.0, "step": 4150 }, { "entropy": 0.44898705668747424, "epoch": 0.06403706761028981, "grad_norm": 0.5321436524391174, "learning_rate": 4.268048642824158e-06, "loss": 0.4469, "mean_token_accuracy": 0.8670206174254418, "num_tokens": 27565814.0, "step": 4160 }, { "entropy": 0.46878557093441486, "epoch": 0.06419100286896838, "grad_norm": 0.5273202657699585, "learning_rate": 4.278310842013444e-06, "loss": 0.4384, "mean_token_accuracy": 0.8603363029658795, "num_tokens": 27627287.0, "step": 4170 }, { "entropy": 0.43385438099503515, "epoch": 0.06434493812764697, "grad_norm": 0.4667953848838806, "learning_rate": 4.2885730412027295e-06, "loss": 0.4248, "mean_token_accuracy": 0.8715814180672169, "num_tokens": 27695607.0, "step": 4180 }, { "entropy": 0.44315362758934496, "epoch": 0.06449887338632554, "grad_norm": 0.6750472187995911, "learning_rate": 4.298835240392016e-06, "loss": 0.4341, "mean_token_accuracy": 0.8693481974303723, "num_tokens": 27765801.0, "step": 4190 }, { "entropy": 0.4542911022901535, "epoch": 0.06465280864500413, "grad_norm": 0.4916030168533325, "learning_rate": 4.309097439581303e-06, "loss": 0.4416, "mean_token_accuracy": 0.8651349946856499, "num_tokens": 27834680.0, "step": 4200 }, { "entropy": 0.4493482533842325, "epoch": 0.06480674390368271, "grad_norm": 0.48829343914985657, "learning_rate": 4.319359638770589e-06, "loss": 0.4235, "mean_token_accuracy": 0.8643908113241195, "num_tokens": 27914145.0, "step": 4210 }, { "entropy": 0.44622364863753317, "epoch": 0.06496067916236128, "grad_norm": 0.444275826215744, "learning_rate": 4.329621837959875e-06, "loss": 0.4433, "mean_token_accuracy": 0.8669401101768017, "num_tokens": 27981112.0, "step": 4220 }, { "entropy": 0.453578008711338, "epoch": 0.06511461442103987, "grad_norm": 0.4628658890724182, "learning_rate": 4.339884037149161e-06, "loss": 0.4416, "mean_token_accuracy": 0.8651710949838162, "num_tokens": 28048778.0, "step": 4230 }, { "entropy": 0.453409119322896, "epoch": 0.06526854967971846, "grad_norm": 0.537559986114502, "learning_rate": 4.350146236338448e-06, "loss": 0.4394, "mean_token_accuracy": 0.8647850267589092, "num_tokens": 28116744.0, "step": 4240 }, { "entropy": 0.4378142818808556, "epoch": 0.06542248493839703, "grad_norm": 0.4816446900367737, "learning_rate": 4.360408435527734e-06, "loss": 0.433, "mean_token_accuracy": 0.8711689800024033, "num_tokens": 28180320.0, "step": 4250 }, { "entropy": 0.4435060642659664, "epoch": 0.06557642019707562, "grad_norm": 0.5464059710502625, "learning_rate": 4.3706706347170205e-06, "loss": 0.429, "mean_token_accuracy": 0.8690860703587532, "num_tokens": 28252392.0, "step": 4260 }, { "entropy": 0.4525720851495862, "epoch": 0.0657303554557542, "grad_norm": 0.44838976860046387, "learning_rate": 4.3809328339063065e-06, "loss": 0.4319, "mean_token_accuracy": 0.8652340486645699, "num_tokens": 28309739.0, "step": 4270 }, { "entropy": 0.45215503200888635, "epoch": 0.06588429071443277, "grad_norm": 0.547401487827301, "learning_rate": 4.391195033095592e-06, "loss": 0.4452, "mean_token_accuracy": 0.866407036036253, "num_tokens": 28378471.0, "step": 4280 }, { "entropy": 0.44314270466566086, "epoch": 0.06603822597311136, "grad_norm": 0.5353981852531433, "learning_rate": 4.401457232284879e-06, "loss": 0.4362, "mean_token_accuracy": 0.869679831713438, "num_tokens": 28445923.0, "step": 4290 }, { "entropy": 0.45549696274101736, "epoch": 0.06619216123178993, "grad_norm": 0.5248070955276489, "learning_rate": 4.411719431474165e-06, "loss": 0.4446, "mean_token_accuracy": 0.8661917731165886, "num_tokens": 28512162.0, "step": 4300 }, { "entropy": 0.4601427044719458, "epoch": 0.06634609649046852, "grad_norm": 0.5303754210472107, "learning_rate": 4.421981630663452e-06, "loss": 0.4301, "mean_token_accuracy": 0.8641643099486828, "num_tokens": 28575875.0, "step": 4310 }, { "entropy": 0.4542066641151905, "epoch": 0.0665000317491471, "grad_norm": 0.647817850112915, "learning_rate": 4.432243829852738e-06, "loss": 0.4337, "mean_token_accuracy": 0.8671061284840107, "num_tokens": 28643747.0, "step": 4320 }, { "entropy": 0.4267870720475912, "epoch": 0.06665396700782568, "grad_norm": 0.4259481430053711, "learning_rate": 4.442506029042024e-06, "loss": 0.4212, "mean_token_accuracy": 0.8739156730473041, "num_tokens": 28709554.0, "step": 4330 }, { "entropy": 0.43938430286943914, "epoch": 0.06680790226650427, "grad_norm": 0.6124649047851562, "learning_rate": 4.452768228231311e-06, "loss": 0.4292, "mean_token_accuracy": 0.8708281181752682, "num_tokens": 28773642.0, "step": 4340 }, { "entropy": 0.46001375280320644, "epoch": 0.06696183752518285, "grad_norm": 0.5957135558128357, "learning_rate": 4.4630304274205966e-06, "loss": 0.4344, "mean_token_accuracy": 0.868109331279993, "num_tokens": 28836243.0, "step": 4350 }, { "entropy": 0.4407184224575758, "epoch": 0.06711577278386142, "grad_norm": 0.4433448910713196, "learning_rate": 4.473292626609883e-06, "loss": 0.4229, "mean_token_accuracy": 0.8693583555519581, "num_tokens": 28900298.0, "step": 4360 }, { "entropy": 0.45080235954374076, "epoch": 0.06726970804254001, "grad_norm": 0.455030232667923, "learning_rate": 4.483554825799169e-06, "loss": 0.4515, "mean_token_accuracy": 0.864966518431902, "num_tokens": 28961844.0, "step": 4370 }, { "entropy": 0.42721987422555685, "epoch": 0.06742364330121858, "grad_norm": 0.5360413789749146, "learning_rate": 4.493817024988455e-06, "loss": 0.424, "mean_token_accuracy": 0.8728285782039166, "num_tokens": 29024521.0, "step": 4380 }, { "entropy": 0.43830730896443126, "epoch": 0.06757757855989717, "grad_norm": 0.48367202281951904, "learning_rate": 4.504079224177741e-06, "loss": 0.4428, "mean_token_accuracy": 0.8689893327653408, "num_tokens": 29086725.0, "step": 4390 }, { "entropy": 0.43234930112957953, "epoch": 0.06773151381857576, "grad_norm": 0.5413707494735718, "learning_rate": 4.514341423367028e-06, "loss": 0.4212, "mean_token_accuracy": 0.87171261459589, "num_tokens": 29153943.0, "step": 4400 }, { "entropy": 0.43048507198691366, "epoch": 0.06788544907725433, "grad_norm": 0.5127657651901245, "learning_rate": 4.524603622556314e-06, "loss": 0.4168, "mean_token_accuracy": 0.8742999926209449, "num_tokens": 29222882.0, "step": 4410 }, { "entropy": 0.4381256137043238, "epoch": 0.06803938433593291, "grad_norm": 0.5586366057395935, "learning_rate": 4.534865821745601e-06, "loss": 0.4253, "mean_token_accuracy": 0.8727688893675805, "num_tokens": 29287286.0, "step": 4420 }, { "entropy": 0.4454494144767523, "epoch": 0.0681933195946115, "grad_norm": 0.5252095460891724, "learning_rate": 4.545128020934887e-06, "loss": 0.4349, "mean_token_accuracy": 0.8692857176065445, "num_tokens": 29362805.0, "step": 4430 }, { "entropy": 0.4518742311745882, "epoch": 0.06834725485329007, "grad_norm": 0.45533856749534607, "learning_rate": 4.555390220124173e-06, "loss": 0.4482, "mean_token_accuracy": 0.8660914018750191, "num_tokens": 29433082.0, "step": 4440 }, { "entropy": 0.44817618988454344, "epoch": 0.06850119011196866, "grad_norm": 0.40252354741096497, "learning_rate": 4.565652419313459e-06, "loss": 0.4472, "mean_token_accuracy": 0.8683816350996494, "num_tokens": 29499394.0, "step": 4450 }, { "entropy": 0.45081495977938174, "epoch": 0.06865512537064725, "grad_norm": 0.6492361426353455, "learning_rate": 4.575914618502745e-06, "loss": 0.4366, "mean_token_accuracy": 0.8674503922462463, "num_tokens": 29567183.0, "step": 4460 }, { "entropy": 0.45052420608699323, "epoch": 0.06880906062932582, "grad_norm": 0.5836458802223206, "learning_rate": 4.586176817692032e-06, "loss": 0.4241, "mean_token_accuracy": 0.8673846818506717, "num_tokens": 29631777.0, "step": 4470 }, { "entropy": 0.43540140707045794, "epoch": 0.0689629958880044, "grad_norm": 0.49901899695396423, "learning_rate": 4.596439016881318e-06, "loss": 0.4302, "mean_token_accuracy": 0.8720945864915848, "num_tokens": 29693430.0, "step": 4480 }, { "entropy": 0.4393024999648333, "epoch": 0.06911693114668298, "grad_norm": 0.8070120811462402, "learning_rate": 4.606701216070604e-06, "loss": 0.4253, "mean_token_accuracy": 0.8705707952380181, "num_tokens": 29754551.0, "step": 4490 }, { "entropy": 0.43125647492706776, "epoch": 0.06927086640536156, "grad_norm": 0.6778170466423035, "learning_rate": 4.616963415259891e-06, "loss": 0.4402, "mean_token_accuracy": 0.8693878464400768, "num_tokens": 29818513.0, "step": 4500 }, { "entropy": 0.44107995219528673, "epoch": 0.06942480166404015, "grad_norm": 0.4139500856399536, "learning_rate": 4.627225614449177e-06, "loss": 0.4479, "mean_token_accuracy": 0.867721838504076, "num_tokens": 29883534.0, "step": 4510 }, { "entropy": 0.4471041072160006, "epoch": 0.06957873692271872, "grad_norm": 0.6659610271453857, "learning_rate": 4.637487813638464e-06, "loss": 0.446, "mean_token_accuracy": 0.8664744555950165, "num_tokens": 29943032.0, "step": 4520 }, { "entropy": 0.4435852263122797, "epoch": 0.06973267218139731, "grad_norm": 0.6100077033042908, "learning_rate": 4.6477500128277495e-06, "loss": 0.4474, "mean_token_accuracy": 0.8720152266323566, "num_tokens": 30006453.0, "step": 4530 }, { "entropy": 0.43452307395637035, "epoch": 0.0698866074400759, "grad_norm": 0.7998954057693481, "learning_rate": 4.6580122120170355e-06, "loss": 0.4271, "mean_token_accuracy": 0.8727165512740612, "num_tokens": 30064890.0, "step": 4540 }, { "entropy": 0.4268382865935564, "epoch": 0.07004054269875447, "grad_norm": 0.6435014009475708, "learning_rate": 4.6682744112063214e-06, "loss": 0.4107, "mean_token_accuracy": 0.8748654343187809, "num_tokens": 30125375.0, "step": 4550 }, { "entropy": 0.45047276839613914, "epoch": 0.07019447795743305, "grad_norm": 0.6302629709243774, "learning_rate": 4.678536610395608e-06, "loss": 0.4493, "mean_token_accuracy": 0.8643552705645561, "num_tokens": 30195275.0, "step": 4560 }, { "entropy": 0.4360350679606199, "epoch": 0.07034841321611163, "grad_norm": 0.5626946687698364, "learning_rate": 4.688798809584895e-06, "loss": 0.4295, "mean_token_accuracy": 0.8718291468918323, "num_tokens": 30256433.0, "step": 4570 }, { "entropy": 0.45359312407672403, "epoch": 0.07050234847479021, "grad_norm": 0.5781683325767517, "learning_rate": 4.699061008774181e-06, "loss": 0.4365, "mean_token_accuracy": 0.865373981744051, "num_tokens": 30332555.0, "step": 4580 }, { "entropy": 0.4215102206915617, "epoch": 0.0706562837334688, "grad_norm": 0.6096848249435425, "learning_rate": 4.709323207963467e-06, "loss": 0.4114, "mean_token_accuracy": 0.8764167673885822, "num_tokens": 30395296.0, "step": 4590 }, { "entropy": 0.45378452707082034, "epoch": 0.07081021899214737, "grad_norm": 0.5584619045257568, "learning_rate": 4.719585407152753e-06, "loss": 0.4357, "mean_token_accuracy": 0.8682520747184753, "num_tokens": 30467374.0, "step": 4600 }, { "entropy": 0.42796165198087693, "epoch": 0.07096415425082596, "grad_norm": 0.585083544254303, "learning_rate": 4.72984760634204e-06, "loss": 0.4176, "mean_token_accuracy": 0.8747455298900604, "num_tokens": 30530666.0, "step": 4610 }, { "entropy": 0.43023137710988524, "epoch": 0.07111808950950455, "grad_norm": 0.7139826416969299, "learning_rate": 4.740109805531326e-06, "loss": 0.4318, "mean_token_accuracy": 0.8717255994677544, "num_tokens": 30594362.0, "step": 4620 }, { "entropy": 0.42966807074844837, "epoch": 0.07127202476818312, "grad_norm": 0.4290238916873932, "learning_rate": 4.750372004720612e-06, "loss": 0.4181, "mean_token_accuracy": 0.8754311814904213, "num_tokens": 30650878.0, "step": 4630 }, { "entropy": 0.4368949789553881, "epoch": 0.0714259600268617, "grad_norm": 0.6265106201171875, "learning_rate": 4.760634203909898e-06, "loss": 0.4258, "mean_token_accuracy": 0.8712414830923081, "num_tokens": 30715230.0, "step": 4640 }, { "entropy": 0.43554726913571357, "epoch": 0.07157989528554029, "grad_norm": 0.5762355327606201, "learning_rate": 4.770896403099184e-06, "loss": 0.4277, "mean_token_accuracy": 0.8727897234261036, "num_tokens": 30786269.0, "step": 4650 }, { "entropy": 0.4292267492040992, "epoch": 0.07173383054421886, "grad_norm": 0.6520154476165771, "learning_rate": 4.781158602288471e-06, "loss": 0.4384, "mean_token_accuracy": 0.8728045299649239, "num_tokens": 30843447.0, "step": 4660 }, { "entropy": 0.4244242705404758, "epoch": 0.07188776580289745, "grad_norm": 0.5313965082168579, "learning_rate": 4.791420801477757e-06, "loss": 0.4206, "mean_token_accuracy": 0.8736548468470573, "num_tokens": 30917501.0, "step": 4670 }, { "entropy": 0.43692820984870195, "epoch": 0.07204170106157602, "grad_norm": 0.5275073647499084, "learning_rate": 4.801683000667044e-06, "loss": 0.4336, "mean_token_accuracy": 0.8714191421866417, "num_tokens": 30987718.0, "step": 4680 }, { "entropy": 0.44872372411191463, "epoch": 0.07219563632025461, "grad_norm": 0.6203837394714355, "learning_rate": 4.81194519985633e-06, "loss": 0.4319, "mean_token_accuracy": 0.8673177860677243, "num_tokens": 31054957.0, "step": 4690 }, { "entropy": 0.43778529558330775, "epoch": 0.0723495715789332, "grad_norm": 0.6044971942901611, "learning_rate": 4.822207399045616e-06, "loss": 0.444, "mean_token_accuracy": 0.8716990239918232, "num_tokens": 31112788.0, "step": 4700 }, { "entropy": 0.4348543733358383, "epoch": 0.07250350683761177, "grad_norm": 0.48114073276519775, "learning_rate": 4.8324695982349025e-06, "loss": 0.4202, "mean_token_accuracy": 0.8719729818403721, "num_tokens": 31187017.0, "step": 4710 }, { "entropy": 0.4409152863547206, "epoch": 0.07265744209629035, "grad_norm": 0.4970141053199768, "learning_rate": 4.8427317974241884e-06, "loss": 0.4257, "mean_token_accuracy": 0.8704994782805443, "num_tokens": 31251013.0, "step": 4720 }, { "entropy": 0.43109920918941497, "epoch": 0.07281137735496894, "grad_norm": 0.5437235236167908, "learning_rate": 4.852993996613474e-06, "loss": 0.4139, "mean_token_accuracy": 0.8728999182581901, "num_tokens": 31327507.0, "step": 4730 }, { "entropy": 0.43550004325807096, "epoch": 0.07296531261364751, "grad_norm": 0.4968566596508026, "learning_rate": 4.86325619580276e-06, "loss": 0.4273, "mean_token_accuracy": 0.8716769888997078, "num_tokens": 31398619.0, "step": 4740 }, { "entropy": 0.438331962376833, "epoch": 0.0731192478723261, "grad_norm": 0.6142215132713318, "learning_rate": 4.873518394992047e-06, "loss": 0.4355, "mean_token_accuracy": 0.8680566608905792, "num_tokens": 31467269.0, "step": 4750 }, { "entropy": 0.4609987366944551, "epoch": 0.07327318313100467, "grad_norm": 0.6624812483787537, "learning_rate": 4.883780594181333e-06, "loss": 0.4391, "mean_token_accuracy": 0.8635055311024189, "num_tokens": 31526073.0, "step": 4760 }, { "entropy": 0.4477278683334589, "epoch": 0.07342711838968326, "grad_norm": 0.5468263626098633, "learning_rate": 4.89404279337062e-06, "loss": 0.4306, "mean_token_accuracy": 0.8688179835677147, "num_tokens": 31593770.0, "step": 4770 }, { "entropy": 0.43328834921121595, "epoch": 0.07358105364836184, "grad_norm": 0.6477828621864319, "learning_rate": 4.904304992559906e-06, "loss": 0.4205, "mean_token_accuracy": 0.8743316382169724, "num_tokens": 31662034.0, "step": 4780 }, { "entropy": 0.45155464205890894, "epoch": 0.07373498890704042, "grad_norm": 0.5492165088653564, "learning_rate": 4.914567191749192e-06, "loss": 0.4482, "mean_token_accuracy": 0.8662343196570873, "num_tokens": 31727910.0, "step": 4790 }, { "entropy": 0.4234725644811988, "epoch": 0.073888924165719, "grad_norm": 0.5141232013702393, "learning_rate": 4.9248293909384786e-06, "loss": 0.4114, "mean_token_accuracy": 0.8739719919860363, "num_tokens": 31794814.0, "step": 4800 }, { "entropy": 0.42531051449477675, "epoch": 0.07404285942439759, "grad_norm": 0.5651070475578308, "learning_rate": 4.9350915901277645e-06, "loss": 0.4251, "mean_token_accuracy": 0.8747605383396149, "num_tokens": 31858450.0, "step": 4810 }, { "entropy": 0.42737324088811873, "epoch": 0.07419679468307616, "grad_norm": 0.6124818921089172, "learning_rate": 4.945353789317051e-06, "loss": 0.4204, "mean_token_accuracy": 0.874111083894968, "num_tokens": 31922295.0, "step": 4820 }, { "entropy": 0.4235507473349571, "epoch": 0.07435072994175475, "grad_norm": 0.5936418175697327, "learning_rate": 4.955615988506337e-06, "loss": 0.4337, "mean_token_accuracy": 0.873522213101387, "num_tokens": 31982982.0, "step": 4830 }, { "entropy": 0.4503814272582531, "epoch": 0.07450466520043333, "grad_norm": 0.7092617750167847, "learning_rate": 4.965878187695623e-06, "loss": 0.4335, "mean_token_accuracy": 0.8669056862592697, "num_tokens": 32051485.0, "step": 4840 }, { "entropy": 0.436300403624773, "epoch": 0.0746586004591119, "grad_norm": 0.7018095850944519, "learning_rate": 4.97614038688491e-06, "loss": 0.4186, "mean_token_accuracy": 0.8717990070581436, "num_tokens": 32121114.0, "step": 4850 }, { "entropy": 0.44688520170748236, "epoch": 0.0748125357177905, "grad_norm": 0.7257376909255981, "learning_rate": 4.986402586074196e-06, "loss": 0.4384, "mean_token_accuracy": 0.8689869448542595, "num_tokens": 32180920.0, "step": 4860 }, { "entropy": 0.44128737822175024, "epoch": 0.07496647097646907, "grad_norm": 0.5909163355827332, "learning_rate": 4.996664785263483e-06, "loss": 0.4341, "mean_token_accuracy": 0.8702662937343121, "num_tokens": 32241576.0, "step": 4870 }, { "entropy": 0.43739975392818453, "epoch": 0.07512040623514765, "grad_norm": 0.5158711671829224, "learning_rate": 5.006926984452769e-06, "loss": 0.4277, "mean_token_accuracy": 0.8736055098474026, "num_tokens": 32312249.0, "step": 4880 }, { "entropy": 0.42668365892022847, "epoch": 0.07527434149382624, "grad_norm": 0.7749167084693909, "learning_rate": 5.017189183642055e-06, "loss": 0.413, "mean_token_accuracy": 0.8760628707706928, "num_tokens": 32383326.0, "step": 4890 }, { "entropy": 0.438033302128315, "epoch": 0.07542827675250481, "grad_norm": 0.5635640025138855, "learning_rate": 5.0274513828313406e-06, "loss": 0.4362, "mean_token_accuracy": 0.869855822622776, "num_tokens": 32444559.0, "step": 4900 }, { "entropy": 0.434188605286181, "epoch": 0.0755822120111834, "grad_norm": 0.7389861345291138, "learning_rate": 5.0377135820206265e-06, "loss": 0.427, "mean_token_accuracy": 0.872494001686573, "num_tokens": 32511174.0, "step": 4910 }, { "entropy": 0.41789351813495157, "epoch": 0.07573614726986198, "grad_norm": 0.6728223562240601, "learning_rate": 5.047975781209914e-06, "loss": 0.4213, "mean_token_accuracy": 0.8762855052947998, "num_tokens": 32573103.0, "step": 4920 }, { "entropy": 0.43080891147255895, "epoch": 0.07589008252854056, "grad_norm": 0.6046070456504822, "learning_rate": 5.0582379803992e-06, "loss": 0.4115, "mean_token_accuracy": 0.8733631186187267, "num_tokens": 32634250.0, "step": 4930 }, { "entropy": 0.4267462085932493, "epoch": 0.07604401778721914, "grad_norm": 0.5924621224403381, "learning_rate": 5.068500179588486e-06, "loss": 0.4128, "mean_token_accuracy": 0.8743566997349262, "num_tokens": 32694572.0, "step": 4940 }, { "entropy": 0.4351202379912138, "epoch": 0.07619795304589772, "grad_norm": 0.5197636485099792, "learning_rate": 5.078762378777773e-06, "loss": 0.4271, "mean_token_accuracy": 0.8709302566945553, "num_tokens": 32750490.0, "step": 4950 }, { "entropy": 0.4142552940174937, "epoch": 0.0763518883045763, "grad_norm": 0.7984163165092468, "learning_rate": 5.089024577967059e-06, "loss": 0.4201, "mean_token_accuracy": 0.875827606022358, "num_tokens": 32816864.0, "step": 4960 }, { "entropy": 0.44386029094457624, "epoch": 0.07650582356325489, "grad_norm": 0.6631568074226379, "learning_rate": 5.099286777156345e-06, "loss": 0.425, "mean_token_accuracy": 0.8703337624669075, "num_tokens": 32881242.0, "step": 4970 }, { "entropy": 0.4420074883848429, "epoch": 0.07665975882193346, "grad_norm": 0.6967163681983948, "learning_rate": 5.109548976345631e-06, "loss": 0.4369, "mean_token_accuracy": 0.8711039267480374, "num_tokens": 32951891.0, "step": 4980 }, { "entropy": 0.43923610504716637, "epoch": 0.07681369408061205, "grad_norm": 0.6958683729171753, "learning_rate": 5.1198111755349175e-06, "loss": 0.4225, "mean_token_accuracy": 0.8715112514793872, "num_tokens": 33020353.0, "step": 4990 }, { "entropy": 0.41183395087718966, "epoch": 0.07696762933929063, "grad_norm": 0.7061434388160706, "learning_rate": 5.1300733747242034e-06, "loss": 0.4133, "mean_token_accuracy": 0.8790297605097294, "num_tokens": 33083710.0, "step": 5000 }, { "entropy": 0.44321772567927836, "epoch": 0.0771215645979692, "grad_norm": 0.7578907608985901, "learning_rate": 5.140335573913489e-06, "loss": 0.4368, "mean_token_accuracy": 0.8647960893809795, "num_tokens": 33139606.0, "step": 5010 }, { "entropy": 0.426203272305429, "epoch": 0.07727549985664779, "grad_norm": 0.5662795305252075, "learning_rate": 5.150597773102777e-06, "loss": 0.4286, "mean_token_accuracy": 0.8726430989801883, "num_tokens": 33202973.0, "step": 5020 }, { "entropy": 0.4395048229023814, "epoch": 0.07742943511532638, "grad_norm": 0.6590364575386047, "learning_rate": 5.160859972292063e-06, "loss": 0.4202, "mean_token_accuracy": 0.8712683215737342, "num_tokens": 33268226.0, "step": 5030 }, { "entropy": 0.4307605214416981, "epoch": 0.07758337037400495, "grad_norm": 0.6200886368751526, "learning_rate": 5.171122171481349e-06, "loss": 0.4221, "mean_token_accuracy": 0.8752574756741524, "num_tokens": 33337867.0, "step": 5040 }, { "entropy": 0.4225597377866507, "epoch": 0.07773730563268354, "grad_norm": 0.6719914674758911, "learning_rate": 5.181384370670635e-06, "loss": 0.424, "mean_token_accuracy": 0.8748431585729122, "num_tokens": 33401444.0, "step": 5050 }, { "entropy": 0.4142552735283971, "epoch": 0.07789124089136211, "grad_norm": 0.5960219502449036, "learning_rate": 5.191646569859922e-06, "loss": 0.4133, "mean_token_accuracy": 0.8783732444047928, "num_tokens": 33467434.0, "step": 5060 }, { "entropy": 0.4282940816134214, "epoch": 0.0780451761500407, "grad_norm": 0.5027746558189392, "learning_rate": 5.201908769049208e-06, "loss": 0.4232, "mean_token_accuracy": 0.8714878387749195, "num_tokens": 33540014.0, "step": 5070 }, { "entropy": 0.4207949053496122, "epoch": 0.07819911140871928, "grad_norm": 0.7483528852462769, "learning_rate": 5.2121709682384935e-06, "loss": 0.4192, "mean_token_accuracy": 0.8759927079081535, "num_tokens": 33602283.0, "step": 5080 }, { "entropy": 0.44280373938381673, "epoch": 0.07835304666739785, "grad_norm": 0.6777769327163696, "learning_rate": 5.22243316742778e-06, "loss": 0.4389, "mean_token_accuracy": 0.8696865729987622, "num_tokens": 33666546.0, "step": 5090 }, { "entropy": 0.434335564263165, "epoch": 0.07850698192607644, "grad_norm": 0.6038931608200073, "learning_rate": 5.232695366617066e-06, "loss": 0.4222, "mean_token_accuracy": 0.8719284042716027, "num_tokens": 33732073.0, "step": 5100 }, { "entropy": 0.4324218679219484, "epoch": 0.07866091718475503, "grad_norm": 0.7179152965545654, "learning_rate": 5.242957565806352e-06, "loss": 0.4057, "mean_token_accuracy": 0.8738261327147484, "num_tokens": 33796108.0, "step": 5110 }, { "entropy": 0.42774378079921005, "epoch": 0.0788148524434336, "grad_norm": 0.7869688272476196, "learning_rate": 5.253219764995638e-06, "loss": 0.4319, "mean_token_accuracy": 0.8696112684905529, "num_tokens": 33854146.0, "step": 5120 }, { "entropy": 0.4163206540048122, "epoch": 0.07896878770211219, "grad_norm": 0.5846735239028931, "learning_rate": 5.263481964184926e-06, "loss": 0.4024, "mean_token_accuracy": 0.876974007487297, "num_tokens": 33919336.0, "step": 5130 }, { "entropy": 0.4462639458477497, "epoch": 0.07912272296079076, "grad_norm": 0.6638777852058411, "learning_rate": 5.273744163374212e-06, "loss": 0.432, "mean_token_accuracy": 0.8678876765072345, "num_tokens": 33981135.0, "step": 5140 }, { "entropy": 0.43107565939426423, "epoch": 0.07927665821946935, "grad_norm": 0.6271690130233765, "learning_rate": 5.284006362563498e-06, "loss": 0.4147, "mean_token_accuracy": 0.8733716629445553, "num_tokens": 34042886.0, "step": 5150 }, { "entropy": 0.42411140762269495, "epoch": 0.07943059347814793, "grad_norm": 0.6102625131607056, "learning_rate": 5.2942685617527845e-06, "loss": 0.4202, "mean_token_accuracy": 0.8735177330672741, "num_tokens": 34103775.0, "step": 5160 }, { "entropy": 0.43352266158908603, "epoch": 0.0795845287368265, "grad_norm": 0.524846613407135, "learning_rate": 5.3045307609420704e-06, "loss": 0.4279, "mean_token_accuracy": 0.8698651477694511, "num_tokens": 34178284.0, "step": 5170 }, { "entropy": 0.429770103469491, "epoch": 0.07973846399550509, "grad_norm": 0.6829613447189331, "learning_rate": 5.314792960131356e-06, "loss": 0.417, "mean_token_accuracy": 0.8731008902192116, "num_tokens": 34242809.0, "step": 5180 }, { "entropy": 0.4041440483182669, "epoch": 0.07989239925418368, "grad_norm": 0.5456902980804443, "learning_rate": 5.325055159320642e-06, "loss": 0.3889, "mean_token_accuracy": 0.8813743181526661, "num_tokens": 34310075.0, "step": 5190 }, { "entropy": 0.44433347024023534, "epoch": 0.08004633451286225, "grad_norm": 0.7528950572013855, "learning_rate": 5.335317358509929e-06, "loss": 0.4279, "mean_token_accuracy": 0.8670542031526566, "num_tokens": 34370406.0, "step": 5200 }, { "entropy": 0.4279072808101773, "epoch": 0.08020026977154084, "grad_norm": 0.6122167706489563, "learning_rate": 5.345579557699215e-06, "loss": 0.4343, "mean_token_accuracy": 0.8734210044145584, "num_tokens": 34430855.0, "step": 5210 }, { "entropy": 0.42270097732543943, "epoch": 0.08035420503021942, "grad_norm": 0.6118198037147522, "learning_rate": 5.355841756888501e-06, "loss": 0.4152, "mean_token_accuracy": 0.8769620075821877, "num_tokens": 34490768.0, "step": 5220 }, { "entropy": 0.4214580971747637, "epoch": 0.080508140288898, "grad_norm": 0.631098210811615, "learning_rate": 5.366103956077789e-06, "loss": 0.4061, "mean_token_accuracy": 0.8777850709855557, "num_tokens": 34552954.0, "step": 5230 }, { "entropy": 0.4187882348895073, "epoch": 0.08066207554757658, "grad_norm": 0.6333276629447937, "learning_rate": 5.376366155267075e-06, "loss": 0.4136, "mean_token_accuracy": 0.8745335608720779, "num_tokens": 34614927.0, "step": 5240 }, { "entropy": 0.4389587864279747, "epoch": 0.08081601080625515, "grad_norm": 0.5010234117507935, "learning_rate": 5.3866283544563606e-06, "loss": 0.4276, "mean_token_accuracy": 0.8707822062075138, "num_tokens": 34682117.0, "step": 5250 }, { "entropy": 0.43640714809298514, "epoch": 0.08096994606493374, "grad_norm": 0.6409255862236023, "learning_rate": 5.3968905536456465e-06, "loss": 0.4168, "mean_token_accuracy": 0.8741114400327206, "num_tokens": 34749463.0, "step": 5260 }, { "entropy": 0.4094121715053916, "epoch": 0.08112388132361233, "grad_norm": 0.6909043192863464, "learning_rate": 5.407152752834933e-06, "loss": 0.4174, "mean_token_accuracy": 0.8793571993708611, "num_tokens": 34810881.0, "step": 5270 }, { "entropy": 0.4202055620029569, "epoch": 0.0812778165822909, "grad_norm": 0.6904728412628174, "learning_rate": 5.417414952024219e-06, "loss": 0.4158, "mean_token_accuracy": 0.8772929340600968, "num_tokens": 34876380.0, "step": 5280 }, { "entropy": 0.4350327538326383, "epoch": 0.08143175184096949, "grad_norm": 0.657232403755188, "learning_rate": 5.427677151213505e-06, "loss": 0.4333, "mean_token_accuracy": 0.8705995284020901, "num_tokens": 34938433.0, "step": 5290 }, { "entropy": 0.42657493278384206, "epoch": 0.08158568709964807, "grad_norm": 0.7711935639381409, "learning_rate": 5.437939350402792e-06, "loss": 0.4201, "mean_token_accuracy": 0.8744896501302719, "num_tokens": 35009054.0, "step": 5300 }, { "entropy": 0.42691188398748636, "epoch": 0.08173962235832664, "grad_norm": 0.5022244453430176, "learning_rate": 5.448201549592078e-06, "loss": 0.4173, "mean_token_accuracy": 0.874696784466505, "num_tokens": 35079966.0, "step": 5310 }, { "entropy": 0.4196954511106014, "epoch": 0.08189355761700523, "grad_norm": 0.6195477843284607, "learning_rate": 5.458463748781364e-06, "loss": 0.4135, "mean_token_accuracy": 0.8764049679040908, "num_tokens": 35150478.0, "step": 5320 }, { "entropy": 0.44586904626339674, "epoch": 0.0820474928756838, "grad_norm": 0.6803871393203735, "learning_rate": 5.46872594797065e-06, "loss": 0.4252, "mean_token_accuracy": 0.8678563430905342, "num_tokens": 35221609.0, "step": 5330 }, { "entropy": 0.4423530142754316, "epoch": 0.08220142813436239, "grad_norm": 0.5858097076416016, "learning_rate": 5.4789881471599375e-06, "loss": 0.4304, "mean_token_accuracy": 0.8683094881474972, "num_tokens": 35294865.0, "step": 5340 }, { "entropy": 0.42301856242120267, "epoch": 0.08235536339304098, "grad_norm": 0.8084450960159302, "learning_rate": 5.489250346349223e-06, "loss": 0.414, "mean_token_accuracy": 0.874556140601635, "num_tokens": 35367882.0, "step": 5350 }, { "entropy": 0.43246834445744753, "epoch": 0.08250929865171955, "grad_norm": 0.8120408654212952, "learning_rate": 5.499512545538509e-06, "loss": 0.4206, "mean_token_accuracy": 0.872473056614399, "num_tokens": 35425239.0, "step": 5360 }, { "entropy": 0.41288772951811553, "epoch": 0.08266323391039813, "grad_norm": 0.7603704333305359, "learning_rate": 5.509774744727796e-06, "loss": 0.4011, "mean_token_accuracy": 0.8785846725106239, "num_tokens": 35483989.0, "step": 5370 }, { "entropy": 0.41172635518014433, "epoch": 0.08281716916907672, "grad_norm": 0.7297714948654175, "learning_rate": 5.520036943917082e-06, "loss": 0.4071, "mean_token_accuracy": 0.8770224466919899, "num_tokens": 35543667.0, "step": 5380 }, { "entropy": 0.4331138519570231, "epoch": 0.0829711044277553, "grad_norm": 0.5840741991996765, "learning_rate": 5.530299143106368e-06, "loss": 0.4237, "mean_token_accuracy": 0.8704054646193982, "num_tokens": 35612091.0, "step": 5390 }, { "entropy": 0.4246971346437931, "epoch": 0.08312503968643388, "grad_norm": 0.6356149315834045, "learning_rate": 5.540561342295654e-06, "loss": 0.4255, "mean_token_accuracy": 0.8740020252764225, "num_tokens": 35682738.0, "step": 5400 }, { "entropy": 0.39963344763964415, "epoch": 0.08327897494511247, "grad_norm": 0.5635823607444763, "learning_rate": 5.550823541484941e-06, "loss": 0.4001, "mean_token_accuracy": 0.8816719308495522, "num_tokens": 35754264.0, "step": 5410 }, { "entropy": 0.4454112177714705, "epoch": 0.08343291020379104, "grad_norm": 0.7974903583526611, "learning_rate": 5.561085740674227e-06, "loss": 0.4356, "mean_token_accuracy": 0.8701792776584625, "num_tokens": 35820293.0, "step": 5420 }, { "entropy": 0.41953193992376325, "epoch": 0.08358684546246962, "grad_norm": 0.6903355121612549, "learning_rate": 5.571347939863513e-06, "loss": 0.4082, "mean_token_accuracy": 0.8775039598345756, "num_tokens": 35886092.0, "step": 5430 }, { "entropy": 0.4140274036675692, "epoch": 0.0837407807211482, "grad_norm": 0.6216182708740234, "learning_rate": 5.5816101390528e-06, "loss": 0.4116, "mean_token_accuracy": 0.8744678661227226, "num_tokens": 35955352.0, "step": 5440 }, { "entropy": 0.4185045186430216, "epoch": 0.08389471597982678, "grad_norm": 0.6931182146072388, "learning_rate": 5.591872338242086e-06, "loss": 0.4146, "mean_token_accuracy": 0.8761678047478199, "num_tokens": 36021037.0, "step": 5450 }, { "entropy": 0.4106154549866915, "epoch": 0.08404865123850537, "grad_norm": 0.5358290672302246, "learning_rate": 5.602134537431372e-06, "loss": 0.4063, "mean_token_accuracy": 0.8773495331406593, "num_tokens": 36086125.0, "step": 5460 }, { "entropy": 0.43716970067471267, "epoch": 0.08420258649718394, "grad_norm": 0.7022265195846558, "learning_rate": 5.612396736620658e-06, "loss": 0.4164, "mean_token_accuracy": 0.8731329597532749, "num_tokens": 36149706.0, "step": 5470 }, { "entropy": 0.4130382280796766, "epoch": 0.08435652175586253, "grad_norm": 0.6345503926277161, "learning_rate": 5.622658935809945e-06, "loss": 0.4109, "mean_token_accuracy": 0.8775826044380665, "num_tokens": 36220330.0, "step": 5480 }, { "entropy": 0.40830725338310003, "epoch": 0.08451045701454112, "grad_norm": 0.5789692401885986, "learning_rate": 5.632921134999231e-06, "loss": 0.3997, "mean_token_accuracy": 0.8794941529631615, "num_tokens": 36288346.0, "step": 5490 }, { "entropy": 0.39877272471785546, "epoch": 0.08466439227321969, "grad_norm": 0.6392132043838501, "learning_rate": 5.643183334188517e-06, "loss": 0.4, "mean_token_accuracy": 0.8805644512176514, "num_tokens": 36349834.0, "step": 5500 }, { "entropy": 0.41946568675339224, "epoch": 0.08481832753189827, "grad_norm": 0.7111280560493469, "learning_rate": 5.653445533377804e-06, "loss": 0.4081, "mean_token_accuracy": 0.8769002579152584, "num_tokens": 36415553.0, "step": 5510 }, { "entropy": 0.4269743964076042, "epoch": 0.08497226279057685, "grad_norm": 0.5952219367027283, "learning_rate": 5.66370773256709e-06, "loss": 0.4233, "mean_token_accuracy": 0.8717615015804767, "num_tokens": 36485307.0, "step": 5520 }, { "entropy": 0.4083184314891696, "epoch": 0.08512619804925543, "grad_norm": 0.6839407086372375, "learning_rate": 5.6739699317563755e-06, "loss": 0.4034, "mean_token_accuracy": 0.8819707810878754, "num_tokens": 36547792.0, "step": 5530 }, { "entropy": 0.4356100518256426, "epoch": 0.08528013330793402, "grad_norm": 0.7519106864929199, "learning_rate": 5.6842321309456615e-06, "loss": 0.4324, "mean_token_accuracy": 0.8678693562746048, "num_tokens": 36618444.0, "step": 5540 }, { "entropy": 0.4265044052153826, "epoch": 0.08543406856661259, "grad_norm": 0.810945451259613, "learning_rate": 5.694494330134948e-06, "loss": 0.4176, "mean_token_accuracy": 0.8740845955908298, "num_tokens": 36682668.0, "step": 5550 }, { "entropy": 0.43693528175354, "epoch": 0.08558800382529118, "grad_norm": 0.6631878614425659, "learning_rate": 5.704756529324234e-06, "loss": 0.4257, "mean_token_accuracy": 0.8699273519217968, "num_tokens": 36752725.0, "step": 5560 }, { "entropy": 0.41072532460093497, "epoch": 0.08574193908396976, "grad_norm": 0.7244552373886108, "learning_rate": 5.71501872851352e-06, "loss": 0.3912, "mean_token_accuracy": 0.8808658972382546, "num_tokens": 36830650.0, "step": 5570 }, { "entropy": 0.4373810842633247, "epoch": 0.08589587434264834, "grad_norm": 0.6513342261314392, "learning_rate": 5.725280927702808e-06, "loss": 0.4225, "mean_token_accuracy": 0.8695768132805825, "num_tokens": 36890495.0, "step": 5580 }, { "entropy": 0.39551035929471257, "epoch": 0.08604980960132692, "grad_norm": 0.6546845436096191, "learning_rate": 5.735543126892094e-06, "loss": 0.3888, "mean_token_accuracy": 0.885033306479454, "num_tokens": 36964511.0, "step": 5590 }, { "entropy": 0.4240613792091608, "epoch": 0.08620374486000551, "grad_norm": 0.649697482585907, "learning_rate": 5.74580532608138e-06, "loss": 0.4128, "mean_token_accuracy": 0.8734390340745449, "num_tokens": 37031077.0, "step": 5600 }, { "entropy": 0.4119853347539902, "epoch": 0.08635768011868408, "grad_norm": 0.8512964248657227, "learning_rate": 5.756067525270666e-06, "loss": 0.3935, "mean_token_accuracy": 0.879948103427887, "num_tokens": 37103694.0, "step": 5610 }, { "entropy": 0.42411659844219685, "epoch": 0.08651161537736267, "grad_norm": 0.5616170763969421, "learning_rate": 5.7663297244599524e-06, "loss": 0.4114, "mean_token_accuracy": 0.8733312614262104, "num_tokens": 37170214.0, "step": 5620 }, { "entropy": 0.4178475245833397, "epoch": 0.08666555063604124, "grad_norm": 0.7268560528755188, "learning_rate": 5.776591923649238e-06, "loss": 0.4172, "mean_token_accuracy": 0.875276991724968, "num_tokens": 37235384.0, "step": 5630 }, { "entropy": 0.4104923168197274, "epoch": 0.08681948589471983, "grad_norm": 0.5699918866157532, "learning_rate": 5.786854122838524e-06, "loss": 0.4102, "mean_token_accuracy": 0.879251991212368, "num_tokens": 37305045.0, "step": 5640 }, { "entropy": 0.4219876116141677, "epoch": 0.08697342115339841, "grad_norm": 0.6278819441795349, "learning_rate": 5.797116322027811e-06, "loss": 0.4174, "mean_token_accuracy": 0.8769207403063775, "num_tokens": 37370666.0, "step": 5650 }, { "entropy": 0.4324507249519229, "epoch": 0.08712735641207699, "grad_norm": 0.723122775554657, "learning_rate": 5.807378521217097e-06, "loss": 0.4181, "mean_token_accuracy": 0.8707829311490058, "num_tokens": 37439906.0, "step": 5660 }, { "entropy": 0.435960528254509, "epoch": 0.08728129167075557, "grad_norm": 0.8484935164451599, "learning_rate": 5.817640720406383e-06, "loss": 0.4263, "mean_token_accuracy": 0.8687728695571423, "num_tokens": 37503418.0, "step": 5670 }, { "entropy": 0.41654552444815635, "epoch": 0.08743522692943416, "grad_norm": 0.5964093208312988, "learning_rate": 5.827902919595669e-06, "loss": 0.4109, "mean_token_accuracy": 0.8779280230402946, "num_tokens": 37568800.0, "step": 5680 }, { "entropy": 0.4272800890728831, "epoch": 0.08758916218811273, "grad_norm": 0.7183772325515747, "learning_rate": 5.838165118784957e-06, "loss": 0.4083, "mean_token_accuracy": 0.8754680089652538, "num_tokens": 37633822.0, "step": 5690 }, { "entropy": 0.4168532855808735, "epoch": 0.08774309744679132, "grad_norm": 0.9626280069351196, "learning_rate": 5.8484273179742426e-06, "loss": 0.4073, "mean_token_accuracy": 0.8770849764347076, "num_tokens": 37693259.0, "step": 5700 }, { "entropy": 0.41631900519132614, "epoch": 0.08789703270546989, "grad_norm": 0.8608735799789429, "learning_rate": 5.8586895171635285e-06, "loss": 0.4138, "mean_token_accuracy": 0.8764878816902637, "num_tokens": 37755276.0, "step": 5710 }, { "entropy": 0.40312777627259494, "epoch": 0.08805096796414848, "grad_norm": 0.9884957075119019, "learning_rate": 5.868951716352815e-06, "loss": 0.3926, "mean_token_accuracy": 0.8807962737977505, "num_tokens": 37825803.0, "step": 5720 }, { "entropy": 0.4211206899955869, "epoch": 0.08820490322282706, "grad_norm": 0.7559930086135864, "learning_rate": 5.879213915542101e-06, "loss": 0.4181, "mean_token_accuracy": 0.8750810459256172, "num_tokens": 37888516.0, "step": 5730 }, { "entropy": 0.42948139030486343, "epoch": 0.08835883848150564, "grad_norm": 0.7411385774612427, "learning_rate": 5.889476114731387e-06, "loss": 0.4168, "mean_token_accuracy": 0.8710540935397149, "num_tokens": 37956427.0, "step": 5740 }, { "entropy": 0.40811120718717575, "epoch": 0.08851277374018422, "grad_norm": 0.6016805171966553, "learning_rate": 5.899738313920673e-06, "loss": 0.4106, "mean_token_accuracy": 0.8785745009779931, "num_tokens": 38032359.0, "step": 5750 }, { "entropy": 0.42178536765277386, "epoch": 0.08866670899886281, "grad_norm": 0.5719447731971741, "learning_rate": 5.91000051310996e-06, "loss": 0.4076, "mean_token_accuracy": 0.8757497057318687, "num_tokens": 38103981.0, "step": 5760 }, { "entropy": 0.42275034859776495, "epoch": 0.08882064425754138, "grad_norm": 0.6736098527908325, "learning_rate": 5.920262712299246e-06, "loss": 0.4146, "mean_token_accuracy": 0.8746068559587001, "num_tokens": 38178143.0, "step": 5770 }, { "entropy": 0.43437793105840683, "epoch": 0.08897457951621997, "grad_norm": 0.7492460012435913, "learning_rate": 5.930524911488532e-06, "loss": 0.4408, "mean_token_accuracy": 0.871581806242466, "num_tokens": 38241440.0, "step": 5780 }, { "entropy": 0.42332875560969113, "epoch": 0.08912851477489854, "grad_norm": 0.8469909429550171, "learning_rate": 5.9407871106778195e-06, "loss": 0.4218, "mean_token_accuracy": 0.8764850310981274, "num_tokens": 38305133.0, "step": 5790 }, { "entropy": 0.4275753781199455, "epoch": 0.08928245003357713, "grad_norm": 0.5799296498298645, "learning_rate": 5.951049309867105e-06, "loss": 0.4169, "mean_token_accuracy": 0.8750816740095615, "num_tokens": 38375806.0, "step": 5800 }, { "entropy": 0.4191420054063201, "epoch": 0.08943638529225571, "grad_norm": 0.9175649881362915, "learning_rate": 5.961311509056391e-06, "loss": 0.4227, "mean_token_accuracy": 0.8749246820807457, "num_tokens": 38425793.0, "step": 5810 }, { "entropy": 0.4219985757023096, "epoch": 0.08959032055093429, "grad_norm": 0.7411928772926331, "learning_rate": 5.971573708245677e-06, "loss": 0.4064, "mean_token_accuracy": 0.8740438498556614, "num_tokens": 38487948.0, "step": 5820 }, { "entropy": 0.4302801571786404, "epoch": 0.08974425580961287, "grad_norm": 0.7042123675346375, "learning_rate": 5.981835907434964e-06, "loss": 0.4242, "mean_token_accuracy": 0.8722670547664165, "num_tokens": 38554834.0, "step": 5830 }, { "entropy": 0.4154212325811386, "epoch": 0.08989819106829146, "grad_norm": 0.6977325081825256, "learning_rate": 5.99209810662425e-06, "loss": 0.3964, "mean_token_accuracy": 0.8797412365674973, "num_tokens": 38618460.0, "step": 5840 }, { "entropy": 0.40928043238818645, "epoch": 0.09005212632697003, "grad_norm": 0.6588506698608398, "learning_rate": 6.002360305813536e-06, "loss": 0.3971, "mean_token_accuracy": 0.8789309732615948, "num_tokens": 38680438.0, "step": 5850 }, { "entropy": 0.4114106811583042, "epoch": 0.09020606158564862, "grad_norm": 0.6796452403068542, "learning_rate": 6.012622505002823e-06, "loss": 0.4072, "mean_token_accuracy": 0.8787621252238751, "num_tokens": 38755959.0, "step": 5860 }, { "entropy": 0.41352033205330374, "epoch": 0.0903599968443272, "grad_norm": 0.7241241335868835, "learning_rate": 6.022884704192109e-06, "loss": 0.3949, "mean_token_accuracy": 0.8756845869123936, "num_tokens": 38825745.0, "step": 5870 }, { "entropy": 0.4147080108523369, "epoch": 0.09051393210300578, "grad_norm": 0.809306800365448, "learning_rate": 6.033146903381395e-06, "loss": 0.4113, "mean_token_accuracy": 0.8769565336406231, "num_tokens": 38883686.0, "step": 5880 }, { "entropy": 0.4311453001573682, "epoch": 0.09066786736168436, "grad_norm": 0.6028603315353394, "learning_rate": 6.043409102570681e-06, "loss": 0.4189, "mean_token_accuracy": 0.873661158978939, "num_tokens": 38953048.0, "step": 5890 }, { "entropy": 0.41795442178845404, "epoch": 0.09082180262036293, "grad_norm": 0.7126477360725403, "learning_rate": 6.053671301759968e-06, "loss": 0.4091, "mean_token_accuracy": 0.8783990189433097, "num_tokens": 39021291.0, "step": 5900 }, { "entropy": 0.4142689574509859, "epoch": 0.09097573787904152, "grad_norm": 0.8038569688796997, "learning_rate": 6.063933500949254e-06, "loss": 0.4025, "mean_token_accuracy": 0.8785617902874947, "num_tokens": 39085576.0, "step": 5910 }, { "entropy": 0.4178519045934081, "epoch": 0.09112967313772011, "grad_norm": 0.621347188949585, "learning_rate": 6.07419570013854e-06, "loss": 0.4101, "mean_token_accuracy": 0.8727268479764462, "num_tokens": 39154750.0, "step": 5920 }, { "entropy": 0.43542108945548536, "epoch": 0.09128360839639868, "grad_norm": 1.068782091140747, "learning_rate": 6.084457899327827e-06, "loss": 0.4197, "mean_token_accuracy": 0.8727223508059978, "num_tokens": 39220489.0, "step": 5930 }, { "entropy": 0.43467424921691417, "epoch": 0.09143754365507727, "grad_norm": 0.6476715803146362, "learning_rate": 6.094720098517113e-06, "loss": 0.4347, "mean_token_accuracy": 0.8714924544095993, "num_tokens": 39287022.0, "step": 5940 }, { "entropy": 0.42675085347145797, "epoch": 0.09159147891375585, "grad_norm": 0.7642800211906433, "learning_rate": 6.104982297706399e-06, "loss": 0.4063, "mean_token_accuracy": 0.8729378141462802, "num_tokens": 39354963.0, "step": 5950 }, { "entropy": 0.3944542396813631, "epoch": 0.09174541417243443, "grad_norm": 0.7542684078216553, "learning_rate": 6.115244496895685e-06, "loss": 0.3881, "mean_token_accuracy": 0.8816908605396747, "num_tokens": 39417761.0, "step": 5960 }, { "entropy": 0.40615622643381355, "epoch": 0.09189934943111301, "grad_norm": 0.778256893157959, "learning_rate": 6.125506696084972e-06, "loss": 0.3947, "mean_token_accuracy": 0.8784826546907425, "num_tokens": 39484220.0, "step": 5970 }, { "entropy": 0.4164897739887238, "epoch": 0.09205328468979158, "grad_norm": 0.843711793422699, "learning_rate": 6.1357688952742575e-06, "loss": 0.4081, "mean_token_accuracy": 0.8771915636956692, "num_tokens": 39546573.0, "step": 5980 }, { "entropy": 0.4049221627414227, "epoch": 0.09220721994847017, "grad_norm": 0.8146544098854065, "learning_rate": 6.1460310944635435e-06, "loss": 0.3928, "mean_token_accuracy": 0.8790832415223122, "num_tokens": 39606120.0, "step": 5990 }, { "entropy": 0.40704064499586823, "epoch": 0.09236115520714876, "grad_norm": 0.8512804508209229, "learning_rate": 6.156293293652831e-06, "loss": 0.4058, "mean_token_accuracy": 0.8771355882287025, "num_tokens": 39667561.0, "step": 6000 }, { "entropy": 0.414562363922596, "epoch": 0.09251509046582733, "grad_norm": 0.6880080103874207, "learning_rate": 6.166555492842117e-06, "loss": 0.3989, "mean_token_accuracy": 0.8791386432945728, "num_tokens": 39735411.0, "step": 6010 }, { "entropy": 0.4214152108877897, "epoch": 0.09266902572450592, "grad_norm": 0.6307430863380432, "learning_rate": 6.176817692031403e-06, "loss": 0.4131, "mean_token_accuracy": 0.8763658292591572, "num_tokens": 39805225.0, "step": 6020 }, { "entropy": 0.3991399185732007, "epoch": 0.0928229609831845, "grad_norm": 0.8601220846176147, "learning_rate": 6.187079891220689e-06, "loss": 0.3956, "mean_token_accuracy": 0.8805144377052784, "num_tokens": 39879260.0, "step": 6030 }, { "entropy": 0.42077887803316116, "epoch": 0.09297689624186307, "grad_norm": 0.8958874344825745, "learning_rate": 6.197342090409976e-06, "loss": 0.4144, "mean_token_accuracy": 0.8754172049462795, "num_tokens": 39944142.0, "step": 6040 }, { "entropy": 0.4209673432633281, "epoch": 0.09313083150054166, "grad_norm": 0.892876923084259, "learning_rate": 6.207604289599262e-06, "loss": 0.4172, "mean_token_accuracy": 0.8767972119152546, "num_tokens": 39999029.0, "step": 6050 }, { "entropy": 0.4071037542074919, "epoch": 0.09328476675922025, "grad_norm": 0.8168721795082092, "learning_rate": 6.217866488788548e-06, "loss": 0.4044, "mean_token_accuracy": 0.8772570721805095, "num_tokens": 40064895.0, "step": 6060 }, { "entropy": 0.4143290439620614, "epoch": 0.09343870201789882, "grad_norm": 0.6679834127426147, "learning_rate": 6.2281286879778344e-06, "loss": 0.4077, "mean_token_accuracy": 0.8789192266762257, "num_tokens": 40129115.0, "step": 6070 }, { "entropy": 0.41838721334934237, "epoch": 0.0935926372765774, "grad_norm": 0.8226512670516968, "learning_rate": 6.23839088716712e-06, "loss": 0.4026, "mean_token_accuracy": 0.8763012439012527, "num_tokens": 40193225.0, "step": 6080 }, { "entropy": 0.410373205691576, "epoch": 0.09374657253525598, "grad_norm": 0.8420726656913757, "learning_rate": 6.248653086356406e-06, "loss": 0.408, "mean_token_accuracy": 0.8780744791030883, "num_tokens": 40262410.0, "step": 6090 }, { "entropy": 0.4321902824565768, "epoch": 0.09390050779393457, "grad_norm": 0.721820056438446, "learning_rate": 6.258915285545692e-06, "loss": 0.4183, "mean_token_accuracy": 0.8735571645200253, "num_tokens": 40329567.0, "step": 6100 }, { "entropy": 0.4020687719807029, "epoch": 0.09405444305261315, "grad_norm": 0.7756867408752441, "learning_rate": 6.26917748473498e-06, "loss": 0.3943, "mean_token_accuracy": 0.8827378273010253, "num_tokens": 40394619.0, "step": 6110 }, { "entropy": 0.43172606099396943, "epoch": 0.09420837831129172, "grad_norm": 0.7559576630592346, "learning_rate": 6.279439683924266e-06, "loss": 0.4242, "mean_token_accuracy": 0.8701939187943936, "num_tokens": 40463098.0, "step": 6120 }, { "entropy": 0.436732823215425, "epoch": 0.09436231356997031, "grad_norm": 1.2893855571746826, "learning_rate": 6.289701883113552e-06, "loss": 0.4171, "mean_token_accuracy": 0.8722435459494591, "num_tokens": 40526879.0, "step": 6130 }, { "entropy": 0.42198566496372225, "epoch": 0.0945162488286489, "grad_norm": 0.718706488609314, "learning_rate": 6.299964082302839e-06, "loss": 0.4141, "mean_token_accuracy": 0.8766583122313023, "num_tokens": 40591297.0, "step": 6140 }, { "entropy": 0.4046117525547743, "epoch": 0.09467018408732747, "grad_norm": 0.8595797419548035, "learning_rate": 6.3102262814921245e-06, "loss": 0.391, "mean_token_accuracy": 0.8806111216545105, "num_tokens": 40655494.0, "step": 6150 }, { "entropy": 0.4059289461001754, "epoch": 0.09482411934600606, "grad_norm": 1.0237679481506348, "learning_rate": 6.3204884806814105e-06, "loss": 0.4166, "mean_token_accuracy": 0.8792765863239765, "num_tokens": 40724412.0, "step": 6160 }, { "entropy": 0.38973231315612794, "epoch": 0.09497805460468463, "grad_norm": 0.8217259645462036, "learning_rate": 6.3307506798706964e-06, "loss": 0.3858, "mean_token_accuracy": 0.8863075539469719, "num_tokens": 40786383.0, "step": 6170 }, { "entropy": 0.41072600353509187, "epoch": 0.09513198986336321, "grad_norm": 0.6721541881561279, "learning_rate": 6.341012879059983e-06, "loss": 0.4024, "mean_token_accuracy": 0.8767092496156692, "num_tokens": 40858357.0, "step": 6180 }, { "entropy": 0.4077396659180522, "epoch": 0.0952859251220418, "grad_norm": 0.8294060826301575, "learning_rate": 6.351275078249269e-06, "loss": 0.4034, "mean_token_accuracy": 0.8818958438932896, "num_tokens": 40929874.0, "step": 6190 }, { "entropy": 0.4035213768482208, "epoch": 0.09543986038072037, "grad_norm": 0.9872422218322754, "learning_rate": 6.361537277438555e-06, "loss": 0.3959, "mean_token_accuracy": 0.8833466954529285, "num_tokens": 40995487.0, "step": 6200 }, { "entropy": 0.39125555865466594, "epoch": 0.09559379563939896, "grad_norm": 0.7421615123748779, "learning_rate": 6.371799476627843e-06, "loss": 0.3871, "mean_token_accuracy": 0.885584868490696, "num_tokens": 41067321.0, "step": 6210 }, { "entropy": 0.4060887536033988, "epoch": 0.09574773089807755, "grad_norm": 0.8095366954803467, "learning_rate": 6.382061675817129e-06, "loss": 0.4029, "mean_token_accuracy": 0.8793494515120983, "num_tokens": 41131515.0, "step": 6220 }, { "entropy": 0.40365253333002327, "epoch": 0.09590166615675612, "grad_norm": 0.9372423887252808, "learning_rate": 6.392323875006415e-06, "loss": 0.4053, "mean_token_accuracy": 0.8795587457716465, "num_tokens": 41190180.0, "step": 6230 }, { "entropy": 0.42292281296104195, "epoch": 0.0960556014154347, "grad_norm": 0.8081592917442322, "learning_rate": 6.402586074195701e-06, "loss": 0.4042, "mean_token_accuracy": 0.8757644027471543, "num_tokens": 41254497.0, "step": 6240 }, { "entropy": 0.4100620301440358, "epoch": 0.09620953667411329, "grad_norm": 0.7257930636405945, "learning_rate": 6.412848273384987e-06, "loss": 0.3877, "mean_token_accuracy": 0.8815142661333084, "num_tokens": 41317947.0, "step": 6250 }, { "entropy": 0.39924417026340964, "epoch": 0.09636347193279186, "grad_norm": 0.7273569703102112, "learning_rate": 6.423110472574273e-06, "loss": 0.3918, "mean_token_accuracy": 0.8819228291511536, "num_tokens": 41387206.0, "step": 6260 }, { "entropy": 0.3904265871271491, "epoch": 0.09651740719147045, "grad_norm": 0.8949925303459167, "learning_rate": 6.433372671763559e-06, "loss": 0.3841, "mean_token_accuracy": 0.8788401119410991, "num_tokens": 41461467.0, "step": 6270 }, { "entropy": 0.4220112729817629, "epoch": 0.09667134245014902, "grad_norm": 0.8731507658958435, "learning_rate": 6.443634870952846e-06, "loss": 0.4106, "mean_token_accuracy": 0.8756696194410324, "num_tokens": 41519225.0, "step": 6280 }, { "entropy": 0.4149504560977221, "epoch": 0.09682527770882761, "grad_norm": 0.5827104449272156, "learning_rate": 6.453897070142132e-06, "loss": 0.4103, "mean_token_accuracy": 0.8796297594904899, "num_tokens": 41594795.0, "step": 6290 }, { "entropy": 0.41385520845651624, "epoch": 0.0969792129675062, "grad_norm": 1.0263270139694214, "learning_rate": 6.464159269331418e-06, "loss": 0.4067, "mean_token_accuracy": 0.8774956218898297, "num_tokens": 41658688.0, "step": 6300 }, { "entropy": 0.3906195817515254, "epoch": 0.09713314822618477, "grad_norm": 1.013946533203125, "learning_rate": 6.474421468520704e-06, "loss": 0.3895, "mean_token_accuracy": 0.8813171707093715, "num_tokens": 41726999.0, "step": 6310 }, { "entropy": 0.40177844613790514, "epoch": 0.09728708348486335, "grad_norm": 0.7679173350334167, "learning_rate": 6.484683667709991e-06, "loss": 0.3902, "mean_token_accuracy": 0.8796385176479816, "num_tokens": 41798309.0, "step": 6320 }, { "entropy": 0.4235487373545766, "epoch": 0.09744101874354194, "grad_norm": 0.860097348690033, "learning_rate": 6.494945866899277e-06, "loss": 0.4126, "mean_token_accuracy": 0.874699542671442, "num_tokens": 41868014.0, "step": 6330 }, { "entropy": 0.41539999153465035, "epoch": 0.09759495400222051, "grad_norm": 0.8836818337440491, "learning_rate": 6.505208066088563e-06, "loss": 0.4188, "mean_token_accuracy": 0.8750540271401406, "num_tokens": 41926221.0, "step": 6340 }, { "entropy": 0.4218715768307447, "epoch": 0.0977488892608991, "grad_norm": 0.7872618436813354, "learning_rate": 6.51547026527785e-06, "loss": 0.4207, "mean_token_accuracy": 0.8737487263977528, "num_tokens": 41993305.0, "step": 6350 }, { "entropy": 0.4086828786879778, "epoch": 0.09790282451957767, "grad_norm": 0.7650004625320435, "learning_rate": 6.525732464467136e-06, "loss": 0.3825, "mean_token_accuracy": 0.8803218767046929, "num_tokens": 42065488.0, "step": 6360 }, { "entropy": 0.3929923724383116, "epoch": 0.09805675977825626, "grad_norm": 0.8192272186279297, "learning_rate": 6.535994663656422e-06, "loss": 0.3825, "mean_token_accuracy": 0.8866658769547939, "num_tokens": 42130263.0, "step": 6370 }, { "entropy": 0.3930084504187107, "epoch": 0.09821069503693484, "grad_norm": 0.8400616645812988, "learning_rate": 6.546256862845708e-06, "loss": 0.3944, "mean_token_accuracy": 0.8835932359099388, "num_tokens": 42207385.0, "step": 6380 }, { "entropy": 0.41093120966106655, "epoch": 0.09836463029561342, "grad_norm": 1.0407720804214478, "learning_rate": 6.556519062034995e-06, "loss": 0.4191, "mean_token_accuracy": 0.8746047407388687, "num_tokens": 42266487.0, "step": 6390 }, { "entropy": 0.41667022369802, "epoch": 0.098518565554292, "grad_norm": 0.6880308985710144, "learning_rate": 6.566781261224281e-06, "loss": 0.396, "mean_token_accuracy": 0.8793096080422401, "num_tokens": 42340904.0, "step": 6400 }, { "entropy": 0.44296816550195217, "epoch": 0.09867250081297059, "grad_norm": 0.8429402112960815, "learning_rate": 6.577043460413567e-06, "loss": 0.4234, "mean_token_accuracy": 0.8695221461355687, "num_tokens": 42402373.0, "step": 6410 }, { "entropy": 0.403323407843709, "epoch": 0.09882643607164916, "grad_norm": 0.7002152800559998, "learning_rate": 6.587305659602854e-06, "loss": 0.4032, "mean_token_accuracy": 0.8805265940725804, "num_tokens": 42472141.0, "step": 6420 }, { "entropy": 0.4173213990405202, "epoch": 0.09898037133032775, "grad_norm": 0.8329616785049438, "learning_rate": 6.5975678587921395e-06, "loss": 0.4065, "mean_token_accuracy": 0.8772598892450333, "num_tokens": 42538149.0, "step": 6430 }, { "entropy": 0.42845604475587606, "epoch": 0.09913430658900634, "grad_norm": 0.7051017880439758, "learning_rate": 6.6078300579814255e-06, "loss": 0.4259, "mean_token_accuracy": 0.8725374132394791, "num_tokens": 42601327.0, "step": 6440 }, { "entropy": 0.4134164800867438, "epoch": 0.09928824184768491, "grad_norm": 0.8752163052558899, "learning_rate": 6.618092257170711e-06, "loss": 0.4102, "mean_token_accuracy": 0.8797780841588974, "num_tokens": 42671912.0, "step": 6450 }, { "entropy": 0.4072555746883154, "epoch": 0.0994421771063635, "grad_norm": 0.7870827317237854, "learning_rate": 6.628354456359999e-06, "loss": 0.3934, "mean_token_accuracy": 0.8787299253046512, "num_tokens": 42743893.0, "step": 6460 }, { "entropy": 0.41108001098036767, "epoch": 0.09959611236504207, "grad_norm": 0.7322207689285278, "learning_rate": 6.638616655549285e-06, "loss": 0.4184, "mean_token_accuracy": 0.8794822186231613, "num_tokens": 42804286.0, "step": 6470 }, { "entropy": 0.4353123489767313, "epoch": 0.09975004762372065, "grad_norm": 1.049981951713562, "learning_rate": 6.648878854738571e-06, "loss": 0.4161, "mean_token_accuracy": 0.8725879587233066, "num_tokens": 42868503.0, "step": 6480 }, { "entropy": 0.40310263335704805, "epoch": 0.09990398288239924, "grad_norm": 0.7993954420089722, "learning_rate": 6.659141053927858e-06, "loss": 0.4032, "mean_token_accuracy": 0.8808480203151703, "num_tokens": 42935074.0, "step": 6490 }, { "entropy": 0.4086808621883392, "epoch": 0.10005791814107781, "grad_norm": 0.7004649043083191, "learning_rate": 6.669403253117144e-06, "loss": 0.4016, "mean_token_accuracy": 0.8777862660586834, "num_tokens": 43011273.0, "step": 6500 }, { "entropy": 0.39941146578639747, "epoch": 0.1002118533997564, "grad_norm": 0.8489904403686523, "learning_rate": 6.67966545230643e-06, "loss": 0.395, "mean_token_accuracy": 0.8818038702011108, "num_tokens": 43069095.0, "step": 6510 }, { "entropy": 0.4129484832286835, "epoch": 0.10036578865843498, "grad_norm": 0.725077748298645, "learning_rate": 6.689927651495716e-06, "loss": 0.4136, "mean_token_accuracy": 0.8784377858042717, "num_tokens": 43129978.0, "step": 6520 }, { "entropy": 0.40179549995809793, "epoch": 0.10051972391711356, "grad_norm": 0.7075647711753845, "learning_rate": 6.700189850685002e-06, "loss": 0.4038, "mean_token_accuracy": 0.8819667443633079, "num_tokens": 43194780.0, "step": 6530 }, { "entropy": 0.42899221926927567, "epoch": 0.10067365917579214, "grad_norm": 0.7426197528839111, "learning_rate": 6.710452049874288e-06, "loss": 0.417, "mean_token_accuracy": 0.8735409148037434, "num_tokens": 43257606.0, "step": 6540 }, { "entropy": 0.4055026303976774, "epoch": 0.10082759443447072, "grad_norm": 0.8767962455749512, "learning_rate": 6.720714249063574e-06, "loss": 0.3888, "mean_token_accuracy": 0.8825754299759865, "num_tokens": 43324096.0, "step": 6550 }, { "entropy": 0.4204454481601715, "epoch": 0.1009815296931493, "grad_norm": 0.8622724413871765, "learning_rate": 6.730976448252862e-06, "loss": 0.4117, "mean_token_accuracy": 0.8750358000397682, "num_tokens": 43391975.0, "step": 6560 }, { "entropy": 0.41198216937482357, "epoch": 0.10113546495182789, "grad_norm": 0.7742594480514526, "learning_rate": 6.741238647442148e-06, "loss": 0.3946, "mean_token_accuracy": 0.8792687974870205, "num_tokens": 43458647.0, "step": 6570 }, { "entropy": 0.41033698320388795, "epoch": 0.10128940021050646, "grad_norm": 0.9607150554656982, "learning_rate": 6.751500846631434e-06, "loss": 0.4084, "mean_token_accuracy": 0.8772461257874966, "num_tokens": 43527268.0, "step": 6580 }, { "entropy": 0.39983655512332916, "epoch": 0.10144333546918505, "grad_norm": 0.7678231000900269, "learning_rate": 6.76176304582072e-06, "loss": 0.3888, "mean_token_accuracy": 0.8802364133298397, "num_tokens": 43605585.0, "step": 6590 }, { "entropy": 0.4109651414677501, "epoch": 0.10159727072786363, "grad_norm": 0.7335399389266968, "learning_rate": 6.7720252450100065e-06, "loss": 0.3982, "mean_token_accuracy": 0.880258559435606, "num_tokens": 43668150.0, "step": 6600 }, { "entropy": 0.3963170114904642, "epoch": 0.1017512059865422, "grad_norm": 0.726838231086731, "learning_rate": 6.7822874441992925e-06, "loss": 0.3903, "mean_token_accuracy": 0.8837692275643348, "num_tokens": 43737784.0, "step": 6610 }, { "entropy": 0.42030272595584395, "epoch": 0.10190514124522079, "grad_norm": 0.689132034778595, "learning_rate": 6.7925496433885784e-06, "loss": 0.4091, "mean_token_accuracy": 0.8745331957936286, "num_tokens": 43806204.0, "step": 6620 }, { "entropy": 0.3983965259045362, "epoch": 0.10205907650389938, "grad_norm": 0.9415562152862549, "learning_rate": 6.802811842577865e-06, "loss": 0.3812, "mean_token_accuracy": 0.8817177444696427, "num_tokens": 43870226.0, "step": 6630 }, { "entropy": 0.3902996936812997, "epoch": 0.10221301176257795, "grad_norm": 0.9219557046890259, "learning_rate": 6.813074041767151e-06, "loss": 0.4005, "mean_token_accuracy": 0.8829255707561969, "num_tokens": 43933673.0, "step": 6640 }, { "entropy": 0.40215321369469165, "epoch": 0.10236694702125654, "grad_norm": 0.9268127083778381, "learning_rate": 6.823336240956437e-06, "loss": 0.3876, "mean_token_accuracy": 0.8831090591847897, "num_tokens": 43993669.0, "step": 6650 }, { "entropy": 0.4009618565440178, "epoch": 0.10252088227993511, "grad_norm": 0.6567865014076233, "learning_rate": 6.833598440145723e-06, "loss": 0.4037, "mean_token_accuracy": 0.8783262476325036, "num_tokens": 44059577.0, "step": 6660 }, { "entropy": 0.419619844481349, "epoch": 0.1026748175386137, "grad_norm": 0.9269498586654663, "learning_rate": 6.843860639335011e-06, "loss": 0.4118, "mean_token_accuracy": 0.8771808467805385, "num_tokens": 44129096.0, "step": 6670 }, { "entropy": 0.3987704012542963, "epoch": 0.10282875279729228, "grad_norm": 0.8475388884544373, "learning_rate": 6.854122838524297e-06, "loss": 0.3942, "mean_token_accuracy": 0.8837143443524837, "num_tokens": 44196242.0, "step": 6680 }, { "entropy": 0.3949459666386247, "epoch": 0.10298268805597086, "grad_norm": 1.1003563404083252, "learning_rate": 6.864385037713583e-06, "loss": 0.3877, "mean_token_accuracy": 0.8850385703146457, "num_tokens": 44264471.0, "step": 6690 }, { "entropy": 0.4087157526984811, "epoch": 0.10313662331464944, "grad_norm": 0.7422076463699341, "learning_rate": 6.874647236902869e-06, "loss": 0.4145, "mean_token_accuracy": 0.8766322053968907, "num_tokens": 44329360.0, "step": 6700 }, { "entropy": 0.39431228395551443, "epoch": 0.10329055857332803, "grad_norm": 0.9230190515518188, "learning_rate": 6.884909436092155e-06, "loss": 0.3886, "mean_token_accuracy": 0.8857167303562165, "num_tokens": 44402624.0, "step": 6710 }, { "entropy": 0.39347777888178825, "epoch": 0.1034444938320066, "grad_norm": 0.7047861218452454, "learning_rate": 6.895171635281441e-06, "loss": 0.3873, "mean_token_accuracy": 0.8829744778573513, "num_tokens": 44469194.0, "step": 6720 }, { "entropy": 0.39541865028440953, "epoch": 0.10359842909068519, "grad_norm": 1.0062466859817505, "learning_rate": 6.905433834470727e-06, "loss": 0.3941, "mean_token_accuracy": 0.8806367866694927, "num_tokens": 44533137.0, "step": 6730 }, { "entropy": 0.40589327551424503, "epoch": 0.10375236434936376, "grad_norm": 0.9106683135032654, "learning_rate": 6.915696033660014e-06, "loss": 0.4012, "mean_token_accuracy": 0.8795268438756466, "num_tokens": 44598303.0, "step": 6740 }, { "entropy": 0.3980288114398718, "epoch": 0.10390629960804235, "grad_norm": 0.6746912598609924, "learning_rate": 6.9259582328493e-06, "loss": 0.3795, "mean_token_accuracy": 0.8865584127604962, "num_tokens": 44659834.0, "step": 6750 }, { "entropy": 0.3972005950286984, "epoch": 0.10406023486672093, "grad_norm": 0.6883527040481567, "learning_rate": 6.936220432038586e-06, "loss": 0.3932, "mean_token_accuracy": 0.8798867650330067, "num_tokens": 44730667.0, "step": 6760 }, { "entropy": 0.4049756120890379, "epoch": 0.1042141701253995, "grad_norm": 1.2310893535614014, "learning_rate": 6.9464826312278736e-06, "loss": 0.395, "mean_token_accuracy": 0.87807806879282, "num_tokens": 44794667.0, "step": 6770 }, { "entropy": 0.40046725925058124, "epoch": 0.10436810538407809, "grad_norm": 0.7639310956001282, "learning_rate": 6.9567448304171595e-06, "loss": 0.3886, "mean_token_accuracy": 0.8807132013142109, "num_tokens": 44855473.0, "step": 6780 }, { "entropy": 0.3859627699479461, "epoch": 0.10452204064275668, "grad_norm": 0.9937034845352173, "learning_rate": 6.9670070296064455e-06, "loss": 0.3835, "mean_token_accuracy": 0.8865384891629219, "num_tokens": 44919186.0, "step": 6790 }, { "entropy": 0.3947236372157931, "epoch": 0.10467597590143525, "grad_norm": 0.7613213062286377, "learning_rate": 6.977269228795731e-06, "loss": 0.3848, "mean_token_accuracy": 0.8829286321997643, "num_tokens": 44994846.0, "step": 6800 }, { "entropy": 0.41461104694753886, "epoch": 0.10482991116011384, "grad_norm": 0.8668520450592041, "learning_rate": 6.987531427985018e-06, "loss": 0.4075, "mean_token_accuracy": 0.8780760072171688, "num_tokens": 45051748.0, "step": 6810 }, { "entropy": 0.4047233249992132, "epoch": 0.10498384641879242, "grad_norm": 0.8456770777702332, "learning_rate": 6.997793627174304e-06, "loss": 0.3961, "mean_token_accuracy": 0.879888617247343, "num_tokens": 45120553.0, "step": 6820 }, { "entropy": 0.4005092794075608, "epoch": 0.105137781677471, "grad_norm": 1.1273472309112549, "learning_rate": 7.00805582636359e-06, "loss": 0.4024, "mean_token_accuracy": 0.8805606156587601, "num_tokens": 45181276.0, "step": 6830 }, { "entropy": 0.4019557597115636, "epoch": 0.10529171693614958, "grad_norm": 0.8517630696296692, "learning_rate": 7.018318025552876e-06, "loss": 0.399, "mean_token_accuracy": 0.8823310054838658, "num_tokens": 45252114.0, "step": 6840 }, { "entropy": 0.41165378168225286, "epoch": 0.10544565219482815, "grad_norm": 0.8318567276000977, "learning_rate": 7.028580224742163e-06, "loss": 0.3951, "mean_token_accuracy": 0.8794574089348316, "num_tokens": 45325139.0, "step": 6850 }, { "entropy": 0.4026630360633135, "epoch": 0.10559958745350674, "grad_norm": 0.7371814846992493, "learning_rate": 7.038842423931449e-06, "loss": 0.3942, "mean_token_accuracy": 0.8825903370976448, "num_tokens": 45390581.0, "step": 6860 }, { "entropy": 0.398336973041296, "epoch": 0.10575352271218533, "grad_norm": 0.8797497749328613, "learning_rate": 7.049104623120735e-06, "loss": 0.3955, "mean_token_accuracy": 0.8806413948535919, "num_tokens": 45459864.0, "step": 6870 }, { "entropy": 0.41360377091914413, "epoch": 0.1059074579708639, "grad_norm": 0.7167338132858276, "learning_rate": 7.059366822310022e-06, "loss": 0.4015, "mean_token_accuracy": 0.8772066496312618, "num_tokens": 45526747.0, "step": 6880 }, { "entropy": 0.4060611831024289, "epoch": 0.10606139322954249, "grad_norm": 0.8804991245269775, "learning_rate": 7.069629021499308e-06, "loss": 0.4018, "mean_token_accuracy": 0.8797616332769393, "num_tokens": 45590509.0, "step": 6890 }, { "entropy": 0.41314102672040465, "epoch": 0.10621532848822107, "grad_norm": 0.7604706287384033, "learning_rate": 7.079891220688594e-06, "loss": 0.4101, "mean_token_accuracy": 0.8763804130256176, "num_tokens": 45663121.0, "step": 6900 }, { "entropy": 0.39266312066465614, "epoch": 0.10636926374689964, "grad_norm": 0.8810570240020752, "learning_rate": 7.09015341987788e-06, "loss": 0.3895, "mean_token_accuracy": 0.8827128127217293, "num_tokens": 45729347.0, "step": 6910 }, { "entropy": 0.4140281070023775, "epoch": 0.10652319900557823, "grad_norm": 1.0305598974227905, "learning_rate": 7.100415619067167e-06, "loss": 0.3938, "mean_token_accuracy": 0.8766757532954216, "num_tokens": 45802060.0, "step": 6920 }, { "entropy": 0.39304708745330574, "epoch": 0.1066771342642568, "grad_norm": 1.0025267601013184, "learning_rate": 7.110677818256453e-06, "loss": 0.3805, "mean_token_accuracy": 0.8840374484658241, "num_tokens": 45864877.0, "step": 6930 }, { "entropy": 0.3861002193763852, "epoch": 0.10683106952293539, "grad_norm": 1.7211312055587769, "learning_rate": 7.120940017445739e-06, "loss": 0.3802, "mean_token_accuracy": 0.8873412698507309, "num_tokens": 45928065.0, "step": 6940 }, { "entropy": 0.3970777178183198, "epoch": 0.10698500478161398, "grad_norm": 0.8095009326934814, "learning_rate": 7.131202216635026e-06, "loss": 0.3847, "mean_token_accuracy": 0.8805326089262963, "num_tokens": 45996487.0, "step": 6950 }, { "entropy": 0.390126727335155, "epoch": 0.10713894004029255, "grad_norm": 1.0113595724105835, "learning_rate": 7.141464415824312e-06, "loss": 0.3853, "mean_token_accuracy": 0.8833760134875774, "num_tokens": 46063423.0, "step": 6960 }, { "entropy": 0.39638145323842766, "epoch": 0.10729287529897114, "grad_norm": 1.0751299858093262, "learning_rate": 7.151726615013598e-06, "loss": 0.3811, "mean_token_accuracy": 0.8827173084020614, "num_tokens": 46124520.0, "step": 6970 }, { "entropy": 0.3981941301375628, "epoch": 0.10744681055764972, "grad_norm": 0.8968340754508972, "learning_rate": 7.1619888142028835e-06, "loss": 0.3792, "mean_token_accuracy": 0.8813779003918171, "num_tokens": 46191015.0, "step": 6980 }, { "entropy": 0.39315024930983783, "epoch": 0.1076007458163283, "grad_norm": 0.9844537973403931, "learning_rate": 7.172251013392171e-06, "loss": 0.397, "mean_token_accuracy": 0.8835346758365631, "num_tokens": 46256860.0, "step": 6990 }, { "entropy": 0.4041078334674239, "epoch": 0.10775468107500688, "grad_norm": 0.8463955521583557, "learning_rate": 7.182513212581457e-06, "loss": 0.3934, "mean_token_accuracy": 0.8798059120774269, "num_tokens": 46320275.0, "step": 7000 }, { "entropy": 0.4154280886054039, "epoch": 0.10790861633368547, "grad_norm": 0.8167122602462769, "learning_rate": 7.192775411770743e-06, "loss": 0.4123, "mean_token_accuracy": 0.8764241963624955, "num_tokens": 46384015.0, "step": 7010 }, { "entropy": 0.4035636156797409, "epoch": 0.10806255159236404, "grad_norm": 0.8513615131378174, "learning_rate": 7.20303761096003e-06, "loss": 0.3919, "mean_token_accuracy": 0.8830088943243026, "num_tokens": 46447753.0, "step": 7020 }, { "entropy": 0.40846620239317416, "epoch": 0.10821648685104263, "grad_norm": 0.6715316772460938, "learning_rate": 7.213299810149316e-06, "loss": 0.4006, "mean_token_accuracy": 0.8783602863550186, "num_tokens": 46512681.0, "step": 7030 }, { "entropy": 0.39658583197742703, "epoch": 0.1083704221097212, "grad_norm": 0.8957178592681885, "learning_rate": 7.223562009338602e-06, "loss": 0.3845, "mean_token_accuracy": 0.8820250883698464, "num_tokens": 46582556.0, "step": 7040 }, { "entropy": 0.4136719899252057, "epoch": 0.10852435736839978, "grad_norm": 0.7435616850852966, "learning_rate": 7.233824208527888e-06, "loss": 0.4152, "mean_token_accuracy": 0.8769947342574597, "num_tokens": 46645665.0, "step": 7050 }, { "entropy": 0.4132521042600274, "epoch": 0.10867829262707837, "grad_norm": 0.7048901319503784, "learning_rate": 7.2440864077171745e-06, "loss": 0.4016, "mean_token_accuracy": 0.878590964525938, "num_tokens": 46710920.0, "step": 7060 }, { "entropy": 0.38822707198560236, "epoch": 0.10883222788575694, "grad_norm": 0.8561174869537354, "learning_rate": 7.2543486069064604e-06, "loss": 0.3819, "mean_token_accuracy": 0.8851060263812542, "num_tokens": 46772264.0, "step": 7070 }, { "entropy": 0.3944748228415847, "epoch": 0.10898616314443553, "grad_norm": 0.9179787635803223, "learning_rate": 7.264610806095746e-06, "loss": 0.3883, "mean_token_accuracy": 0.8838487058877945, "num_tokens": 46834519.0, "step": 7080 }, { "entropy": 0.41002411060035227, "epoch": 0.10914009840311412, "grad_norm": 0.9043139219284058, "learning_rate": 7.274873005285033e-06, "loss": 0.4091, "mean_token_accuracy": 0.8785881474614143, "num_tokens": 46896585.0, "step": 7090 }, { "entropy": 0.3938509691506624, "epoch": 0.10929403366179269, "grad_norm": 0.806222140789032, "learning_rate": 7.28513520447432e-06, "loss": 0.3837, "mean_token_accuracy": 0.8844780065119267, "num_tokens": 46957220.0, "step": 7100 }, { "entropy": 0.4034655228257179, "epoch": 0.10944796892047128, "grad_norm": 0.941551148891449, "learning_rate": 7.295397403663606e-06, "loss": 0.3915, "mean_token_accuracy": 0.88055065497756, "num_tokens": 47024428.0, "step": 7110 }, { "entropy": 0.4003958612680435, "epoch": 0.10960190417914985, "grad_norm": 0.9681385159492493, "learning_rate": 7.305659602852892e-06, "loss": 0.39, "mean_token_accuracy": 0.8813659206032753, "num_tokens": 47090512.0, "step": 7120 }, { "entropy": 0.4185586957260966, "epoch": 0.10975583943782843, "grad_norm": 0.9303019642829895, "learning_rate": 7.315921802042179e-06, "loss": 0.421, "mean_token_accuracy": 0.8758418276906014, "num_tokens": 47153763.0, "step": 7130 }, { "entropy": 0.39962144289165735, "epoch": 0.10990977469650702, "grad_norm": 0.8481465578079224, "learning_rate": 7.326184001231465e-06, "loss": 0.3889, "mean_token_accuracy": 0.8820036977529526, "num_tokens": 47218917.0, "step": 7140 }, { "entropy": 0.3988111075013876, "epoch": 0.11006370995518559, "grad_norm": 0.8950425982475281, "learning_rate": 7.3364462004207505e-06, "loss": 0.3941, "mean_token_accuracy": 0.8815625481307506, "num_tokens": 47285396.0, "step": 7150 }, { "entropy": 0.38581124842166903, "epoch": 0.11021764521386418, "grad_norm": 0.9896876215934753, "learning_rate": 7.346708399610037e-06, "loss": 0.3882, "mean_token_accuracy": 0.8856437139213085, "num_tokens": 47350820.0, "step": 7160 }, { "entropy": 0.39604289680719373, "epoch": 0.11037158047254277, "grad_norm": 0.8681496381759644, "learning_rate": 7.356970598799323e-06, "loss": 0.3847, "mean_token_accuracy": 0.8820672355592251, "num_tokens": 47419818.0, "step": 7170 }, { "entropy": 0.39551221001893283, "epoch": 0.11052551573122134, "grad_norm": 0.9882600903511047, "learning_rate": 7.367232797988609e-06, "loss": 0.3854, "mean_token_accuracy": 0.8847740426659584, "num_tokens": 47491436.0, "step": 7180 }, { "entropy": 0.4017511229962111, "epoch": 0.11067945098989992, "grad_norm": 1.040644884109497, "learning_rate": 7.377494997177895e-06, "loss": 0.4064, "mean_token_accuracy": 0.8779905922710896, "num_tokens": 47559942.0, "step": 7190 }, { "entropy": 0.3964296955615282, "epoch": 0.11083338624857851, "grad_norm": 0.8185475468635559, "learning_rate": 7.387757196367182e-06, "loss": 0.387, "mean_token_accuracy": 0.8846265852451325, "num_tokens": 47629951.0, "step": 7200 }, { "entropy": 0.3943544581532478, "epoch": 0.11098732150725708, "grad_norm": 0.8798453211784363, "learning_rate": 7.398019395556468e-06, "loss": 0.3834, "mean_token_accuracy": 0.8826319195330143, "num_tokens": 47696514.0, "step": 7210 }, { "entropy": 0.39021538719534876, "epoch": 0.11114125676593567, "grad_norm": 0.9619843363761902, "learning_rate": 7.408281594745754e-06, "loss": 0.4004, "mean_token_accuracy": 0.8808719933032989, "num_tokens": 47763225.0, "step": 7220 }, { "entropy": 0.39308298099786043, "epoch": 0.11129519202461424, "grad_norm": 0.8148038387298584, "learning_rate": 7.4185437939350415e-06, "loss": 0.3861, "mean_token_accuracy": 0.8844554193317891, "num_tokens": 47831754.0, "step": 7230 }, { "entropy": 0.39782262556254866, "epoch": 0.11144912728329283, "grad_norm": 0.953629195690155, "learning_rate": 7.4288059931243275e-06, "loss": 0.3877, "mean_token_accuracy": 0.8802815146744252, "num_tokens": 47894172.0, "step": 7240 }, { "entropy": 0.3925269182771444, "epoch": 0.11160306254197142, "grad_norm": 1.0230320692062378, "learning_rate": 7.439068192313613e-06, "loss": 0.3909, "mean_token_accuracy": 0.8809458099305629, "num_tokens": 47968851.0, "step": 7250 }, { "entropy": 0.3989115238189697, "epoch": 0.11175699780064999, "grad_norm": 1.0343152284622192, "learning_rate": 7.449330391502899e-06, "loss": 0.3916, "mean_token_accuracy": 0.8839238688349724, "num_tokens": 48027704.0, "step": 7260 }, { "entropy": 0.4049607951194048, "epoch": 0.11191093305932857, "grad_norm": 1.1204216480255127, "learning_rate": 7.459592590692186e-06, "loss": 0.389, "mean_token_accuracy": 0.8828478343784809, "num_tokens": 48089875.0, "step": 7270 }, { "entropy": 0.36857661604881287, "epoch": 0.11206486831800716, "grad_norm": 0.8281176686286926, "learning_rate": 7.469854789881472e-06, "loss": 0.374, "mean_token_accuracy": 0.888515992462635, "num_tokens": 48161167.0, "step": 7280 }, { "entropy": 0.4056619530543685, "epoch": 0.11221880357668573, "grad_norm": 0.816843569278717, "learning_rate": 7.480116989070758e-06, "loss": 0.3882, "mean_token_accuracy": 0.8824722737073898, "num_tokens": 48229579.0, "step": 7290 }, { "entropy": 0.40279102977365255, "epoch": 0.11237273883536432, "grad_norm": 0.9018985033035278, "learning_rate": 7.490379188260045e-06, "loss": 0.4003, "mean_token_accuracy": 0.8803386755287648, "num_tokens": 48300103.0, "step": 7300 }, { "entropy": 0.3982949238270521, "epoch": 0.11252667409404289, "grad_norm": 0.7007762789726257, "learning_rate": 7.500641387449331e-06, "loss": 0.3841, "mean_token_accuracy": 0.8811415523290634, "num_tokens": 48373010.0, "step": 7310 }, { "entropy": 0.40271374247968195, "epoch": 0.11268060935272148, "grad_norm": 0.8151914477348328, "learning_rate": 7.510903586638617e-06, "loss": 0.394, "mean_token_accuracy": 0.8805105693638324, "num_tokens": 48431276.0, "step": 7320 }, { "entropy": 0.37930153366178276, "epoch": 0.11283454461140006, "grad_norm": 0.9014715552330017, "learning_rate": 7.521165785827903e-06, "loss": 0.3749, "mean_token_accuracy": 0.8885110959410667, "num_tokens": 48496857.0, "step": 7330 }, { "entropy": 0.39732321575284, "epoch": 0.11298847987007864, "grad_norm": 0.8457902073860168, "learning_rate": 7.53142798501719e-06, "loss": 0.3847, "mean_token_accuracy": 0.8804906204342842, "num_tokens": 48558795.0, "step": 7340 }, { "entropy": 0.3931488314643502, "epoch": 0.11314241512875722, "grad_norm": 0.8413862586021423, "learning_rate": 7.541690184206476e-06, "loss": 0.3892, "mean_token_accuracy": 0.8839531272649765, "num_tokens": 48632511.0, "step": 7350 }, { "entropy": 0.3901769742369652, "epoch": 0.11329635038743581, "grad_norm": 0.8481228351593018, "learning_rate": 7.551952383395762e-06, "loss": 0.3818, "mean_token_accuracy": 0.8853270024061203, "num_tokens": 48696136.0, "step": 7360 }, { "entropy": 0.3913082992658019, "epoch": 0.11345028564611438, "grad_norm": 0.9680777192115784, "learning_rate": 7.562214582585049e-06, "loss": 0.3858, "mean_token_accuracy": 0.8872882023453712, "num_tokens": 48755211.0, "step": 7370 }, { "entropy": 0.3957627644762397, "epoch": 0.11360422090479297, "grad_norm": 0.9582895040512085, "learning_rate": 7.572476781774335e-06, "loss": 0.3821, "mean_token_accuracy": 0.8822747528553009, "num_tokens": 48826008.0, "step": 7380 }, { "entropy": 0.4060609083622694, "epoch": 0.11375815616347154, "grad_norm": 1.097449541091919, "learning_rate": 7.582738980963621e-06, "loss": 0.3935, "mean_token_accuracy": 0.8783149190247059, "num_tokens": 48888317.0, "step": 7390 }, { "entropy": 0.3958776222541928, "epoch": 0.11391209142215013, "grad_norm": 1.1092424392700195, "learning_rate": 7.593001180152907e-06, "loss": 0.3883, "mean_token_accuracy": 0.8830806463956833, "num_tokens": 48952489.0, "step": 7400 }, { "entropy": 0.39209698624908923, "epoch": 0.11406602668082871, "grad_norm": 1.1179596185684204, "learning_rate": 7.603263379342194e-06, "loss": 0.3864, "mean_token_accuracy": 0.8849010966718197, "num_tokens": 49022028.0, "step": 7410 }, { "entropy": 0.39544045329093935, "epoch": 0.11421996193950729, "grad_norm": 0.860373854637146, "learning_rate": 7.6135255785314796e-06, "loss": 0.3933, "mean_token_accuracy": 0.88176054880023, "num_tokens": 49091058.0, "step": 7420 }, { "entropy": 0.40698101501911876, "epoch": 0.11437389719818587, "grad_norm": 0.9307304620742798, "learning_rate": 7.6237877777207655e-06, "loss": 0.3985, "mean_token_accuracy": 0.8801010996103287, "num_tokens": 49161306.0, "step": 7430 }, { "entropy": 0.39587601460516453, "epoch": 0.11452783245686446, "grad_norm": 1.1771053075790405, "learning_rate": 7.634049976910053e-06, "loss": 0.3941, "mean_token_accuracy": 0.8838434986770153, "num_tokens": 49220446.0, "step": 7440 }, { "entropy": 0.39960651062428953, "epoch": 0.11468176771554303, "grad_norm": 1.1653066873550415, "learning_rate": 7.644312176099339e-06, "loss": 0.4079, "mean_token_accuracy": 0.8788885608315468, "num_tokens": 49284486.0, "step": 7450 }, { "entropy": 0.3966827826574445, "epoch": 0.11483570297422162, "grad_norm": 1.1807506084442139, "learning_rate": 7.654574375288625e-06, "loss": 0.3924, "mean_token_accuracy": 0.8824269123375416, "num_tokens": 49345018.0, "step": 7460 }, { "entropy": 0.4079535922035575, "epoch": 0.1149896382329002, "grad_norm": 0.8276416659355164, "learning_rate": 7.664836574477911e-06, "loss": 0.4121, "mean_token_accuracy": 0.8788645885884762, "num_tokens": 49404666.0, "step": 7470 }, { "entropy": 0.3771595565602183, "epoch": 0.11514357349157878, "grad_norm": 0.8742098212242126, "learning_rate": 7.675098773667199e-06, "loss": 0.3653, "mean_token_accuracy": 0.8892890758812427, "num_tokens": 49467977.0, "step": 7480 }, { "entropy": 0.3890165759250522, "epoch": 0.11529750875025736, "grad_norm": 0.8427861332893372, "learning_rate": 7.685360972856485e-06, "loss": 0.393, "mean_token_accuracy": 0.881065996736288, "num_tokens": 49530846.0, "step": 7490 }, { "entropy": 0.417880149371922, "epoch": 0.11545144400893594, "grad_norm": 0.9130853414535522, "learning_rate": 7.69562317204577e-06, "loss": 0.3926, "mean_token_accuracy": 0.8778949394822121, "num_tokens": 49596492.0, "step": 7500 }, { "entropy": 0.38725020214915273, "epoch": 0.11560537926761452, "grad_norm": 0.856160044670105, "learning_rate": 7.705885371235056e-06, "loss": 0.3884, "mean_token_accuracy": 0.8847163759171963, "num_tokens": 49664769.0, "step": 7510 }, { "entropy": 0.393495243601501, "epoch": 0.11575931452629311, "grad_norm": 0.8271752595901489, "learning_rate": 7.716147570424342e-06, "loss": 0.3885, "mean_token_accuracy": 0.8832922972738743, "num_tokens": 49728544.0, "step": 7520 }, { "entropy": 0.3971280476078391, "epoch": 0.11591324978497168, "grad_norm": 1.0562469959259033, "learning_rate": 7.726409769613628e-06, "loss": 0.3941, "mean_token_accuracy": 0.8819942273199558, "num_tokens": 49787932.0, "step": 7530 }, { "entropy": 0.39849797636270523, "epoch": 0.11606718504365027, "grad_norm": 0.9349098205566406, "learning_rate": 7.736671968802914e-06, "loss": 0.3939, "mean_token_accuracy": 0.8834865190088749, "num_tokens": 49854523.0, "step": 7540 }, { "entropy": 0.3893200742080808, "epoch": 0.11622112030232885, "grad_norm": 0.8193073272705078, "learning_rate": 7.746934167992202e-06, "loss": 0.3825, "mean_token_accuracy": 0.8833098173141479, "num_tokens": 49924723.0, "step": 7550 }, { "entropy": 0.3899458462372422, "epoch": 0.11637505556100743, "grad_norm": 0.8201577067375183, "learning_rate": 7.757196367181488e-06, "loss": 0.3997, "mean_token_accuracy": 0.8811960786581039, "num_tokens": 49990961.0, "step": 7560 }, { "entropy": 0.39966467302292585, "epoch": 0.11652899081968601, "grad_norm": 0.6757298111915588, "learning_rate": 7.767458566370774e-06, "loss": 0.3781, "mean_token_accuracy": 0.8842616900801659, "num_tokens": 50067484.0, "step": 7570 }, { "entropy": 0.38005825541913507, "epoch": 0.11668292607836458, "grad_norm": 0.90175461769104, "learning_rate": 7.77772076556006e-06, "loss": 0.3794, "mean_token_accuracy": 0.8857890486717224, "num_tokens": 50129583.0, "step": 7580 }, { "entropy": 0.3828873407095671, "epoch": 0.11683686133704317, "grad_norm": 0.7917709946632385, "learning_rate": 7.787982964749346e-06, "loss": 0.387, "mean_token_accuracy": 0.8839100897312164, "num_tokens": 50201165.0, "step": 7590 }, { "entropy": 0.3972518404945731, "epoch": 0.11699079659572176, "grad_norm": 1.0276069641113281, "learning_rate": 7.798245163938632e-06, "loss": 0.3776, "mean_token_accuracy": 0.8831652827560902, "num_tokens": 50265741.0, "step": 7600 }, { "entropy": 0.39127344321459534, "epoch": 0.11714473185440033, "grad_norm": 0.8973371982574463, "learning_rate": 7.808507363127918e-06, "loss": 0.3714, "mean_token_accuracy": 0.88309490904212, "num_tokens": 50327219.0, "step": 7610 }, { "entropy": 0.3846073366701603, "epoch": 0.11729866711307892, "grad_norm": 1.1052109003067017, "learning_rate": 7.818769562317205e-06, "loss": 0.3999, "mean_token_accuracy": 0.8848465695977211, "num_tokens": 50389338.0, "step": 7620 }, { "entropy": 0.3917363416403532, "epoch": 0.1174526023717575, "grad_norm": 0.8910684585571289, "learning_rate": 7.829031761506491e-06, "loss": 0.3947, "mean_token_accuracy": 0.8793053552508354, "num_tokens": 50461926.0, "step": 7630 }, { "entropy": 0.38857480604201555, "epoch": 0.11760653763043608, "grad_norm": 0.8137053847312927, "learning_rate": 7.839293960695777e-06, "loss": 0.3723, "mean_token_accuracy": 0.8879310473799705, "num_tokens": 50527634.0, "step": 7640 }, { "entropy": 0.4010484293103218, "epoch": 0.11776047288911466, "grad_norm": 0.9937020540237427, "learning_rate": 7.849556159885065e-06, "loss": 0.4014, "mean_token_accuracy": 0.8795229211449623, "num_tokens": 50600019.0, "step": 7650 }, { "entropy": 0.39621988907456396, "epoch": 0.11791440814779325, "grad_norm": 0.7291847467422485, "learning_rate": 7.85981835907435e-06, "loss": 0.3871, "mean_token_accuracy": 0.8827448599040508, "num_tokens": 50668790.0, "step": 7660 }, { "entropy": 0.39504702277481557, "epoch": 0.11806834340647182, "grad_norm": 0.8953614830970764, "learning_rate": 7.870080558263637e-06, "loss": 0.3832, "mean_token_accuracy": 0.8866313934326172, "num_tokens": 50735526.0, "step": 7670 }, { "entropy": 0.39538113623857496, "epoch": 0.11822227866515041, "grad_norm": 0.8995855450630188, "learning_rate": 7.880342757452923e-06, "loss": 0.3849, "mean_token_accuracy": 0.8814667411148548, "num_tokens": 50806826.0, "step": 7680 }, { "entropy": 0.3857993038371205, "epoch": 0.11837621392382898, "grad_norm": 1.3187960386276245, "learning_rate": 7.890604956642209e-06, "loss": 0.3863, "mean_token_accuracy": 0.8846948929131031, "num_tokens": 50867111.0, "step": 7690 }, { "entropy": 0.39694024007767437, "epoch": 0.11853014918250757, "grad_norm": 0.9483128786087036, "learning_rate": 7.900867155831495e-06, "loss": 0.3928, "mean_token_accuracy": 0.883855152130127, "num_tokens": 50933405.0, "step": 7700 }, { "entropy": 0.37589157540351154, "epoch": 0.11868408444118615, "grad_norm": 0.9549035429954529, "learning_rate": 7.91112935502078e-06, "loss": 0.3677, "mean_token_accuracy": 0.88824462890625, "num_tokens": 50991639.0, "step": 7710 }, { "entropy": 0.39136400669813154, "epoch": 0.11883801969986472, "grad_norm": 0.8230884075164795, "learning_rate": 7.921391554210068e-06, "loss": 0.3912, "mean_token_accuracy": 0.88299914970994, "num_tokens": 51061321.0, "step": 7720 }, { "entropy": 0.3869258012622595, "epoch": 0.11899195495854331, "grad_norm": 0.7837406396865845, "learning_rate": 7.931653753399354e-06, "loss": 0.3817, "mean_token_accuracy": 0.8836988814175128, "num_tokens": 51128888.0, "step": 7730 }, { "entropy": 0.37920168712735175, "epoch": 0.1191458902172219, "grad_norm": 0.8167626857757568, "learning_rate": 7.94191595258864e-06, "loss": 0.3684, "mean_token_accuracy": 0.889821219444275, "num_tokens": 51198806.0, "step": 7740 }, { "entropy": 0.3791661435738206, "epoch": 0.11929982547590047, "grad_norm": 0.8339083194732666, "learning_rate": 7.952178151777926e-06, "loss": 0.3734, "mean_token_accuracy": 0.8873614929616451, "num_tokens": 51271658.0, "step": 7750 }, { "entropy": 0.3804197799414396, "epoch": 0.11945376073457906, "grad_norm": 1.0118334293365479, "learning_rate": 7.962440350967214e-06, "loss": 0.3726, "mean_token_accuracy": 0.8854521170258522, "num_tokens": 51333640.0, "step": 7760 }, { "entropy": 0.39584023524075745, "epoch": 0.11960769599325763, "grad_norm": 0.9817074537277222, "learning_rate": 7.9727025501565e-06, "loss": 0.3926, "mean_token_accuracy": 0.8822452619671821, "num_tokens": 51389669.0, "step": 7770 }, { "entropy": 0.3898228001780808, "epoch": 0.11976163125193622, "grad_norm": 0.9822894930839539, "learning_rate": 7.982964749345786e-06, "loss": 0.3814, "mean_token_accuracy": 0.8844502709805966, "num_tokens": 51455602.0, "step": 7780 }, { "entropy": 0.3845732048153877, "epoch": 0.1199155665106148, "grad_norm": 0.8597120642662048, "learning_rate": 7.993226948535071e-06, "loss": 0.3744, "mean_token_accuracy": 0.8863092936575413, "num_tokens": 51528389.0, "step": 7790 }, { "entropy": 0.40402434337884185, "epoch": 0.12006950176929337, "grad_norm": 1.2541532516479492, "learning_rate": 8.003489147724357e-06, "loss": 0.3949, "mean_token_accuracy": 0.8819908864796162, "num_tokens": 51594009.0, "step": 7800 }, { "entropy": 0.38790595848113296, "epoch": 0.12022343702797196, "grad_norm": 0.7649032473564148, "learning_rate": 8.013751346913643e-06, "loss": 0.3845, "mean_token_accuracy": 0.8844651490449905, "num_tokens": 51659273.0, "step": 7810 }, { "entropy": 0.37748203370720146, "epoch": 0.12037737228665055, "grad_norm": 1.0317325592041016, "learning_rate": 8.02401354610293e-06, "loss": 0.383, "mean_token_accuracy": 0.8888876266777516, "num_tokens": 51722693.0, "step": 7820 }, { "entropy": 0.38931059148162606, "epoch": 0.12053130754532912, "grad_norm": 0.987404465675354, "learning_rate": 8.034275745292217e-06, "loss": 0.3755, "mean_token_accuracy": 0.8860176384449006, "num_tokens": 51777137.0, "step": 7830 }, { "entropy": 0.38972168751060965, "epoch": 0.1206852428040077, "grad_norm": 0.8156319856643677, "learning_rate": 8.044537944481503e-06, "loss": 0.3857, "mean_token_accuracy": 0.8833921656012536, "num_tokens": 51841995.0, "step": 7840 }, { "entropy": 0.4058391977101564, "epoch": 0.12083917806268629, "grad_norm": 0.7583590745925903, "learning_rate": 8.054800143670789e-06, "loss": 0.4012, "mean_token_accuracy": 0.879642803966999, "num_tokens": 51904428.0, "step": 7850 }, { "entropy": 0.3893846580758691, "epoch": 0.12099311332136486, "grad_norm": 1.089469075202942, "learning_rate": 8.065062342860076e-06, "loss": 0.3851, "mean_token_accuracy": 0.8828784674406052, "num_tokens": 51965552.0, "step": 7860 }, { "entropy": 0.3865307565778494, "epoch": 0.12114704858004345, "grad_norm": 1.0769461393356323, "learning_rate": 8.075324542049362e-06, "loss": 0.3883, "mean_token_accuracy": 0.8842757679522038, "num_tokens": 52022318.0, "step": 7870 }, { "entropy": 0.3913748200982809, "epoch": 0.12130098383872202, "grad_norm": 0.9166527986526489, "learning_rate": 8.085586741238648e-06, "loss": 0.3754, "mean_token_accuracy": 0.8838211618363857, "num_tokens": 52096761.0, "step": 7880 }, { "entropy": 0.36882957983762027, "epoch": 0.12145491909740061, "grad_norm": 0.8018612265586853, "learning_rate": 8.095848940427934e-06, "loss": 0.3646, "mean_token_accuracy": 0.8871994189918041, "num_tokens": 52161523.0, "step": 7890 }, { "entropy": 0.3780492400750518, "epoch": 0.1216088543560792, "grad_norm": 0.9500919580459595, "learning_rate": 8.10611113961722e-06, "loss": 0.3666, "mean_token_accuracy": 0.8854998424649239, "num_tokens": 52223164.0, "step": 7900 }, { "entropy": 0.37804587818682195, "epoch": 0.12176278961475777, "grad_norm": 0.7850148677825928, "learning_rate": 8.116373338806506e-06, "loss": 0.3733, "mean_token_accuracy": 0.8871286697685719, "num_tokens": 52291979.0, "step": 7910 }, { "entropy": 0.3690214574337006, "epoch": 0.12191672487343636, "grad_norm": 0.7977561354637146, "learning_rate": 8.126635537995792e-06, "loss": 0.3614, "mean_token_accuracy": 0.8910660445690155, "num_tokens": 52366620.0, "step": 7920 }, { "entropy": 0.38490340020507574, "epoch": 0.12207066013211494, "grad_norm": 0.8544886708259583, "learning_rate": 8.13689773718508e-06, "loss": 0.3874, "mean_token_accuracy": 0.8832869179546833, "num_tokens": 52427829.0, "step": 7930 }, { "entropy": 0.4116355596110225, "epoch": 0.12222459539079351, "grad_norm": 0.8447369933128357, "learning_rate": 8.147159936374366e-06, "loss": 0.4007, "mean_token_accuracy": 0.8796799726784229, "num_tokens": 52493454.0, "step": 7940 }, { "entropy": 0.38714114408940076, "epoch": 0.1223785306494721, "grad_norm": 0.9092228412628174, "learning_rate": 8.157422135563652e-06, "loss": 0.3823, "mean_token_accuracy": 0.886280857026577, "num_tokens": 52568179.0, "step": 7950 }, { "entropy": 0.39115470219403503, "epoch": 0.12253246590815067, "grad_norm": 1.0572621822357178, "learning_rate": 8.167684334752938e-06, "loss": 0.3946, "mean_token_accuracy": 0.8836760327219964, "num_tokens": 52629947.0, "step": 7960 }, { "entropy": 0.38701794743537904, "epoch": 0.12268640116682926, "grad_norm": 1.0283362865447998, "learning_rate": 8.177946533942225e-06, "loss": 0.3793, "mean_token_accuracy": 0.8845600388944149, "num_tokens": 52693923.0, "step": 7970 }, { "entropy": 0.38885440658777953, "epoch": 0.12284033642550785, "grad_norm": 0.9278860688209534, "learning_rate": 8.188208733131511e-06, "loss": 0.3825, "mean_token_accuracy": 0.8854039832949638, "num_tokens": 52759372.0, "step": 7980 }, { "entropy": 0.3994645942002535, "epoch": 0.12299427168418642, "grad_norm": 0.9673884510993958, "learning_rate": 8.198470932320797e-06, "loss": 0.3818, "mean_token_accuracy": 0.8818763867020607, "num_tokens": 52825289.0, "step": 7990 }, { "entropy": 0.3961532488465309, "epoch": 0.123148206942865, "grad_norm": 1.0786172151565552, "learning_rate": 8.208733131510083e-06, "loss": 0.3933, "mean_token_accuracy": 0.8817623794078827, "num_tokens": 52884017.0, "step": 8000 }, { "entropy": 0.38500903230160477, "epoch": 0.12330214220154359, "grad_norm": 0.9115105867385864, "learning_rate": 8.218995330699369e-06, "loss": 0.3846, "mean_token_accuracy": 0.8860148161649704, "num_tokens": 52949772.0, "step": 8010 }, { "entropy": 0.3840710446238518, "epoch": 0.12345607746022216, "grad_norm": 1.1731107234954834, "learning_rate": 8.229257529888655e-06, "loss": 0.3715, "mean_token_accuracy": 0.8871754080057144, "num_tokens": 53023464.0, "step": 8020 }, { "entropy": 0.39028403870761397, "epoch": 0.12361001271890075, "grad_norm": 0.9555650949478149, "learning_rate": 8.239519729077941e-06, "loss": 0.3796, "mean_token_accuracy": 0.8798096723854542, "num_tokens": 53085436.0, "step": 8030 }, { "entropy": 0.3868252597749233, "epoch": 0.12376394797757934, "grad_norm": 0.8481394648551941, "learning_rate": 8.249781928267229e-06, "loss": 0.3813, "mean_token_accuracy": 0.8875099629163742, "num_tokens": 53153060.0, "step": 8040 }, { "entropy": 0.4001662597060204, "epoch": 0.12391788323625791, "grad_norm": 0.9158478379249573, "learning_rate": 8.260044127456515e-06, "loss": 0.3942, "mean_token_accuracy": 0.8824917733669281, "num_tokens": 53220629.0, "step": 8050 }, { "entropy": 0.3882638091221452, "epoch": 0.1240718184949365, "grad_norm": 0.7519555687904358, "learning_rate": 8.2703063266458e-06, "loss": 0.3772, "mean_token_accuracy": 0.8816925257444381, "num_tokens": 53282016.0, "step": 8060 }, { "entropy": 0.4040209949016571, "epoch": 0.12422575375361507, "grad_norm": 0.9412928223609924, "learning_rate": 8.280568525835088e-06, "loss": 0.3953, "mean_token_accuracy": 0.879036970436573, "num_tokens": 53343464.0, "step": 8070 }, { "entropy": 0.38917973916977644, "epoch": 0.12437968901229365, "grad_norm": 0.9077893495559692, "learning_rate": 8.290830725024374e-06, "loss": 0.387, "mean_token_accuracy": 0.8847793459892273, "num_tokens": 53417341.0, "step": 8080 }, { "entropy": 0.3921839350834489, "epoch": 0.12453362427097224, "grad_norm": 0.9756255745887756, "learning_rate": 8.30109292421366e-06, "loss": 0.3968, "mean_token_accuracy": 0.8857490673661232, "num_tokens": 53482266.0, "step": 8090 }, { "entropy": 0.38762753568589686, "epoch": 0.12468755952965081, "grad_norm": 1.013256549835205, "learning_rate": 8.311355123402946e-06, "loss": 0.3674, "mean_token_accuracy": 0.8859986081719399, "num_tokens": 53544267.0, "step": 8100 }, { "entropy": 0.39533355589956043, "epoch": 0.1248414947883294, "grad_norm": 0.9442536234855652, "learning_rate": 8.321617322592232e-06, "loss": 0.3854, "mean_token_accuracy": 0.8831515543162822, "num_tokens": 53609191.0, "step": 8110 }, { "entropy": 0.39356588795781133, "epoch": 0.12499543004700799, "grad_norm": 0.9516169428825378, "learning_rate": 8.331879521781518e-06, "loss": 0.3741, "mean_token_accuracy": 0.8823643490672112, "num_tokens": 53683411.0, "step": 8120 }, { "entropy": 0.3878414237871766, "epoch": 0.12514936530568657, "grad_norm": 1.3057329654693604, "learning_rate": 8.342141720970804e-06, "loss": 0.384, "mean_token_accuracy": 0.8825326152145863, "num_tokens": 53739880.0, "step": 8130 }, { "entropy": 0.3765022659674287, "epoch": 0.12530330056436514, "grad_norm": 1.1537734270095825, "learning_rate": 8.352403920160091e-06, "loss": 0.3623, "mean_token_accuracy": 0.8900561109185219, "num_tokens": 53815206.0, "step": 8140 }, { "entropy": 0.37828837800771, "epoch": 0.12545723582304372, "grad_norm": 1.1475242376327515, "learning_rate": 8.362666119349377e-06, "loss": 0.3681, "mean_token_accuracy": 0.8856085725128651, "num_tokens": 53881085.0, "step": 8150 }, { "entropy": 0.38308486677706244, "epoch": 0.12561117108172232, "grad_norm": 0.9958640336990356, "learning_rate": 8.372928318538663e-06, "loss": 0.3803, "mean_token_accuracy": 0.8858601681888103, "num_tokens": 53942766.0, "step": 8160 }, { "entropy": 0.38374856412410735, "epoch": 0.1257651063404009, "grad_norm": 1.0108689069747925, "learning_rate": 8.38319051772795e-06, "loss": 0.3823, "mean_token_accuracy": 0.8866935387253762, "num_tokens": 54004017.0, "step": 8170 }, { "entropy": 0.3923009864985943, "epoch": 0.12591904159907946, "grad_norm": 0.9821131229400635, "learning_rate": 8.393452716917237e-06, "loss": 0.3891, "mean_token_accuracy": 0.8828725919127465, "num_tokens": 54068796.0, "step": 8180 }, { "entropy": 0.3984433325007558, "epoch": 0.12607297685775806, "grad_norm": 1.0014069080352783, "learning_rate": 8.403714916106523e-06, "loss": 0.3881, "mean_token_accuracy": 0.8818629011511803, "num_tokens": 54139197.0, "step": 8190 }, { "entropy": 0.3783513555303216, "epoch": 0.12622691211643663, "grad_norm": 0.9490594863891602, "learning_rate": 8.413977115295809e-06, "loss": 0.3728, "mean_token_accuracy": 0.889243832975626, "num_tokens": 54209521.0, "step": 8200 }, { "entropy": 0.38250621389597655, "epoch": 0.1263808473751152, "grad_norm": 0.8292544484138489, "learning_rate": 8.424239314485095e-06, "loss": 0.3828, "mean_token_accuracy": 0.885168407857418, "num_tokens": 54275091.0, "step": 8210 }, { "entropy": 0.385955517552793, "epoch": 0.12653478263379378, "grad_norm": 0.9726831316947937, "learning_rate": 8.43450151367438e-06, "loss": 0.3688, "mean_token_accuracy": 0.8847323872148991, "num_tokens": 54341332.0, "step": 8220 }, { "entropy": 0.4046483436599374, "epoch": 0.12668871789247238, "grad_norm": 1.279548168182373, "learning_rate": 8.444763712863667e-06, "loss": 0.3925, "mean_token_accuracy": 0.8801962092518807, "num_tokens": 54398436.0, "step": 8230 }, { "entropy": 0.37528606709092854, "epoch": 0.12684265315115095, "grad_norm": 1.1118497848510742, "learning_rate": 8.455025912052953e-06, "loss": 0.3734, "mean_token_accuracy": 0.8915409997105599, "num_tokens": 54468287.0, "step": 8240 }, { "entropy": 0.38929990846663715, "epoch": 0.12699658840982953, "grad_norm": 0.8869342803955078, "learning_rate": 8.46528811124224e-06, "loss": 0.3919, "mean_token_accuracy": 0.8839195109903812, "num_tokens": 54537464.0, "step": 8250 }, { "entropy": 0.3879651974886656, "epoch": 0.12715052366850813, "grad_norm": 1.1725494861602783, "learning_rate": 8.475550310431526e-06, "loss": 0.3927, "mean_token_accuracy": 0.8851934336125851, "num_tokens": 54607410.0, "step": 8260 }, { "entropy": 0.3831555496901274, "epoch": 0.1273044589271867, "grad_norm": 0.737170934677124, "learning_rate": 8.485812509620812e-06, "loss": 0.3815, "mean_token_accuracy": 0.887173806130886, "num_tokens": 54672272.0, "step": 8270 }, { "entropy": 0.3977357080206275, "epoch": 0.12745839418586527, "grad_norm": 0.905200719833374, "learning_rate": 8.4960747088101e-06, "loss": 0.3998, "mean_token_accuracy": 0.8813491068780422, "num_tokens": 54742793.0, "step": 8280 }, { "entropy": 0.37593100741505625, "epoch": 0.12761232944454387, "grad_norm": 0.8528915047645569, "learning_rate": 8.506336907999386e-06, "loss": 0.3835, "mean_token_accuracy": 0.8867595963180065, "num_tokens": 54799493.0, "step": 8290 }, { "entropy": 0.3868357952684164, "epoch": 0.12776626470322244, "grad_norm": 1.1954389810562134, "learning_rate": 8.516599107188672e-06, "loss": 0.3731, "mean_token_accuracy": 0.8879633657634258, "num_tokens": 54865458.0, "step": 8300 }, { "entropy": 0.381146052479744, "epoch": 0.12792019996190102, "grad_norm": 0.830925464630127, "learning_rate": 8.526861306377958e-06, "loss": 0.3751, "mean_token_accuracy": 0.8869059279561042, "num_tokens": 54924212.0, "step": 8310 }, { "entropy": 0.38014077115803957, "epoch": 0.12807413522057962, "grad_norm": 1.120048999786377, "learning_rate": 8.537123505567244e-06, "loss": 0.3724, "mean_token_accuracy": 0.8868149243295192, "num_tokens": 54994310.0, "step": 8320 }, { "entropy": 0.37059691157191993, "epoch": 0.1282280704792582, "grad_norm": 0.8519971370697021, "learning_rate": 8.54738570475653e-06, "loss": 0.3676, "mean_token_accuracy": 0.8909349709749221, "num_tokens": 55061087.0, "step": 8330 }, { "entropy": 0.3595030026510358, "epoch": 0.12838200573793676, "grad_norm": 0.8418389558792114, "learning_rate": 8.557647903945815e-06, "loss": 0.362, "mean_token_accuracy": 0.8901072457432747, "num_tokens": 55124981.0, "step": 8340 }, { "entropy": 0.36959905810654164, "epoch": 0.12853594099661536, "grad_norm": 1.0278793573379517, "learning_rate": 8.567910103135103e-06, "loss": 0.3619, "mean_token_accuracy": 0.8893049828708172, "num_tokens": 55187881.0, "step": 8350 }, { "entropy": 0.390495184995234, "epoch": 0.12868987625529393, "grad_norm": 1.0987550020217896, "learning_rate": 8.578172302324389e-06, "loss": 0.3723, "mean_token_accuracy": 0.8848682887852192, "num_tokens": 55257101.0, "step": 8360 }, { "entropy": 0.3792550653219223, "epoch": 0.1288438115139725, "grad_norm": 0.9619755744934082, "learning_rate": 8.588434501513675e-06, "loss": 0.3703, "mean_token_accuracy": 0.8883831456303597, "num_tokens": 55328336.0, "step": 8370 }, { "entropy": 0.3819209774956107, "epoch": 0.12899774677265108, "grad_norm": 0.883810818195343, "learning_rate": 8.598696700702961e-06, "loss": 0.3698, "mean_token_accuracy": 0.8845030046999455, "num_tokens": 55388416.0, "step": 8380 }, { "entropy": 0.41712566949427127, "epoch": 0.12915168203132968, "grad_norm": 1.029217004776001, "learning_rate": 8.608958899892249e-06, "loss": 0.4187, "mean_token_accuracy": 0.8760782979428768, "num_tokens": 55455475.0, "step": 8390 }, { "entropy": 0.3744277484714985, "epoch": 0.12930561729000825, "grad_norm": 1.1835001707077026, "learning_rate": 8.619221099081535e-06, "loss": 0.3611, "mean_token_accuracy": 0.8916149221360683, "num_tokens": 55531182.0, "step": 8400 }, { "entropy": 0.36570582892745734, "epoch": 0.12945955254868682, "grad_norm": 1.3647798299789429, "learning_rate": 8.62948329827082e-06, "loss": 0.3654, "mean_token_accuracy": 0.8917721942067146, "num_tokens": 55596905.0, "step": 8410 }, { "entropy": 0.38675917163491247, "epoch": 0.12961348780736542, "grad_norm": 0.9408262968063354, "learning_rate": 8.639745497460106e-06, "loss": 0.3882, "mean_token_accuracy": 0.8843123033642769, "num_tokens": 55656283.0, "step": 8420 }, { "entropy": 0.40712751988321544, "epoch": 0.129767423066044, "grad_norm": 0.9578071236610413, "learning_rate": 8.650007696649392e-06, "loss": 0.3887, "mean_token_accuracy": 0.8798348195850849, "num_tokens": 55719868.0, "step": 8430 }, { "entropy": 0.38545413985848426, "epoch": 0.12992135832472257, "grad_norm": 1.0123414993286133, "learning_rate": 8.660269895838678e-06, "loss": 0.3764, "mean_token_accuracy": 0.885742112249136, "num_tokens": 55786621.0, "step": 8440 }, { "entropy": 0.39197013322263957, "epoch": 0.13007529358340117, "grad_norm": 1.2784596681594849, "learning_rate": 8.670532095027964e-06, "loss": 0.3941, "mean_token_accuracy": 0.8853843040764332, "num_tokens": 55847899.0, "step": 8450 }, { "entropy": 0.3881336748600006, "epoch": 0.13022922884207974, "grad_norm": 1.027031660079956, "learning_rate": 8.680794294217252e-06, "loss": 0.3756, "mean_token_accuracy": 0.8863748870790005, "num_tokens": 55926473.0, "step": 8460 }, { "entropy": 0.3758434684947133, "epoch": 0.13038316410075831, "grad_norm": 0.8461241126060486, "learning_rate": 8.691056493406538e-06, "loss": 0.3653, "mean_token_accuracy": 0.8879210643470288, "num_tokens": 55996000.0, "step": 8470 }, { "entropy": 0.40396627727895973, "epoch": 0.13053709935943691, "grad_norm": 1.0031622648239136, "learning_rate": 8.701318692595824e-06, "loss": 0.4037, "mean_token_accuracy": 0.8802567288279534, "num_tokens": 56061161.0, "step": 8480 }, { "entropy": 0.37754711974412203, "epoch": 0.1306910346181155, "grad_norm": 0.8415380120277405, "learning_rate": 8.711580891785111e-06, "loss": 0.3727, "mean_token_accuracy": 0.8870553106069565, "num_tokens": 56131657.0, "step": 8490 }, { "entropy": 0.39163973182439804, "epoch": 0.13084496987679406, "grad_norm": 1.2081959247589111, "learning_rate": 8.721843090974397e-06, "loss": 0.3956, "mean_token_accuracy": 0.8806260921061039, "num_tokens": 56193784.0, "step": 8500 }, { "entropy": 0.4008811067789793, "epoch": 0.13099890513547266, "grad_norm": 0.9441126585006714, "learning_rate": 8.732105290163683e-06, "loss": 0.3988, "mean_token_accuracy": 0.8817123390734196, "num_tokens": 56258436.0, "step": 8510 }, { "entropy": 0.37262743152678013, "epoch": 0.13115284039415123, "grad_norm": 1.1339831352233887, "learning_rate": 8.74236748935297e-06, "loss": 0.3595, "mean_token_accuracy": 0.8888380005955696, "num_tokens": 56322901.0, "step": 8520 }, { "entropy": 0.3929012013599277, "epoch": 0.1313067756528298, "grad_norm": 1.0990926027297974, "learning_rate": 8.752629688542255e-06, "loss": 0.3938, "mean_token_accuracy": 0.882765419781208, "num_tokens": 56395121.0, "step": 8530 }, { "entropy": 0.3861417492851615, "epoch": 0.1314607109115084, "grad_norm": 1.1674091815948486, "learning_rate": 8.762891887731541e-06, "loss": 0.3647, "mean_token_accuracy": 0.8890951283276081, "num_tokens": 56458404.0, "step": 8540 }, { "entropy": 0.37827637139707804, "epoch": 0.13161464617018698, "grad_norm": 0.7927403450012207, "learning_rate": 8.773154086920827e-06, "loss": 0.3709, "mean_token_accuracy": 0.8888309873640537, "num_tokens": 56521538.0, "step": 8550 }, { "entropy": 0.3947906117886305, "epoch": 0.13176858142886555, "grad_norm": 1.2214082479476929, "learning_rate": 8.783416286110115e-06, "loss": 0.3926, "mean_token_accuracy": 0.8838553339242935, "num_tokens": 56583165.0, "step": 8560 }, { "entropy": 0.3738367848098278, "epoch": 0.13192251668754412, "grad_norm": 0.9511138796806335, "learning_rate": 8.7936784852994e-06, "loss": 0.3731, "mean_token_accuracy": 0.8894836217164993, "num_tokens": 56644674.0, "step": 8570 }, { "entropy": 0.3949241453781724, "epoch": 0.13207645194622272, "grad_norm": 0.9919796586036682, "learning_rate": 8.803940684488687e-06, "loss": 0.3961, "mean_token_accuracy": 0.8836066029965878, "num_tokens": 56702641.0, "step": 8580 }, { "entropy": 0.38667487166821957, "epoch": 0.1322303872049013, "grad_norm": 1.0693305730819702, "learning_rate": 8.814202883677973e-06, "loss": 0.385, "mean_token_accuracy": 0.8847244873642921, "num_tokens": 56771065.0, "step": 8590 }, { "entropy": 0.394650131650269, "epoch": 0.13238432246357987, "grad_norm": 0.7815119624137878, "learning_rate": 8.82446508286726e-06, "loss": 0.383, "mean_token_accuracy": 0.8792438231408596, "num_tokens": 56846548.0, "step": 8600 }, { "entropy": 0.3726951666176319, "epoch": 0.13253825772225847, "grad_norm": 0.9911210536956787, "learning_rate": 8.834727282056546e-06, "loss": 0.363, "mean_token_accuracy": 0.8894088119268417, "num_tokens": 56909353.0, "step": 8610 }, { "entropy": 0.37975149378180506, "epoch": 0.13269219298093704, "grad_norm": 0.7932310104370117, "learning_rate": 8.844989481245832e-06, "loss": 0.377, "mean_token_accuracy": 0.8872944734990597, "num_tokens": 56979307.0, "step": 8620 }, { "entropy": 0.37905315291136504, "epoch": 0.1328461282396156, "grad_norm": 0.9836629033088684, "learning_rate": 8.855251680435118e-06, "loss": 0.3675, "mean_token_accuracy": 0.8879952609539032, "num_tokens": 57047411.0, "step": 8630 }, { "entropy": 0.39384991619735954, "epoch": 0.1330000634982942, "grad_norm": 1.1098779439926147, "learning_rate": 8.865513879624404e-06, "loss": 0.3897, "mean_token_accuracy": 0.8808375008404254, "num_tokens": 57103188.0, "step": 8640 }, { "entropy": 0.36884569432586434, "epoch": 0.13315399875697279, "grad_norm": 0.9600271582603455, "learning_rate": 8.87577607881369e-06, "loss": 0.364, "mean_token_accuracy": 0.8889295764267444, "num_tokens": 57163935.0, "step": 8650 }, { "entropy": 0.3892528910189867, "epoch": 0.13330793401565136, "grad_norm": 0.9866056442260742, "learning_rate": 8.886038278002976e-06, "loss": 0.3853, "mean_token_accuracy": 0.8838459923863411, "num_tokens": 57230339.0, "step": 8660 }, { "entropy": 0.39450476337224244, "epoch": 0.13346186927432996, "grad_norm": 1.2298544645309448, "learning_rate": 8.896300477192264e-06, "loss": 0.3894, "mean_token_accuracy": 0.8837019622325897, "num_tokens": 57296906.0, "step": 8670 }, { "entropy": 0.39201121032238007, "epoch": 0.13361580453300853, "grad_norm": 0.9556811451911926, "learning_rate": 8.90656267638155e-06, "loss": 0.3942, "mean_token_accuracy": 0.8830985344946385, "num_tokens": 57360107.0, "step": 8680 }, { "entropy": 0.37911541778594254, "epoch": 0.1337697397916871, "grad_norm": 0.8685864210128784, "learning_rate": 8.916824875570835e-06, "loss": 0.3816, "mean_token_accuracy": 0.8890303768217563, "num_tokens": 57433566.0, "step": 8690 }, { "entropy": 0.3745739547535777, "epoch": 0.1339236750503657, "grad_norm": 0.9401716589927673, "learning_rate": 8.927087074760123e-06, "loss": 0.3703, "mean_token_accuracy": 0.8903084963560104, "num_tokens": 57502023.0, "step": 8700 }, { "entropy": 0.37328403238207103, "epoch": 0.13407761030904428, "grad_norm": 1.0238693952560425, "learning_rate": 8.937349273949409e-06, "loss": 0.3757, "mean_token_accuracy": 0.8871846057474613, "num_tokens": 57567605.0, "step": 8710 }, { "entropy": 0.3989089599810541, "epoch": 0.13423154556772285, "grad_norm": 0.9999030232429504, "learning_rate": 8.947611473138695e-06, "loss": 0.3996, "mean_token_accuracy": 0.8817990742623806, "num_tokens": 57639755.0, "step": 8720 }, { "entropy": 0.3825955675914884, "epoch": 0.13438548082640145, "grad_norm": 1.0139081478118896, "learning_rate": 8.957873672327981e-06, "loss": 0.3774, "mean_token_accuracy": 0.8854189328849316, "num_tokens": 57704975.0, "step": 8730 }, { "entropy": 0.37971726786345245, "epoch": 0.13453941608508002, "grad_norm": 0.9168292284011841, "learning_rate": 8.968135871517267e-06, "loss": 0.3618, "mean_token_accuracy": 0.8909018874168396, "num_tokens": 57761272.0, "step": 8740 }, { "entropy": 0.37735572215169666, "epoch": 0.1346933513437586, "grad_norm": 1.2129549980163574, "learning_rate": 8.978398070706553e-06, "loss": 0.3608, "mean_token_accuracy": 0.8886507332324982, "num_tokens": 57823797.0, "step": 8750 }, { "entropy": 0.35337164960801604, "epoch": 0.13484728660243717, "grad_norm": 1.0050450563430786, "learning_rate": 8.988660269895839e-06, "loss": 0.3589, "mean_token_accuracy": 0.8951895870268345, "num_tokens": 57887530.0, "step": 8760 }, { "entropy": 0.3800753802061081, "epoch": 0.13500122186111577, "grad_norm": 1.4161221981048584, "learning_rate": 8.998922469085126e-06, "loss": 0.3835, "mean_token_accuracy": 0.8868881642818451, "num_tokens": 57952082.0, "step": 8770 }, { "entropy": 0.3870715469121933, "epoch": 0.13515515711979434, "grad_norm": 1.1189618110656738, "learning_rate": 9.009184668274412e-06, "loss": 0.3808, "mean_token_accuracy": 0.8880681656301022, "num_tokens": 58013042.0, "step": 8780 }, { "entropy": 0.3862118501216173, "epoch": 0.1353090923784729, "grad_norm": 0.8559407591819763, "learning_rate": 9.019446867463698e-06, "loss": 0.3869, "mean_token_accuracy": 0.8863866478204727, "num_tokens": 58075927.0, "step": 8790 }, { "entropy": 0.37133047692477705, "epoch": 0.1354630276371515, "grad_norm": 0.8582996726036072, "learning_rate": 9.029709066652984e-06, "loss": 0.3666, "mean_token_accuracy": 0.8886181302368641, "num_tokens": 58149050.0, "step": 8800 }, { "entropy": 0.37651703171432016, "epoch": 0.13561696289583008, "grad_norm": 0.8975586295127869, "learning_rate": 9.039971265842272e-06, "loss": 0.3676, "mean_token_accuracy": 0.8906302087008953, "num_tokens": 58219813.0, "step": 8810 }, { "entropy": 0.3737834494560957, "epoch": 0.13577089815450866, "grad_norm": 0.9329116940498352, "learning_rate": 9.050233465031558e-06, "loss": 0.3672, "mean_token_accuracy": 0.8875796355307102, "num_tokens": 58290254.0, "step": 8820 }, { "entropy": 0.3634373350068927, "epoch": 0.13592483341318726, "grad_norm": 1.4213063716888428, "learning_rate": 9.060495664220844e-06, "loss": 0.3689, "mean_token_accuracy": 0.8920297801494599, "num_tokens": 58350837.0, "step": 8830 }, { "entropy": 0.38723152615129947, "epoch": 0.13607876867186583, "grad_norm": 0.9427770972251892, "learning_rate": 9.07075786341013e-06, "loss": 0.3811, "mean_token_accuracy": 0.8826982401311397, "num_tokens": 58407353.0, "step": 8840 }, { "entropy": 0.3769544014707208, "epoch": 0.1362327039305444, "grad_norm": 1.0082849264144897, "learning_rate": 9.081020062599416e-06, "loss": 0.3697, "mean_token_accuracy": 0.88860400095582, "num_tokens": 58477883.0, "step": 8850 }, { "entropy": 0.38419189527630804, "epoch": 0.136386639189223, "grad_norm": 0.9859758019447327, "learning_rate": 9.091282261788702e-06, "loss": 0.3771, "mean_token_accuracy": 0.8866801485419273, "num_tokens": 58546720.0, "step": 8860 }, { "entropy": 0.38270653542131183, "epoch": 0.13654057444790157, "grad_norm": 1.1020630598068237, "learning_rate": 9.101544460977988e-06, "loss": 0.3723, "mean_token_accuracy": 0.8874691620469093, "num_tokens": 58610850.0, "step": 8870 }, { "entropy": 0.3893430970609188, "epoch": 0.13669450970658015, "grad_norm": 1.1083502769470215, "learning_rate": 9.111806660167275e-06, "loss": 0.3889, "mean_token_accuracy": 0.8823237873613834, "num_tokens": 58672234.0, "step": 8880 }, { "entropy": 0.37143487483263016, "epoch": 0.13684844496525875, "grad_norm": 1.0747451782226562, "learning_rate": 9.122068859356561e-06, "loss": 0.3717, "mean_token_accuracy": 0.8900997772812843, "num_tokens": 58728643.0, "step": 8890 }, { "entropy": 0.3773067731410265, "epoch": 0.13700238022393732, "grad_norm": 0.9704793095588684, "learning_rate": 9.132331058545847e-06, "loss": 0.3814, "mean_token_accuracy": 0.8865525186061859, "num_tokens": 58791513.0, "step": 8900 }, { "entropy": 0.3572006935253739, "epoch": 0.1371563154826159, "grad_norm": 1.0483635663986206, "learning_rate": 9.142593257735133e-06, "loss": 0.3649, "mean_token_accuracy": 0.8962068401277066, "num_tokens": 58853084.0, "step": 8910 }, { "entropy": 0.38332744054496287, "epoch": 0.1373102507412945, "grad_norm": 0.857992947101593, "learning_rate": 9.15285545692442e-06, "loss": 0.385, "mean_token_accuracy": 0.886269212514162, "num_tokens": 58925843.0, "step": 8920 }, { "entropy": 0.3814038896933198, "epoch": 0.13746418599997307, "grad_norm": 0.9443476796150208, "learning_rate": 9.163117656113707e-06, "loss": 0.3705, "mean_token_accuracy": 0.8881099104881287, "num_tokens": 58993630.0, "step": 8930 }, { "entropy": 0.3710115119814873, "epoch": 0.13761812125865164, "grad_norm": 1.0264142751693726, "learning_rate": 9.173379855302993e-06, "loss": 0.3801, "mean_token_accuracy": 0.8876972824335099, "num_tokens": 59054305.0, "step": 8940 }, { "entropy": 0.38770154789090155, "epoch": 0.1377720565173302, "grad_norm": 0.9584836363792419, "learning_rate": 9.183642054492279e-06, "loss": 0.3717, "mean_token_accuracy": 0.8872930653393268, "num_tokens": 59117484.0, "step": 8950 }, { "entropy": 0.35991688165813684, "epoch": 0.1379259917760088, "grad_norm": 0.916239857673645, "learning_rate": 9.193904253681564e-06, "loss": 0.3679, "mean_token_accuracy": 0.8913791537284851, "num_tokens": 59183767.0, "step": 8960 }, { "entropy": 0.36869072299450634, "epoch": 0.13807992703468738, "grad_norm": 1.3912293910980225, "learning_rate": 9.20416645287085e-06, "loss": 0.3688, "mean_token_accuracy": 0.889674398303032, "num_tokens": 59252904.0, "step": 8970 }, { "entropy": 0.3862820381298661, "epoch": 0.13823386229336596, "grad_norm": 1.2056713104248047, "learning_rate": 9.214428652060136e-06, "loss": 0.368, "mean_token_accuracy": 0.8854621827602387, "num_tokens": 59322139.0, "step": 8980 }, { "entropy": 0.38373652212321757, "epoch": 0.13838779755204456, "grad_norm": 0.9523072242736816, "learning_rate": 9.224690851249424e-06, "loss": 0.3802, "mean_token_accuracy": 0.8857626982033253, "num_tokens": 59387131.0, "step": 8990 }, { "entropy": 0.38003709986805917, "epoch": 0.13854173281072313, "grad_norm": 0.9325840473175049, "learning_rate": 9.23495305043871e-06, "loss": 0.3754, "mean_token_accuracy": 0.8880851492285728, "num_tokens": 59456040.0, "step": 9000 }, { "entropy": 0.3827074025757611, "epoch": 0.1386956680694017, "grad_norm": 0.9606167674064636, "learning_rate": 9.245215249627996e-06, "loss": 0.3837, "mean_token_accuracy": 0.8824609629809856, "num_tokens": 59516596.0, "step": 9010 }, { "entropy": 0.3860724503174424, "epoch": 0.1388496033280803, "grad_norm": 1.0173367261886597, "learning_rate": 9.255477448817284e-06, "loss": 0.3819, "mean_token_accuracy": 0.8864633306860924, "num_tokens": 59581919.0, "step": 9020 }, { "entropy": 0.3881792418658733, "epoch": 0.13900353858675887, "grad_norm": 0.9705384373664856, "learning_rate": 9.26573964800657e-06, "loss": 0.3784, "mean_token_accuracy": 0.8831011287868022, "num_tokens": 59641122.0, "step": 9030 }, { "entropy": 0.3834510467946529, "epoch": 0.13915747384543745, "grad_norm": 0.8840473890304565, "learning_rate": 9.276001847195855e-06, "loss": 0.384, "mean_token_accuracy": 0.8854764096438885, "num_tokens": 59710587.0, "step": 9040 }, { "entropy": 0.3883678724989295, "epoch": 0.13931140910411605, "grad_norm": 1.0438703298568726, "learning_rate": 9.286264046385141e-06, "loss": 0.3757, "mean_token_accuracy": 0.8841064594686031, "num_tokens": 59769461.0, "step": 9050 }, { "entropy": 0.37505215834826233, "epoch": 0.13946534436279462, "grad_norm": 1.1692461967468262, "learning_rate": 9.296526245574427e-06, "loss": 0.3647, "mean_token_accuracy": 0.8915792450308799, "num_tokens": 59826254.0, "step": 9060 }, { "entropy": 0.3614616906270385, "epoch": 0.1396192796214732, "grad_norm": 0.9619891047477722, "learning_rate": 9.306788444763713e-06, "loss": 0.3572, "mean_token_accuracy": 0.8930794410407543, "num_tokens": 59900578.0, "step": 9070 }, { "entropy": 0.3875246774405241, "epoch": 0.1397732148801518, "grad_norm": 0.9402825236320496, "learning_rate": 9.317050643953e-06, "loss": 0.3947, "mean_token_accuracy": 0.8864976927638054, "num_tokens": 59971962.0, "step": 9080 }, { "entropy": 0.35660703107714653, "epoch": 0.13992715013883036, "grad_norm": 0.8829073309898376, "learning_rate": 9.327312843142287e-06, "loss": 0.36, "mean_token_accuracy": 0.8961240090429783, "num_tokens": 60043115.0, "step": 9090 }, { "entropy": 0.3668832708150148, "epoch": 0.14008108539750894, "grad_norm": 0.6951959729194641, "learning_rate": 9.337575042331573e-06, "loss": 0.3746, "mean_token_accuracy": 0.8900481320917606, "num_tokens": 60115572.0, "step": 9100 }, { "entropy": 0.37924629356712103, "epoch": 0.14023502065618754, "grad_norm": 0.9377932548522949, "learning_rate": 9.347837241520859e-06, "loss": 0.3855, "mean_token_accuracy": 0.8883099496364594, "num_tokens": 60179041.0, "step": 9110 }, { "entropy": 0.3633079992607236, "epoch": 0.1403889559148661, "grad_norm": 0.7811972498893738, "learning_rate": 9.358099440710145e-06, "loss": 0.356, "mean_token_accuracy": 0.8916290335357189, "num_tokens": 60249020.0, "step": 9120 }, { "entropy": 0.3906168209388852, "epoch": 0.14054289117354468, "grad_norm": 0.8953498005867004, "learning_rate": 9.36836163989943e-06, "loss": 0.3832, "mean_token_accuracy": 0.8854277491569519, "num_tokens": 60318641.0, "step": 9130 }, { "entropy": 0.38442315366119145, "epoch": 0.14069682643222325, "grad_norm": 0.7869237661361694, "learning_rate": 9.378623839088717e-06, "loss": 0.3718, "mean_token_accuracy": 0.8869470752775669, "num_tokens": 60384889.0, "step": 9140 }, { "entropy": 0.37252229182049634, "epoch": 0.14085076169090185, "grad_norm": 1.39883291721344, "learning_rate": 9.388886038278003e-06, "loss": 0.373, "mean_token_accuracy": 0.8894499905407429, "num_tokens": 60454184.0, "step": 9150 }, { "entropy": 0.4028792141005397, "epoch": 0.14100469694958043, "grad_norm": 1.196859359741211, "learning_rate": 9.39914823746729e-06, "loss": 0.3937, "mean_token_accuracy": 0.8796214871108532, "num_tokens": 60522439.0, "step": 9160 }, { "entropy": 0.38629184383898973, "epoch": 0.141158632208259, "grad_norm": 1.0528005361557007, "learning_rate": 9.409410436656576e-06, "loss": 0.3746, "mean_token_accuracy": 0.8868168279528618, "num_tokens": 60581135.0, "step": 9170 }, { "entropy": 0.3645790319889784, "epoch": 0.1413125674669376, "grad_norm": 1.0220832824707031, "learning_rate": 9.419672635845862e-06, "loss": 0.3687, "mean_token_accuracy": 0.892681135982275, "num_tokens": 60655699.0, "step": 9180 }, { "entropy": 0.3816587008535862, "epoch": 0.14146650272561617, "grad_norm": 0.9884149432182312, "learning_rate": 9.429934835035148e-06, "loss": 0.3765, "mean_token_accuracy": 0.8851926386356354, "num_tokens": 60716150.0, "step": 9190 }, { "entropy": 0.37300890814512966, "epoch": 0.14162043798429474, "grad_norm": 1.171911597251892, "learning_rate": 9.440197034224436e-06, "loss": 0.3754, "mean_token_accuracy": 0.8891427412629127, "num_tokens": 60786589.0, "step": 9200 }, { "entropy": 0.38393719140440224, "epoch": 0.14177437324297334, "grad_norm": 1.4081896543502808, "learning_rate": 9.450459233413722e-06, "loss": 0.3762, "mean_token_accuracy": 0.8866244576871395, "num_tokens": 60846210.0, "step": 9210 }, { "entropy": 0.35560316052287816, "epoch": 0.14192830850165192, "grad_norm": 0.994995653629303, "learning_rate": 9.460721432603008e-06, "loss": 0.3476, "mean_token_accuracy": 0.8942395403981209, "num_tokens": 60918284.0, "step": 9220 }, { "entropy": 0.35030888821929695, "epoch": 0.1420822437603305, "grad_norm": 0.8153771758079529, "learning_rate": 9.470983631792294e-06, "loss": 0.3556, "mean_token_accuracy": 0.8939210586249828, "num_tokens": 60983705.0, "step": 9230 }, { "entropy": 0.37313160188496114, "epoch": 0.1422361790190091, "grad_norm": 1.3992575407028198, "learning_rate": 9.48124583098158e-06, "loss": 0.3686, "mean_token_accuracy": 0.8879819087684154, "num_tokens": 61052009.0, "step": 9240 }, { "entropy": 0.368733817897737, "epoch": 0.14239011427768766, "grad_norm": 1.1791915893554688, "learning_rate": 9.491508030170865e-06, "loss": 0.3633, "mean_token_accuracy": 0.8890954084694386, "num_tokens": 61113002.0, "step": 9250 }, { "entropy": 0.3679061349481344, "epoch": 0.14254404953636624, "grad_norm": 1.5149726867675781, "learning_rate": 9.501770229360151e-06, "loss": 0.3538, "mean_token_accuracy": 0.8930689737200737, "num_tokens": 61175707.0, "step": 9260 }, { "entropy": 0.36747241504490374, "epoch": 0.14269798479504484, "grad_norm": 0.9460855722427368, "learning_rate": 9.512032428549439e-06, "loss": 0.3776, "mean_token_accuracy": 0.8862187400460243, "num_tokens": 61247559.0, "step": 9270 }, { "entropy": 0.3750160880386829, "epoch": 0.1428519200537234, "grad_norm": 1.1028491258621216, "learning_rate": 9.522294627738725e-06, "loss": 0.376, "mean_token_accuracy": 0.8882162049412727, "num_tokens": 61316683.0, "step": 9280 }, { "entropy": 0.3718897713348269, "epoch": 0.14300585531240198, "grad_norm": 1.0871278047561646, "learning_rate": 9.532556826928011e-06, "loss": 0.3601, "mean_token_accuracy": 0.889789055287838, "num_tokens": 61388636.0, "step": 9290 }, { "entropy": 0.3606926975771785, "epoch": 0.14315979057108058, "grad_norm": 1.0704143047332764, "learning_rate": 9.542819026117299e-06, "loss": 0.3637, "mean_token_accuracy": 0.8917102493345738, "num_tokens": 61453566.0, "step": 9300 }, { "entropy": 0.3747862946242094, "epoch": 0.14331372582975915, "grad_norm": 0.9245670437812805, "learning_rate": 9.553081225306584e-06, "loss": 0.3696, "mean_token_accuracy": 0.8885299161076545, "num_tokens": 61522822.0, "step": 9310 }, { "entropy": 0.35725069846957924, "epoch": 0.14346766108843773, "grad_norm": 1.230002522468567, "learning_rate": 9.56334342449587e-06, "loss": 0.3651, "mean_token_accuracy": 0.8953065402805805, "num_tokens": 61588770.0, "step": 9320 }, { "entropy": 0.38150390312075616, "epoch": 0.1436215963471163, "grad_norm": 1.3839865922927856, "learning_rate": 9.573605623685156e-06, "loss": 0.3759, "mean_token_accuracy": 0.8853817380964756, "num_tokens": 61649861.0, "step": 9330 }, { "entropy": 0.3789200929924846, "epoch": 0.1437755316057949, "grad_norm": 0.8917683362960815, "learning_rate": 9.583867822874442e-06, "loss": 0.3709, "mean_token_accuracy": 0.8868048988282681, "num_tokens": 61724814.0, "step": 9340 }, { "entropy": 0.35949106700718403, "epoch": 0.14392946686447347, "grad_norm": 1.0501710176467896, "learning_rate": 9.594130022063728e-06, "loss": 0.3627, "mean_token_accuracy": 0.8928941294550896, "num_tokens": 61795378.0, "step": 9350 }, { "entropy": 0.3700631694868207, "epoch": 0.14408340212315204, "grad_norm": 0.9308406114578247, "learning_rate": 9.604392221253014e-06, "loss": 0.3587, "mean_token_accuracy": 0.8907234974205493, "num_tokens": 61862926.0, "step": 9360 }, { "entropy": 0.37425701450556514, "epoch": 0.14423733738183064, "grad_norm": 1.0760427713394165, "learning_rate": 9.614654420442302e-06, "loss": 0.3722, "mean_token_accuracy": 0.8887610211968422, "num_tokens": 61935237.0, "step": 9370 }, { "entropy": 0.3780505442991853, "epoch": 0.14439127264050922, "grad_norm": 0.837759256362915, "learning_rate": 9.624916619631588e-06, "loss": 0.3668, "mean_token_accuracy": 0.8864322818815709, "num_tokens": 62005112.0, "step": 9380 }, { "entropy": 0.3819732736796141, "epoch": 0.1445452078991878, "grad_norm": 1.1731915473937988, "learning_rate": 9.635178818820874e-06, "loss": 0.3691, "mean_token_accuracy": 0.8890457332134247, "num_tokens": 62063734.0, "step": 9390 }, { "entropy": 0.35882311798632144, "epoch": 0.1446991431578664, "grad_norm": 0.8147122859954834, "learning_rate": 9.64544101801016e-06, "loss": 0.3621, "mean_token_accuracy": 0.890620281547308, "num_tokens": 62138030.0, "step": 9400 }, { "entropy": 0.37506419867277146, "epoch": 0.14485307841654496, "grad_norm": 0.93534916639328, "learning_rate": 9.655703217199447e-06, "loss": 0.3659, "mean_token_accuracy": 0.8885995902121067, "num_tokens": 62205961.0, "step": 9410 }, { "entropy": 0.36465648021548985, "epoch": 0.14500701367522353, "grad_norm": 1.2030808925628662, "learning_rate": 9.665965416388733e-06, "loss": 0.3614, "mean_token_accuracy": 0.8932661339640617, "num_tokens": 62269760.0, "step": 9420 }, { "entropy": 0.3827360400930047, "epoch": 0.14516094893390213, "grad_norm": 1.0467240810394287, "learning_rate": 9.67622761557802e-06, "loss": 0.3924, "mean_token_accuracy": 0.8866322420537471, "num_tokens": 62333233.0, "step": 9430 }, { "entropy": 0.37486645579338074, "epoch": 0.1453148841925807, "grad_norm": 1.0949591398239136, "learning_rate": 9.686489814767305e-06, "loss": 0.3735, "mean_token_accuracy": 0.8883974619209767, "num_tokens": 62392815.0, "step": 9440 }, { "entropy": 0.3639534987509251, "epoch": 0.14546881945125928, "grad_norm": 1.0686287879943848, "learning_rate": 9.696752013956591e-06, "loss": 0.3562, "mean_token_accuracy": 0.8911957576870918, "num_tokens": 62456648.0, "step": 9450 }, { "entropy": 0.39884469993412497, "epoch": 0.14562275470993788, "grad_norm": 1.412200927734375, "learning_rate": 9.707014213145877e-06, "loss": 0.3997, "mean_token_accuracy": 0.8812993787229061, "num_tokens": 62517779.0, "step": 9460 }, { "entropy": 0.3545834569260478, "epoch": 0.14577668996861645, "grad_norm": 1.2375297546386719, "learning_rate": 9.717276412335163e-06, "loss": 0.3474, "mean_token_accuracy": 0.8941337116062641, "num_tokens": 62571525.0, "step": 9470 }, { "entropy": 0.37581885065883397, "epoch": 0.14593062522729502, "grad_norm": 1.078931212425232, "learning_rate": 9.72753861152445e-06, "loss": 0.3843, "mean_token_accuracy": 0.8856226250529289, "num_tokens": 62645275.0, "step": 9480 }, { "entropy": 0.399120905995369, "epoch": 0.14608456048597362, "grad_norm": 0.9117498397827148, "learning_rate": 9.737800810713737e-06, "loss": 0.3805, "mean_token_accuracy": 0.8842652820050716, "num_tokens": 62708343.0, "step": 9490 }, { "entropy": 0.3602043965831399, "epoch": 0.1462384957446522, "grad_norm": 0.8388940691947937, "learning_rate": 9.748063009903023e-06, "loss": 0.3533, "mean_token_accuracy": 0.8918748177587986, "num_tokens": 62785451.0, "step": 9500 }, { "entropy": 0.3676484663039446, "epoch": 0.14639243100333077, "grad_norm": 1.3716073036193848, "learning_rate": 9.75832520909231e-06, "loss": 0.3736, "mean_token_accuracy": 0.889635269343853, "num_tokens": 62847574.0, "step": 9510 }, { "entropy": 0.3839932333678007, "epoch": 0.14654636626200934, "grad_norm": 1.1039843559265137, "learning_rate": 9.768587408281596e-06, "loss": 0.381, "mean_token_accuracy": 0.8881366066634655, "num_tokens": 62907498.0, "step": 9520 }, { "entropy": 0.3680999452248216, "epoch": 0.14670030152068794, "grad_norm": 1.0679049491882324, "learning_rate": 9.778849607470882e-06, "loss": 0.3687, "mean_token_accuracy": 0.8895623795688152, "num_tokens": 62977145.0, "step": 9530 }, { "entropy": 0.3721592552959919, "epoch": 0.14685423677936651, "grad_norm": 1.0660983324050903, "learning_rate": 9.789111806660168e-06, "loss": 0.3778, "mean_token_accuracy": 0.889016904681921, "num_tokens": 63044516.0, "step": 9540 }, { "entropy": 0.3983332395553589, "epoch": 0.1470081720380451, "grad_norm": 1.0599030256271362, "learning_rate": 9.799374005849454e-06, "loss": 0.4057, "mean_token_accuracy": 0.8835399180650712, "num_tokens": 63102657.0, "step": 9550 }, { "entropy": 0.3692759586498141, "epoch": 0.1471621072967237, "grad_norm": 0.9140305519104004, "learning_rate": 9.80963620503874e-06, "loss": 0.3543, "mean_token_accuracy": 0.8904049903154373, "num_tokens": 63169872.0, "step": 9560 }, { "entropy": 0.36171836126595736, "epoch": 0.14731604255540226, "grad_norm": 1.1623233556747437, "learning_rate": 9.819898404228026e-06, "loss": 0.3519, "mean_token_accuracy": 0.8916416928172112, "num_tokens": 63236944.0, "step": 9570 }, { "entropy": 0.39079474937170744, "epoch": 0.14746997781408083, "grad_norm": 0.8739739060401917, "learning_rate": 9.830160603417313e-06, "loss": 0.3862, "mean_token_accuracy": 0.8835752815008163, "num_tokens": 63307338.0, "step": 9580 }, { "entropy": 0.35524075888097284, "epoch": 0.14762391307275943, "grad_norm": 1.2925055027008057, "learning_rate": 9.8404228026066e-06, "loss": 0.3544, "mean_token_accuracy": 0.8933298282325268, "num_tokens": 63371494.0, "step": 9590 }, { "entropy": 0.3640078632161021, "epoch": 0.147777848331438, "grad_norm": 1.1060504913330078, "learning_rate": 9.850685001795885e-06, "loss": 0.3617, "mean_token_accuracy": 0.8904783055186272, "num_tokens": 63430295.0, "step": 9600 }, { "entropy": 0.37833972182124853, "epoch": 0.14793178359011658, "grad_norm": 1.4882596731185913, "learning_rate": 9.860947200985171e-06, "loss": 0.3676, "mean_token_accuracy": 0.8895161189138889, "num_tokens": 63499265.0, "step": 9610 }, { "entropy": 0.3595001820474863, "epoch": 0.14808571884879518, "grad_norm": 0.9977539777755737, "learning_rate": 9.871209400174459e-06, "loss": 0.3558, "mean_token_accuracy": 0.8948916025459767, "num_tokens": 63566506.0, "step": 9620 }, { "entropy": 0.3615765988826752, "epoch": 0.14823965410747375, "grad_norm": 1.1279009580612183, "learning_rate": 9.881471599363745e-06, "loss": 0.368, "mean_token_accuracy": 0.892397803068161, "num_tokens": 63630326.0, "step": 9630 }, { "entropy": 0.3765588365495205, "epoch": 0.14839358936615232, "grad_norm": 1.0868343114852905, "learning_rate": 9.891733798553031e-06, "loss": 0.3663, "mean_token_accuracy": 0.8885284826159477, "num_tokens": 63704796.0, "step": 9640 }, { "entropy": 0.3837457563728094, "epoch": 0.14854752462483092, "grad_norm": 1.2708063125610352, "learning_rate": 9.901995997742317e-06, "loss": 0.3829, "mean_token_accuracy": 0.8858843423426151, "num_tokens": 63767147.0, "step": 9650 }, { "entropy": 0.37671289052814244, "epoch": 0.1487014598835095, "grad_norm": 1.0633951425552368, "learning_rate": 9.912258196931603e-06, "loss": 0.3613, "mean_token_accuracy": 0.8887148521840572, "num_tokens": 63841660.0, "step": 9660 }, { "entropy": 0.37020720578730104, "epoch": 0.14885539514218807, "grad_norm": 0.9558635354042053, "learning_rate": 9.922520396120889e-06, "loss": 0.3727, "mean_token_accuracy": 0.8900530055165291, "num_tokens": 63908336.0, "step": 9670 }, { "entropy": 0.36232567569240925, "epoch": 0.14900933040086667, "grad_norm": 0.9704034924507141, "learning_rate": 9.932782595310175e-06, "loss": 0.3514, "mean_token_accuracy": 0.8922076933085918, "num_tokens": 63980884.0, "step": 9680 }, { "entropy": 0.3852179450914264, "epoch": 0.14916326565954524, "grad_norm": 1.0196561813354492, "learning_rate": 9.943044794499462e-06, "loss": 0.3638, "mean_token_accuracy": 0.8871132008731365, "num_tokens": 64044312.0, "step": 9690 }, { "entropy": 0.36628071069717405, "epoch": 0.1493172009182238, "grad_norm": 1.0460721254348755, "learning_rate": 9.953306993688748e-06, "loss": 0.3588, "mean_token_accuracy": 0.8916154101490974, "num_tokens": 64111224.0, "step": 9700 }, { "entropy": 0.38333977553993465, "epoch": 0.14947113617690239, "grad_norm": 1.163880705833435, "learning_rate": 9.963569192878034e-06, "loss": 0.3757, "mean_token_accuracy": 0.8871649883687496, "num_tokens": 64175396.0, "step": 9710 }, { "entropy": 0.378684419952333, "epoch": 0.149625071435581, "grad_norm": 1.019608736038208, "learning_rate": 9.973831392067322e-06, "loss": 0.3715, "mean_token_accuracy": 0.8862099409103393, "num_tokens": 64240462.0, "step": 9720 }, { "entropy": 0.3782960711047053, "epoch": 0.14977900669425956, "grad_norm": 0.9675107002258301, "learning_rate": 9.984093591256608e-06, "loss": 0.3734, "mean_token_accuracy": 0.8865332037210465, "num_tokens": 64308986.0, "step": 9730 }, { "entropy": 0.38071581479161976, "epoch": 0.14993294195293813, "grad_norm": 0.8809739947319031, "learning_rate": 9.994355790445894e-06, "loss": 0.3777, "mean_token_accuracy": 0.8863679841160774, "num_tokens": 64370970.0, "step": 9740 }, { "entropy": 0.3617125736549497, "epoch": 0.15008687721161673, "grad_norm": 1.0472217798233032, "learning_rate": 1.000461798963518e-05, "loss": 0.3578, "mean_token_accuracy": 0.893860848993063, "num_tokens": 64437825.0, "step": 9750 }, { "entropy": 0.37736012060195206, "epoch": 0.1502408124702953, "grad_norm": 1.1496306657791138, "learning_rate": 1.0014880188824466e-05, "loss": 0.3733, "mean_token_accuracy": 0.8854293718934059, "num_tokens": 64503847.0, "step": 9760 }, { "entropy": 0.3819476844742894, "epoch": 0.15039474772897388, "grad_norm": 0.9523701667785645, "learning_rate": 1.0025142388013752e-05, "loss": 0.36, "mean_token_accuracy": 0.8878491871058941, "num_tokens": 64566941.0, "step": 9770 }, { "entropy": 0.3842069996520877, "epoch": 0.15054868298765248, "grad_norm": 1.0156503915786743, "learning_rate": 1.003540458720304e-05, "loss": 0.3882, "mean_token_accuracy": 0.8833870530128479, "num_tokens": 64625908.0, "step": 9780 }, { "entropy": 0.36424654703587295, "epoch": 0.15070261824633105, "grad_norm": 0.8401058316230774, "learning_rate": 1.0045666786392325e-05, "loss": 0.3571, "mean_token_accuracy": 0.8932690136134624, "num_tokens": 64692816.0, "step": 9790 }, { "entropy": 0.37057048063725234, "epoch": 0.15085655350500962, "grad_norm": 1.0286133289337158, "learning_rate": 1.0055928985581611e-05, "loss": 0.3776, "mean_token_accuracy": 0.8886066064238548, "num_tokens": 64760446.0, "step": 9800 }, { "entropy": 0.39353278987109663, "epoch": 0.15101048876368822, "grad_norm": 1.3912450075149536, "learning_rate": 1.0066191184770897e-05, "loss": 0.3742, "mean_token_accuracy": 0.8855526179075242, "num_tokens": 64820700.0, "step": 9810 }, { "entropy": 0.37083251606673, "epoch": 0.1511644240223668, "grad_norm": 1.3110016584396362, "learning_rate": 1.0076453383960183e-05, "loss": 0.3551, "mean_token_accuracy": 0.8883546724915504, "num_tokens": 64889676.0, "step": 9820 }, { "entropy": 0.37800302244722844, "epoch": 0.15131835928104537, "grad_norm": 1.7148500680923462, "learning_rate": 1.0086715583149469e-05, "loss": 0.3774, "mean_token_accuracy": 0.8862802468240261, "num_tokens": 64946121.0, "step": 9830 }, { "entropy": 0.3756085250526667, "epoch": 0.15147229453972397, "grad_norm": 1.3293439149856567, "learning_rate": 1.0096977782338755e-05, "loss": 0.3576, "mean_token_accuracy": 0.8891270942986012, "num_tokens": 65011199.0, "step": 9840 }, { "entropy": 0.3683837400749326, "epoch": 0.15162622979840254, "grad_norm": 0.8841857314109802, "learning_rate": 1.0107239981528043e-05, "loss": 0.3653, "mean_token_accuracy": 0.8885490983724594, "num_tokens": 65080341.0, "step": 9850 }, { "entropy": 0.3704158840700984, "epoch": 0.1517801650570811, "grad_norm": 1.1424461603164673, "learning_rate": 1.0117502180717328e-05, "loss": 0.3604, "mean_token_accuracy": 0.8903406083583831, "num_tokens": 65150102.0, "step": 9860 }, { "entropy": 0.38488377202302215, "epoch": 0.1519341003157597, "grad_norm": 1.1835922002792358, "learning_rate": 1.0127764379906614e-05, "loss": 0.3746, "mean_token_accuracy": 0.8868098445236683, "num_tokens": 65217877.0, "step": 9870 }, { "entropy": 0.3768758397549391, "epoch": 0.15208803557443829, "grad_norm": 0.9409542679786682, "learning_rate": 1.01380265790959e-05, "loss": 0.3662, "mean_token_accuracy": 0.8871283769607544, "num_tokens": 65287855.0, "step": 9880 }, { "entropy": 0.37756369542330503, "epoch": 0.15224197083311686, "grad_norm": 1.1315701007843018, "learning_rate": 1.0148288778285186e-05, "loss": 0.3565, "mean_token_accuracy": 0.8893378943204879, "num_tokens": 65357919.0, "step": 9890 }, { "entropy": 0.36762614343315364, "epoch": 0.15239590609179543, "grad_norm": 1.1041256189346313, "learning_rate": 1.0158550977474472e-05, "loss": 0.3648, "mean_token_accuracy": 0.8890646770596504, "num_tokens": 65430108.0, "step": 9900 }, { "entropy": 0.3798810152336955, "epoch": 0.15254984135047403, "grad_norm": 0.867255449295044, "learning_rate": 1.0168813176663758e-05, "loss": 0.3721, "mean_token_accuracy": 0.8880518555641175, "num_tokens": 65500095.0, "step": 9910 }, { "entropy": 0.3844749653711915, "epoch": 0.1527037766091526, "grad_norm": 1.0161409378051758, "learning_rate": 1.0179075375853048e-05, "loss": 0.3688, "mean_token_accuracy": 0.883291169255972, "num_tokens": 65565308.0, "step": 9920 }, { "entropy": 0.3793401584029198, "epoch": 0.15285771186783118, "grad_norm": 0.8650683164596558, "learning_rate": 1.0189337575042333e-05, "loss": 0.3841, "mean_token_accuracy": 0.8876499757170677, "num_tokens": 65637908.0, "step": 9930 }, { "entropy": 0.3835520476102829, "epoch": 0.15301164712650978, "grad_norm": 0.9564667344093323, "learning_rate": 1.019959977423162e-05, "loss": 0.3734, "mean_token_accuracy": 0.8884454756975174, "num_tokens": 65706412.0, "step": 9940 }, { "entropy": 0.36027509327977897, "epoch": 0.15316558238518835, "grad_norm": 1.0025168657302856, "learning_rate": 1.0209861973420905e-05, "loss": 0.3548, "mean_token_accuracy": 0.8916240088641644, "num_tokens": 65771663.0, "step": 9950 }, { "entropy": 0.37857271265238523, "epoch": 0.15331951764386692, "grad_norm": 1.4759347438812256, "learning_rate": 1.0220124172610191e-05, "loss": 0.3724, "mean_token_accuracy": 0.8869003139436245, "num_tokens": 65834095.0, "step": 9960 }, { "entropy": 0.36646109223365786, "epoch": 0.15347345290254552, "grad_norm": 1.0493839979171753, "learning_rate": 1.0230386371799477e-05, "loss": 0.3522, "mean_token_accuracy": 0.8921338364481926, "num_tokens": 65902632.0, "step": 9970 }, { "entropy": 0.34495412167161704, "epoch": 0.1536273881612241, "grad_norm": 1.08441960811615, "learning_rate": 1.0240648570988763e-05, "loss": 0.3447, "mean_token_accuracy": 0.8971198447048664, "num_tokens": 65976199.0, "step": 9980 }, { "entropy": 0.36031927578151224, "epoch": 0.15378132341990267, "grad_norm": 1.1840256452560425, "learning_rate": 1.0250910770178051e-05, "loss": 0.3629, "mean_token_accuracy": 0.890544169396162, "num_tokens": 66041564.0, "step": 9990 }, { "entropy": 0.40189000852406026, "epoch": 0.15393525867858127, "grad_norm": 0.9730496406555176, "learning_rate": 1.0261172969367337e-05, "loss": 0.4005, "mean_token_accuracy": 0.8804517105221749, "num_tokens": 66106728.0, "step": 10000 }, { "epoch": 0.15393525867858127, "eval_entropy": 0.38363094091989763, "eval_loss": 0.36727336049079895, "eval_mean_token_accuracy": 0.8900757077314108, "eval_num_tokens": 66106728.0, "eval_runtime": 7677.9697, "eval_samples_per_second": 4.231, "eval_steps_per_second": 4.231, "step": 10000 }, { "entropy": 0.382415085285902, "epoch": 0.15408919393725984, "grad_norm": 0.8757123947143555, "learning_rate": 1.0271435168556623e-05, "loss": 0.3755, "mean_token_accuracy": 0.8873049363493919, "num_tokens": 66178143.0, "step": 10010 }, { "entropy": 0.3822949226945639, "epoch": 0.1542431291959384, "grad_norm": 1.1520872116088867, "learning_rate": 1.0281697367745909e-05, "loss": 0.367, "mean_token_accuracy": 0.887526348233223, "num_tokens": 66239384.0, "step": 10020 }, { "entropy": 0.364031994715333, "epoch": 0.154397064454617, "grad_norm": 1.0313588380813599, "learning_rate": 1.0291959566935195e-05, "loss": 0.3652, "mean_token_accuracy": 0.8905894704163074, "num_tokens": 66303296.0, "step": 10030 }, { "entropy": 0.3632042586803436, "epoch": 0.15455099971329558, "grad_norm": 1.026885986328125, "learning_rate": 1.030222176612448e-05, "loss": 0.351, "mean_token_accuracy": 0.8919341348111629, "num_tokens": 66369104.0, "step": 10040 }, { "entropy": 0.3627515258267522, "epoch": 0.15470493497197416, "grad_norm": 1.0750664472579956, "learning_rate": 1.0312483965313767e-05, "loss": 0.3668, "mean_token_accuracy": 0.8917936027050019, "num_tokens": 66427001.0, "step": 10050 }, { "entropy": 0.36310987211763857, "epoch": 0.15485887023065276, "grad_norm": 1.10531485080719, "learning_rate": 1.0322746164503054e-05, "loss": 0.3586, "mean_token_accuracy": 0.8922353930771351, "num_tokens": 66494230.0, "step": 10060 }, { "entropy": 0.3852401077747345, "epoch": 0.15501280548933133, "grad_norm": 0.9016704559326172, "learning_rate": 1.033300836369234e-05, "loss": 0.3817, "mean_token_accuracy": 0.8865027725696564, "num_tokens": 66564101.0, "step": 10070 }, { "entropy": 0.3701156251132488, "epoch": 0.1551667407480099, "grad_norm": 1.4698355197906494, "learning_rate": 1.0343270562881626e-05, "loss": 0.366, "mean_token_accuracy": 0.8894424512982368, "num_tokens": 66629203.0, "step": 10080 }, { "entropy": 0.3740495277568698, "epoch": 0.15532067600668847, "grad_norm": 1.0526795387268066, "learning_rate": 1.0353532762070912e-05, "loss": 0.3755, "mean_token_accuracy": 0.8888017863035202, "num_tokens": 66696989.0, "step": 10090 }, { "entropy": 0.35276208743453025, "epoch": 0.15547461126536707, "grad_norm": 0.8621221780776978, "learning_rate": 1.0363794961260198e-05, "loss": 0.347, "mean_token_accuracy": 0.8969864718616009, "num_tokens": 66761775.0, "step": 10100 }, { "entropy": 0.3567109966650605, "epoch": 0.15562854652404565, "grad_norm": 0.9741185307502747, "learning_rate": 1.0374057160449484e-05, "loss": 0.3547, "mean_token_accuracy": 0.8935653276741504, "num_tokens": 66829266.0, "step": 10110 }, { "entropy": 0.3655007597059011, "epoch": 0.15578248178272422, "grad_norm": 1.0064667463302612, "learning_rate": 1.038431935963877e-05, "loss": 0.3551, "mean_token_accuracy": 0.8921849861741066, "num_tokens": 66899245.0, "step": 10120 }, { "entropy": 0.3633456023409963, "epoch": 0.15593641704140282, "grad_norm": 0.9745544791221619, "learning_rate": 1.039458155882806e-05, "loss": 0.358, "mean_token_accuracy": 0.8899225071072578, "num_tokens": 66970614.0, "step": 10130 }, { "entropy": 0.3902129026129842, "epoch": 0.1560903523000814, "grad_norm": 0.9418654441833496, "learning_rate": 1.0404843758017345e-05, "loss": 0.3893, "mean_token_accuracy": 0.8848986007273197, "num_tokens": 67032548.0, "step": 10140 }, { "entropy": 0.36253134310245516, "epoch": 0.15624428755875996, "grad_norm": 1.1880676746368408, "learning_rate": 1.0415105957206631e-05, "loss": 0.3502, "mean_token_accuracy": 0.8945998832583427, "num_tokens": 67094974.0, "step": 10150 }, { "entropy": 0.36292612794786694, "epoch": 0.15639822281743856, "grad_norm": 1.1811809539794922, "learning_rate": 1.0425368156395917e-05, "loss": 0.3588, "mean_token_accuracy": 0.8906000383198261, "num_tokens": 67158472.0, "step": 10160 }, { "entropy": 0.38062991965562104, "epoch": 0.15655215807611714, "grad_norm": 1.1069482564926147, "learning_rate": 1.0435630355585203e-05, "loss": 0.3684, "mean_token_accuracy": 0.8883603930473327, "num_tokens": 67219351.0, "step": 10170 }, { "entropy": 0.37174352817237377, "epoch": 0.1567060933347957, "grad_norm": 1.1414361000061035, "learning_rate": 1.0445892554774489e-05, "loss": 0.3605, "mean_token_accuracy": 0.891122005134821, "num_tokens": 67285674.0, "step": 10180 }, { "entropy": 0.3423094689846039, "epoch": 0.1568600285934743, "grad_norm": 0.971333384513855, "learning_rate": 1.0456154753963775e-05, "loss": 0.3486, "mean_token_accuracy": 0.8982788264751435, "num_tokens": 67350934.0, "step": 10190 }, { "entropy": 0.37597265392541884, "epoch": 0.15701396385215288, "grad_norm": 1.0701488256454468, "learning_rate": 1.0466416953153063e-05, "loss": 0.3754, "mean_token_accuracy": 0.8892924815416337, "num_tokens": 67416527.0, "step": 10200 }, { "entropy": 0.3533797912299633, "epoch": 0.15716789911083145, "grad_norm": 1.0065929889678955, "learning_rate": 1.0476679152342348e-05, "loss": 0.3542, "mean_token_accuracy": 0.8923704698681831, "num_tokens": 67481604.0, "step": 10210 }, { "entropy": 0.3793426727876067, "epoch": 0.15732183436951006, "grad_norm": 1.2936853170394897, "learning_rate": 1.0486941351531634e-05, "loss": 0.3731, "mean_token_accuracy": 0.889318035542965, "num_tokens": 67542230.0, "step": 10220 }, { "entropy": 0.3838397070765495, "epoch": 0.15747576962818863, "grad_norm": 1.1168878078460693, "learning_rate": 1.049720355072092e-05, "loss": 0.3784, "mean_token_accuracy": 0.8874218456447125, "num_tokens": 67604368.0, "step": 10230 }, { "entropy": 0.37669097185134887, "epoch": 0.1576297048868672, "grad_norm": 1.2057017087936401, "learning_rate": 1.0507465749910206e-05, "loss": 0.3667, "mean_token_accuracy": 0.8873655021190643, "num_tokens": 67666777.0, "step": 10240 }, { "entropy": 0.3678618956357241, "epoch": 0.1577836401455458, "grad_norm": 0.8742278814315796, "learning_rate": 1.0517727949099492e-05, "loss": 0.3731, "mean_token_accuracy": 0.8892256058752537, "num_tokens": 67727434.0, "step": 10250 }, { "entropy": 0.36147007942199705, "epoch": 0.15793757540422437, "grad_norm": 0.949871838092804, "learning_rate": 1.0527990148288778e-05, "loss": 0.3647, "mean_token_accuracy": 0.8924664452672004, "num_tokens": 67799953.0, "step": 10260 }, { "entropy": 0.3718326030299067, "epoch": 0.15809151066290295, "grad_norm": 0.8623730540275574, "learning_rate": 1.0538252347478066e-05, "loss": 0.3795, "mean_token_accuracy": 0.8898218773305416, "num_tokens": 67873016.0, "step": 10270 }, { "entropy": 0.3688068469054997, "epoch": 0.15824544592158152, "grad_norm": 1.311579704284668, "learning_rate": 1.0548514546667352e-05, "loss": 0.3668, "mean_token_accuracy": 0.8921973332762718, "num_tokens": 67945722.0, "step": 10280 }, { "entropy": 0.35715268533676864, "epoch": 0.15839938118026012, "grad_norm": 1.3735698461532593, "learning_rate": 1.0558776745856638e-05, "loss": 0.3605, "mean_token_accuracy": 0.8899977631866932, "num_tokens": 68016289.0, "step": 10290 }, { "entropy": 0.3700808335095644, "epoch": 0.1585533164389387, "grad_norm": 0.9749259948730469, "learning_rate": 1.0569038945045924e-05, "loss": 0.3678, "mean_token_accuracy": 0.8880540937185287, "num_tokens": 68079240.0, "step": 10300 }, { "entropy": 0.3653397412970662, "epoch": 0.15870725169761726, "grad_norm": 0.8795186281204224, "learning_rate": 1.057930114423521e-05, "loss": 0.3476, "mean_token_accuracy": 0.8913263112306595, "num_tokens": 68144365.0, "step": 10310 }, { "entropy": 0.3694413617253304, "epoch": 0.15886118695629586, "grad_norm": 1.3340898752212524, "learning_rate": 1.0589563343424496e-05, "loss": 0.3814, "mean_token_accuracy": 0.8873363800346852, "num_tokens": 68215877.0, "step": 10320 }, { "entropy": 0.3807395942509174, "epoch": 0.15901512221497444, "grad_norm": 0.8809978365898132, "learning_rate": 1.0599825542613782e-05, "loss": 0.3753, "mean_token_accuracy": 0.8882056809961796, "num_tokens": 68284769.0, "step": 10330 }, { "entropy": 0.3571710869669914, "epoch": 0.159169057473653, "grad_norm": 1.0748190879821777, "learning_rate": 1.061008774180307e-05, "loss": 0.3585, "mean_token_accuracy": 0.8938510693609715, "num_tokens": 68356199.0, "step": 10340 }, { "entropy": 0.3622090512886643, "epoch": 0.1593229927323316, "grad_norm": 1.072919249534607, "learning_rate": 1.0620349940992357e-05, "loss": 0.3461, "mean_token_accuracy": 0.8918087676167488, "num_tokens": 68425850.0, "step": 10350 }, { "entropy": 0.38010069597512486, "epoch": 0.15947692799101018, "grad_norm": 1.1575322151184082, "learning_rate": 1.0630612140181643e-05, "loss": 0.3771, "mean_token_accuracy": 0.8869552329182625, "num_tokens": 68490530.0, "step": 10360 }, { "entropy": 0.3583028377965093, "epoch": 0.15963086324968875, "grad_norm": 1.0719696283340454, "learning_rate": 1.0640874339370929e-05, "loss": 0.3645, "mean_token_accuracy": 0.8911060407757759, "num_tokens": 68559118.0, "step": 10370 }, { "entropy": 0.3887250415980816, "epoch": 0.15978479850836735, "grad_norm": 0.9859808087348938, "learning_rate": 1.0651136538560215e-05, "loss": 0.3787, "mean_token_accuracy": 0.8846508964896203, "num_tokens": 68626907.0, "step": 10380 }, { "entropy": 0.38225669115781785, "epoch": 0.15993873376704593, "grad_norm": 1.0623259544372559, "learning_rate": 1.06613987377495e-05, "loss": 0.3615, "mean_token_accuracy": 0.8853197813034057, "num_tokens": 68696005.0, "step": 10390 }, { "entropy": 0.3632244667038321, "epoch": 0.1600926690257245, "grad_norm": 0.9468759894371033, "learning_rate": 1.0671660936938787e-05, "loss": 0.3611, "mean_token_accuracy": 0.8938015311956405, "num_tokens": 68769035.0, "step": 10400 }, { "entropy": 0.36657728888094426, "epoch": 0.1602466042844031, "grad_norm": 0.9760169386863708, "learning_rate": 1.0681923136128074e-05, "loss": 0.3577, "mean_token_accuracy": 0.89152412712574, "num_tokens": 68837910.0, "step": 10410 }, { "entropy": 0.3707921175286174, "epoch": 0.16040053954308167, "grad_norm": 0.9478567838668823, "learning_rate": 1.069218533531736e-05, "loss": 0.3579, "mean_token_accuracy": 0.8876467004418374, "num_tokens": 68903293.0, "step": 10420 }, { "entropy": 0.3603986954316497, "epoch": 0.16055447480176024, "grad_norm": 1.0007624626159668, "learning_rate": 1.0702447534506646e-05, "loss": 0.354, "mean_token_accuracy": 0.891101585328579, "num_tokens": 68970017.0, "step": 10430 }, { "entropy": 0.3753379609435797, "epoch": 0.16070841006043884, "grad_norm": 1.2794084548950195, "learning_rate": 1.0712709733695932e-05, "loss": 0.365, "mean_token_accuracy": 0.8887268804013729, "num_tokens": 69037772.0, "step": 10440 }, { "entropy": 0.35934779439121484, "epoch": 0.16086234531911742, "grad_norm": 1.1469686031341553, "learning_rate": 1.0722971932885218e-05, "loss": 0.3632, "mean_token_accuracy": 0.893193930387497, "num_tokens": 69098083.0, "step": 10450 }, { "entropy": 0.36223105182871224, "epoch": 0.161016280577796, "grad_norm": 1.0016130208969116, "learning_rate": 1.0733234132074504e-05, "loss": 0.3671, "mean_token_accuracy": 0.8905196659266948, "num_tokens": 69157406.0, "step": 10460 }, { "entropy": 0.35597382597625254, "epoch": 0.16117021583647456, "grad_norm": 0.8814265727996826, "learning_rate": 1.074349633126379e-05, "loss": 0.3545, "mean_token_accuracy": 0.8949585281312465, "num_tokens": 69226109.0, "step": 10470 }, { "entropy": 0.396559807099402, "epoch": 0.16132415109515316, "grad_norm": 0.9141523241996765, "learning_rate": 1.0753758530453077e-05, "loss": 0.3847, "mean_token_accuracy": 0.8820580244064331, "num_tokens": 69292933.0, "step": 10480 }, { "entropy": 0.35285013634711504, "epoch": 0.16147808635383173, "grad_norm": 1.3368574380874634, "learning_rate": 1.0764020729642363e-05, "loss": 0.3409, "mean_token_accuracy": 0.8924467109143734, "num_tokens": 69361445.0, "step": 10490 }, { "entropy": 0.37493516635149715, "epoch": 0.1616320216125103, "grad_norm": 1.2028579711914062, "learning_rate": 1.077428292883165e-05, "loss": 0.3825, "mean_token_accuracy": 0.8880997613072396, "num_tokens": 69425194.0, "step": 10500 }, { "entropy": 0.3897503925487399, "epoch": 0.1617859568711889, "grad_norm": 1.1605091094970703, "learning_rate": 1.0784545128020935e-05, "loss": 0.3816, "mean_token_accuracy": 0.8855008065700531, "num_tokens": 69481438.0, "step": 10510 }, { "entropy": 0.3675442495383322, "epoch": 0.16193989212986748, "grad_norm": 1.0920733213424683, "learning_rate": 1.0794807327210221e-05, "loss": 0.369, "mean_token_accuracy": 0.8914564065635204, "num_tokens": 69546326.0, "step": 10520 }, { "entropy": 0.3626975230872631, "epoch": 0.16209382738854605, "grad_norm": 1.4608960151672363, "learning_rate": 1.0805069526399507e-05, "loss": 0.3443, "mean_token_accuracy": 0.8937839038670063, "num_tokens": 69612836.0, "step": 10530 }, { "entropy": 0.38009179662913084, "epoch": 0.16224776264722465, "grad_norm": 0.8470841646194458, "learning_rate": 1.0815331725588793e-05, "loss": 0.377, "mean_token_accuracy": 0.8888774015009403, "num_tokens": 69678815.0, "step": 10540 }, { "entropy": 0.3558527658693492, "epoch": 0.16240169790590323, "grad_norm": 1.1682864427566528, "learning_rate": 1.0825593924778082e-05, "loss": 0.3533, "mean_token_accuracy": 0.8938690245151519, "num_tokens": 69745069.0, "step": 10550 }, { "entropy": 0.36717404145747423, "epoch": 0.1625556331645818, "grad_norm": 1.062397837638855, "learning_rate": 1.0835856123967368e-05, "loss": 0.3632, "mean_token_accuracy": 0.8907847620546818, "num_tokens": 69803796.0, "step": 10560 }, { "entropy": 0.36510087475180625, "epoch": 0.1627095684232604, "grad_norm": 1.007825255393982, "learning_rate": 1.0846118323156654e-05, "loss": 0.3584, "mean_token_accuracy": 0.8912466712296009, "num_tokens": 69874950.0, "step": 10570 }, { "entropy": 0.36882289350032804, "epoch": 0.16286350368193897, "grad_norm": 0.8991548418998718, "learning_rate": 1.085638052234594e-05, "loss": 0.36, "mean_token_accuracy": 0.8882355369627476, "num_tokens": 69944661.0, "step": 10580 }, { "entropy": 0.36504187118262055, "epoch": 0.16301743894061754, "grad_norm": 1.2302838563919067, "learning_rate": 1.0866642721535226e-05, "loss": 0.3538, "mean_token_accuracy": 0.8919628866016864, "num_tokens": 70010353.0, "step": 10590 }, { "entropy": 0.3687148792669177, "epoch": 0.16317137419929614, "grad_norm": 0.8783401250839233, "learning_rate": 1.0876904920724512e-05, "loss": 0.3705, "mean_token_accuracy": 0.8879882499575615, "num_tokens": 70081695.0, "step": 10600 }, { "entropy": 0.3573172532021999, "epoch": 0.16332530945797472, "grad_norm": 1.4582313299179077, "learning_rate": 1.0887167119913798e-05, "loss": 0.3623, "mean_token_accuracy": 0.8950750544667244, "num_tokens": 70145032.0, "step": 10610 }, { "entropy": 0.3654877331107855, "epoch": 0.1634792447166533, "grad_norm": 0.9867797493934631, "learning_rate": 1.0897429319103086e-05, "loss": 0.3631, "mean_token_accuracy": 0.8894522964954377, "num_tokens": 70213807.0, "step": 10620 }, { "entropy": 0.38560765273869035, "epoch": 0.1636331799753319, "grad_norm": 0.9297165274620056, "learning_rate": 1.0907691518292372e-05, "loss": 0.3671, "mean_token_accuracy": 0.8861762836575509, "num_tokens": 70285225.0, "step": 10630 }, { "entropy": 0.35958816166967156, "epoch": 0.16378711523401046, "grad_norm": 1.1129131317138672, "learning_rate": 1.0917953717481658e-05, "loss": 0.3668, "mean_token_accuracy": 0.8880079165101051, "num_tokens": 70354499.0, "step": 10640 }, { "entropy": 0.3748537825420499, "epoch": 0.16394105049268903, "grad_norm": 1.3786412477493286, "learning_rate": 1.0928215916670944e-05, "loss": 0.3711, "mean_token_accuracy": 0.8909391619265079, "num_tokens": 70419962.0, "step": 10650 }, { "entropy": 0.3572362121194601, "epoch": 0.1640949857513676, "grad_norm": 1.1869874000549316, "learning_rate": 1.093847811586023e-05, "loss": 0.3682, "mean_token_accuracy": 0.8951291866600514, "num_tokens": 70493826.0, "step": 10660 }, { "entropy": 0.3485226223245263, "epoch": 0.1642489210100462, "grad_norm": 0.8306623697280884, "learning_rate": 1.0948740315049516e-05, "loss": 0.351, "mean_token_accuracy": 0.8965601928532123, "num_tokens": 70560500.0, "step": 10670 }, { "entropy": 0.34973593391478064, "epoch": 0.16440285626872478, "grad_norm": 0.9659526944160461, "learning_rate": 1.0959002514238801e-05, "loss": 0.366, "mean_token_accuracy": 0.8960307441651821, "num_tokens": 70623189.0, "step": 10680 }, { "entropy": 0.3795007038861513, "epoch": 0.16455679152740335, "grad_norm": 0.909063994884491, "learning_rate": 1.0969264713428089e-05, "loss": 0.373, "mean_token_accuracy": 0.886588717252016, "num_tokens": 70689634.0, "step": 10690 }, { "entropy": 0.37708673998713493, "epoch": 0.16471072678608195, "grad_norm": 0.8974459767341614, "learning_rate": 1.0979526912617375e-05, "loss": 0.3717, "mean_token_accuracy": 0.8894701212644577, "num_tokens": 70769022.0, "step": 10700 }, { "entropy": 0.35642101541161536, "epoch": 0.16486466204476052, "grad_norm": 1.0445367097854614, "learning_rate": 1.0989789111806661e-05, "loss": 0.35, "mean_token_accuracy": 0.8929494939744472, "num_tokens": 70841066.0, "step": 10710 }, { "entropy": 0.37198880072683094, "epoch": 0.1650185973034391, "grad_norm": 1.1077598333358765, "learning_rate": 1.1000051310995947e-05, "loss": 0.3632, "mean_token_accuracy": 0.8882399685680866, "num_tokens": 70905660.0, "step": 10720 }, { "entropy": 0.36980657279491425, "epoch": 0.1651725325621177, "grad_norm": 0.9263826012611389, "learning_rate": 1.1010313510185233e-05, "loss": 0.3564, "mean_token_accuracy": 0.8896138094365597, "num_tokens": 70973049.0, "step": 10730 }, { "entropy": 0.3702884713187814, "epoch": 0.16532646782079627, "grad_norm": 1.1688071489334106, "learning_rate": 1.1020575709374519e-05, "loss": 0.3615, "mean_token_accuracy": 0.8868311822414399, "num_tokens": 71039686.0, "step": 10740 }, { "entropy": 0.37103391662240026, "epoch": 0.16548040307947484, "grad_norm": 0.9353637099266052, "learning_rate": 1.1030837908563805e-05, "loss": 0.357, "mean_token_accuracy": 0.8907366521656513, "num_tokens": 71118341.0, "step": 10750 }, { "entropy": 0.359919885545969, "epoch": 0.16563433833815344, "grad_norm": 1.1045199632644653, "learning_rate": 1.1041100107753092e-05, "loss": 0.3536, "mean_token_accuracy": 0.8938797652721405, "num_tokens": 71180397.0, "step": 10760 }, { "entropy": 0.3810721319168806, "epoch": 0.16578827359683201, "grad_norm": 1.0454723834991455, "learning_rate": 1.1051362306942378e-05, "loss": 0.3845, "mean_token_accuracy": 0.8849816508591175, "num_tokens": 71244506.0, "step": 10770 }, { "entropy": 0.3532800318673253, "epoch": 0.1659422088555106, "grad_norm": 1.04214608669281, "learning_rate": 1.1061624506131664e-05, "loss": 0.3366, "mean_token_accuracy": 0.8970113761723042, "num_tokens": 71310362.0, "step": 10780 }, { "entropy": 0.3647288382053375, "epoch": 0.1660961441141892, "grad_norm": 1.1720908880233765, "learning_rate": 1.107188670532095e-05, "loss": 0.3529, "mean_token_accuracy": 0.8911557406187057, "num_tokens": 71382598.0, "step": 10790 }, { "entropy": 0.37489817291498184, "epoch": 0.16625007937286776, "grad_norm": 1.0223392248153687, "learning_rate": 1.1082148904510236e-05, "loss": 0.3633, "mean_token_accuracy": 0.8864688672125339, "num_tokens": 71446102.0, "step": 10800 }, { "entropy": 0.37869728822261095, "epoch": 0.16640401463154633, "grad_norm": 1.134292483329773, "learning_rate": 1.1092411103699522e-05, "loss": 0.3662, "mean_token_accuracy": 0.8881227627396584, "num_tokens": 71520829.0, "step": 10810 }, { "entropy": 0.35437496062368157, "epoch": 0.16655794989022493, "grad_norm": 1.1578501462936401, "learning_rate": 1.1102673302888808e-05, "loss": 0.3557, "mean_token_accuracy": 0.8920820720493794, "num_tokens": 71589729.0, "step": 10820 }, { "entropy": 0.3764654850587249, "epoch": 0.1667118851489035, "grad_norm": 1.2339365482330322, "learning_rate": 1.1112935502078097e-05, "loss": 0.3755, "mean_token_accuracy": 0.8886642456054688, "num_tokens": 71658868.0, "step": 10830 }, { "entropy": 0.3671187641099095, "epoch": 0.16686582040758208, "grad_norm": 1.1273040771484375, "learning_rate": 1.1123197701267383e-05, "loss": 0.3675, "mean_token_accuracy": 0.890870001912117, "num_tokens": 71725332.0, "step": 10840 }, { "entropy": 0.3639064282178879, "epoch": 0.16701975566626065, "grad_norm": 0.9500343203544617, "learning_rate": 1.113345990045667e-05, "loss": 0.3665, "mean_token_accuracy": 0.8911407865583897, "num_tokens": 71793700.0, "step": 10850 }, { "entropy": 0.3815064262598753, "epoch": 0.16717369092493925, "grad_norm": 1.1124459505081177, "learning_rate": 1.1143722099645955e-05, "loss": 0.3936, "mean_token_accuracy": 0.8873532555997372, "num_tokens": 71854332.0, "step": 10860 }, { "entropy": 0.39492093604058026, "epoch": 0.16732762618361782, "grad_norm": 1.1400431394577026, "learning_rate": 1.1153984298835241e-05, "loss": 0.3737, "mean_token_accuracy": 0.8839095383882523, "num_tokens": 71935665.0, "step": 10870 }, { "entropy": 0.35395246706902983, "epoch": 0.1674815614422964, "grad_norm": 1.194289207458496, "learning_rate": 1.1164246498024527e-05, "loss": 0.3641, "mean_token_accuracy": 0.8930094510316848, "num_tokens": 71999407.0, "step": 10880 }, { "entropy": 0.346639065630734, "epoch": 0.167635496700975, "grad_norm": 1.307450532913208, "learning_rate": 1.1174508697213813e-05, "loss": 0.3515, "mean_token_accuracy": 0.8951026268303395, "num_tokens": 72061608.0, "step": 10890 }, { "entropy": 0.3950799200683832, "epoch": 0.16778943195965357, "grad_norm": 1.0118749141693115, "learning_rate": 1.11847708964031e-05, "loss": 0.3812, "mean_token_accuracy": 0.882252000272274, "num_tokens": 72122644.0, "step": 10900 }, { "entropy": 0.3810609744861722, "epoch": 0.16794336721833214, "grad_norm": 1.4135937690734863, "learning_rate": 1.1195033095592387e-05, "loss": 0.3666, "mean_token_accuracy": 0.888980895280838, "num_tokens": 72189296.0, "step": 10910 }, { "entropy": 0.35915468335151673, "epoch": 0.16809730247701074, "grad_norm": 1.2816193103790283, "learning_rate": 1.1205295294781673e-05, "loss": 0.3587, "mean_token_accuracy": 0.890455074608326, "num_tokens": 72257688.0, "step": 10920 }, { "entropy": 0.3654381912201643, "epoch": 0.1682512377356893, "grad_norm": 1.4621587991714478, "learning_rate": 1.1215557493970959e-05, "loss": 0.3704, "mean_token_accuracy": 0.8914733193814754, "num_tokens": 72320989.0, "step": 10930 }, { "entropy": 0.3524819567799568, "epoch": 0.16840517299436789, "grad_norm": 1.5644153356552124, "learning_rate": 1.1225819693160245e-05, "loss": 0.3565, "mean_token_accuracy": 0.8958624824881554, "num_tokens": 72387159.0, "step": 10940 }, { "entropy": 0.3752221858128905, "epoch": 0.16855910825304649, "grad_norm": 0.9265833497047424, "learning_rate": 1.123608189234953e-05, "loss": 0.3612, "mean_token_accuracy": 0.8869732089340687, "num_tokens": 72460181.0, "step": 10950 }, { "entropy": 0.34957735426723957, "epoch": 0.16871304351172506, "grad_norm": 1.0917813777923584, "learning_rate": 1.1246344091538816e-05, "loss": 0.3434, "mean_token_accuracy": 0.8961347796022892, "num_tokens": 72533148.0, "step": 10960 }, { "entropy": 0.36815793439745903, "epoch": 0.16886697877040363, "grad_norm": 0.9173478484153748, "learning_rate": 1.1256606290728104e-05, "loss": 0.3626, "mean_token_accuracy": 0.890468118339777, "num_tokens": 72597448.0, "step": 10970 }, { "entropy": 0.3678385313600302, "epoch": 0.16902091402908223, "grad_norm": 1.2373212575912476, "learning_rate": 1.126686848991739e-05, "loss": 0.3609, "mean_token_accuracy": 0.8900888189673424, "num_tokens": 72667630.0, "step": 10980 }, { "entropy": 0.3442217745818198, "epoch": 0.1691748492877608, "grad_norm": 1.0282894372940063, "learning_rate": 1.1277130689106676e-05, "loss": 0.3465, "mean_token_accuracy": 0.8950027093291283, "num_tokens": 72736855.0, "step": 10990 }, { "entropy": 0.36576870921999216, "epoch": 0.16932878454643938, "grad_norm": 0.867874264717102, "learning_rate": 1.1287392888295962e-05, "loss": 0.358, "mean_token_accuracy": 0.8925472415983677, "num_tokens": 72807373.0, "step": 11000 }, { "entropy": 0.3443356351926923, "epoch": 0.16948271980511798, "grad_norm": 1.1101855039596558, "learning_rate": 1.1297655087485248e-05, "loss": 0.3513, "mean_token_accuracy": 0.8973018504679203, "num_tokens": 72881108.0, "step": 11010 }, { "entropy": 0.36280818209052085, "epoch": 0.16963665506379655, "grad_norm": 1.0717730522155762, "learning_rate": 1.1307917286674534e-05, "loss": 0.3554, "mean_token_accuracy": 0.8879412159323692, "num_tokens": 72944823.0, "step": 11020 }, { "entropy": 0.3657096056267619, "epoch": 0.16979059032247512, "grad_norm": 0.8116201162338257, "learning_rate": 1.131817948586382e-05, "loss": 0.3603, "mean_token_accuracy": 0.8928468383848667, "num_tokens": 73013076.0, "step": 11030 }, { "entropy": 0.3650772899389267, "epoch": 0.1699445255811537, "grad_norm": 1.0305383205413818, "learning_rate": 1.1328441685053109e-05, "loss": 0.3565, "mean_token_accuracy": 0.8905553027987481, "num_tokens": 73087387.0, "step": 11040 }, { "entropy": 0.3675177074968815, "epoch": 0.1700984608398323, "grad_norm": 1.1571955680847168, "learning_rate": 1.1338703884242395e-05, "loss": 0.3684, "mean_token_accuracy": 0.889612103253603, "num_tokens": 73154624.0, "step": 11050 }, { "entropy": 0.35993093997240067, "epoch": 0.17025239609851087, "grad_norm": 0.9971462488174438, "learning_rate": 1.1348966083431681e-05, "loss": 0.3575, "mean_token_accuracy": 0.8948220923542977, "num_tokens": 73222795.0, "step": 11060 }, { "entropy": 0.35243437606841327, "epoch": 0.17040633135718944, "grad_norm": 1.2678110599517822, "learning_rate": 1.1359228282620967e-05, "loss": 0.3363, "mean_token_accuracy": 0.8945498906075955, "num_tokens": 73289498.0, "step": 11070 }, { "entropy": 0.3499010674655437, "epoch": 0.17056026661586804, "grad_norm": 1.0235035419464111, "learning_rate": 1.1369490481810253e-05, "loss": 0.3478, "mean_token_accuracy": 0.8960938587784767, "num_tokens": 73352290.0, "step": 11080 }, { "entropy": 0.35724786203354597, "epoch": 0.1707142018745466, "grad_norm": 1.3945056200027466, "learning_rate": 1.1379752680999539e-05, "loss": 0.3492, "mean_token_accuracy": 0.8914260916411877, "num_tokens": 73418509.0, "step": 11090 }, { "entropy": 0.3708932615816593, "epoch": 0.17086813713322518, "grad_norm": 0.9325275421142578, "learning_rate": 1.1390014880188825e-05, "loss": 0.3666, "mean_token_accuracy": 0.88999824821949, "num_tokens": 73485441.0, "step": 11100 }, { "entropy": 0.35862792674452065, "epoch": 0.17102207239190378, "grad_norm": 1.0436254739761353, "learning_rate": 1.1400277079378112e-05, "loss": 0.3566, "mean_token_accuracy": 0.8939824491739273, "num_tokens": 73549561.0, "step": 11110 }, { "entropy": 0.3661471953615546, "epoch": 0.17117600765058236, "grad_norm": 0.9528864026069641, "learning_rate": 1.1410539278567398e-05, "loss": 0.3624, "mean_token_accuracy": 0.8907121613621711, "num_tokens": 73615797.0, "step": 11120 }, { "entropy": 0.3693573135882616, "epoch": 0.17132994290926093, "grad_norm": 1.0293779373168945, "learning_rate": 1.1420801477756684e-05, "loss": 0.3691, "mean_token_accuracy": 0.8903206795454025, "num_tokens": 73683620.0, "step": 11130 }, { "entropy": 0.37297889199107886, "epoch": 0.17148387816793953, "grad_norm": 1.1785625219345093, "learning_rate": 1.143106367694597e-05, "loss": 0.3556, "mean_token_accuracy": 0.8911580890417099, "num_tokens": 73750314.0, "step": 11140 }, { "entropy": 0.366870036162436, "epoch": 0.1716378134266181, "grad_norm": 1.1074758768081665, "learning_rate": 1.1441325876135256e-05, "loss": 0.362, "mean_token_accuracy": 0.8906842522323132, "num_tokens": 73823338.0, "step": 11150 }, { "entropy": 0.34488157387822865, "epoch": 0.17179174868529667, "grad_norm": 1.1654596328735352, "learning_rate": 1.1451588075324542e-05, "loss": 0.3401, "mean_token_accuracy": 0.8965043090283871, "num_tokens": 73893855.0, "step": 11160 }, { "entropy": 0.3707609100267291, "epoch": 0.17194568394397527, "grad_norm": 0.7683007121086121, "learning_rate": 1.1461850274513828e-05, "loss": 0.3756, "mean_token_accuracy": 0.889462323486805, "num_tokens": 73963621.0, "step": 11170 }, { "entropy": 0.36703203711658716, "epoch": 0.17209961920265385, "grad_norm": 1.2197864055633545, "learning_rate": 1.1472112473703116e-05, "loss": 0.3674, "mean_token_accuracy": 0.8923780798912049, "num_tokens": 74018105.0, "step": 11180 }, { "entropy": 0.3752311462536454, "epoch": 0.17225355446133242, "grad_norm": 0.8429661989212036, "learning_rate": 1.1482374672892402e-05, "loss": 0.3712, "mean_token_accuracy": 0.8897179864346981, "num_tokens": 74080537.0, "step": 11190 }, { "entropy": 0.36353593934327366, "epoch": 0.17240748972001102, "grad_norm": 1.0628674030303955, "learning_rate": 1.1492636872081688e-05, "loss": 0.3656, "mean_token_accuracy": 0.8924637921154499, "num_tokens": 74141933.0, "step": 11200 }, { "entropy": 0.36649113334715366, "epoch": 0.1725614249786896, "grad_norm": 1.133853793144226, "learning_rate": 1.1502899071270974e-05, "loss": 0.3618, "mean_token_accuracy": 0.8899900451302528, "num_tokens": 74213199.0, "step": 11210 }, { "entropy": 0.3618604289367795, "epoch": 0.17271536023736817, "grad_norm": 1.3968298435211182, "learning_rate": 1.151316127046026e-05, "loss": 0.3513, "mean_token_accuracy": 0.8948212526738644, "num_tokens": 74282137.0, "step": 11220 }, { "entropy": 0.3556383879855275, "epoch": 0.17286929549604674, "grad_norm": 1.1131987571716309, "learning_rate": 1.1523423469649545e-05, "loss": 0.3627, "mean_token_accuracy": 0.8920914746820927, "num_tokens": 74350648.0, "step": 11230 }, { "entropy": 0.36849641613662243, "epoch": 0.17302323075472534, "grad_norm": 1.120004653930664, "learning_rate": 1.1533685668838831e-05, "loss": 0.36, "mean_token_accuracy": 0.8895770497620106, "num_tokens": 74422179.0, "step": 11240 }, { "entropy": 0.370517772436142, "epoch": 0.1731771660134039, "grad_norm": 0.9872550368309021, "learning_rate": 1.154394786802812e-05, "loss": 0.3615, "mean_token_accuracy": 0.8894947275519371, "num_tokens": 74481750.0, "step": 11250 }, { "entropy": 0.37605318110436203, "epoch": 0.17333110127208248, "grad_norm": 0.988788902759552, "learning_rate": 1.1554210067217407e-05, "loss": 0.3719, "mean_token_accuracy": 0.8885431334376335, "num_tokens": 74546034.0, "step": 11260 }, { "entropy": 0.36679733032360673, "epoch": 0.17348503653076108, "grad_norm": 1.1372240781784058, "learning_rate": 1.1564472266406693e-05, "loss": 0.3659, "mean_token_accuracy": 0.8891807816922664, "num_tokens": 74610725.0, "step": 11270 }, { "entropy": 0.3712363686412573, "epoch": 0.17363897178943966, "grad_norm": 1.3739112615585327, "learning_rate": 1.1574734465595979e-05, "loss": 0.3696, "mean_token_accuracy": 0.8889528259634971, "num_tokens": 74678124.0, "step": 11280 }, { "entropy": 0.34832353349775075, "epoch": 0.17379290704811823, "grad_norm": 0.9466639161109924, "learning_rate": 1.1584996664785265e-05, "loss": 0.3393, "mean_token_accuracy": 0.896711677312851, "num_tokens": 74752578.0, "step": 11290 }, { "entropy": 0.3616205113008618, "epoch": 0.17394684230679683, "grad_norm": 1.369436502456665, "learning_rate": 1.159525886397455e-05, "loss": 0.3668, "mean_token_accuracy": 0.8894391544163227, "num_tokens": 74808713.0, "step": 11300 }, { "entropy": 0.35920630767941475, "epoch": 0.1741007775654754, "grad_norm": 1.02012038230896, "learning_rate": 1.1605521063163836e-05, "loss": 0.3448, "mean_token_accuracy": 0.8926343604922294, "num_tokens": 74870525.0, "step": 11310 }, { "entropy": 0.3631144633516669, "epoch": 0.17425471282415397, "grad_norm": 1.0874216556549072, "learning_rate": 1.1615783262353124e-05, "loss": 0.3543, "mean_token_accuracy": 0.8923752784729004, "num_tokens": 74939874.0, "step": 11320 }, { "entropy": 0.37109795212745667, "epoch": 0.17440864808283257, "grad_norm": 0.9172027111053467, "learning_rate": 1.162604546154241e-05, "loss": 0.3722, "mean_token_accuracy": 0.8910910919308662, "num_tokens": 75004964.0, "step": 11330 }, { "entropy": 0.33920574486255645, "epoch": 0.17456258334151115, "grad_norm": 1.2008745670318604, "learning_rate": 1.1636307660731696e-05, "loss": 0.3416, "mean_token_accuracy": 0.901589635014534, "num_tokens": 75061105.0, "step": 11340 }, { "entropy": 0.35463760420680046, "epoch": 0.17471651860018972, "grad_norm": 0.9704163074493408, "learning_rate": 1.1646569859920982e-05, "loss": 0.3625, "mean_token_accuracy": 0.8931730180978775, "num_tokens": 75120676.0, "step": 11350 }, { "entropy": 0.3814159628003836, "epoch": 0.17487045385886832, "grad_norm": 0.8748956918716431, "learning_rate": 1.1656832059110268e-05, "loss": 0.3662, "mean_token_accuracy": 0.8886724606156349, "num_tokens": 75189803.0, "step": 11360 }, { "entropy": 0.37744920402765275, "epoch": 0.1750243891175469, "grad_norm": 1.1003925800323486, "learning_rate": 1.1667094258299554e-05, "loss": 0.3539, "mean_token_accuracy": 0.8879048228263855, "num_tokens": 75265928.0, "step": 11370 }, { "entropy": 0.3698181141167879, "epoch": 0.17517832437622546, "grad_norm": 1.1004750728607178, "learning_rate": 1.167735645748884e-05, "loss": 0.3693, "mean_token_accuracy": 0.8896317414939403, "num_tokens": 75339750.0, "step": 11380 }, { "entropy": 0.34389972481876613, "epoch": 0.17533225963490406, "grad_norm": 1.1678074598312378, "learning_rate": 1.1687618656678127e-05, "loss": 0.3496, "mean_token_accuracy": 0.8974109947681427, "num_tokens": 75411766.0, "step": 11390 }, { "entropy": 0.33962323172017933, "epoch": 0.17548619489358264, "grad_norm": 1.0843145847320557, "learning_rate": 1.1697880855867413e-05, "loss": 0.349, "mean_token_accuracy": 0.8969196975231171, "num_tokens": 75472196.0, "step": 11400 }, { "entropy": 0.34310347847640515, "epoch": 0.1756401301522612, "grad_norm": 1.1918199062347412, "learning_rate": 1.17081430550567e-05, "loss": 0.3509, "mean_token_accuracy": 0.897018413990736, "num_tokens": 75541684.0, "step": 11410 }, { "entropy": 0.37728849351406096, "epoch": 0.17579406541093978, "grad_norm": 1.155470609664917, "learning_rate": 1.1718405254245985e-05, "loss": 0.3559, "mean_token_accuracy": 0.8889025293290616, "num_tokens": 75610198.0, "step": 11420 }, { "entropy": 0.36960522569715976, "epoch": 0.17594800066961838, "grad_norm": 1.1649020910263062, "learning_rate": 1.1728667453435271e-05, "loss": 0.3671, "mean_token_accuracy": 0.8907266698777676, "num_tokens": 75670791.0, "step": 11430 }, { "entropy": 0.3585035800933838, "epoch": 0.17610193592829695, "grad_norm": 0.8958064317703247, "learning_rate": 1.1738929652624557e-05, "loss": 0.3613, "mean_token_accuracy": 0.8921600796282292, "num_tokens": 75738131.0, "step": 11440 }, { "entropy": 0.3646376380696893, "epoch": 0.17625587118697553, "grad_norm": 1.086273193359375, "learning_rate": 1.1749191851813843e-05, "loss": 0.364, "mean_token_accuracy": 0.893036013841629, "num_tokens": 75807054.0, "step": 11450 }, { "entropy": 0.36033814307302237, "epoch": 0.17640980644565413, "grad_norm": 1.4513825178146362, "learning_rate": 1.1759454051003132e-05, "loss": 0.3484, "mean_token_accuracy": 0.8932399690151215, "num_tokens": 75869491.0, "step": 11460 }, { "entropy": 0.3637367507442832, "epoch": 0.1765637417043327, "grad_norm": 1.0419394969940186, "learning_rate": 1.1769716250192418e-05, "loss": 0.3513, "mean_token_accuracy": 0.8909502677619457, "num_tokens": 75930081.0, "step": 11470 }, { "entropy": 0.346670382283628, "epoch": 0.17671767696301127, "grad_norm": 0.9156445860862732, "learning_rate": 1.1779978449381704e-05, "loss": 0.3508, "mean_token_accuracy": 0.8944143950939178, "num_tokens": 75998990.0, "step": 11480 }, { "entropy": 0.38323199450969697, "epoch": 0.17687161222168987, "grad_norm": 1.0800260305404663, "learning_rate": 1.179024064857099e-05, "loss": 0.3705, "mean_token_accuracy": 0.8855953179299831, "num_tokens": 76058859.0, "step": 11490 }, { "entropy": 0.34631476635113356, "epoch": 0.17702554748036844, "grad_norm": 0.9651276469230652, "learning_rate": 1.1800502847760276e-05, "loss": 0.3506, "mean_token_accuracy": 0.8962479025125504, "num_tokens": 76127406.0, "step": 11500 }, { "entropy": 0.3746170559898019, "epoch": 0.17717948273904702, "grad_norm": 0.8826376795768738, "learning_rate": 1.1810765046949562e-05, "loss": 0.3639, "mean_token_accuracy": 0.8873319305479527, "num_tokens": 76189644.0, "step": 11510 }, { "entropy": 0.3780833661556244, "epoch": 0.17733341799772562, "grad_norm": 1.2389671802520752, "learning_rate": 1.1821027246138848e-05, "loss": 0.368, "mean_token_accuracy": 0.886408019810915, "num_tokens": 76249589.0, "step": 11520 }, { "entropy": 0.38660849779844286, "epoch": 0.1774873532564042, "grad_norm": 0.8616106510162354, "learning_rate": 1.1831289445328136e-05, "loss": 0.3684, "mean_token_accuracy": 0.8844117693603039, "num_tokens": 76325386.0, "step": 11530 }, { "entropy": 0.3647430343553424, "epoch": 0.17764128851508276, "grad_norm": 0.9665796756744385, "learning_rate": 1.1841551644517422e-05, "loss": 0.3517, "mean_token_accuracy": 0.8909231014549732, "num_tokens": 76396825.0, "step": 11540 }, { "entropy": 0.3602033512666821, "epoch": 0.17779522377376136, "grad_norm": 1.0970573425292969, "learning_rate": 1.1851813843706708e-05, "loss": 0.3566, "mean_token_accuracy": 0.8901985324919224, "num_tokens": 76467119.0, "step": 11550 }, { "entropy": 0.38886030167341235, "epoch": 0.17794915903243994, "grad_norm": 0.9971334338188171, "learning_rate": 1.1862076042895994e-05, "loss": 0.3739, "mean_token_accuracy": 0.8881716027855873, "num_tokens": 76536696.0, "step": 11560 }, { "entropy": 0.3565880352631211, "epoch": 0.1781030942911185, "grad_norm": 0.9471988081932068, "learning_rate": 1.187233824208528e-05, "loss": 0.3538, "mean_token_accuracy": 0.8918481968343258, "num_tokens": 76605282.0, "step": 11570 }, { "entropy": 0.4017040733247995, "epoch": 0.17825702954979708, "grad_norm": 1.0693265199661255, "learning_rate": 1.1882600441274565e-05, "loss": 0.4003, "mean_token_accuracy": 0.8767531581223011, "num_tokens": 76675542.0, "step": 11580 }, { "entropy": 0.3508052956312895, "epoch": 0.17841096480847568, "grad_norm": 0.9552015066146851, "learning_rate": 1.1892862640463851e-05, "loss": 0.3357, "mean_token_accuracy": 0.8975337900221347, "num_tokens": 76739591.0, "step": 11590 }, { "entropy": 0.3569879673421383, "epoch": 0.17856490006715425, "grad_norm": 1.4250537157058716, "learning_rate": 1.1903124839653139e-05, "loss": 0.3595, "mean_token_accuracy": 0.8952900677919388, "num_tokens": 76799122.0, "step": 11600 }, { "entropy": 0.3828628171235323, "epoch": 0.17871883532583283, "grad_norm": 1.1470942497253418, "learning_rate": 1.1913387038842425e-05, "loss": 0.3836, "mean_token_accuracy": 0.8834895834326744, "num_tokens": 76863706.0, "step": 11610 }, { "entropy": 0.3656041666865349, "epoch": 0.17887277058451143, "grad_norm": 0.995202362537384, "learning_rate": 1.1923649238031711e-05, "loss": 0.3487, "mean_token_accuracy": 0.8939320847392083, "num_tokens": 76930221.0, "step": 11620 }, { "entropy": 0.3610787719488144, "epoch": 0.17902670584319, "grad_norm": 1.2366081476211548, "learning_rate": 1.1933911437220997e-05, "loss": 0.3596, "mean_token_accuracy": 0.8902872160077095, "num_tokens": 77000457.0, "step": 11630 }, { "entropy": 0.3636155209504068, "epoch": 0.17918064110186857, "grad_norm": 1.2518510818481445, "learning_rate": 1.1944173636410283e-05, "loss": 0.3582, "mean_token_accuracy": 0.8896710559725761, "num_tokens": 77073477.0, "step": 11640 }, { "entropy": 0.3389192149043083, "epoch": 0.17933457636054717, "grad_norm": 0.9271234273910522, "learning_rate": 1.1954435835599569e-05, "loss": 0.3368, "mean_token_accuracy": 0.8972251273691654, "num_tokens": 77142413.0, "step": 11650 }, { "entropy": 0.3747572500258684, "epoch": 0.17948851161922574, "grad_norm": 1.053276777267456, "learning_rate": 1.1964698034788855e-05, "loss": 0.3593, "mean_token_accuracy": 0.8878202229738236, "num_tokens": 77214492.0, "step": 11660 }, { "entropy": 0.3560306800529361, "epoch": 0.17964244687790432, "grad_norm": 1.299925446510315, "learning_rate": 1.1974960233978144e-05, "loss": 0.3494, "mean_token_accuracy": 0.895259128510952, "num_tokens": 77281463.0, "step": 11670 }, { "entropy": 0.3509239017963409, "epoch": 0.17979638213658292, "grad_norm": 1.074188470840454, "learning_rate": 1.198522243316743e-05, "loss": 0.3506, "mean_token_accuracy": 0.8951381616294384, "num_tokens": 77353460.0, "step": 11680 }, { "entropy": 0.3628412250429392, "epoch": 0.1799503173952615, "grad_norm": 1.0142645835876465, "learning_rate": 1.1995484632356716e-05, "loss": 0.3563, "mean_token_accuracy": 0.8905898302793502, "num_tokens": 77421460.0, "step": 11690 }, { "entropy": 0.3629735173657537, "epoch": 0.18010425265394006, "grad_norm": 1.0778164863586426, "learning_rate": 1.2005746831546002e-05, "loss": 0.3591, "mean_token_accuracy": 0.8909591600298882, "num_tokens": 77488700.0, "step": 11700 }, { "entropy": 0.36947824340313673, "epoch": 0.18025818791261866, "grad_norm": 1.0490225553512573, "learning_rate": 1.2016009030735288e-05, "loss": 0.3631, "mean_token_accuracy": 0.8899081528186799, "num_tokens": 77562828.0, "step": 11710 }, { "entropy": 0.3563668975606561, "epoch": 0.18041212317129723, "grad_norm": 1.3872160911560059, "learning_rate": 1.2026271229924574e-05, "loss": 0.3522, "mean_token_accuracy": 0.8933009155094623, "num_tokens": 77630389.0, "step": 11720 }, { "entropy": 0.3547056593000889, "epoch": 0.1805660584299758, "grad_norm": 1.1138989925384521, "learning_rate": 1.203653342911386e-05, "loss": 0.3537, "mean_token_accuracy": 0.8954100854694843, "num_tokens": 77693301.0, "step": 11730 }, { "entropy": 0.33836898151785133, "epoch": 0.1807199936886544, "grad_norm": 0.8093135952949524, "learning_rate": 1.2046795628303147e-05, "loss": 0.3372, "mean_token_accuracy": 0.8972642533481121, "num_tokens": 77761339.0, "step": 11740 }, { "entropy": 0.37456215675920246, "epoch": 0.18087392894733298, "grad_norm": 1.0511068105697632, "learning_rate": 1.2057057827492433e-05, "loss": 0.3703, "mean_token_accuracy": 0.8892511814832688, "num_tokens": 77830967.0, "step": 11750 }, { "entropy": 0.3537282351404428, "epoch": 0.18102786420601155, "grad_norm": 0.7876558303833008, "learning_rate": 1.206732002668172e-05, "loss": 0.3516, "mean_token_accuracy": 0.8929909870028496, "num_tokens": 77896022.0, "step": 11760 }, { "entropy": 0.3620762155391276, "epoch": 0.18118179946469012, "grad_norm": 1.0025376081466675, "learning_rate": 1.2077582225871005e-05, "loss": 0.3616, "mean_token_accuracy": 0.8933670930564404, "num_tokens": 77956360.0, "step": 11770 }, { "entropy": 0.3556920537725091, "epoch": 0.18133573472336872, "grad_norm": 1.0369625091552734, "learning_rate": 1.2087844425060291e-05, "loss": 0.3498, "mean_token_accuracy": 0.8958395384252071, "num_tokens": 78020829.0, "step": 11780 }, { "entropy": 0.3402321269735694, "epoch": 0.1814896699820473, "grad_norm": 1.0545387268066406, "learning_rate": 1.2098106624249577e-05, "loss": 0.3501, "mean_token_accuracy": 0.8964074477553368, "num_tokens": 78092936.0, "step": 11790 }, { "entropy": 0.3607377752661705, "epoch": 0.18164360524072587, "grad_norm": 1.1746302843093872, "learning_rate": 1.2108368823438863e-05, "loss": 0.3438, "mean_token_accuracy": 0.8932857260107994, "num_tokens": 78159767.0, "step": 11800 }, { "entropy": 0.34800679571926596, "epoch": 0.18179754049940447, "grad_norm": 1.2693965435028076, "learning_rate": 1.211863102262815e-05, "loss": 0.3581, "mean_token_accuracy": 0.8969726406037808, "num_tokens": 78226747.0, "step": 11810 }, { "entropy": 0.3574408745393157, "epoch": 0.18195147575808304, "grad_norm": 1.058281421661377, "learning_rate": 1.2128893221817437e-05, "loss": 0.3505, "mean_token_accuracy": 0.8949975602328777, "num_tokens": 78294913.0, "step": 11820 }, { "entropy": 0.355748001486063, "epoch": 0.18210541101676161, "grad_norm": 1.213556170463562, "learning_rate": 1.2139155421006723e-05, "loss": 0.3607, "mean_token_accuracy": 0.8937791138887405, "num_tokens": 78362189.0, "step": 11830 }, { "entropy": 0.37023243177682164, "epoch": 0.18225934627544021, "grad_norm": 1.0194082260131836, "learning_rate": 1.2149417620196009e-05, "loss": 0.3685, "mean_token_accuracy": 0.889833864569664, "num_tokens": 78431054.0, "step": 11840 }, { "entropy": 0.3559973623603582, "epoch": 0.1824132815341188, "grad_norm": 1.1777937412261963, "learning_rate": 1.2159679819385295e-05, "loss": 0.3525, "mean_token_accuracy": 0.8933048918843269, "num_tokens": 78494782.0, "step": 11850 }, { "entropy": 0.3607227046042681, "epoch": 0.18256721679279736, "grad_norm": 1.2480800151824951, "learning_rate": 1.216994201857458e-05, "loss": 0.3583, "mean_token_accuracy": 0.8911955706775189, "num_tokens": 78561076.0, "step": 11860 }, { "entropy": 0.3570686148479581, "epoch": 0.18272115205147596, "grad_norm": 1.179555892944336, "learning_rate": 1.2180204217763866e-05, "loss": 0.3534, "mean_token_accuracy": 0.8943781808018685, "num_tokens": 78621222.0, "step": 11870 }, { "entropy": 0.3593306034803391, "epoch": 0.18287508731015453, "grad_norm": 1.17879319190979, "learning_rate": 1.2190466416953156e-05, "loss": 0.3515, "mean_token_accuracy": 0.8939934268593788, "num_tokens": 78682195.0, "step": 11880 }, { "entropy": 0.3707267511636019, "epoch": 0.1830290225688331, "grad_norm": 1.1648681163787842, "learning_rate": 1.2200728616142442e-05, "loss": 0.3639, "mean_token_accuracy": 0.8896999768912792, "num_tokens": 78751939.0, "step": 11890 }, { "entropy": 0.38085852935910225, "epoch": 0.1831829578275117, "grad_norm": 1.172619342803955, "learning_rate": 1.2210990815331728e-05, "loss": 0.3739, "mean_token_accuracy": 0.8849237650632858, "num_tokens": 78818810.0, "step": 11900 }, { "entropy": 0.3490477215498686, "epoch": 0.18333689308619028, "grad_norm": 0.9274274110794067, "learning_rate": 1.2221253014521014e-05, "loss": 0.3422, "mean_token_accuracy": 0.8969408638775349, "num_tokens": 78884435.0, "step": 11910 }, { "entropy": 0.37106885500252246, "epoch": 0.18349082834486885, "grad_norm": 0.9566990733146667, "learning_rate": 1.22315152137103e-05, "loss": 0.3735, "mean_token_accuracy": 0.8877666302025318, "num_tokens": 78944928.0, "step": 11920 }, { "entropy": 0.3577731425873935, "epoch": 0.18364476360354745, "grad_norm": 1.3477628231048584, "learning_rate": 1.2241777412899585e-05, "loss": 0.351, "mean_token_accuracy": 0.8952071309089661, "num_tokens": 79003555.0, "step": 11930 }, { "entropy": 0.36099823750555515, "epoch": 0.18379869886222602, "grad_norm": 1.1385979652404785, "learning_rate": 1.2252039612088871e-05, "loss": 0.3454, "mean_token_accuracy": 0.8945130214095116, "num_tokens": 79065474.0, "step": 11940 }, { "entropy": 0.3623945916071534, "epoch": 0.1839526341209046, "grad_norm": 1.1224446296691895, "learning_rate": 1.2262301811278159e-05, "loss": 0.3596, "mean_token_accuracy": 0.8896106123924256, "num_tokens": 79135501.0, "step": 11950 }, { "entropy": 0.3630253652110696, "epoch": 0.18410656937958317, "grad_norm": 1.115851640701294, "learning_rate": 1.2272564010467445e-05, "loss": 0.3502, "mean_token_accuracy": 0.8937408536672592, "num_tokens": 79199964.0, "step": 11960 }, { "entropy": 0.36427900213748216, "epoch": 0.18426050463826177, "grad_norm": 0.9164201021194458, "learning_rate": 1.2282826209656731e-05, "loss": 0.3582, "mean_token_accuracy": 0.8907139040529728, "num_tokens": 79269770.0, "step": 11970 }, { "entropy": 0.36459892317652703, "epoch": 0.18441443989694034, "grad_norm": 1.126297950744629, "learning_rate": 1.2293088408846017e-05, "loss": 0.3537, "mean_token_accuracy": 0.8945225425064564, "num_tokens": 79334079.0, "step": 11980 }, { "entropy": 0.3349937184713781, "epoch": 0.1845683751556189, "grad_norm": 1.4280519485473633, "learning_rate": 1.2303350608035303e-05, "loss": 0.3352, "mean_token_accuracy": 0.8975636444985866, "num_tokens": 79391286.0, "step": 11990 }, { "entropy": 0.3666781472042203, "epoch": 0.1847223104142975, "grad_norm": 1.0347983837127686, "learning_rate": 1.2313612807224589e-05, "loss": 0.3601, "mean_token_accuracy": 0.8932104952633381, "num_tokens": 79452850.0, "step": 12000 }, { "entropy": 0.3616097033023834, "epoch": 0.1848762456729761, "grad_norm": 0.9916002154350281, "learning_rate": 1.2323875006413875e-05, "loss": 0.3673, "mean_token_accuracy": 0.890849432349205, "num_tokens": 79516264.0, "step": 12010 }, { "entropy": 0.3498946316540241, "epoch": 0.18503018093165466, "grad_norm": 1.097764253616333, "learning_rate": 1.2334137205603162e-05, "loss": 0.3408, "mean_token_accuracy": 0.8954347051680088, "num_tokens": 79577668.0, "step": 12020 }, { "entropy": 0.3519274178892374, "epoch": 0.18518411619033326, "grad_norm": 0.9737581610679626, "learning_rate": 1.2344399404792448e-05, "loss": 0.3568, "mean_token_accuracy": 0.8964522793889046, "num_tokens": 79643652.0, "step": 12030 }, { "entropy": 0.3726873125880957, "epoch": 0.18533805144901183, "grad_norm": 1.3958265781402588, "learning_rate": 1.2354661603981734e-05, "loss": 0.36, "mean_token_accuracy": 0.889809612929821, "num_tokens": 79700293.0, "step": 12040 }, { "entropy": 0.3408668765798211, "epoch": 0.1854919867076904, "grad_norm": 1.0094479322433472, "learning_rate": 1.236492380317102e-05, "loss": 0.348, "mean_token_accuracy": 0.8982391357421875, "num_tokens": 79764392.0, "step": 12050 }, { "entropy": 0.3752965614199638, "epoch": 0.185645921966369, "grad_norm": 0.9488865733146667, "learning_rate": 1.2375186002360306e-05, "loss": 0.3638, "mean_token_accuracy": 0.8899731330573559, "num_tokens": 79832814.0, "step": 12060 }, { "entropy": 0.35972616355866194, "epoch": 0.18579985722504758, "grad_norm": 0.9228470325469971, "learning_rate": 1.2385448201549592e-05, "loss": 0.3496, "mean_token_accuracy": 0.8940783366560936, "num_tokens": 79900757.0, "step": 12070 }, { "entropy": 0.3612998565658927, "epoch": 0.18595379248372615, "grad_norm": 1.1129733324050903, "learning_rate": 1.2395710400738878e-05, "loss": 0.3576, "mean_token_accuracy": 0.8937497437000275, "num_tokens": 79973363.0, "step": 12080 }, { "entropy": 0.3651424886658788, "epoch": 0.18610772774240475, "grad_norm": 1.1129662990570068, "learning_rate": 1.2405972599928167e-05, "loss": 0.3511, "mean_token_accuracy": 0.8918917588889599, "num_tokens": 80039679.0, "step": 12090 }, { "entropy": 0.3546100541949272, "epoch": 0.18626166300108332, "grad_norm": 1.0001370906829834, "learning_rate": 1.2416234799117453e-05, "loss": 0.3548, "mean_token_accuracy": 0.8906474635004997, "num_tokens": 80105284.0, "step": 12100 }, { "entropy": 0.35750543773174287, "epoch": 0.1864155982597619, "grad_norm": 1.3386023044586182, "learning_rate": 1.242649699830674e-05, "loss": 0.3556, "mean_token_accuracy": 0.8975345343351364, "num_tokens": 80164621.0, "step": 12110 }, { "entropy": 0.36012827027589084, "epoch": 0.1865695335184405, "grad_norm": 1.6115515232086182, "learning_rate": 1.2436759197496025e-05, "loss": 0.3607, "mean_token_accuracy": 0.8921824343502521, "num_tokens": 80228426.0, "step": 12120 }, { "entropy": 0.34874184858053925, "epoch": 0.18672346877711907, "grad_norm": 1.0212427377700806, "learning_rate": 1.2447021396685311e-05, "loss": 0.3513, "mean_token_accuracy": 0.8938342243432998, "num_tokens": 80291633.0, "step": 12130 }, { "entropy": 0.3592286377213895, "epoch": 0.18687740403579764, "grad_norm": 1.3329455852508545, "learning_rate": 1.2457283595874597e-05, "loss": 0.354, "mean_token_accuracy": 0.8935637749731541, "num_tokens": 80357684.0, "step": 12140 }, { "entropy": 0.35332383029162884, "epoch": 0.1870313392944762, "grad_norm": 0.8923876881599426, "learning_rate": 1.2467545795063883e-05, "loss": 0.3487, "mean_token_accuracy": 0.8939933620393277, "num_tokens": 80430179.0, "step": 12150 }, { "entropy": 0.36384153421968224, "epoch": 0.1871852745531548, "grad_norm": 1.5553321838378906, "learning_rate": 1.247780799425317e-05, "loss": 0.3466, "mean_token_accuracy": 0.890943631529808, "num_tokens": 80501361.0, "step": 12160 }, { "entropy": 0.3584054773673415, "epoch": 0.18733920981183338, "grad_norm": 1.0227301120758057, "learning_rate": 1.2488070193442457e-05, "loss": 0.3472, "mean_token_accuracy": 0.8930776156485081, "num_tokens": 80568928.0, "step": 12170 }, { "entropy": 0.3408653698861599, "epoch": 0.18749314507051196, "grad_norm": 1.0405083894729614, "learning_rate": 1.2498332392631743e-05, "loss": 0.3308, "mean_token_accuracy": 0.8980809018015862, "num_tokens": 80641621.0, "step": 12180 }, { "entropy": 0.3476330881938338, "epoch": 0.18764708032919056, "grad_norm": 1.0678234100341797, "learning_rate": 1.2508594591821029e-05, "loss": 0.3484, "mean_token_accuracy": 0.8956910766661167, "num_tokens": 80708372.0, "step": 12190 }, { "entropy": 0.36569529455155136, "epoch": 0.18780101558786913, "grad_norm": 0.958949089050293, "learning_rate": 1.2518856791010314e-05, "loss": 0.3517, "mean_token_accuracy": 0.8909769050776959, "num_tokens": 80773141.0, "step": 12200 }, { "entropy": 0.3557133857160807, "epoch": 0.1879549508465477, "grad_norm": 1.022268295288086, "learning_rate": 1.25291189901996e-05, "loss": 0.3458, "mean_token_accuracy": 0.8948882929980755, "num_tokens": 80843036.0, "step": 12210 }, { "entropy": 0.35195997543632984, "epoch": 0.1881088861052263, "grad_norm": 0.9293181300163269, "learning_rate": 1.2539381189388886e-05, "loss": 0.3556, "mean_token_accuracy": 0.894617534428835, "num_tokens": 80914911.0, "step": 12220 }, { "entropy": 0.3693913768045604, "epoch": 0.18826282136390488, "grad_norm": 1.107330560684204, "learning_rate": 1.2549643388578174e-05, "loss": 0.3723, "mean_token_accuracy": 0.8896137118339539, "num_tokens": 80983243.0, "step": 12230 }, { "entropy": 0.351125000230968, "epoch": 0.18841675662258345, "grad_norm": 0.8950424194335938, "learning_rate": 1.255990558776746e-05, "loss": 0.3508, "mean_token_accuracy": 0.8977523908019066, "num_tokens": 81045875.0, "step": 12240 }, { "entropy": 0.3582336648367345, "epoch": 0.18857069188126205, "grad_norm": 1.0025907754898071, "learning_rate": 1.2570167786956746e-05, "loss": 0.3568, "mean_token_accuracy": 0.8932476244866848, "num_tokens": 81112118.0, "step": 12250 }, { "entropy": 0.35061614885926246, "epoch": 0.18872462713994062, "grad_norm": 0.9502792358398438, "learning_rate": 1.2580429986146032e-05, "loss": 0.345, "mean_token_accuracy": 0.8946667842566967, "num_tokens": 81177362.0, "step": 12260 }, { "entropy": 0.3611680497415364, "epoch": 0.1888785623986192, "grad_norm": 1.3258167505264282, "learning_rate": 1.2590692185335318e-05, "loss": 0.3584, "mean_token_accuracy": 0.8930979862809181, "num_tokens": 81231762.0, "step": 12270 }, { "entropy": 0.3707623466849327, "epoch": 0.1890324976572978, "grad_norm": 1.3809473514556885, "learning_rate": 1.2600954384524604e-05, "loss": 0.3712, "mean_token_accuracy": 0.8878168813884258, "num_tokens": 81298048.0, "step": 12280 }, { "entropy": 0.35853383094072344, "epoch": 0.18918643291597637, "grad_norm": 1.227759838104248, "learning_rate": 1.261121658371389e-05, "loss": 0.3436, "mean_token_accuracy": 0.8945878259837627, "num_tokens": 81362782.0, "step": 12290 }, { "entropy": 0.3525180272758007, "epoch": 0.18934036817465494, "grad_norm": 0.997398316860199, "learning_rate": 1.2621478782903177e-05, "loss": 0.3475, "mean_token_accuracy": 0.892761017382145, "num_tokens": 81433824.0, "step": 12300 }, { "entropy": 0.3504673629999161, "epoch": 0.18949430343333354, "grad_norm": 1.0685735940933228, "learning_rate": 1.2631740982092463e-05, "loss": 0.3453, "mean_token_accuracy": 0.8969720236957073, "num_tokens": 81501156.0, "step": 12310 }, { "entropy": 0.36293565863743427, "epoch": 0.1896482386920121, "grad_norm": 1.0304206609725952, "learning_rate": 1.264200318128175e-05, "loss": 0.3607, "mean_token_accuracy": 0.8926094762980938, "num_tokens": 81569917.0, "step": 12320 }, { "entropy": 0.3658025974407792, "epoch": 0.18980217395069068, "grad_norm": 0.9775126576423645, "learning_rate": 1.2652265380471035e-05, "loss": 0.3663, "mean_token_accuracy": 0.8916933476924896, "num_tokens": 81629012.0, "step": 12330 }, { "entropy": 0.3579639525152743, "epoch": 0.18995610920936926, "grad_norm": 1.1109633445739746, "learning_rate": 1.2662527579660321e-05, "loss": 0.3519, "mean_token_accuracy": 0.893869961053133, "num_tokens": 81699949.0, "step": 12340 }, { "entropy": 0.3565894543193281, "epoch": 0.19011004446804786, "grad_norm": 1.2810338735580444, "learning_rate": 1.2672789778849607e-05, "loss": 0.3586, "mean_token_accuracy": 0.8928021870553493, "num_tokens": 81766035.0, "step": 12350 }, { "entropy": 0.36371489707380533, "epoch": 0.19026397972672643, "grad_norm": 1.1607608795166016, "learning_rate": 1.2683051978038893e-05, "loss": 0.3539, "mean_token_accuracy": 0.8916282631456852, "num_tokens": 81826950.0, "step": 12360 }, { "entropy": 0.3593049097806215, "epoch": 0.190417914985405, "grad_norm": 0.9761979579925537, "learning_rate": 1.2693314177228182e-05, "loss": 0.3471, "mean_token_accuracy": 0.8923504441976547, "num_tokens": 81893331.0, "step": 12370 }, { "entropy": 0.3668552339076996, "epoch": 0.1905718502440836, "grad_norm": 1.010116457939148, "learning_rate": 1.2703576376417468e-05, "loss": 0.3481, "mean_token_accuracy": 0.8917888045310974, "num_tokens": 81961955.0, "step": 12380 }, { "entropy": 0.342813728377223, "epoch": 0.19072578550276217, "grad_norm": 1.2782572507858276, "learning_rate": 1.2713838575606754e-05, "loss": 0.3498, "mean_token_accuracy": 0.896675492823124, "num_tokens": 82024506.0, "step": 12390 }, { "entropy": 0.36618463043123484, "epoch": 0.19087972076144075, "grad_norm": 1.2062394618988037, "learning_rate": 1.272410077479604e-05, "loss": 0.3591, "mean_token_accuracy": 0.8896235466003418, "num_tokens": 82089957.0, "step": 12400 }, { "entropy": 0.3563377056270838, "epoch": 0.19103365602011935, "grad_norm": 0.856900691986084, "learning_rate": 1.2734362973985326e-05, "loss": 0.3505, "mean_token_accuracy": 0.8939288459718228, "num_tokens": 82151625.0, "step": 12410 }, { "entropy": 0.35161900287494063, "epoch": 0.19118759127879792, "grad_norm": 1.2402515411376953, "learning_rate": 1.2744625173174612e-05, "loss": 0.3484, "mean_token_accuracy": 0.8938475020229817, "num_tokens": 82218608.0, "step": 12420 }, { "entropy": 0.3481152568012476, "epoch": 0.1913415265374765, "grad_norm": 1.1392803192138672, "learning_rate": 1.2754887372363898e-05, "loss": 0.3432, "mean_token_accuracy": 0.8981491297483444, "num_tokens": 82290560.0, "step": 12430 }, { "entropy": 0.3434770403429866, "epoch": 0.1914954617961551, "grad_norm": 0.9474986791610718, "learning_rate": 1.2765149571553186e-05, "loss": 0.3306, "mean_token_accuracy": 0.8953828811645508, "num_tokens": 82352158.0, "step": 12440 }, { "entropy": 0.3515912627801299, "epoch": 0.19164939705483366, "grad_norm": 0.9429934024810791, "learning_rate": 1.2775411770742472e-05, "loss": 0.3534, "mean_token_accuracy": 0.8931252397596836, "num_tokens": 82419920.0, "step": 12450 }, { "entropy": 0.3460219241678715, "epoch": 0.19180333231351224, "grad_norm": 0.9787110686302185, "learning_rate": 1.2785673969931758e-05, "loss": 0.3351, "mean_token_accuracy": 0.8947556503117085, "num_tokens": 82481794.0, "step": 12460 }, { "entropy": 0.3532759821042418, "epoch": 0.19195726757219084, "grad_norm": 0.9770433902740479, "learning_rate": 1.2795936169121044e-05, "loss": 0.3536, "mean_token_accuracy": 0.8940710552036762, "num_tokens": 82549145.0, "step": 12470 }, { "entropy": 0.35719360820949075, "epoch": 0.1921112028308694, "grad_norm": 1.2455576658248901, "learning_rate": 1.280619836831033e-05, "loss": 0.3464, "mean_token_accuracy": 0.8950096622109414, "num_tokens": 82616459.0, "step": 12480 }, { "entropy": 0.3384915979579091, "epoch": 0.19226513808954798, "grad_norm": 1.0189845561981201, "learning_rate": 1.2816460567499615e-05, "loss": 0.3418, "mean_token_accuracy": 0.8975750207901001, "num_tokens": 82676139.0, "step": 12490 }, { "entropy": 0.35068309456110003, "epoch": 0.19241907334822658, "grad_norm": 1.1450045108795166, "learning_rate": 1.2826722766688901e-05, "loss": 0.351, "mean_token_accuracy": 0.8967172712087631, "num_tokens": 82735942.0, "step": 12500 }, { "entropy": 0.3467830466106534, "epoch": 0.19257300860690515, "grad_norm": 1.2884422540664673, "learning_rate": 1.2836984965878189e-05, "loss": 0.3443, "mean_token_accuracy": 0.8971650384366512, "num_tokens": 82799579.0, "step": 12510 }, { "entropy": 0.35078452453017234, "epoch": 0.19272694386558373, "grad_norm": 1.2165379524230957, "learning_rate": 1.2847247165067475e-05, "loss": 0.3486, "mean_token_accuracy": 0.8929484769701957, "num_tokens": 82859289.0, "step": 12520 }, { "entropy": 0.35820967368781564, "epoch": 0.1928808791242623, "grad_norm": 0.8912848830223083, "learning_rate": 1.2857509364256761e-05, "loss": 0.3511, "mean_token_accuracy": 0.8950020544230938, "num_tokens": 82930119.0, "step": 12530 }, { "entropy": 0.3471573643386364, "epoch": 0.1930348143829409, "grad_norm": 0.9596291780471802, "learning_rate": 1.2867771563446047e-05, "loss": 0.3435, "mean_token_accuracy": 0.8969451479613781, "num_tokens": 82991094.0, "step": 12540 }, { "entropy": 0.3876943189650774, "epoch": 0.19318874964161947, "grad_norm": 0.991664707660675, "learning_rate": 1.2878033762635333e-05, "loss": 0.3854, "mean_token_accuracy": 0.8844738006591797, "num_tokens": 83055550.0, "step": 12550 }, { "entropy": 0.3582869589328766, "epoch": 0.19334268490029805, "grad_norm": 1.0589607954025269, "learning_rate": 1.2888295961824619e-05, "loss": 0.3555, "mean_token_accuracy": 0.89596256762743, "num_tokens": 83124419.0, "step": 12560 }, { "entropy": 0.35366692263633015, "epoch": 0.19349662015897665, "grad_norm": 1.258882999420166, "learning_rate": 1.2898558161013905e-05, "loss": 0.3594, "mean_token_accuracy": 0.8937082283198834, "num_tokens": 83190774.0, "step": 12570 }, { "entropy": 0.3534739456139505, "epoch": 0.19365055541765522, "grad_norm": 1.4651248455047607, "learning_rate": 1.2908820360203194e-05, "loss": 0.3609, "mean_token_accuracy": 0.8969170399010181, "num_tokens": 83244742.0, "step": 12580 }, { "entropy": 0.35738892909139397, "epoch": 0.1938044906763338, "grad_norm": 1.3759630918502808, "learning_rate": 1.291908255939248e-05, "loss": 0.3641, "mean_token_accuracy": 0.8935420364141464, "num_tokens": 83317466.0, "step": 12590 }, { "entropy": 0.3550308601930737, "epoch": 0.1939584259350124, "grad_norm": 1.0511797666549683, "learning_rate": 1.2929344758581766e-05, "loss": 0.3604, "mean_token_accuracy": 0.8934253700077533, "num_tokens": 83384797.0, "step": 12600 }, { "entropy": 0.3873874446377158, "epoch": 0.19411236119369096, "grad_norm": 1.0936853885650635, "learning_rate": 1.2939606957771052e-05, "loss": 0.3734, "mean_token_accuracy": 0.8843059256672859, "num_tokens": 83454382.0, "step": 12610 }, { "entropy": 0.35722117219120264, "epoch": 0.19426629645236954, "grad_norm": 1.5210107564926147, "learning_rate": 1.2949869156960338e-05, "loss": 0.362, "mean_token_accuracy": 0.8917098768055439, "num_tokens": 83517821.0, "step": 12620 }, { "entropy": 0.354817177914083, "epoch": 0.19442023171104814, "grad_norm": 0.9353083372116089, "learning_rate": 1.2960131356149624e-05, "loss": 0.3455, "mean_token_accuracy": 0.8959952048957348, "num_tokens": 83581115.0, "step": 12630 }, { "entropy": 0.35566438660025596, "epoch": 0.1945741669697267, "grad_norm": 1.4151222705841064, "learning_rate": 1.297039355533891e-05, "loss": 0.3455, "mean_token_accuracy": 0.8944477364420891, "num_tokens": 83643334.0, "step": 12640 }, { "entropy": 0.35761970225721595, "epoch": 0.19472810222840528, "grad_norm": 0.9808882474899292, "learning_rate": 1.2980655754528197e-05, "loss": 0.3638, "mean_token_accuracy": 0.8932575181126594, "num_tokens": 83703916.0, "step": 12650 }, { "entropy": 0.34609224535524846, "epoch": 0.19488203748708388, "grad_norm": 0.9536427855491638, "learning_rate": 1.2990917953717483e-05, "loss": 0.3387, "mean_token_accuracy": 0.8968044832348824, "num_tokens": 83770863.0, "step": 12660 }, { "entropy": 0.35734721310436723, "epoch": 0.19503597274576245, "grad_norm": 1.2024794816970825, "learning_rate": 1.300118015290677e-05, "loss": 0.3588, "mean_token_accuracy": 0.8930225417017936, "num_tokens": 83832629.0, "step": 12670 }, { "entropy": 0.34835477024316785, "epoch": 0.19518990800444103, "grad_norm": 0.9768033623695374, "learning_rate": 1.3011442352096055e-05, "loss": 0.3418, "mean_token_accuracy": 0.8972929298877717, "num_tokens": 83894559.0, "step": 12680 }, { "entropy": 0.3507492411881685, "epoch": 0.19534384326311963, "grad_norm": 0.8008396625518799, "learning_rate": 1.3021704551285341e-05, "loss": 0.3466, "mean_token_accuracy": 0.8956995882093907, "num_tokens": 83955695.0, "step": 12690 }, { "entropy": 0.35532643999904395, "epoch": 0.1954977785217982, "grad_norm": 1.2672348022460938, "learning_rate": 1.3031966750474627e-05, "loss": 0.3536, "mean_token_accuracy": 0.8940373219549655, "num_tokens": 84014454.0, "step": 12700 }, { "entropy": 0.35289344731718303, "epoch": 0.19565171378047677, "grad_norm": 1.0806225538253784, "learning_rate": 1.3042228949663913e-05, "loss": 0.3537, "mean_token_accuracy": 0.8967005416750908, "num_tokens": 84083803.0, "step": 12710 }, { "entropy": 0.3493725646287203, "epoch": 0.19580564903915534, "grad_norm": 1.0623986721038818, "learning_rate": 1.30524911488532e-05, "loss": 0.3452, "mean_token_accuracy": 0.8971086785197258, "num_tokens": 84144818.0, "step": 12720 }, { "entropy": 0.3316182173788548, "epoch": 0.19595958429783394, "grad_norm": 0.9292863607406616, "learning_rate": 1.3062753348042487e-05, "loss": 0.3356, "mean_token_accuracy": 0.9022722832858563, "num_tokens": 84206818.0, "step": 12730 }, { "entropy": 0.3510355897247791, "epoch": 0.19611351955651252, "grad_norm": 1.0695397853851318, "learning_rate": 1.3073015547231773e-05, "loss": 0.3535, "mean_token_accuracy": 0.8953202545642853, "num_tokens": 84268122.0, "step": 12740 }, { "entropy": 0.3633683858439326, "epoch": 0.1962674548151911, "grad_norm": 1.0965555906295776, "learning_rate": 1.3083277746421059e-05, "loss": 0.3566, "mean_token_accuracy": 0.8916458450257778, "num_tokens": 84335865.0, "step": 12750 }, { "entropy": 0.3705788904801011, "epoch": 0.1964213900738697, "grad_norm": 1.1444776058197021, "learning_rate": 1.3093539945610344e-05, "loss": 0.3586, "mean_token_accuracy": 0.8893420599400997, "num_tokens": 84399078.0, "step": 12760 }, { "entropy": 0.35449990555644034, "epoch": 0.19657532533254826, "grad_norm": 1.4940191507339478, "learning_rate": 1.310380214479963e-05, "loss": 0.358, "mean_token_accuracy": 0.892243392765522, "num_tokens": 84460537.0, "step": 12770 }, { "entropy": 0.3528883261606097, "epoch": 0.19672926059122683, "grad_norm": 0.9156441688537598, "learning_rate": 1.3114064343988916e-05, "loss": 0.3389, "mean_token_accuracy": 0.8953651860356331, "num_tokens": 84526259.0, "step": 12780 }, { "entropy": 0.36038217041641474, "epoch": 0.19688319584990543, "grad_norm": 1.2191041707992554, "learning_rate": 1.3124326543178206e-05, "loss": 0.3665, "mean_token_accuracy": 0.8902619995176793, "num_tokens": 84594592.0, "step": 12790 }, { "entropy": 0.36618912871927023, "epoch": 0.197037131108584, "grad_norm": 1.002626657485962, "learning_rate": 1.3134588742367492e-05, "loss": 0.3552, "mean_token_accuracy": 0.8916862204670906, "num_tokens": 84672241.0, "step": 12800 }, { "entropy": 0.3497647061944008, "epoch": 0.19719106636726258, "grad_norm": 1.1995753049850464, "learning_rate": 1.3144850941556778e-05, "loss": 0.3433, "mean_token_accuracy": 0.8951816827058792, "num_tokens": 84742949.0, "step": 12810 }, { "entropy": 0.3601151240989566, "epoch": 0.19734500162594118, "grad_norm": 1.4600054025650024, "learning_rate": 1.3155113140746064e-05, "loss": 0.3535, "mean_token_accuracy": 0.891014450788498, "num_tokens": 84804551.0, "step": 12820 }, { "entropy": 0.3408388477750123, "epoch": 0.19749893688461975, "grad_norm": 1.0377914905548096, "learning_rate": 1.316537533993535e-05, "loss": 0.3548, "mean_token_accuracy": 0.8978255517780781, "num_tokens": 84871669.0, "step": 12830 }, { "entropy": 0.35422666324302554, "epoch": 0.19765287214329832, "grad_norm": 1.1909681558609009, "learning_rate": 1.3175637539124635e-05, "loss": 0.3565, "mean_token_accuracy": 0.8976750478148461, "num_tokens": 84930296.0, "step": 12840 }, { "entropy": 0.3552001962438226, "epoch": 0.19780680740197693, "grad_norm": 0.8720049262046814, "learning_rate": 1.3185899738313921e-05, "loss": 0.3501, "mean_token_accuracy": 0.8935939945280552, "num_tokens": 84998801.0, "step": 12850 }, { "entropy": 0.3592835795134306, "epoch": 0.1979607426606555, "grad_norm": 1.4167853593826294, "learning_rate": 1.3196161937503209e-05, "loss": 0.3566, "mean_token_accuracy": 0.8906046748161316, "num_tokens": 85067330.0, "step": 12860 }, { "entropy": 0.3419466454535723, "epoch": 0.19811467791933407, "grad_norm": 0.9748983383178711, "learning_rate": 1.3206424136692495e-05, "loss": 0.3534, "mean_token_accuracy": 0.8985917955636978, "num_tokens": 85140903.0, "step": 12870 }, { "entropy": 0.35611083786934616, "epoch": 0.19826861317801267, "grad_norm": 1.0322648286819458, "learning_rate": 1.3216686335881781e-05, "loss": 0.344, "mean_token_accuracy": 0.8953326724469661, "num_tokens": 85216675.0, "step": 12880 }, { "entropy": 0.3567270440980792, "epoch": 0.19842254843669124, "grad_norm": 0.9677907228469849, "learning_rate": 1.3226948535071067e-05, "loss": 0.3546, "mean_token_accuracy": 0.893481270968914, "num_tokens": 85292818.0, "step": 12890 }, { "entropy": 0.3499099567532539, "epoch": 0.19857648369536982, "grad_norm": 0.8850228190422058, "learning_rate": 1.3237210734260353e-05, "loss": 0.3378, "mean_token_accuracy": 0.8951471164822579, "num_tokens": 85354806.0, "step": 12900 }, { "entropy": 0.3336795738898218, "epoch": 0.1987304189540484, "grad_norm": 1.0377925634384155, "learning_rate": 1.3247472933449639e-05, "loss": 0.3398, "mean_token_accuracy": 0.8998399592936039, "num_tokens": 85429209.0, "step": 12910 }, { "entropy": 0.3372984562069178, "epoch": 0.198884354212727, "grad_norm": 1.1174752712249756, "learning_rate": 1.3257735132638925e-05, "loss": 0.3345, "mean_token_accuracy": 0.8992430336773396, "num_tokens": 85496917.0, "step": 12920 }, { "entropy": 0.35359586160629985, "epoch": 0.19903828947140556, "grad_norm": 1.0875686407089233, "learning_rate": 1.3267997331828212e-05, "loss": 0.3391, "mean_token_accuracy": 0.8936707668006421, "num_tokens": 85564429.0, "step": 12930 }, { "entropy": 0.3740382889285684, "epoch": 0.19919222473008413, "grad_norm": 1.5164424180984497, "learning_rate": 1.3278259531017498e-05, "loss": 0.3646, "mean_token_accuracy": 0.8880160771310329, "num_tokens": 85636158.0, "step": 12940 }, { "entropy": 0.35322183147072794, "epoch": 0.19934615998876273, "grad_norm": 0.8653030395507812, "learning_rate": 1.3288521730206784e-05, "loss": 0.3514, "mean_token_accuracy": 0.8944994278252125, "num_tokens": 85707725.0, "step": 12950 }, { "entropy": 0.33852391252294184, "epoch": 0.1995000952474413, "grad_norm": 0.9738702178001404, "learning_rate": 1.329878392939607e-05, "loss": 0.3424, "mean_token_accuracy": 0.8977271534502507, "num_tokens": 85765306.0, "step": 12960 }, { "entropy": 0.3548492131754756, "epoch": 0.19965403050611988, "grad_norm": 0.934413731098175, "learning_rate": 1.3309046128585356e-05, "loss": 0.343, "mean_token_accuracy": 0.8937967106699943, "num_tokens": 85830443.0, "step": 12970 }, { "entropy": 0.3449532238766551, "epoch": 0.19980796576479848, "grad_norm": 1.406815767288208, "learning_rate": 1.3319308327774642e-05, "loss": 0.3419, "mean_token_accuracy": 0.8975078754127026, "num_tokens": 85898969.0, "step": 12980 }, { "entropy": 0.3478503910824656, "epoch": 0.19996190102347705, "grad_norm": 1.306504726409912, "learning_rate": 1.3329570526963928e-05, "loss": 0.3517, "mean_token_accuracy": 0.8966536447405815, "num_tokens": 85963174.0, "step": 12990 }, { "entropy": 0.3445705872029066, "epoch": 0.20011583628215562, "grad_norm": 1.053175449371338, "learning_rate": 1.3339832726153217e-05, "loss": 0.3577, "mean_token_accuracy": 0.894159197062254, "num_tokens": 86026962.0, "step": 13000 }, { "entropy": 0.3423521677032113, "epoch": 0.20026977154083422, "grad_norm": 1.104081392288208, "learning_rate": 1.3350094925342503e-05, "loss": 0.3423, "mean_token_accuracy": 0.8988126151263713, "num_tokens": 86096342.0, "step": 13010 }, { "entropy": 0.3825072761625051, "epoch": 0.2004237067995128, "grad_norm": 1.3912255764007568, "learning_rate": 1.336035712453179e-05, "loss": 0.3641, "mean_token_accuracy": 0.8872999422252178, "num_tokens": 86162654.0, "step": 13020 }, { "entropy": 0.33640101701021197, "epoch": 0.20057764205819137, "grad_norm": 1.168756365776062, "learning_rate": 1.3370619323721075e-05, "loss": 0.3398, "mean_token_accuracy": 0.8983068615198135, "num_tokens": 86229009.0, "step": 13030 }, { "entropy": 0.3358041213825345, "epoch": 0.20073157731686997, "grad_norm": 0.838599443435669, "learning_rate": 1.3380881522910361e-05, "loss": 0.3373, "mean_token_accuracy": 0.8969876378774643, "num_tokens": 86300211.0, "step": 13040 }, { "entropy": 0.3712282093241811, "epoch": 0.20088551257554854, "grad_norm": 1.1351079940795898, "learning_rate": 1.3391143722099647e-05, "loss": 0.3541, "mean_token_accuracy": 0.89002346098423, "num_tokens": 86367375.0, "step": 13050 }, { "entropy": 0.34562111981213095, "epoch": 0.20103944783422711, "grad_norm": 1.0762157440185547, "learning_rate": 1.3401405921288933e-05, "loss": 0.3555, "mean_token_accuracy": 0.8967710562050343, "num_tokens": 86430007.0, "step": 13060 }, { "entropy": 0.3360803842544556, "epoch": 0.20119338309290571, "grad_norm": 0.8332509398460388, "learning_rate": 1.341166812047822e-05, "loss": 0.336, "mean_token_accuracy": 0.8985401146113873, "num_tokens": 86505563.0, "step": 13070 }, { "entropy": 0.3611773304641247, "epoch": 0.2013473183515843, "grad_norm": 1.138384461402893, "learning_rate": 1.3421930319667507e-05, "loss": 0.3543, "mean_token_accuracy": 0.8934557676315308, "num_tokens": 86570075.0, "step": 13080 }, { "entropy": 0.36002518236637115, "epoch": 0.20150125361026286, "grad_norm": 0.9904517531394958, "learning_rate": 1.3432192518856793e-05, "loss": 0.3515, "mean_token_accuracy": 0.8898184485733509, "num_tokens": 86638507.0, "step": 13090 }, { "entropy": 0.36078307032585144, "epoch": 0.20165518886894143, "grad_norm": 1.0600427389144897, "learning_rate": 1.3442454718046078e-05, "loss": 0.3492, "mean_token_accuracy": 0.8948367848992348, "num_tokens": 86704427.0, "step": 13100 }, { "entropy": 0.3501623447984457, "epoch": 0.20180912412762003, "grad_norm": 0.9069358110427856, "learning_rate": 1.3452716917235364e-05, "loss": 0.345, "mean_token_accuracy": 0.8944601975381374, "num_tokens": 86769574.0, "step": 13110 }, { "entropy": 0.36370391510426997, "epoch": 0.2019630593862986, "grad_norm": 0.9560631513595581, "learning_rate": 1.346297911642465e-05, "loss": 0.3511, "mean_token_accuracy": 0.8934513874351978, "num_tokens": 86833393.0, "step": 13120 }, { "entropy": 0.3589118029922247, "epoch": 0.20211699464497718, "grad_norm": 0.9907565712928772, "learning_rate": 1.3473241315613936e-05, "loss": 0.3514, "mean_token_accuracy": 0.8944809712469578, "num_tokens": 86900645.0, "step": 13130 }, { "entropy": 0.34971669744700196, "epoch": 0.20227092990365578, "grad_norm": 1.1794917583465576, "learning_rate": 1.3483503514803224e-05, "loss": 0.3514, "mean_token_accuracy": 0.8958851903676986, "num_tokens": 86978454.0, "step": 13140 }, { "entropy": 0.3291204024106264, "epoch": 0.20242486516233435, "grad_norm": 1.144350528717041, "learning_rate": 1.349376571399251e-05, "loss": 0.3392, "mean_token_accuracy": 0.9038756154477596, "num_tokens": 87036183.0, "step": 13150 }, { "entropy": 0.3337495280429721, "epoch": 0.20257880042101292, "grad_norm": 0.9443320035934448, "learning_rate": 1.3504027913181796e-05, "loss": 0.3424, "mean_token_accuracy": 0.8970521457493306, "num_tokens": 87102486.0, "step": 13160 }, { "entropy": 0.33897433783859016, "epoch": 0.20273273567969152, "grad_norm": 0.9509174227714539, "learning_rate": 1.3514290112371082e-05, "loss": 0.3366, "mean_token_accuracy": 0.8985123828053474, "num_tokens": 87171611.0, "step": 13170 }, { "entropy": 0.3544972507283092, "epoch": 0.2028866709383701, "grad_norm": 0.881351113319397, "learning_rate": 1.3524552311560368e-05, "loss": 0.3623, "mean_token_accuracy": 0.8930921815335751, "num_tokens": 87237256.0, "step": 13180 }, { "entropy": 0.34052872005850077, "epoch": 0.20304060619704867, "grad_norm": 0.995733380317688, "learning_rate": 1.3534814510749654e-05, "loss": 0.34, "mean_token_accuracy": 0.8982825331389904, "num_tokens": 87303803.0, "step": 13190 }, { "entropy": 0.3612168589606881, "epoch": 0.20319454145572727, "grad_norm": 1.1548861265182495, "learning_rate": 1.354507670993894e-05, "loss": 0.3531, "mean_token_accuracy": 0.8925371803343296, "num_tokens": 87367438.0, "step": 13200 }, { "entropy": 0.3456981783732772, "epoch": 0.20334847671440584, "grad_norm": 1.036792516708374, "learning_rate": 1.3555338909128229e-05, "loss": 0.3568, "mean_token_accuracy": 0.8962286174297333, "num_tokens": 87428879.0, "step": 13210 }, { "entropy": 0.34616745822131634, "epoch": 0.2035024119730844, "grad_norm": 1.0066267251968384, "learning_rate": 1.3565601108317515e-05, "loss": 0.3472, "mean_token_accuracy": 0.8981555327773094, "num_tokens": 87499370.0, "step": 13220 }, { "entropy": 0.34766814671456814, "epoch": 0.203656347231763, "grad_norm": 0.9056470990180969, "learning_rate": 1.3575863307506801e-05, "loss": 0.344, "mean_token_accuracy": 0.8955429792404175, "num_tokens": 87570095.0, "step": 13230 }, { "entropy": 0.36445282977074384, "epoch": 0.20381028249044159, "grad_norm": 0.9006561636924744, "learning_rate": 1.3586125506696087e-05, "loss": 0.3554, "mean_token_accuracy": 0.8931279852986336, "num_tokens": 87644986.0, "step": 13240 }, { "entropy": 0.33631072361022235, "epoch": 0.20396421774912016, "grad_norm": 1.0652306079864502, "learning_rate": 1.3596387705885373e-05, "loss": 0.3448, "mean_token_accuracy": 0.897956109046936, "num_tokens": 87715239.0, "step": 13250 }, { "entropy": 0.36287173721939325, "epoch": 0.20411815300779876, "grad_norm": 1.2555105686187744, "learning_rate": 1.3606649905074659e-05, "loss": 0.3601, "mean_token_accuracy": 0.8915054358541965, "num_tokens": 87780810.0, "step": 13260 }, { "entropy": 0.3470073949545622, "epoch": 0.20427208826647733, "grad_norm": 1.0546239614486694, "learning_rate": 1.3616912104263945e-05, "loss": 0.3508, "mean_token_accuracy": 0.8971715368330478, "num_tokens": 87849192.0, "step": 13270 }, { "entropy": 0.32508864589035513, "epoch": 0.2044260235251559, "grad_norm": 1.181350827217102, "learning_rate": 1.3627174303453232e-05, "loss": 0.3239, "mean_token_accuracy": 0.9030675046145916, "num_tokens": 87929089.0, "step": 13280 }, { "entropy": 0.3610581045970321, "epoch": 0.20457995878383448, "grad_norm": 1.0887575149536133, "learning_rate": 1.3637436502642518e-05, "loss": 0.3646, "mean_token_accuracy": 0.8908023796975613, "num_tokens": 88001943.0, "step": 13290 }, { "entropy": 0.3522009341046214, "epoch": 0.20473389404251308, "grad_norm": 0.9298438429832458, "learning_rate": 1.3647698701831804e-05, "loss": 0.3547, "mean_token_accuracy": 0.8948989018797875, "num_tokens": 88070072.0, "step": 13300 }, { "entropy": 0.3499462427571416, "epoch": 0.20488782930119165, "grad_norm": 0.9557731747627258, "learning_rate": 1.365796090102109e-05, "loss": 0.352, "mean_token_accuracy": 0.8920705884695053, "num_tokens": 88145822.0, "step": 13310 }, { "entropy": 0.3529243079945445, "epoch": 0.20504176455987022, "grad_norm": 1.2689249515533447, "learning_rate": 1.3668223100210376e-05, "loss": 0.3589, "mean_token_accuracy": 0.8957703992724418, "num_tokens": 88202948.0, "step": 13320 }, { "entropy": 0.33756988141685723, "epoch": 0.20519569981854882, "grad_norm": 1.0839815139770508, "learning_rate": 1.3678485299399662e-05, "loss": 0.342, "mean_token_accuracy": 0.8997690849006176, "num_tokens": 88270659.0, "step": 13330 }, { "entropy": 0.34337351396679877, "epoch": 0.2053496350772274, "grad_norm": 1.015210747718811, "learning_rate": 1.3688747498588948e-05, "loss": 0.3335, "mean_token_accuracy": 0.8961172923445702, "num_tokens": 88342025.0, "step": 13340 }, { "entropy": 0.3414792022667825, "epoch": 0.20550357033590597, "grad_norm": 1.3747962713241577, "learning_rate": 1.3699009697778236e-05, "loss": 0.3535, "mean_token_accuracy": 0.8975412927567958, "num_tokens": 88401570.0, "step": 13350 }, { "entropy": 0.3666770771145821, "epoch": 0.20565750559458457, "grad_norm": 0.9657528400421143, "learning_rate": 1.3709271896967522e-05, "loss": 0.3608, "mean_token_accuracy": 0.8927541233599185, "num_tokens": 88477614.0, "step": 13360 }, { "entropy": 0.3293705377727747, "epoch": 0.20581144085326314, "grad_norm": 0.8777787685394287, "learning_rate": 1.3719534096156808e-05, "loss": 0.329, "mean_token_accuracy": 0.9009486809372902, "num_tokens": 88547847.0, "step": 13370 }, { "entropy": 0.3643647413700819, "epoch": 0.2059653761119417, "grad_norm": 1.1200100183486938, "learning_rate": 1.3729796295346093e-05, "loss": 0.3585, "mean_token_accuracy": 0.89280409142375, "num_tokens": 88611564.0, "step": 13380 }, { "entropy": 0.3604135558009148, "epoch": 0.2061193113706203, "grad_norm": 0.94215327501297, "learning_rate": 1.374005849453538e-05, "loss": 0.3604, "mean_token_accuracy": 0.8914728775620461, "num_tokens": 88674459.0, "step": 13390 }, { "entropy": 0.34103977857157586, "epoch": 0.20627324662929888, "grad_norm": 0.9026941657066345, "learning_rate": 1.3750320693724665e-05, "loss": 0.3605, "mean_token_accuracy": 0.8963655643165112, "num_tokens": 88737459.0, "step": 13400 }, { "entropy": 0.37158016860485077, "epoch": 0.20642718188797746, "grad_norm": 1.4683005809783936, "learning_rate": 1.3760582892913951e-05, "loss": 0.3574, "mean_token_accuracy": 0.8897707134485244, "num_tokens": 88797159.0, "step": 13410 }, { "entropy": 0.3688820531591773, "epoch": 0.20658111714665606, "grad_norm": 1.4289737939834595, "learning_rate": 1.377084509210324e-05, "loss": 0.3609, "mean_token_accuracy": 0.8891170173883438, "num_tokens": 88860566.0, "step": 13420 }, { "entropy": 0.35987344579771163, "epoch": 0.20673505240533463, "grad_norm": 1.0661101341247559, "learning_rate": 1.3781107291292527e-05, "loss": 0.3486, "mean_token_accuracy": 0.8916387602686882, "num_tokens": 88926751.0, "step": 13430 }, { "entropy": 0.35062282513827087, "epoch": 0.2068889876640132, "grad_norm": 0.8668496608734131, "learning_rate": 1.3791369490481813e-05, "loss": 0.3409, "mean_token_accuracy": 0.8975114971399307, "num_tokens": 88997077.0, "step": 13440 }, { "entropy": 0.3599264150485396, "epoch": 0.2070429229226918, "grad_norm": 1.1413519382476807, "learning_rate": 1.3801631689671098e-05, "loss": 0.3721, "mean_token_accuracy": 0.8920949719846248, "num_tokens": 89055985.0, "step": 13450 }, { "entropy": 0.3473878752440214, "epoch": 0.20719685818137037, "grad_norm": 0.9287727475166321, "learning_rate": 1.3811893888860384e-05, "loss": 0.3517, "mean_token_accuracy": 0.8937331236898899, "num_tokens": 89125512.0, "step": 13460 }, { "entropy": 0.35813484601676465, "epoch": 0.20735079344004895, "grad_norm": 1.093798041343689, "learning_rate": 1.382215608804967e-05, "loss": 0.3421, "mean_token_accuracy": 0.8934362418949604, "num_tokens": 89198272.0, "step": 13470 }, { "entropy": 0.36269765831530093, "epoch": 0.20750472869872752, "grad_norm": 1.0925129652023315, "learning_rate": 1.3832418287238956e-05, "loss": 0.3544, "mean_token_accuracy": 0.892594138532877, "num_tokens": 89262448.0, "step": 13480 }, { "entropy": 0.36553083565086125, "epoch": 0.20765866395740612, "grad_norm": 0.9740934371948242, "learning_rate": 1.3842680486428244e-05, "loss": 0.3612, "mean_token_accuracy": 0.8901189751923084, "num_tokens": 89335686.0, "step": 13490 }, { "entropy": 0.35796478521078823, "epoch": 0.2078125992160847, "grad_norm": 1.0025303363800049, "learning_rate": 1.385294268561753e-05, "loss": 0.3604, "mean_token_accuracy": 0.8914570838212967, "num_tokens": 89399906.0, "step": 13500 }, { "entropy": 0.36174532435834406, "epoch": 0.20796653447476326, "grad_norm": 0.9027694463729858, "learning_rate": 1.3863204884806816e-05, "loss": 0.3425, "mean_token_accuracy": 0.8929674170911313, "num_tokens": 89461202.0, "step": 13510 }, { "entropy": 0.35340076982975005, "epoch": 0.20812046973344187, "grad_norm": 1.1556769609451294, "learning_rate": 1.3873467083996102e-05, "loss": 0.3438, "mean_token_accuracy": 0.8957247890532016, "num_tokens": 89522757.0, "step": 13520 }, { "entropy": 0.3528218276798725, "epoch": 0.20827440499212044, "grad_norm": 0.8955548405647278, "learning_rate": 1.3883729283185388e-05, "loss": 0.3515, "mean_token_accuracy": 0.8938475050032139, "num_tokens": 89597177.0, "step": 13530 }, { "entropy": 0.3364135568961501, "epoch": 0.208428340250799, "grad_norm": 1.0162755250930786, "learning_rate": 1.3893991482374674e-05, "loss": 0.3386, "mean_token_accuracy": 0.8998708225786686, "num_tokens": 89667178.0, "step": 13540 }, { "entropy": 0.33847698047757147, "epoch": 0.2085822755094776, "grad_norm": 1.281129240989685, "learning_rate": 1.390425368156396e-05, "loss": 0.346, "mean_token_accuracy": 0.8990820780396461, "num_tokens": 89734118.0, "step": 13550 }, { "entropy": 0.33770544864237306, "epoch": 0.20873621076815618, "grad_norm": 1.2579842805862427, "learning_rate": 1.3914515880753247e-05, "loss": 0.3442, "mean_token_accuracy": 0.8999221786856652, "num_tokens": 89797732.0, "step": 13560 }, { "entropy": 0.35285487547516825, "epoch": 0.20889014602683476, "grad_norm": 0.8897687792778015, "learning_rate": 1.3924778079942533e-05, "loss": 0.3419, "mean_token_accuracy": 0.8935743123292923, "num_tokens": 89869244.0, "step": 13570 }, { "entropy": 0.34058104902505876, "epoch": 0.20904408128551336, "grad_norm": 1.4443669319152832, "learning_rate": 1.393504027913182e-05, "loss": 0.342, "mean_token_accuracy": 0.8997912958264351, "num_tokens": 89926319.0, "step": 13580 }, { "entropy": 0.3396898239850998, "epoch": 0.20919801654419193, "grad_norm": 1.0090047121047974, "learning_rate": 1.3945302478321105e-05, "loss": 0.3431, "mean_token_accuracy": 0.897279366850853, "num_tokens": 89988375.0, "step": 13590 }, { "entropy": 0.3627443008124828, "epoch": 0.2093519518028705, "grad_norm": 0.983643651008606, "learning_rate": 1.3955564677510391e-05, "loss": 0.3574, "mean_token_accuracy": 0.8913164831697941, "num_tokens": 90051076.0, "step": 13600 }, { "entropy": 0.35939530432224276, "epoch": 0.2095058870615491, "grad_norm": 1.324865698814392, "learning_rate": 1.3965826876699677e-05, "loss": 0.3553, "mean_token_accuracy": 0.8925200402736664, "num_tokens": 90125125.0, "step": 13610 }, { "entropy": 0.32566974526271225, "epoch": 0.20965982232022767, "grad_norm": 0.9806482195854187, "learning_rate": 1.3976089075888963e-05, "loss": 0.3382, "mean_token_accuracy": 0.8994771189987659, "num_tokens": 90193322.0, "step": 13620 }, { "entropy": 0.3443180816248059, "epoch": 0.20981375757890625, "grad_norm": 0.9756690859794617, "learning_rate": 1.3986351275078252e-05, "loss": 0.3487, "mean_token_accuracy": 0.8980305515229702, "num_tokens": 90261909.0, "step": 13630 }, { "entropy": 0.35495741218328475, "epoch": 0.20996769283758485, "grad_norm": 1.3757035732269287, "learning_rate": 1.3996613474267538e-05, "loss": 0.3652, "mean_token_accuracy": 0.8946496322751045, "num_tokens": 90329561.0, "step": 13640 }, { "entropy": 0.345865186303854, "epoch": 0.21012162809626342, "grad_norm": 0.9943558573722839, "learning_rate": 1.4006875673456824e-05, "loss": 0.3418, "mean_token_accuracy": 0.8975316397845745, "num_tokens": 90392425.0, "step": 13650 }, { "entropy": 0.35199963562190534, "epoch": 0.210275563354942, "grad_norm": 1.005291223526001, "learning_rate": 1.401713787264611e-05, "loss": 0.3561, "mean_token_accuracy": 0.8950831383466721, "num_tokens": 90461862.0, "step": 13660 }, { "entropy": 0.3491489365696907, "epoch": 0.21042949861362056, "grad_norm": 0.8945630192756653, "learning_rate": 1.4027400071835396e-05, "loss": 0.3305, "mean_token_accuracy": 0.8978734008967877, "num_tokens": 90531225.0, "step": 13670 }, { "entropy": 0.343388085719198, "epoch": 0.21058343387229916, "grad_norm": 1.0870062112808228, "learning_rate": 1.4037662271024682e-05, "loss": 0.3329, "mean_token_accuracy": 0.8977461978793144, "num_tokens": 90593973.0, "step": 13680 }, { "entropy": 0.3459851242601871, "epoch": 0.21073736913097774, "grad_norm": 1.1542528867721558, "learning_rate": 1.4047924470213968e-05, "loss": 0.3444, "mean_token_accuracy": 0.8971845835447312, "num_tokens": 90653104.0, "step": 13690 }, { "entropy": 0.3575796462595463, "epoch": 0.2108913043896563, "grad_norm": 0.8677027225494385, "learning_rate": 1.4058186669403254e-05, "loss": 0.3564, "mean_token_accuracy": 0.8965449020266533, "num_tokens": 90710500.0, "step": 13700 }, { "entropy": 0.3477449856698513, "epoch": 0.2110452396483349, "grad_norm": 1.0705486536026, "learning_rate": 1.4068448868592542e-05, "loss": 0.3424, "mean_token_accuracy": 0.8962080873548984, "num_tokens": 90786564.0, "step": 13710 }, { "entropy": 0.34354918655008077, "epoch": 0.21119917490701348, "grad_norm": 0.941288948059082, "learning_rate": 1.4078711067781828e-05, "loss": 0.3442, "mean_token_accuracy": 0.8966618835926056, "num_tokens": 90850962.0, "step": 13720 }, { "entropy": 0.3506887532770634, "epoch": 0.21135311016569205, "grad_norm": 0.9664983749389648, "learning_rate": 1.4088973266971113e-05, "loss": 0.3569, "mean_token_accuracy": 0.8949453510344029, "num_tokens": 90917098.0, "step": 13730 }, { "entropy": 0.33653822597116234, "epoch": 0.21150704542437065, "grad_norm": 0.8690981864929199, "learning_rate": 1.40992354661604e-05, "loss": 0.3363, "mean_token_accuracy": 0.9015102289617062, "num_tokens": 90982477.0, "step": 13740 }, { "entropy": 0.32165888901799916, "epoch": 0.21166098068304923, "grad_norm": 1.2220581769943237, "learning_rate": 1.4109497665349685e-05, "loss": 0.3264, "mean_token_accuracy": 0.9040958099067211, "num_tokens": 91057866.0, "step": 13750 }, { "entropy": 0.3502052029594779, "epoch": 0.2118149159417278, "grad_norm": 1.166387677192688, "learning_rate": 1.4119759864538971e-05, "loss": 0.356, "mean_token_accuracy": 0.8947944924235344, "num_tokens": 91125761.0, "step": 13760 }, { "entropy": 0.35358840562403204, "epoch": 0.2119688512004064, "grad_norm": 1.1197131872177124, "learning_rate": 1.4130022063728257e-05, "loss": 0.3531, "mean_token_accuracy": 0.8942760467529297, "num_tokens": 91192447.0, "step": 13770 }, { "entropy": 0.33730539344251154, "epoch": 0.21212278645908497, "grad_norm": 1.4867254495620728, "learning_rate": 1.4140284262917545e-05, "loss": 0.3461, "mean_token_accuracy": 0.9019478492438793, "num_tokens": 91251818.0, "step": 13780 }, { "entropy": 0.35013288017362354, "epoch": 0.21227672171776354, "grad_norm": 1.0305131673812866, "learning_rate": 1.415054646210683e-05, "loss": 0.3497, "mean_token_accuracy": 0.8967849366366863, "num_tokens": 91322795.0, "step": 13790 }, { "entropy": 0.36794479805976155, "epoch": 0.21243065697644214, "grad_norm": 0.776488184928894, "learning_rate": 1.4160808661296117e-05, "loss": 0.3621, "mean_token_accuracy": 0.8873595841228962, "num_tokens": 91391511.0, "step": 13800 }, { "entropy": 0.35463980892673136, "epoch": 0.21258459223512072, "grad_norm": 1.4412134885787964, "learning_rate": 1.4171070860485403e-05, "loss": 0.3564, "mean_token_accuracy": 0.8949647277593613, "num_tokens": 91460875.0, "step": 13810 }, { "entropy": 0.3604133240878582, "epoch": 0.2127385274937993, "grad_norm": 1.2032661437988281, "learning_rate": 1.4181333059674689e-05, "loss": 0.3599, "mean_token_accuracy": 0.8936687402427197, "num_tokens": 91529743.0, "step": 13820 }, { "entropy": 0.3366190385073423, "epoch": 0.2128924627524779, "grad_norm": 1.076551079750061, "learning_rate": 1.4191595258863975e-05, "loss": 0.3448, "mean_token_accuracy": 0.8980883352458477, "num_tokens": 91592609.0, "step": 13830 }, { "entropy": 0.38283877484500406, "epoch": 0.21304639801115646, "grad_norm": 1.106215238571167, "learning_rate": 1.420185745805326e-05, "loss": 0.3791, "mean_token_accuracy": 0.8871507465839386, "num_tokens": 91655503.0, "step": 13840 }, { "entropy": 0.3498924020677805, "epoch": 0.21320033326983504, "grad_norm": 1.0779763460159302, "learning_rate": 1.421211965724255e-05, "loss": 0.3436, "mean_token_accuracy": 0.8961516380310058, "num_tokens": 91720992.0, "step": 13850 }, { "entropy": 0.3423221481963992, "epoch": 0.2133542685285136, "grad_norm": 0.9719846248626709, "learning_rate": 1.4222381856431836e-05, "loss": 0.3524, "mean_token_accuracy": 0.8959507040679455, "num_tokens": 91785466.0, "step": 13860 }, { "entropy": 0.3638999843969941, "epoch": 0.2135082037871922, "grad_norm": 1.1117138862609863, "learning_rate": 1.4232644055621122e-05, "loss": 0.3568, "mean_token_accuracy": 0.8912957787513733, "num_tokens": 91854926.0, "step": 13870 }, { "entropy": 0.3515742216259241, "epoch": 0.21366213904587078, "grad_norm": 1.3751273155212402, "learning_rate": 1.4242906254810408e-05, "loss": 0.3521, "mean_token_accuracy": 0.8954551473259926, "num_tokens": 91914457.0, "step": 13880 }, { "entropy": 0.3443468499928713, "epoch": 0.21381607430454935, "grad_norm": 0.9474574327468872, "learning_rate": 1.4253168453999694e-05, "loss": 0.3544, "mean_token_accuracy": 0.8963393643498421, "num_tokens": 91982550.0, "step": 13890 }, { "entropy": 0.34728756118565796, "epoch": 0.21397000956322795, "grad_norm": 1.2112963199615479, "learning_rate": 1.426343065318898e-05, "loss": 0.3473, "mean_token_accuracy": 0.8974163241684436, "num_tokens": 92049578.0, "step": 13900 }, { "entropy": 0.34830859042704104, "epoch": 0.21412394482190653, "grad_norm": 1.1574852466583252, "learning_rate": 1.4273692852378266e-05, "loss": 0.3562, "mean_token_accuracy": 0.8977142550051213, "num_tokens": 92107976.0, "step": 13910 }, { "entropy": 0.33897163681685927, "epoch": 0.2142778800805851, "grad_norm": 0.8403790593147278, "learning_rate": 1.4283955051567553e-05, "loss": 0.3409, "mean_token_accuracy": 0.8973698228597641, "num_tokens": 92170178.0, "step": 13920 }, { "entropy": 0.3362141041085124, "epoch": 0.2144318153392637, "grad_norm": 0.8024852871894836, "learning_rate": 1.429421725075684e-05, "loss": 0.3296, "mean_token_accuracy": 0.8993747889995575, "num_tokens": 92243930.0, "step": 13930 }, { "entropy": 0.3535531425848603, "epoch": 0.21458575059794227, "grad_norm": 1.0838823318481445, "learning_rate": 1.4304479449946125e-05, "loss": 0.3571, "mean_token_accuracy": 0.8964630246162415, "num_tokens": 92303145.0, "step": 13940 }, { "entropy": 0.35681166164577005, "epoch": 0.21473968585662084, "grad_norm": 1.0394943952560425, "learning_rate": 1.4314741649135411e-05, "loss": 0.3542, "mean_token_accuracy": 0.8939013585448266, "num_tokens": 92373795.0, "step": 13950 }, { "entropy": 0.33109614737331866, "epoch": 0.21489362111529944, "grad_norm": 1.2375112771987915, "learning_rate": 1.4325003848324697e-05, "loss": 0.328, "mean_token_accuracy": 0.9024527639150619, "num_tokens": 92435580.0, "step": 13960 }, { "entropy": 0.35213205246254803, "epoch": 0.21504755637397802, "grad_norm": 0.787100076675415, "learning_rate": 1.4335266047513983e-05, "loss": 0.3532, "mean_token_accuracy": 0.8941807255148888, "num_tokens": 92509352.0, "step": 13970 }, { "entropy": 0.3607875041663647, "epoch": 0.2152014916326566, "grad_norm": 1.1603630781173706, "learning_rate": 1.4345528246703269e-05, "loss": 0.354, "mean_token_accuracy": 0.8935132853686809, "num_tokens": 92574167.0, "step": 13980 }, { "entropy": 0.3521743414923549, "epoch": 0.2153554268913352, "grad_norm": 0.9127770662307739, "learning_rate": 1.4355790445892557e-05, "loss": 0.3553, "mean_token_accuracy": 0.8948088377714157, "num_tokens": 92640028.0, "step": 13990 }, { "entropy": 0.34276873897761106, "epoch": 0.21550936215001376, "grad_norm": 0.8151850700378418, "learning_rate": 1.4366052645081842e-05, "loss": 0.3434, "mean_token_accuracy": 0.8978072069585323, "num_tokens": 92708304.0, "step": 14000 }, { "entropy": 0.34842790495604276, "epoch": 0.21566329740869233, "grad_norm": 1.1069834232330322, "learning_rate": 1.4376314844271128e-05, "loss": 0.3449, "mean_token_accuracy": 0.8968148045241833, "num_tokens": 92769666.0, "step": 14010 }, { "entropy": 0.3361660405993462, "epoch": 0.21581723266737093, "grad_norm": 1.0134168863296509, "learning_rate": 1.4386577043460414e-05, "loss": 0.3335, "mean_token_accuracy": 0.8993557021021843, "num_tokens": 92826786.0, "step": 14020 }, { "entropy": 0.350286652892828, "epoch": 0.2159711679260495, "grad_norm": 1.0963908433914185, "learning_rate": 1.43968392426497e-05, "loss": 0.3417, "mean_token_accuracy": 0.8969309888780117, "num_tokens": 92887792.0, "step": 14030 }, { "entropy": 0.35112369135022165, "epoch": 0.21612510318472808, "grad_norm": 1.1427966356277466, "learning_rate": 1.4407101441838986e-05, "loss": 0.347, "mean_token_accuracy": 0.8971873603761196, "num_tokens": 92960181.0, "step": 14040 }, { "entropy": 0.34864537641406057, "epoch": 0.21627903844340665, "grad_norm": 1.0036447048187256, "learning_rate": 1.4417363641028272e-05, "loss": 0.3511, "mean_token_accuracy": 0.8945410206913949, "num_tokens": 93026058.0, "step": 14050 }, { "entropy": 0.3585334803909063, "epoch": 0.21643297370208525, "grad_norm": 1.085709571838379, "learning_rate": 1.442762584021756e-05, "loss": 0.3584, "mean_token_accuracy": 0.8940209455788135, "num_tokens": 93087986.0, "step": 14060 }, { "entropy": 0.3240797193720937, "epoch": 0.21658690896076382, "grad_norm": 1.1252182722091675, "learning_rate": 1.4437888039406846e-05, "loss": 0.3197, "mean_token_accuracy": 0.9055031806230545, "num_tokens": 93148581.0, "step": 14070 }, { "entropy": 0.35017966516315935, "epoch": 0.2167408442194424, "grad_norm": 1.187028169631958, "learning_rate": 1.4448150238596132e-05, "loss": 0.3421, "mean_token_accuracy": 0.8971598617732525, "num_tokens": 93207772.0, "step": 14080 }, { "entropy": 0.3272729835473001, "epoch": 0.216894779478121, "grad_norm": 0.9279325008392334, "learning_rate": 1.4458412437785418e-05, "loss": 0.344, "mean_token_accuracy": 0.9022970125079155, "num_tokens": 93275466.0, "step": 14090 }, { "entropy": 0.360987126454711, "epoch": 0.21704871473679957, "grad_norm": 0.8772956132888794, "learning_rate": 1.4468674636974704e-05, "loss": 0.3586, "mean_token_accuracy": 0.8926633842289448, "num_tokens": 93345129.0, "step": 14100 }, { "entropy": 0.34406961444765327, "epoch": 0.21720264999547814, "grad_norm": 1.070145845413208, "learning_rate": 1.447893683616399e-05, "loss": 0.3505, "mean_token_accuracy": 0.8947709031403065, "num_tokens": 93408881.0, "step": 14110 }, { "entropy": 0.32918982580304146, "epoch": 0.21735658525415674, "grad_norm": 1.3075984716415405, "learning_rate": 1.4489199035353276e-05, "loss": 0.3336, "mean_token_accuracy": 0.9027044504880906, "num_tokens": 93472472.0, "step": 14120 }, { "entropy": 0.3518409701064229, "epoch": 0.21751052051283531, "grad_norm": 1.0696684122085571, "learning_rate": 1.4499461234542565e-05, "loss": 0.3504, "mean_token_accuracy": 0.8963014148175716, "num_tokens": 93542455.0, "step": 14130 }, { "entropy": 0.3434735979884863, "epoch": 0.2176644557715139, "grad_norm": 0.9258744120597839, "learning_rate": 1.450972343373185e-05, "loss": 0.3417, "mean_token_accuracy": 0.896588996052742, "num_tokens": 93611382.0, "step": 14140 }, { "entropy": 0.34962210338562727, "epoch": 0.2178183910301925, "grad_norm": 0.8775164484977722, "learning_rate": 1.4519985632921137e-05, "loss": 0.3505, "mean_token_accuracy": 0.8957376383244992, "num_tokens": 93684698.0, "step": 14150 }, { "entropy": 0.3510172402486205, "epoch": 0.21797232628887106, "grad_norm": 0.9712697267532349, "learning_rate": 1.4530247832110423e-05, "loss": 0.3611, "mean_token_accuracy": 0.8930176183581352, "num_tokens": 93750052.0, "step": 14160 }, { "entropy": 0.3561602350324392, "epoch": 0.21812626154754963, "grad_norm": 0.8961439728736877, "learning_rate": 1.4540510031299709e-05, "loss": 0.3468, "mean_token_accuracy": 0.892551313340664, "num_tokens": 93805569.0, "step": 14170 }, { "entropy": 0.34974083714187143, "epoch": 0.21828019680622823, "grad_norm": 1.0188530683517456, "learning_rate": 1.4550772230488995e-05, "loss": 0.3538, "mean_token_accuracy": 0.8949953019618988, "num_tokens": 93863128.0, "step": 14180 }, { "entropy": 0.34916047714650633, "epoch": 0.2184341320649068, "grad_norm": 1.2443690299987793, "learning_rate": 1.456103442967828e-05, "loss": 0.3432, "mean_token_accuracy": 0.8970316417515278, "num_tokens": 93928010.0, "step": 14190 }, { "entropy": 0.3470428643748164, "epoch": 0.21858806732358538, "grad_norm": 1.3474335670471191, "learning_rate": 1.4571296628867568e-05, "loss": 0.3422, "mean_token_accuracy": 0.8949176482856274, "num_tokens": 93998904.0, "step": 14200 }, { "entropy": 0.34782584570348263, "epoch": 0.21874200258226398, "grad_norm": 1.1110724210739136, "learning_rate": 1.4581558828056854e-05, "loss": 0.3517, "mean_token_accuracy": 0.895660138130188, "num_tokens": 94062410.0, "step": 14210 }, { "entropy": 0.36056921295821664, "epoch": 0.21889593784094255, "grad_norm": 1.078192114830017, "learning_rate": 1.459182102724614e-05, "loss": 0.3618, "mean_token_accuracy": 0.8929273530840873, "num_tokens": 94132687.0, "step": 14220 }, { "entropy": 0.351908902451396, "epoch": 0.21904987309962112, "grad_norm": 0.8894240260124207, "learning_rate": 1.4602083226435426e-05, "loss": 0.3397, "mean_token_accuracy": 0.8959835089743138, "num_tokens": 94195762.0, "step": 14230 }, { "entropy": 0.3257636709138751, "epoch": 0.2192038083582997, "grad_norm": 1.13307523727417, "learning_rate": 1.4612345425624712e-05, "loss": 0.3361, "mean_token_accuracy": 0.9010414563119411, "num_tokens": 94264067.0, "step": 14240 }, { "entropy": 0.3513483306393027, "epoch": 0.2193577436169783, "grad_norm": 0.9361041188240051, "learning_rate": 1.4622607624813998e-05, "loss": 0.3513, "mean_token_accuracy": 0.8947654753923416, "num_tokens": 94325121.0, "step": 14250 }, { "entropy": 0.35917858090251686, "epoch": 0.21951167887565687, "grad_norm": 1.1478155851364136, "learning_rate": 1.4632869824003284e-05, "loss": 0.3513, "mean_token_accuracy": 0.893972584605217, "num_tokens": 94396296.0, "step": 14260 }, { "entropy": 0.33953505717217924, "epoch": 0.21966561413433544, "grad_norm": 1.1509572267532349, "learning_rate": 1.4643132023192572e-05, "loss": 0.3321, "mean_token_accuracy": 0.899994795024395, "num_tokens": 94462330.0, "step": 14270 }, { "entropy": 0.3729175429791212, "epoch": 0.21981954939301404, "grad_norm": 1.1264737844467163, "learning_rate": 1.4653394222381857e-05, "loss": 0.3668, "mean_token_accuracy": 0.8869711980223656, "num_tokens": 94519902.0, "step": 14280 }, { "entropy": 0.3453880837187171, "epoch": 0.2199734846516926, "grad_norm": 0.9922084808349609, "learning_rate": 1.4663656421571143e-05, "loss": 0.3469, "mean_token_accuracy": 0.8959919095039368, "num_tokens": 94581693.0, "step": 14290 }, { "entropy": 0.36645417045801876, "epoch": 0.22012741991037119, "grad_norm": 0.9239858984947205, "learning_rate": 1.467391862076043e-05, "loss": 0.3688, "mean_token_accuracy": 0.8929167084395886, "num_tokens": 94638327.0, "step": 14300 }, { "entropy": 0.36155565455555916, "epoch": 0.2202813551690498, "grad_norm": 1.0104480981826782, "learning_rate": 1.4684180819949715e-05, "loss": 0.3481, "mean_token_accuracy": 0.8911198481917382, "num_tokens": 94706786.0, "step": 14310 }, { "entropy": 0.3564252441748977, "epoch": 0.22043529042772836, "grad_norm": 0.9048095941543579, "learning_rate": 1.4694443019139001e-05, "loss": 0.3459, "mean_token_accuracy": 0.8919098049402236, "num_tokens": 94771313.0, "step": 14320 }, { "entropy": 0.34303579283878205, "epoch": 0.22058922568640693, "grad_norm": 0.9000144600868225, "learning_rate": 1.4704705218328287e-05, "loss": 0.3383, "mean_token_accuracy": 0.8991294652223587, "num_tokens": 94840816.0, "step": 14330 }, { "entropy": 0.3336765434592962, "epoch": 0.22074316094508553, "grad_norm": 1.0817244052886963, "learning_rate": 1.4714967417517577e-05, "loss": 0.3339, "mean_token_accuracy": 0.899217402935028, "num_tokens": 94898438.0, "step": 14340 }, { "entropy": 0.3473183669149876, "epoch": 0.2208970962037641, "grad_norm": 1.056075930595398, "learning_rate": 1.4725229616706862e-05, "loss": 0.3493, "mean_token_accuracy": 0.897345045208931, "num_tokens": 94965094.0, "step": 14350 }, { "entropy": 0.37250875318422916, "epoch": 0.22105103146244268, "grad_norm": 0.9352676272392273, "learning_rate": 1.4735491815896148e-05, "loss": 0.3648, "mean_token_accuracy": 0.8895127832889557, "num_tokens": 95033509.0, "step": 14360 }, { "entropy": 0.35329547598958017, "epoch": 0.22120496672112128, "grad_norm": 0.8039282560348511, "learning_rate": 1.4745754015085434e-05, "loss": 0.3519, "mean_token_accuracy": 0.8946187563240529, "num_tokens": 95096195.0, "step": 14370 }, { "entropy": 0.3343543831259012, "epoch": 0.22135890197979985, "grad_norm": 1.1692262887954712, "learning_rate": 1.475601621427472e-05, "loss": 0.3429, "mean_token_accuracy": 0.9022023193538189, "num_tokens": 95161173.0, "step": 14380 }, { "entropy": 0.33654433116316795, "epoch": 0.22151283723847842, "grad_norm": 0.7882617712020874, "learning_rate": 1.4766278413464006e-05, "loss": 0.3415, "mean_token_accuracy": 0.8991675473749637, "num_tokens": 95236688.0, "step": 14390 }, { "entropy": 0.3462183212861419, "epoch": 0.22166677249715702, "grad_norm": 0.9944173693656921, "learning_rate": 1.4776540612653292e-05, "loss": 0.3552, "mean_token_accuracy": 0.8967430539429188, "num_tokens": 95310309.0, "step": 14400 }, { "entropy": 0.34987576510757207, "epoch": 0.2218207077558356, "grad_norm": 0.9004371166229248, "learning_rate": 1.478680281184258e-05, "loss": 0.342, "mean_token_accuracy": 0.8959424026310444, "num_tokens": 95364479.0, "step": 14410 }, { "entropy": 0.33109207414090636, "epoch": 0.22197464301451417, "grad_norm": 1.021478533744812, "learning_rate": 1.4797065011031866e-05, "loss": 0.3425, "mean_token_accuracy": 0.8970656640827656, "num_tokens": 95435411.0, "step": 14420 }, { "entropy": 0.33824212551116944, "epoch": 0.22212857827319274, "grad_norm": 1.0539770126342773, "learning_rate": 1.4807327210221152e-05, "loss": 0.3389, "mean_token_accuracy": 0.9003555558621883, "num_tokens": 95509612.0, "step": 14430 }, { "entropy": 0.3559542873874307, "epoch": 0.22228251353187134, "grad_norm": 0.9484845995903015, "learning_rate": 1.4817589409410438e-05, "loss": 0.3488, "mean_token_accuracy": 0.8922474510967732, "num_tokens": 95584010.0, "step": 14440 }, { "entropy": 0.3477209068834782, "epoch": 0.2224364487905499, "grad_norm": 1.0014897584915161, "learning_rate": 1.4827851608599724e-05, "loss": 0.3485, "mean_token_accuracy": 0.8949301853775978, "num_tokens": 95646855.0, "step": 14450 }, { "entropy": 0.3620127037167549, "epoch": 0.22259038404922848, "grad_norm": 1.113832950592041, "learning_rate": 1.483811380778901e-05, "loss": 0.3606, "mean_token_accuracy": 0.892803567647934, "num_tokens": 95710897.0, "step": 14460 }, { "entropy": 0.3426787626929581, "epoch": 0.22274431930790708, "grad_norm": 0.9206414818763733, "learning_rate": 1.4848376006978296e-05, "loss": 0.332, "mean_token_accuracy": 0.8955044955015182, "num_tokens": 95770329.0, "step": 14470 }, { "entropy": 0.3528147917240858, "epoch": 0.22289825456658566, "grad_norm": 1.0122126340866089, "learning_rate": 1.4858638206167583e-05, "loss": 0.352, "mean_token_accuracy": 0.8949774496257306, "num_tokens": 95842743.0, "step": 14480 }, { "entropy": 0.34791748579591514, "epoch": 0.22305218982526423, "grad_norm": 1.1418280601501465, "learning_rate": 1.4868900405356869e-05, "loss": 0.3435, "mean_token_accuracy": 0.8959469325840473, "num_tokens": 95906132.0, "step": 14490 }, { "entropy": 0.3351908948272467, "epoch": 0.22320612508394283, "grad_norm": 0.9551706314086914, "learning_rate": 1.4879162604546155e-05, "loss": 0.3445, "mean_token_accuracy": 0.8987627096474171, "num_tokens": 95966686.0, "step": 14500 }, { "entropy": 0.34946579542011025, "epoch": 0.2233600603426214, "grad_norm": 0.7943546772003174, "learning_rate": 1.4889424803735441e-05, "loss": 0.3378, "mean_token_accuracy": 0.8964219771325588, "num_tokens": 96035920.0, "step": 14510 }, { "entropy": 0.33618608117103577, "epoch": 0.22351399560129998, "grad_norm": 1.1804412603378296, "learning_rate": 1.4899687002924727e-05, "loss": 0.3338, "mean_token_accuracy": 0.8986660368740559, "num_tokens": 96096125.0, "step": 14520 }, { "entropy": 0.34263528380542996, "epoch": 0.22366793085997858, "grad_norm": 1.0255722999572754, "learning_rate": 1.4909949202114013e-05, "loss": 0.3347, "mean_token_accuracy": 0.8982995770871639, "num_tokens": 96152659.0, "step": 14530 }, { "entropy": 0.3510633213445544, "epoch": 0.22382186611865715, "grad_norm": 1.1278952360153198, "learning_rate": 1.4920211401303299e-05, "loss": 0.3498, "mean_token_accuracy": 0.8953355394303799, "num_tokens": 96221546.0, "step": 14540 }, { "entropy": 0.3520375231280923, "epoch": 0.22397580137733572, "grad_norm": 0.7728859186172485, "learning_rate": 1.4930473600492588e-05, "loss": 0.3521, "mean_token_accuracy": 0.8935390777885914, "num_tokens": 96289075.0, "step": 14550 }, { "entropy": 0.3461946056224406, "epoch": 0.22412973663601432, "grad_norm": 1.0036213397979736, "learning_rate": 1.4940735799681874e-05, "loss": 0.3413, "mean_token_accuracy": 0.8984273284673691, "num_tokens": 96362423.0, "step": 14560 }, { "entropy": 0.36221667751669884, "epoch": 0.2242836718946929, "grad_norm": 0.8863373398780823, "learning_rate": 1.495099799887116e-05, "loss": 0.3637, "mean_token_accuracy": 0.8925872504711151, "num_tokens": 96426275.0, "step": 14570 }, { "entropy": 0.34798214994370935, "epoch": 0.22443760715337147, "grad_norm": 1.4458107948303223, "learning_rate": 1.4961260198060446e-05, "loss": 0.3562, "mean_token_accuracy": 0.8938911229372024, "num_tokens": 96491644.0, "step": 14580 }, { "entropy": 0.3257733477279544, "epoch": 0.22459154241205007, "grad_norm": 1.0054956674575806, "learning_rate": 1.4971522397249732e-05, "loss": 0.3317, "mean_token_accuracy": 0.9036768399178982, "num_tokens": 96566722.0, "step": 14590 }, { "entropy": 0.35923367999494077, "epoch": 0.22474547767072864, "grad_norm": 0.760492742061615, "learning_rate": 1.4981784596439018e-05, "loss": 0.3504, "mean_token_accuracy": 0.893144690990448, "num_tokens": 96640739.0, "step": 14600 }, { "entropy": 0.3548390123061836, "epoch": 0.2248994129294072, "grad_norm": 0.8728858828544617, "learning_rate": 1.4992046795628304e-05, "loss": 0.364, "mean_token_accuracy": 0.8954411178827286, "num_tokens": 96704851.0, "step": 14610 }, { "entropy": 0.34571100417524575, "epoch": 0.22505334818808578, "grad_norm": 0.940420925617218, "learning_rate": 1.5002308994817591e-05, "loss": 0.3452, "mean_token_accuracy": 0.8964842274785042, "num_tokens": 96776355.0, "step": 14620 }, { "entropy": 0.3487147057428956, "epoch": 0.22520728344676438, "grad_norm": 0.8956937193870544, "learning_rate": 1.5012571194006877e-05, "loss": 0.3352, "mean_token_accuracy": 0.8946352861821651, "num_tokens": 96840683.0, "step": 14630 }, { "entropy": 0.3505194324068725, "epoch": 0.22536121870544296, "grad_norm": 0.8892422914505005, "learning_rate": 1.5022833393196163e-05, "loss": 0.3436, "mean_token_accuracy": 0.8964009664952755, "num_tokens": 96907752.0, "step": 14640 }, { "entropy": 0.3416235143318772, "epoch": 0.22551515396412153, "grad_norm": 1.1315984725952148, "learning_rate": 1.503309559238545e-05, "loss": 0.349, "mean_token_accuracy": 0.897643043845892, "num_tokens": 96968324.0, "step": 14650 }, { "entropy": 0.34442323781549933, "epoch": 0.22566908922280013, "grad_norm": 0.995290994644165, "learning_rate": 1.5043357791574735e-05, "loss": 0.3356, "mean_token_accuracy": 0.8970693252980709, "num_tokens": 97040333.0, "step": 14660 }, { "entropy": 0.3589178932830691, "epoch": 0.2258230244814787, "grad_norm": 0.8707993030548096, "learning_rate": 1.5053619990764021e-05, "loss": 0.3461, "mean_token_accuracy": 0.8949706934392452, "num_tokens": 97104920.0, "step": 14670 }, { "entropy": 0.32961464412510394, "epoch": 0.22597695974015727, "grad_norm": 1.0280455350875854, "learning_rate": 1.5063882189953307e-05, "loss": 0.325, "mean_token_accuracy": 0.8981297165155411, "num_tokens": 97161688.0, "step": 14680 }, { "entropy": 0.352553116902709, "epoch": 0.22613089499883587, "grad_norm": 0.8121182322502136, "learning_rate": 1.5074144389142595e-05, "loss": 0.3455, "mean_token_accuracy": 0.894961591809988, "num_tokens": 97233349.0, "step": 14690 }, { "entropy": 0.3408072544261813, "epoch": 0.22628483025751445, "grad_norm": 1.1875507831573486, "learning_rate": 1.508440658833188e-05, "loss": 0.3368, "mean_token_accuracy": 0.8984782554209232, "num_tokens": 97307888.0, "step": 14700 }, { "entropy": 0.3646811155602336, "epoch": 0.22643876551619302, "grad_norm": 1.0042555332183838, "learning_rate": 1.5094668787521167e-05, "loss": 0.3711, "mean_token_accuracy": 0.8895587973296643, "num_tokens": 97365727.0, "step": 14710 }, { "entropy": 0.3429192954674363, "epoch": 0.22659270077487162, "grad_norm": 0.8813191056251526, "learning_rate": 1.5104930986710453e-05, "loss": 0.3414, "mean_token_accuracy": 0.9010369673371315, "num_tokens": 97428381.0, "step": 14720 }, { "entropy": 0.35440403055399655, "epoch": 0.2267466360335502, "grad_norm": 0.8731639385223389, "learning_rate": 1.5115193185899739e-05, "loss": 0.3479, "mean_token_accuracy": 0.8930274754762649, "num_tokens": 97496481.0, "step": 14730 }, { "entropy": 0.3676316190510988, "epoch": 0.22690057129222876, "grad_norm": 1.0398645401000977, "learning_rate": 1.5125455385089025e-05, "loss": 0.3721, "mean_token_accuracy": 0.8909104935824871, "num_tokens": 97563754.0, "step": 14740 }, { "entropy": 0.36425060369074347, "epoch": 0.22705450655090736, "grad_norm": 0.9618772268295288, "learning_rate": 1.513571758427831e-05, "loss": 0.3518, "mean_token_accuracy": 0.8920286938548088, "num_tokens": 97635600.0, "step": 14750 }, { "entropy": 0.3502788433805108, "epoch": 0.22720844180958594, "grad_norm": 0.902698278427124, "learning_rate": 1.51459797834676e-05, "loss": 0.3448, "mean_token_accuracy": 0.8948822103440761, "num_tokens": 97707373.0, "step": 14760 }, { "entropy": 0.3316922690719366, "epoch": 0.2273623770682645, "grad_norm": 0.9087525010108948, "learning_rate": 1.5156241982656886e-05, "loss": 0.3382, "mean_token_accuracy": 0.9013686515390873, "num_tokens": 97775456.0, "step": 14770 }, { "entropy": 0.3430524716153741, "epoch": 0.22751631232694308, "grad_norm": 1.1330407857894897, "learning_rate": 1.5166504181846172e-05, "loss": 0.3244, "mean_token_accuracy": 0.8969996467232704, "num_tokens": 97840114.0, "step": 14780 }, { "entropy": 0.3277584751136601, "epoch": 0.22767024758562168, "grad_norm": 0.9672917127609253, "learning_rate": 1.5176766381035458e-05, "loss": 0.3319, "mean_token_accuracy": 0.8998194701969624, "num_tokens": 97907238.0, "step": 14790 }, { "entropy": 0.3447975292801857, "epoch": 0.22782418284430025, "grad_norm": 1.1008206605911255, "learning_rate": 1.5187028580224744e-05, "loss": 0.3601, "mean_token_accuracy": 0.8983779266476631, "num_tokens": 97966636.0, "step": 14800 }, { "entropy": 0.34510106816887853, "epoch": 0.22797811810297883, "grad_norm": 0.9280365109443665, "learning_rate": 1.519729077941403e-05, "loss": 0.337, "mean_token_accuracy": 0.8976227901875973, "num_tokens": 98032877.0, "step": 14810 }, { "entropy": 0.3557486038655043, "epoch": 0.22813205336165743, "grad_norm": 1.0143213272094727, "learning_rate": 1.5207552978603316e-05, "loss": 0.3669, "mean_token_accuracy": 0.8918137706816196, "num_tokens": 98098666.0, "step": 14820 }, { "entropy": 0.3335181958973408, "epoch": 0.228285988620336, "grad_norm": 0.8909077644348145, "learning_rate": 1.5217815177792603e-05, "loss": 0.3191, "mean_token_accuracy": 0.8998711377382278, "num_tokens": 98168831.0, "step": 14830 }, { "entropy": 0.348327050358057, "epoch": 0.22843992387901457, "grad_norm": 0.8913992047309875, "learning_rate": 1.5228077376981889e-05, "loss": 0.3527, "mean_token_accuracy": 0.8960890963673591, "num_tokens": 98247843.0, "step": 14840 }, { "entropy": 0.38311309218406675, "epoch": 0.22859385913769317, "grad_norm": 0.8900420069694519, "learning_rate": 1.5238339576171175e-05, "loss": 0.3736, "mean_token_accuracy": 0.8848263584077358, "num_tokens": 98315829.0, "step": 14850 }, { "entropy": 0.35280655026435853, "epoch": 0.22874779439637175, "grad_norm": 0.9383358955383301, "learning_rate": 1.5248601775360461e-05, "loss": 0.352, "mean_token_accuracy": 0.8951925307512283, "num_tokens": 98387955.0, "step": 14860 }, { "entropy": 0.32612907886505127, "epoch": 0.22890172965505032, "grad_norm": 0.9584270715713501, "learning_rate": 1.5258863974549745e-05, "loss": 0.3232, "mean_token_accuracy": 0.9033199399709702, "num_tokens": 98452825.0, "step": 14870 }, { "entropy": 0.35166692920029163, "epoch": 0.22905566491372892, "grad_norm": 1.0886048078536987, "learning_rate": 1.526912617373903e-05, "loss": 0.3657, "mean_token_accuracy": 0.8928722567856312, "num_tokens": 98519791.0, "step": 14880 }, { "entropy": 0.32089286763221025, "epoch": 0.2292096001724075, "grad_norm": 0.950408935546875, "learning_rate": 1.5279388372928317e-05, "loss": 0.3294, "mean_token_accuracy": 0.9059212625026702, "num_tokens": 98588571.0, "step": 14890 }, { "entropy": 0.3661126185208559, "epoch": 0.22936353543108606, "grad_norm": 1.0991625785827637, "learning_rate": 1.5289650572117606e-05, "loss": 0.3778, "mean_token_accuracy": 0.8890106745064259, "num_tokens": 98657292.0, "step": 14900 }, { "entropy": 0.35127245225012305, "epoch": 0.22951747068976466, "grad_norm": 1.1011934280395508, "learning_rate": 1.5299912771306892e-05, "loss": 0.3451, "mean_token_accuracy": 0.8970822975039482, "num_tokens": 98726025.0, "step": 14910 }, { "entropy": 0.3447147013619542, "epoch": 0.22967140594844324, "grad_norm": 1.2308597564697266, "learning_rate": 1.531017497049618e-05, "loss": 0.3484, "mean_token_accuracy": 0.8971939884126187, "num_tokens": 98795012.0, "step": 14920 }, { "entropy": 0.33210302432999017, "epoch": 0.2298253412071218, "grad_norm": 1.003407597541809, "learning_rate": 1.5320437169685464e-05, "loss": 0.3294, "mean_token_accuracy": 0.900631732493639, "num_tokens": 98859880.0, "step": 14930 }, { "entropy": 0.3464490009471774, "epoch": 0.2299792764658004, "grad_norm": 0.9654202461242676, "learning_rate": 1.533069936887475e-05, "loss": 0.3472, "mean_token_accuracy": 0.8949533089995384, "num_tokens": 98926360.0, "step": 14940 }, { "entropy": 0.35193880181759596, "epoch": 0.23013321172447898, "grad_norm": 0.8580811619758606, "learning_rate": 1.5340961568064036e-05, "loss": 0.346, "mean_token_accuracy": 0.8948184214532375, "num_tokens": 98992478.0, "step": 14950 }, { "entropy": 0.3334259731695056, "epoch": 0.23028714698315755, "grad_norm": 0.843891441822052, "learning_rate": 1.5351223767253322e-05, "loss": 0.3281, "mean_token_accuracy": 0.9016995541751385, "num_tokens": 99051852.0, "step": 14960 }, { "entropy": 0.3388340823352337, "epoch": 0.23044108224183613, "grad_norm": 0.9432274103164673, "learning_rate": 1.536148596644261e-05, "loss": 0.3439, "mean_token_accuracy": 0.8966603145003319, "num_tokens": 99117138.0, "step": 14970 }, { "entropy": 0.3475486798211932, "epoch": 0.23059501750051473, "grad_norm": 0.9911577701568604, "learning_rate": 1.5371748165631897e-05, "loss": 0.3426, "mean_token_accuracy": 0.8969514064490796, "num_tokens": 99183191.0, "step": 14980 }, { "entropy": 0.3349669689312577, "epoch": 0.2307489527591933, "grad_norm": 0.9090306758880615, "learning_rate": 1.5382010364821183e-05, "loss": 0.3278, "mean_token_accuracy": 0.8989045016467572, "num_tokens": 99253576.0, "step": 14990 }, { "entropy": 0.33318557143211364, "epoch": 0.23090288801787187, "grad_norm": 1.1697083711624146, "learning_rate": 1.539227256401047e-05, "loss": 0.3314, "mean_token_accuracy": 0.9010598942637443, "num_tokens": 99315247.0, "step": 15000 }, { "entropy": 0.33319126404821875, "epoch": 0.23105682327655047, "grad_norm": 0.9806263446807861, "learning_rate": 1.5402534763199755e-05, "loss": 0.3444, "mean_token_accuracy": 0.8989610269665718, "num_tokens": 99378983.0, "step": 15010 }, { "entropy": 0.3469015141949058, "epoch": 0.23121075853522904, "grad_norm": 0.9266743063926697, "learning_rate": 1.541279696238904e-05, "loss": 0.3447, "mean_token_accuracy": 0.8970417559146882, "num_tokens": 99441680.0, "step": 15020 }, { "entropy": 0.338975235261023, "epoch": 0.23136469379390762, "grad_norm": 0.8643180131912231, "learning_rate": 1.5423059161578327e-05, "loss": 0.3368, "mean_token_accuracy": 0.8988993041217327, "num_tokens": 99519419.0, "step": 15030 }, { "entropy": 0.32866589464247226, "epoch": 0.23151862905258622, "grad_norm": 0.9493595957756042, "learning_rate": 1.5433321360767613e-05, "loss": 0.3291, "mean_token_accuracy": 0.9003329090774059, "num_tokens": 99589090.0, "step": 15040 }, { "entropy": 0.34474237356334925, "epoch": 0.2316725643112648, "grad_norm": 1.087359070777893, "learning_rate": 1.54435835599569e-05, "loss": 0.3578, "mean_token_accuracy": 0.8962682768702507, "num_tokens": 99645833.0, "step": 15050 }, { "entropy": 0.35223126634955404, "epoch": 0.23182649956994336, "grad_norm": 1.055171012878418, "learning_rate": 1.5453845759146185e-05, "loss": 0.3423, "mean_token_accuracy": 0.8952971659600735, "num_tokens": 99709938.0, "step": 15060 }, { "entropy": 0.3408489226363599, "epoch": 0.23198043482862196, "grad_norm": 0.8192004561424255, "learning_rate": 1.546410795833547e-05, "loss": 0.3301, "mean_token_accuracy": 0.8972349278628826, "num_tokens": 99779439.0, "step": 15070 }, { "entropy": 0.3338123564608395, "epoch": 0.23213437008730053, "grad_norm": 0.7084630131721497, "learning_rate": 1.5474370157524757e-05, "loss": 0.3308, "mean_token_accuracy": 0.9010661192238331, "num_tokens": 99852846.0, "step": 15080 }, { "entropy": 0.3449832107871771, "epoch": 0.2322883053459791, "grad_norm": 0.9957756400108337, "learning_rate": 1.5484632356714043e-05, "loss": 0.3383, "mean_token_accuracy": 0.8974331051111222, "num_tokens": 99928610.0, "step": 15090 }, { "entropy": 0.31438056044280527, "epoch": 0.2324422406046577, "grad_norm": 0.9073680639266968, "learning_rate": 1.549489455590333e-05, "loss": 0.3163, "mean_token_accuracy": 0.9038571372628212, "num_tokens": 99999205.0, "step": 15100 }, { "entropy": 0.3596272062510252, "epoch": 0.23259617586333628, "grad_norm": 1.0541187524795532, "learning_rate": 1.5505156755092618e-05, "loss": 0.3429, "mean_token_accuracy": 0.8931495293974876, "num_tokens": 100071514.0, "step": 15110 }, { "entropy": 0.3302731657400727, "epoch": 0.23275011112201485, "grad_norm": 0.8898420929908752, "learning_rate": 1.5515418954281904e-05, "loss": 0.3341, "mean_token_accuracy": 0.9015456929802894, "num_tokens": 100147306.0, "step": 15120 }, { "entropy": 0.34826031103730204, "epoch": 0.23290404638069345, "grad_norm": 1.0746705532073975, "learning_rate": 1.552568115347119e-05, "loss": 0.3457, "mean_token_accuracy": 0.8959790602326393, "num_tokens": 100211395.0, "step": 15130 }, { "entropy": 0.3482902547344565, "epoch": 0.23305798163937202, "grad_norm": 0.9440603852272034, "learning_rate": 1.5535943352660476e-05, "loss": 0.3429, "mean_token_accuracy": 0.8987072326242924, "num_tokens": 100268661.0, "step": 15140 }, { "entropy": 0.3068597562611103, "epoch": 0.2332119168980506, "grad_norm": 0.961638867855072, "learning_rate": 1.5546205551849762e-05, "loss": 0.3221, "mean_token_accuracy": 0.9063241206109524, "num_tokens": 100326524.0, "step": 15150 }, { "entropy": 0.34663974717259405, "epoch": 0.23336585215672917, "grad_norm": 0.7949711084365845, "learning_rate": 1.5556467751039048e-05, "loss": 0.3469, "mean_token_accuracy": 0.8965093091130256, "num_tokens": 100390388.0, "step": 15160 }, { "entropy": 0.3336160223931074, "epoch": 0.23351978741540777, "grad_norm": 0.7974973917007446, "learning_rate": 1.5566729950228334e-05, "loss": 0.3361, "mean_token_accuracy": 0.8998984105885028, "num_tokens": 100458046.0, "step": 15170 }, { "entropy": 0.338928615860641, "epoch": 0.23367372267408634, "grad_norm": 0.8459736108779907, "learning_rate": 1.5576992149417623e-05, "loss": 0.3392, "mean_token_accuracy": 0.8974551603198051, "num_tokens": 100534276.0, "step": 15180 }, { "entropy": 0.3272937316447496, "epoch": 0.23382765793276492, "grad_norm": 1.4114683866500854, "learning_rate": 1.558725434860691e-05, "loss": 0.3351, "mean_token_accuracy": 0.900442561507225, "num_tokens": 100601382.0, "step": 15190 }, { "entropy": 0.33767803506925703, "epoch": 0.23398159319144352, "grad_norm": 1.2034164667129517, "learning_rate": 1.5597516547796195e-05, "loss": 0.3346, "mean_token_accuracy": 0.8988872267305851, "num_tokens": 100661746.0, "step": 15200 }, { "entropy": 0.330252811126411, "epoch": 0.2341355284501221, "grad_norm": 0.7050971388816833, "learning_rate": 1.560777874698548e-05, "loss": 0.3386, "mean_token_accuracy": 0.9008645378053188, "num_tokens": 100731474.0, "step": 15210 }, { "entropy": 0.34219943508505823, "epoch": 0.23428946370880066, "grad_norm": 0.86821049451828, "learning_rate": 1.5618040946174767e-05, "loss": 0.3325, "mean_token_accuracy": 0.8970118187367916, "num_tokens": 100796528.0, "step": 15220 }, { "entropy": 0.33525931686162946, "epoch": 0.23444339896747926, "grad_norm": 0.8287476301193237, "learning_rate": 1.5628303145364053e-05, "loss": 0.3277, "mean_token_accuracy": 0.9014592781662941, "num_tokens": 100862841.0, "step": 15230 }, { "entropy": 0.3434631612151861, "epoch": 0.23459733422615783, "grad_norm": 0.9975884556770325, "learning_rate": 1.563856534455334e-05, "loss": 0.3367, "mean_token_accuracy": 0.8965497009456158, "num_tokens": 100927461.0, "step": 15240 }, { "entropy": 0.31947352569550275, "epoch": 0.2347512694848364, "grad_norm": 0.9336960315704346, "learning_rate": 1.5648827543742625e-05, "loss": 0.3232, "mean_token_accuracy": 0.9055870659649372, "num_tokens": 100986433.0, "step": 15250 }, { "entropy": 0.345785197429359, "epoch": 0.234905204743515, "grad_norm": 1.1503700017929077, "learning_rate": 1.565908974293191e-05, "loss": 0.3405, "mean_token_accuracy": 0.8960652559995651, "num_tokens": 101051844.0, "step": 15260 }, { "entropy": 0.33108685780316593, "epoch": 0.23505914000219358, "grad_norm": 1.2141090631484985, "learning_rate": 1.5669351942121197e-05, "loss": 0.3295, "mean_token_accuracy": 0.9016084246337414, "num_tokens": 101115052.0, "step": 15270 }, { "entropy": 0.36090607214719056, "epoch": 0.23521307526087215, "grad_norm": 0.8476120829582214, "learning_rate": 1.5679614141310483e-05, "loss": 0.3601, "mean_token_accuracy": 0.8950169034302234, "num_tokens": 101191522.0, "step": 15280 }, { "entropy": 0.3235401097685099, "epoch": 0.23536701051955075, "grad_norm": 0.89124995470047, "learning_rate": 1.568987634049977e-05, "loss": 0.3228, "mean_token_accuracy": 0.9016498163342476, "num_tokens": 101257192.0, "step": 15290 }, { "entropy": 0.35269466824829576, "epoch": 0.23552094577822932, "grad_norm": 0.884801983833313, "learning_rate": 1.5700138539689054e-05, "loss": 0.345, "mean_token_accuracy": 0.8950144454836846, "num_tokens": 101325161.0, "step": 15300 }, { "entropy": 0.337998459674418, "epoch": 0.2356748810369079, "grad_norm": 1.1640374660491943, "learning_rate": 1.571040073887834e-05, "loss": 0.339, "mean_token_accuracy": 0.8979968398809433, "num_tokens": 101392970.0, "step": 15310 }, { "entropy": 0.3428311372175813, "epoch": 0.2358288162955865, "grad_norm": 0.7604635953903198, "learning_rate": 1.572066293806763e-05, "loss": 0.3493, "mean_token_accuracy": 0.8965647846460343, "num_tokens": 101465664.0, "step": 15320 }, { "entropy": 0.33579759355634453, "epoch": 0.23598275155426507, "grad_norm": 0.8347768783569336, "learning_rate": 1.5730925137256916e-05, "loss": 0.3355, "mean_token_accuracy": 0.8992259301245212, "num_tokens": 101529007.0, "step": 15330 }, { "entropy": 0.3439326986670494, "epoch": 0.23613668681294364, "grad_norm": 1.201247215270996, "learning_rate": 1.57411873364462e-05, "loss": 0.3444, "mean_token_accuracy": 0.8964450784027577, "num_tokens": 101583342.0, "step": 15340 }, { "entropy": 0.34807492755353453, "epoch": 0.2362906220716222, "grad_norm": 0.7841680645942688, "learning_rate": 1.5751449535635488e-05, "loss": 0.3587, "mean_token_accuracy": 0.8925960108637809, "num_tokens": 101651759.0, "step": 15350 }, { "entropy": 0.352733226493001, "epoch": 0.23644455733030081, "grad_norm": 1.0254424810409546, "learning_rate": 1.5761711734824774e-05, "loss": 0.3475, "mean_token_accuracy": 0.8960646830499173, "num_tokens": 101715642.0, "step": 15360 }, { "entropy": 0.33929871637374165, "epoch": 0.2365984925889794, "grad_norm": 0.8573809862136841, "learning_rate": 1.577197393401406e-05, "loss": 0.3307, "mean_token_accuracy": 0.8976258046925067, "num_tokens": 101787951.0, "step": 15370 }, { "entropy": 0.3478397946804762, "epoch": 0.23675242784765796, "grad_norm": 0.9152396321296692, "learning_rate": 1.5782236133203345e-05, "loss": 0.3448, "mean_token_accuracy": 0.8942922987043858, "num_tokens": 101859691.0, "step": 15380 }, { "entropy": 0.34267724193632604, "epoch": 0.23690636310633656, "grad_norm": 0.8506231904029846, "learning_rate": 1.5792498332392635e-05, "loss": 0.3283, "mean_token_accuracy": 0.8973877899348736, "num_tokens": 101929123.0, "step": 15390 }, { "entropy": 0.3351287327706814, "epoch": 0.23706029836501513, "grad_norm": 1.2366130352020264, "learning_rate": 1.580276053158192e-05, "loss": 0.3352, "mean_token_accuracy": 0.8999427787959575, "num_tokens": 101994687.0, "step": 15400 }, { "entropy": 0.3602118710055947, "epoch": 0.2372142336236937, "grad_norm": 0.9026773571968079, "learning_rate": 1.5813022730771207e-05, "loss": 0.3685, "mean_token_accuracy": 0.8906588666141033, "num_tokens": 102057633.0, "step": 15410 }, { "entropy": 0.3308052159845829, "epoch": 0.2373681688823723, "grad_norm": 1.2530686855316162, "learning_rate": 1.5823284929960493e-05, "loss": 0.3373, "mean_token_accuracy": 0.9040094025433063, "num_tokens": 102116385.0, "step": 15420 }, { "entropy": 0.3481506179086864, "epoch": 0.23752210414105088, "grad_norm": 0.9808880090713501, "learning_rate": 1.583354712914978e-05, "loss": 0.3435, "mean_token_accuracy": 0.8959655314683914, "num_tokens": 102186649.0, "step": 15430 }, { "entropy": 0.34336204938590525, "epoch": 0.23767603939972945, "grad_norm": 1.1915559768676758, "learning_rate": 1.5843809328339065e-05, "loss": 0.3297, "mean_token_accuracy": 0.8966076128184796, "num_tokens": 102256542.0, "step": 15440 }, { "entropy": 0.3165686570107937, "epoch": 0.23782997465840805, "grad_norm": 0.908933699131012, "learning_rate": 1.585407152752835e-05, "loss": 0.3263, "mean_token_accuracy": 0.9049242459237575, "num_tokens": 102318479.0, "step": 15450 }, { "entropy": 0.3138377944007516, "epoch": 0.23798390991708662, "grad_norm": 1.0617449283599854, "learning_rate": 1.5864333726717636e-05, "loss": 0.3225, "mean_token_accuracy": 0.9046672753989696, "num_tokens": 102379340.0, "step": 15460 }, { "entropy": 0.34395489245653155, "epoch": 0.2381378451757652, "grad_norm": 0.9696217179298401, "learning_rate": 1.5874595925906922e-05, "loss": 0.3465, "mean_token_accuracy": 0.8973264768719673, "num_tokens": 102437967.0, "step": 15470 }, { "entropy": 0.3420148055069149, "epoch": 0.2382917804344438, "grad_norm": 0.8813077807426453, "learning_rate": 1.588485812509621e-05, "loss": 0.3417, "mean_token_accuracy": 0.8955949425697327, "num_tokens": 102502748.0, "step": 15480 }, { "entropy": 0.34522489123046396, "epoch": 0.23844571569312237, "grad_norm": 0.7721817493438721, "learning_rate": 1.5895120324285494e-05, "loss": 0.3362, "mean_token_accuracy": 0.8982422716915608, "num_tokens": 102564808.0, "step": 15490 }, { "entropy": 0.32663917895406486, "epoch": 0.23859965095180094, "grad_norm": 0.9175283312797546, "learning_rate": 1.590538252347478e-05, "loss": 0.3241, "mean_token_accuracy": 0.9024589903652668, "num_tokens": 102626326.0, "step": 15500 }, { "entropy": 0.3377850580960512, "epoch": 0.23875358621047954, "grad_norm": 1.013188362121582, "learning_rate": 1.5915644722664066e-05, "loss": 0.3349, "mean_token_accuracy": 0.8980246976017952, "num_tokens": 102695631.0, "step": 15510 }, { "entropy": 0.3424107702448964, "epoch": 0.2389075214691581, "grad_norm": 1.1015374660491943, "learning_rate": 1.5925906921853352e-05, "loss": 0.3481, "mean_token_accuracy": 0.8975272037088871, "num_tokens": 102750248.0, "step": 15520 }, { "entropy": 0.3370279397815466, "epoch": 0.23906145672783669, "grad_norm": 1.0124448537826538, "learning_rate": 1.593616912104264e-05, "loss": 0.336, "mean_token_accuracy": 0.897292310744524, "num_tokens": 102818451.0, "step": 15530 }, { "entropy": 0.3448965523391962, "epoch": 0.23921539198651526, "grad_norm": 0.7484856247901917, "learning_rate": 1.5946431320231927e-05, "loss": 0.3385, "mean_token_accuracy": 0.8988717965781688, "num_tokens": 102890325.0, "step": 15540 }, { "entropy": 0.3277913386002183, "epoch": 0.23936932724519386, "grad_norm": 1.1548471450805664, "learning_rate": 1.5956693519421213e-05, "loss": 0.3334, "mean_token_accuracy": 0.9000749185681343, "num_tokens": 102953126.0, "step": 15550 }, { "entropy": 0.3318356145173311, "epoch": 0.23952326250387243, "grad_norm": 1.3439841270446777, "learning_rate": 1.59669557186105e-05, "loss": 0.3347, "mean_token_accuracy": 0.9002738423645497, "num_tokens": 103016628.0, "step": 15560 }, { "entropy": 0.3309711031615734, "epoch": 0.239677197762551, "grad_norm": 1.1817004680633545, "learning_rate": 1.5977217917799785e-05, "loss": 0.3253, "mean_token_accuracy": 0.9026694975793361, "num_tokens": 103078557.0, "step": 15570 }, { "entropy": 0.3677051769569516, "epoch": 0.2398311330212296, "grad_norm": 0.9510855674743652, "learning_rate": 1.598748011698907e-05, "loss": 0.3579, "mean_token_accuracy": 0.8905341058969498, "num_tokens": 103149414.0, "step": 15580 }, { "entropy": 0.33973194751888514, "epoch": 0.23998506827990818, "grad_norm": 1.0597434043884277, "learning_rate": 1.5997742316178357e-05, "loss": 0.3444, "mean_token_accuracy": 0.9016252011060715, "num_tokens": 103215778.0, "step": 15590 }, { "entropy": 0.34859199579805134, "epoch": 0.24013900353858675, "grad_norm": 0.9721766114234924, "learning_rate": 1.6008004515367646e-05, "loss": 0.3385, "mean_token_accuracy": 0.8961347930133343, "num_tokens": 103276755.0, "step": 15600 }, { "entropy": 0.35030864384025334, "epoch": 0.24029293879726535, "grad_norm": 0.8834133744239807, "learning_rate": 1.6018266714556932e-05, "loss": 0.3542, "mean_token_accuracy": 0.8948439046740532, "num_tokens": 103345579.0, "step": 15610 }, { "entropy": 0.35384080819785596, "epoch": 0.24044687405594392, "grad_norm": 1.0056617259979248, "learning_rate": 1.602852891374622e-05, "loss": 0.3364, "mean_token_accuracy": 0.8933177858591079, "num_tokens": 103406563.0, "step": 15620 }, { "entropy": 0.35729899201542137, "epoch": 0.2406008093146225, "grad_norm": 0.9405458569526672, "learning_rate": 1.6038791112935504e-05, "loss": 0.3491, "mean_token_accuracy": 0.8889943659305573, "num_tokens": 103474883.0, "step": 15630 }, { "entropy": 0.35369075313210485, "epoch": 0.2407547445733011, "grad_norm": 1.2552529573440552, "learning_rate": 1.604905331212479e-05, "loss": 0.3527, "mean_token_accuracy": 0.8944333530962467, "num_tokens": 103540984.0, "step": 15640 }, { "entropy": 0.3397589709609747, "epoch": 0.24090867983197967, "grad_norm": 0.8585676550865173, "learning_rate": 1.6059315511314076e-05, "loss": 0.3427, "mean_token_accuracy": 0.898659086227417, "num_tokens": 103612519.0, "step": 15650 }, { "entropy": 0.32468686876818537, "epoch": 0.24106261509065824, "grad_norm": 1.1023786067962646, "learning_rate": 1.6069577710503362e-05, "loss": 0.3196, "mean_token_accuracy": 0.9023633383214473, "num_tokens": 103674746.0, "step": 15660 }, { "entropy": 0.34189087375998495, "epoch": 0.24121655034933684, "grad_norm": 0.8226726055145264, "learning_rate": 1.6079839909692648e-05, "loss": 0.3496, "mean_token_accuracy": 0.8959307745099068, "num_tokens": 103753880.0, "step": 15670 }, { "entropy": 0.33318022079765797, "epoch": 0.2413704856080154, "grad_norm": 0.8639942407608032, "learning_rate": 1.6090102108881934e-05, "loss": 0.3349, "mean_token_accuracy": 0.8995519891381264, "num_tokens": 103824051.0, "step": 15680 }, { "entropy": 0.33731548190116883, "epoch": 0.24152442086669398, "grad_norm": 1.0574461221694946, "learning_rate": 1.610036430807122e-05, "loss": 0.3327, "mean_token_accuracy": 0.8994092412292958, "num_tokens": 103888281.0, "step": 15690 }, { "entropy": 0.36226900909096005, "epoch": 0.24167835612537258, "grad_norm": 1.0353624820709229, "learning_rate": 1.6110626507260506e-05, "loss": 0.3451, "mean_token_accuracy": 0.8928975917398929, "num_tokens": 103953929.0, "step": 15700 }, { "entropy": 0.31779544372111557, "epoch": 0.24183229138405116, "grad_norm": 0.8613629341125488, "learning_rate": 1.6120888706449792e-05, "loss": 0.3131, "mean_token_accuracy": 0.9055507205426693, "num_tokens": 104020790.0, "step": 15710 }, { "entropy": 0.34271442592144014, "epoch": 0.24198622664272973, "grad_norm": 1.1082477569580078, "learning_rate": 1.6131150905639078e-05, "loss": 0.3419, "mean_token_accuracy": 0.8944230020046234, "num_tokens": 104086129.0, "step": 15720 }, { "entropy": 0.34084668988361955, "epoch": 0.2421401619014083, "grad_norm": 0.8586068153381348, "learning_rate": 1.6141413104828364e-05, "loss": 0.339, "mean_token_accuracy": 0.8986884370446205, "num_tokens": 104149699.0, "step": 15730 }, { "entropy": 0.33445830158889295, "epoch": 0.2422940971600869, "grad_norm": 1.021330714225769, "learning_rate": 1.6151675304017653e-05, "loss": 0.3322, "mean_token_accuracy": 0.8996772773563861, "num_tokens": 104211212.0, "step": 15740 }, { "entropy": 0.3516005137935281, "epoch": 0.24244803241876547, "grad_norm": 0.8170610070228577, "learning_rate": 1.616193750320694e-05, "loss": 0.354, "mean_token_accuracy": 0.8951497256755829, "num_tokens": 104275944.0, "step": 15750 }, { "entropy": 0.33008938245475294, "epoch": 0.24260196767744405, "grad_norm": 1.0137580633163452, "learning_rate": 1.6172199702396225e-05, "loss": 0.3238, "mean_token_accuracy": 0.9020410552620888, "num_tokens": 104344180.0, "step": 15760 }, { "entropy": 0.34401535904034974, "epoch": 0.24275590293612265, "grad_norm": 1.1129790544509888, "learning_rate": 1.618246190158551e-05, "loss": 0.3411, "mean_token_accuracy": 0.8974200077354908, "num_tokens": 104415018.0, "step": 15770 }, { "entropy": 0.3244214877486229, "epoch": 0.24290983819480122, "grad_norm": 1.18852698802948, "learning_rate": 1.6192724100774797e-05, "loss": 0.3211, "mean_token_accuracy": 0.9027756132185459, "num_tokens": 104490352.0, "step": 15780 }, { "entropy": 0.3102956837043166, "epoch": 0.2430637734534798, "grad_norm": 1.1742491722106934, "learning_rate": 1.6202986299964083e-05, "loss": 0.3075, "mean_token_accuracy": 0.9077977031469345, "num_tokens": 104553687.0, "step": 15790 }, { "entropy": 0.3221133131533861, "epoch": 0.2432177087121584, "grad_norm": 0.9717815518379211, "learning_rate": 1.621324849915337e-05, "loss": 0.3405, "mean_token_accuracy": 0.901460362225771, "num_tokens": 104621903.0, "step": 15800 }, { "entropy": 0.34749216996133325, "epoch": 0.24337164397083696, "grad_norm": 0.8710683584213257, "learning_rate": 1.6223510698342658e-05, "loss": 0.3485, "mean_token_accuracy": 0.8940451450645923, "num_tokens": 104694994.0, "step": 15810 }, { "entropy": 0.33780239559710024, "epoch": 0.24352557922951554, "grad_norm": 1.237435221672058, "learning_rate": 1.6233772897531944e-05, "loss": 0.3515, "mean_token_accuracy": 0.9010787986218929, "num_tokens": 104758067.0, "step": 15820 }, { "entropy": 0.33651818744838236, "epoch": 0.24367951448819414, "grad_norm": 1.280104160308838, "learning_rate": 1.624403509672123e-05, "loss": 0.3344, "mean_token_accuracy": 0.8979902684688568, "num_tokens": 104821213.0, "step": 15830 }, { "entropy": 0.337699119374156, "epoch": 0.2438334497468727, "grad_norm": 0.8600005507469177, "learning_rate": 1.6254297295910516e-05, "loss": 0.3368, "mean_token_accuracy": 0.9004607804119587, "num_tokens": 104885354.0, "step": 15840 }, { "entropy": 0.3264557690359652, "epoch": 0.24398738500555128, "grad_norm": 0.8867344260215759, "learning_rate": 1.6264559495099802e-05, "loss": 0.3261, "mean_token_accuracy": 0.9015897668898105, "num_tokens": 104946472.0, "step": 15850 }, { "entropy": 0.3590759489685297, "epoch": 0.24414132026422988, "grad_norm": 1.0105472803115845, "learning_rate": 1.6274821694289088e-05, "loss": 0.3607, "mean_token_accuracy": 0.891762088239193, "num_tokens": 105011028.0, "step": 15860 }, { "entropy": 0.3704089755192399, "epoch": 0.24429525552290846, "grad_norm": 1.1456336975097656, "learning_rate": 1.6285083893478374e-05, "loss": 0.3571, "mean_token_accuracy": 0.8894417703151702, "num_tokens": 105083052.0, "step": 15870 }, { "entropy": 0.33945568893104794, "epoch": 0.24444919078158703, "grad_norm": 1.0967501401901245, "learning_rate": 1.629534609266766e-05, "loss": 0.3294, "mean_token_accuracy": 0.8955955021083355, "num_tokens": 105152484.0, "step": 15880 }, { "entropy": 0.3051165063865483, "epoch": 0.24460312604026563, "grad_norm": 1.0276217460632324, "learning_rate": 1.6305608291856946e-05, "loss": 0.3166, "mean_token_accuracy": 0.9093525610864163, "num_tokens": 105213505.0, "step": 15890 }, { "entropy": 0.3402341345325112, "epoch": 0.2447570612989442, "grad_norm": 1.0253326892852783, "learning_rate": 1.631587049104623e-05, "loss": 0.3438, "mean_token_accuracy": 0.8976848848164082, "num_tokens": 105281768.0, "step": 15900 }, { "entropy": 0.3374909995123744, "epoch": 0.24491099655762277, "grad_norm": 1.32438325881958, "learning_rate": 1.6326132690235518e-05, "loss": 0.3534, "mean_token_accuracy": 0.8956445135176182, "num_tokens": 105341683.0, "step": 15910 }, { "entropy": 0.3352485133334994, "epoch": 0.24506493181630135, "grad_norm": 1.2741950750350952, "learning_rate": 1.6336394889424804e-05, "loss": 0.327, "mean_token_accuracy": 0.9002325616776943, "num_tokens": 105411184.0, "step": 15920 }, { "entropy": 0.3521236125379801, "epoch": 0.24521886707497995, "grad_norm": 1.1646066904067993, "learning_rate": 1.634665708861409e-05, "loss": 0.3462, "mean_token_accuracy": 0.892304091155529, "num_tokens": 105477793.0, "step": 15930 }, { "entropy": 0.3290470803156495, "epoch": 0.24537280233365852, "grad_norm": 1.1700540781021118, "learning_rate": 1.6356919287803375e-05, "loss": 0.3321, "mean_token_accuracy": 0.9017195902764797, "num_tokens": 105543519.0, "step": 15940 }, { "entropy": 0.3542687692679465, "epoch": 0.2455267375923371, "grad_norm": 1.0998985767364502, "learning_rate": 1.6367181486992665e-05, "loss": 0.3506, "mean_token_accuracy": 0.8938958287239075, "num_tokens": 105611174.0, "step": 15950 }, { "entropy": 0.34125550743192434, "epoch": 0.2456806728510157, "grad_norm": 1.0433602333068848, "learning_rate": 1.637744368618195e-05, "loss": 0.3388, "mean_token_accuracy": 0.8972013175487519, "num_tokens": 105680019.0, "step": 15960 }, { "entropy": 0.33227487541735173, "epoch": 0.24583460810969426, "grad_norm": 0.8467202186584473, "learning_rate": 1.6387705885371237e-05, "loss": 0.3391, "mean_token_accuracy": 0.8999508656561375, "num_tokens": 105745813.0, "step": 15970 }, { "entropy": 0.3619113082066178, "epoch": 0.24598854336837284, "grad_norm": 0.959578812122345, "learning_rate": 1.6397968084560523e-05, "loss": 0.3454, "mean_token_accuracy": 0.8919618211686611, "num_tokens": 105812980.0, "step": 15980 }, { "entropy": 0.3318849764764309, "epoch": 0.24614247862705144, "grad_norm": 1.2434515953063965, "learning_rate": 1.640823028374981e-05, "loss": 0.3273, "mean_token_accuracy": 0.9015708982944488, "num_tokens": 105874760.0, "step": 15990 }, { "entropy": 0.3491962690837681, "epoch": 0.24629641388573, "grad_norm": 1.0623756647109985, "learning_rate": 1.6418492482939094e-05, "loss": 0.3482, "mean_token_accuracy": 0.8959748581051826, "num_tokens": 105936181.0, "step": 16000 }, { "entropy": 0.34238131921738385, "epoch": 0.24645034914440858, "grad_norm": 0.8515142798423767, "learning_rate": 1.642875468212838e-05, "loss": 0.3355, "mean_token_accuracy": 0.8989916421473026, "num_tokens": 106011599.0, "step": 16010 }, { "entropy": 0.3229739356786013, "epoch": 0.24660428440308718, "grad_norm": 0.7867009043693542, "learning_rate": 1.643901688131767e-05, "loss": 0.3229, "mean_token_accuracy": 0.9027400054037571, "num_tokens": 106082431.0, "step": 16020 }, { "entropy": 0.32104221247136594, "epoch": 0.24675821966176575, "grad_norm": 0.8806533813476562, "learning_rate": 1.6449279080506956e-05, "loss": 0.3207, "mean_token_accuracy": 0.9029978081583977, "num_tokens": 106146402.0, "step": 16030 }, { "entropy": 0.36311257714405654, "epoch": 0.24691215492044433, "grad_norm": 1.0572434663772583, "learning_rate": 1.645954127969624e-05, "loss": 0.3503, "mean_token_accuracy": 0.8916069336235524, "num_tokens": 106212986.0, "step": 16040 }, { "entropy": 0.3417260365560651, "epoch": 0.24706609017912293, "grad_norm": 0.899534285068512, "learning_rate": 1.6469803478885528e-05, "loss": 0.3479, "mean_token_accuracy": 0.8960200570523739, "num_tokens": 106274285.0, "step": 16050 }, { "entropy": 0.34047026857733725, "epoch": 0.2472200254378015, "grad_norm": 1.0377014875411987, "learning_rate": 1.6480065678074814e-05, "loss": 0.3473, "mean_token_accuracy": 0.8979247406125068, "num_tokens": 106332018.0, "step": 16060 }, { "entropy": 0.332492945715785, "epoch": 0.24737396069648007, "grad_norm": 1.0436041355133057, "learning_rate": 1.64903278772641e-05, "loss": 0.3245, "mean_token_accuracy": 0.9012723281979561, "num_tokens": 106393730.0, "step": 16070 }, { "entropy": 0.3354235951788723, "epoch": 0.24752789595515867, "grad_norm": 0.8511201739311218, "learning_rate": 1.6500590076453385e-05, "loss": 0.34, "mean_token_accuracy": 0.8983665265142917, "num_tokens": 106454938.0, "step": 16080 }, { "entropy": 0.326163686811924, "epoch": 0.24768183121383724, "grad_norm": 1.1466076374053955, "learning_rate": 1.651085227564267e-05, "loss": 0.3272, "mean_token_accuracy": 0.8996806263923645, "num_tokens": 106509981.0, "step": 16090 }, { "entropy": 0.3456494530662894, "epoch": 0.24783576647251582, "grad_norm": 1.1873623132705688, "learning_rate": 1.6521114474831957e-05, "loss": 0.3402, "mean_token_accuracy": 0.8985186755657196, "num_tokens": 106581228.0, "step": 16100 }, { "entropy": 0.3199710000306368, "epoch": 0.2479897017311944, "grad_norm": 0.9408320188522339, "learning_rate": 1.6531376674021243e-05, "loss": 0.3184, "mean_token_accuracy": 0.9022484973073006, "num_tokens": 106645023.0, "step": 16110 }, { "entropy": 0.357442875392735, "epoch": 0.248143636989873, "grad_norm": 0.841323971748352, "learning_rate": 1.654163887321053e-05, "loss": 0.348, "mean_token_accuracy": 0.8943756222724915, "num_tokens": 106714415.0, "step": 16120 }, { "entropy": 0.32922798246145246, "epoch": 0.24829757224855156, "grad_norm": 1.0068529844284058, "learning_rate": 1.6551901072399815e-05, "loss": 0.3376, "mean_token_accuracy": 0.9003820337355137, "num_tokens": 106783043.0, "step": 16130 }, { "entropy": 0.35208101850003004, "epoch": 0.24845150750723013, "grad_norm": 1.0964876413345337, "learning_rate": 1.65621632715891e-05, "loss": 0.3572, "mean_token_accuracy": 0.8934477239847183, "num_tokens": 106846625.0, "step": 16140 }, { "entropy": 0.33243641071021557, "epoch": 0.24860544276590874, "grad_norm": 1.0336849689483643, "learning_rate": 1.6572425470778387e-05, "loss": 0.3311, "mean_token_accuracy": 0.9010642312467099, "num_tokens": 106918431.0, "step": 16150 }, { "entropy": 0.3190658316016197, "epoch": 0.2487593780245873, "grad_norm": 1.063986897468567, "learning_rate": 1.6582687669967676e-05, "loss": 0.3227, "mean_token_accuracy": 0.9034149676561356, "num_tokens": 106975828.0, "step": 16160 }, { "entropy": 0.3533791970461607, "epoch": 0.24891331328326588, "grad_norm": 0.9844688177108765, "learning_rate": 1.6592949869156962e-05, "loss": 0.355, "mean_token_accuracy": 0.8937878891825676, "num_tokens": 107050168.0, "step": 16170 }, { "entropy": 0.3514612914994359, "epoch": 0.24906724854194448, "grad_norm": 0.8945493698120117, "learning_rate": 1.6603212068346248e-05, "loss": 0.3354, "mean_token_accuracy": 0.8954433470964431, "num_tokens": 107111643.0, "step": 16180 }, { "entropy": 0.32226260416209696, "epoch": 0.24922118380062305, "grad_norm": 0.6903788447380066, "learning_rate": 1.6613474267535534e-05, "loss": 0.3229, "mean_token_accuracy": 0.8989117950201034, "num_tokens": 107177087.0, "step": 16190 }, { "entropy": 0.3446161936968565, "epoch": 0.24937511905930163, "grad_norm": 1.2332541942596436, "learning_rate": 1.662373646672482e-05, "loss": 0.3505, "mean_token_accuracy": 0.8970385521650315, "num_tokens": 107246931.0, "step": 16200 }, { "entropy": 0.342315760999918, "epoch": 0.24952905431798023, "grad_norm": 1.1110624074935913, "learning_rate": 1.6633998665914106e-05, "loss": 0.3384, "mean_token_accuracy": 0.8974533952772618, "num_tokens": 107308972.0, "step": 16210 }, { "entropy": 0.33524065408855674, "epoch": 0.2496829895766588, "grad_norm": 0.8714624047279358, "learning_rate": 1.6644260865103392e-05, "loss": 0.3356, "mean_token_accuracy": 0.8991208486258984, "num_tokens": 107377418.0, "step": 16220 }, { "entropy": 0.3421924138441682, "epoch": 0.24983692483533737, "grad_norm": 0.8137999773025513, "learning_rate": 1.665452306429268e-05, "loss": 0.3313, "mean_token_accuracy": 0.8985381081700325, "num_tokens": 107446018.0, "step": 16230 }, { "entropy": 0.33295349320396783, "epoch": 0.24999086009401597, "grad_norm": 1.0316303968429565, "learning_rate": 1.6664785263481967e-05, "loss": 0.3404, "mean_token_accuracy": 0.8981610603630543, "num_tokens": 107512523.0, "step": 16240 }, { "entropy": 0.33740181624889376, "epoch": 0.2501447953526945, "grad_norm": 0.8818724751472473, "learning_rate": 1.6675047462671253e-05, "loss": 0.3392, "mean_token_accuracy": 0.8986416630446911, "num_tokens": 107584497.0, "step": 16250 }, { "entropy": 0.3339621365070343, "epoch": 0.25029873061137314, "grad_norm": 0.9786550998687744, "learning_rate": 1.668530966186054e-05, "loss": 0.342, "mean_token_accuracy": 0.9002343900501728, "num_tokens": 107649118.0, "step": 16260 }, { "entropy": 0.34867025930434464, "epoch": 0.2504526658700517, "grad_norm": 1.0743556022644043, "learning_rate": 1.6695571861049825e-05, "loss": 0.3499, "mean_token_accuracy": 0.8976674988865853, "num_tokens": 107714469.0, "step": 16270 }, { "entropy": 0.3321712641976774, "epoch": 0.2506066011287303, "grad_norm": 1.0733642578125, "learning_rate": 1.670583406023911e-05, "loss": 0.3231, "mean_token_accuracy": 0.901260431110859, "num_tokens": 107777712.0, "step": 16280 }, { "entropy": 0.33334287963807585, "epoch": 0.25076053638740886, "grad_norm": 0.8656998872756958, "learning_rate": 1.6716096259428397e-05, "loss": 0.3281, "mean_token_accuracy": 0.9011980392038822, "num_tokens": 107846508.0, "step": 16290 }, { "entropy": 0.3227114871144295, "epoch": 0.25091447164608743, "grad_norm": 1.091599464416504, "learning_rate": 1.6726358458617683e-05, "loss": 0.3326, "mean_token_accuracy": 0.902744372934103, "num_tokens": 107906218.0, "step": 16300 }, { "entropy": 0.33530948869884014, "epoch": 0.251068406904766, "grad_norm": 0.8254163265228271, "learning_rate": 1.673662065780697e-05, "loss": 0.3455, "mean_token_accuracy": 0.8989288285374641, "num_tokens": 107971337.0, "step": 16310 }, { "entropy": 0.34882230600342157, "epoch": 0.25122234216344463, "grad_norm": 0.8712889552116394, "learning_rate": 1.6746882856996255e-05, "loss": 0.3317, "mean_token_accuracy": 0.8957455605268478, "num_tokens": 108034984.0, "step": 16320 }, { "entropy": 0.33186702858656647, "epoch": 0.2513762774221232, "grad_norm": 0.9015682339668274, "learning_rate": 1.675714505618554e-05, "loss": 0.3324, "mean_token_accuracy": 0.9007310412824154, "num_tokens": 108100258.0, "step": 16330 }, { "entropy": 0.33508351184427737, "epoch": 0.2515302126808018, "grad_norm": 0.9207953214645386, "learning_rate": 1.6767407255374827e-05, "loss": 0.3366, "mean_token_accuracy": 0.8983037792146206, "num_tokens": 108165912.0, "step": 16340 }, { "entropy": 0.35582395177334547, "epoch": 0.25168414793948035, "grad_norm": 0.8936296701431274, "learning_rate": 1.6777669454564113e-05, "loss": 0.3409, "mean_token_accuracy": 0.8918878749012947, "num_tokens": 108230874.0, "step": 16350 }, { "entropy": 0.33780971765518186, "epoch": 0.2518380831981589, "grad_norm": 0.8525583148002625, "learning_rate": 1.67879316537534e-05, "loss": 0.3329, "mean_token_accuracy": 0.900156807899475, "num_tokens": 108298636.0, "step": 16360 }, { "entropy": 0.36775310020893814, "epoch": 0.2519920184568375, "grad_norm": 0.9596392512321472, "learning_rate": 1.6798193852942688e-05, "loss": 0.3576, "mean_token_accuracy": 0.8898947820067405, "num_tokens": 108364370.0, "step": 16370 }, { "entropy": 0.3490007985383272, "epoch": 0.2521459537155161, "grad_norm": 0.8366541266441345, "learning_rate": 1.6808456052131974e-05, "loss": 0.3534, "mean_token_accuracy": 0.8926501862704754, "num_tokens": 108426635.0, "step": 16380 }, { "entropy": 0.33974553979933264, "epoch": 0.2522998889741947, "grad_norm": 0.8112787008285522, "learning_rate": 1.681871825132126e-05, "loss": 0.3303, "mean_token_accuracy": 0.8996138326823712, "num_tokens": 108496676.0, "step": 16390 }, { "entropy": 0.33119035679847003, "epoch": 0.25245382423287327, "grad_norm": 1.0813089609146118, "learning_rate": 1.6828980450510546e-05, "loss": 0.3288, "mean_token_accuracy": 0.899722857028246, "num_tokens": 108566996.0, "step": 16400 }, { "entropy": 0.32833350617438556, "epoch": 0.25260775949155184, "grad_norm": 1.0968685150146484, "learning_rate": 1.6839242649699832e-05, "loss": 0.3253, "mean_token_accuracy": 0.9006564475595951, "num_tokens": 108637791.0, "step": 16410 }, { "entropy": 0.34526224955916407, "epoch": 0.2527616947502304, "grad_norm": 1.1314514875411987, "learning_rate": 1.6849504848889118e-05, "loss": 0.3353, "mean_token_accuracy": 0.8957458563148976, "num_tokens": 108699136.0, "step": 16420 }, { "entropy": 0.34868491981178523, "epoch": 0.252915630008909, "grad_norm": 0.9016819596290588, "learning_rate": 1.6859767048078404e-05, "loss": 0.3518, "mean_token_accuracy": 0.894491970539093, "num_tokens": 108772244.0, "step": 16430 }, { "entropy": 0.34525676127523186, "epoch": 0.25306956526758756, "grad_norm": 0.853712797164917, "learning_rate": 1.6870029247267693e-05, "loss": 0.3391, "mean_token_accuracy": 0.8974761739373207, "num_tokens": 108834286.0, "step": 16440 }, { "entropy": 0.3342965183779597, "epoch": 0.2532235005262662, "grad_norm": 0.8686178922653198, "learning_rate": 1.688029144645698e-05, "loss": 0.3309, "mean_token_accuracy": 0.8977761194109917, "num_tokens": 108902881.0, "step": 16450 }, { "entropy": 0.33827430233359335, "epoch": 0.25337743578494476, "grad_norm": 0.9074018597602844, "learning_rate": 1.6890553645646265e-05, "loss": 0.3329, "mean_token_accuracy": 0.8996971525251866, "num_tokens": 108965330.0, "step": 16460 }, { "entropy": 0.3621349258348346, "epoch": 0.25353137104362333, "grad_norm": 0.8960493206977844, "learning_rate": 1.690081584483555e-05, "loss": 0.3602, "mean_token_accuracy": 0.8907706804573536, "num_tokens": 109032798.0, "step": 16470 }, { "entropy": 0.32791166082024575, "epoch": 0.2536853063023019, "grad_norm": 0.8625102639198303, "learning_rate": 1.6911078044024837e-05, "loss": 0.336, "mean_token_accuracy": 0.9014259375631809, "num_tokens": 109099748.0, "step": 16480 }, { "entropy": 0.32702276259660723, "epoch": 0.2538392415609805, "grad_norm": 0.8026692271232605, "learning_rate": 1.6921340243214123e-05, "loss": 0.3232, "mean_token_accuracy": 0.9026642739772797, "num_tokens": 109172367.0, "step": 16490 }, { "entropy": 0.32554697841405866, "epoch": 0.25399317681965905, "grad_norm": 0.8814818263053894, "learning_rate": 1.693160244240341e-05, "loss": 0.3356, "mean_token_accuracy": 0.9008306920528412, "num_tokens": 109238352.0, "step": 16500 }, { "entropy": 0.3272884997539222, "epoch": 0.2541471120783377, "grad_norm": 0.8247383832931519, "learning_rate": 1.6941864641592695e-05, "loss": 0.3195, "mean_token_accuracy": 0.9010601349174976, "num_tokens": 109304496.0, "step": 16510 }, { "entropy": 0.3364904932677746, "epoch": 0.25430104733701625, "grad_norm": 0.7876343727111816, "learning_rate": 1.695212684078198e-05, "loss": 0.3235, "mean_token_accuracy": 0.9005557857453823, "num_tokens": 109375518.0, "step": 16520 }, { "entropy": 0.3264143725857139, "epoch": 0.2544549825956948, "grad_norm": 0.9131514430046082, "learning_rate": 1.6962389039971267e-05, "loss": 0.3342, "mean_token_accuracy": 0.9020302139222622, "num_tokens": 109441954.0, "step": 16530 }, { "entropy": 0.35303243827074765, "epoch": 0.2546089178543734, "grad_norm": 1.0246782302856445, "learning_rate": 1.6972651239160553e-05, "loss": 0.3511, "mean_token_accuracy": 0.8940095692873001, "num_tokens": 109505875.0, "step": 16540 }, { "entropy": 0.35140175092965364, "epoch": 0.25476285311305197, "grad_norm": 1.0343540906906128, "learning_rate": 1.698291343834984e-05, "loss": 0.3462, "mean_token_accuracy": 0.8962739527225494, "num_tokens": 109567716.0, "step": 16550 }, { "entropy": 0.3298405255191028, "epoch": 0.25491678837173054, "grad_norm": 0.997744619846344, "learning_rate": 1.6993175637539124e-05, "loss": 0.3399, "mean_token_accuracy": 0.8983732648193836, "num_tokens": 109634855.0, "step": 16560 }, { "entropy": 0.3324326411820948, "epoch": 0.2550707236304091, "grad_norm": 0.635003924369812, "learning_rate": 1.700343783672841e-05, "loss": 0.3318, "mean_token_accuracy": 0.8998781122267246, "num_tokens": 109708554.0, "step": 16570 }, { "entropy": 0.3544067082926631, "epoch": 0.25522465888908774, "grad_norm": 1.0923149585723877, "learning_rate": 1.70137000359177e-05, "loss": 0.3477, "mean_token_accuracy": 0.8932966157793999, "num_tokens": 109779549.0, "step": 16580 }, { "entropy": 0.35319234849885106, "epoch": 0.2553785941477663, "grad_norm": 0.9240097403526306, "learning_rate": 1.7023962235106986e-05, "loss": 0.3519, "mean_token_accuracy": 0.8926128089427948, "num_tokens": 109855654.0, "step": 16590 }, { "entropy": 0.34121435061097144, "epoch": 0.2555325294064449, "grad_norm": 0.8845734596252441, "learning_rate": 1.703422443429627e-05, "loss": 0.3356, "mean_token_accuracy": 0.897245005518198, "num_tokens": 109919736.0, "step": 16600 }, { "entropy": 0.33174843387678266, "epoch": 0.25568646466512346, "grad_norm": 1.0618396997451782, "learning_rate": 1.7044486633485558e-05, "loss": 0.3294, "mean_token_accuracy": 0.9008065938949585, "num_tokens": 109985946.0, "step": 16610 }, { "entropy": 0.33583765383809805, "epoch": 0.25584039992380203, "grad_norm": 0.8417641520500183, "learning_rate": 1.7054748832674843e-05, "loss": 0.3475, "mean_token_accuracy": 0.8972208686172962, "num_tokens": 110058489.0, "step": 16620 }, { "entropy": 0.3255060622468591, "epoch": 0.2559943351824806, "grad_norm": 0.7920299768447876, "learning_rate": 1.706501103186413e-05, "loss": 0.3261, "mean_token_accuracy": 0.9043900206685066, "num_tokens": 110126991.0, "step": 16630 }, { "entropy": 0.3305272198282182, "epoch": 0.25614827044115923, "grad_norm": 1.0227088928222656, "learning_rate": 1.7075273231053415e-05, "loss": 0.338, "mean_token_accuracy": 0.898469390720129, "num_tokens": 110189247.0, "step": 16640 }, { "entropy": 0.36185980327427386, "epoch": 0.2563022056998378, "grad_norm": 1.0211687088012695, "learning_rate": 1.7085535430242705e-05, "loss": 0.3562, "mean_token_accuracy": 0.8938643790781498, "num_tokens": 110264131.0, "step": 16650 }, { "entropy": 0.35299806874245404, "epoch": 0.2564561409585164, "grad_norm": 0.8562279939651489, "learning_rate": 1.709579762943199e-05, "loss": 0.3502, "mean_token_accuracy": 0.8956657484173774, "num_tokens": 110340176.0, "step": 16660 }, { "entropy": 0.3247136306017637, "epoch": 0.25661007621719495, "grad_norm": 0.8523799180984497, "learning_rate": 1.7106059828621277e-05, "loss": 0.3262, "mean_token_accuracy": 0.9032290875911713, "num_tokens": 110407885.0, "step": 16670 }, { "entropy": 0.345378583855927, "epoch": 0.2567640114758735, "grad_norm": 0.9117225408554077, "learning_rate": 1.7116322027810563e-05, "loss": 0.3508, "mean_token_accuracy": 0.8965706318616867, "num_tokens": 110473632.0, "step": 16680 }, { "entropy": 0.3475194538012147, "epoch": 0.2569179467345521, "grad_norm": 1.1615569591522217, "learning_rate": 1.712658422699985e-05, "loss": 0.3315, "mean_token_accuracy": 0.8959298640489578, "num_tokens": 110540502.0, "step": 16690 }, { "entropy": 0.3243978124111891, "epoch": 0.2570718819932307, "grad_norm": 0.8032922744750977, "learning_rate": 1.7136846426189134e-05, "loss": 0.3206, "mean_token_accuracy": 0.9013020284473896, "num_tokens": 110610842.0, "step": 16700 }, { "entropy": 0.33311784816905854, "epoch": 0.2572258172519093, "grad_norm": 0.744279682636261, "learning_rate": 1.714710862537842e-05, "loss": 0.3387, "mean_token_accuracy": 0.8993645422160625, "num_tokens": 110672857.0, "step": 16710 }, { "entropy": 0.3241079894825816, "epoch": 0.25737975251058787, "grad_norm": 0.977256178855896, "learning_rate": 1.7157370824567706e-05, "loss": 0.3291, "mean_token_accuracy": 0.9003175847232342, "num_tokens": 110734513.0, "step": 16720 }, { "entropy": 0.3510695926845074, "epoch": 0.25753368776926644, "grad_norm": 0.9229866862297058, "learning_rate": 1.7167633023756992e-05, "loss": 0.3512, "mean_token_accuracy": 0.8968770027160644, "num_tokens": 110806334.0, "step": 16730 }, { "entropy": 0.33965141596272586, "epoch": 0.257687623027945, "grad_norm": 1.0575710535049438, "learning_rate": 1.7177895222946278e-05, "loss": 0.3479, "mean_token_accuracy": 0.9001315869390965, "num_tokens": 110874416.0, "step": 16740 }, { "entropy": 0.34147209357470276, "epoch": 0.2578415582866236, "grad_norm": 0.9233494400978088, "learning_rate": 1.7188157422135564e-05, "loss": 0.347, "mean_token_accuracy": 0.8957997597754002, "num_tokens": 110954614.0, "step": 16750 }, { "entropy": 0.3530618315562606, "epoch": 0.25799549354530216, "grad_norm": 1.0119072198867798, "learning_rate": 1.719841962132485e-05, "loss": 0.3366, "mean_token_accuracy": 0.8938606031239033, "num_tokens": 111019786.0, "step": 16760 }, { "entropy": 0.32874724101275205, "epoch": 0.2581494288039808, "grad_norm": 0.7428898215293884, "learning_rate": 1.7208681820514136e-05, "loss": 0.3268, "mean_token_accuracy": 0.8974316999316215, "num_tokens": 111091321.0, "step": 16770 }, { "entropy": 0.3433641832321882, "epoch": 0.25830336406265936, "grad_norm": 1.0505794286727905, "learning_rate": 1.7218944019703422e-05, "loss": 0.3409, "mean_token_accuracy": 0.8986420497298241, "num_tokens": 111160336.0, "step": 16780 }, { "entropy": 0.32341102277860045, "epoch": 0.25845729932133793, "grad_norm": 1.3296717405319214, "learning_rate": 1.722920621889271e-05, "loss": 0.33, "mean_token_accuracy": 0.9032950282096863, "num_tokens": 111215136.0, "step": 16790 }, { "entropy": 0.3394572322256863, "epoch": 0.2586112345800165, "grad_norm": 1.091303825378418, "learning_rate": 1.7239468418081997e-05, "loss": 0.3495, "mean_token_accuracy": 0.8949193134903908, "num_tokens": 111275888.0, "step": 16800 }, { "entropy": 0.34933882374316455, "epoch": 0.2587651698386951, "grad_norm": 0.6614044904708862, "learning_rate": 1.7249730617271283e-05, "loss": 0.3484, "mean_token_accuracy": 0.8962488912045956, "num_tokens": 111347656.0, "step": 16810 }, { "entropy": 0.3364198154769838, "epoch": 0.25891910509737365, "grad_norm": 0.8187477588653564, "learning_rate": 1.725999281646057e-05, "loss": 0.3387, "mean_token_accuracy": 0.8999514542520046, "num_tokens": 111410701.0, "step": 16820 }, { "entropy": 0.32810430293902754, "epoch": 0.2590730403560523, "grad_norm": 0.8305529356002808, "learning_rate": 1.7270255015649855e-05, "loss": 0.3243, "mean_token_accuracy": 0.9004438042640686, "num_tokens": 111482417.0, "step": 16830 }, { "entropy": 0.35502192601561544, "epoch": 0.25922697561473085, "grad_norm": 0.8178640007972717, "learning_rate": 1.728051721483914e-05, "loss": 0.3545, "mean_token_accuracy": 0.8924784235656261, "num_tokens": 111565400.0, "step": 16840 }, { "entropy": 0.3309402149170637, "epoch": 0.2593809108734094, "grad_norm": 1.343480110168457, "learning_rate": 1.7290779414028427e-05, "loss": 0.3299, "mean_token_accuracy": 0.9025194711983204, "num_tokens": 111624704.0, "step": 16850 }, { "entropy": 0.3232613928616047, "epoch": 0.259534846132088, "grad_norm": 0.7818382978439331, "learning_rate": 1.7301041613217716e-05, "loss": 0.3309, "mean_token_accuracy": 0.8998104520142078, "num_tokens": 111697211.0, "step": 16860 }, { "entropy": 0.349194491840899, "epoch": 0.25968878139076657, "grad_norm": 0.7491682767868042, "learning_rate": 1.7311303812407002e-05, "loss": 0.3388, "mean_token_accuracy": 0.8939537450671196, "num_tokens": 111756839.0, "step": 16870 }, { "entropy": 0.32003407934680583, "epoch": 0.25984271664944514, "grad_norm": 0.98880535364151, "learning_rate": 1.7321566011596288e-05, "loss": 0.3171, "mean_token_accuracy": 0.9049525938928127, "num_tokens": 111814962.0, "step": 16880 }, { "entropy": 0.34551804019138216, "epoch": 0.25999665190812377, "grad_norm": 0.7400873899459839, "learning_rate": 1.7331828210785574e-05, "loss": 0.3405, "mean_token_accuracy": 0.8978461675345898, "num_tokens": 111883186.0, "step": 16890 }, { "entropy": 0.3433715313673019, "epoch": 0.26015058716680234, "grad_norm": 0.8725573420524597, "learning_rate": 1.734209040997486e-05, "loss": 0.3298, "mean_token_accuracy": 0.8969067424535752, "num_tokens": 111949757.0, "step": 16900 }, { "entropy": 0.3422919840551913, "epoch": 0.2603045224254809, "grad_norm": 0.878601610660553, "learning_rate": 1.7352352609164146e-05, "loss": 0.3466, "mean_token_accuracy": 0.8961548067629337, "num_tokens": 112015326.0, "step": 16910 }, { "entropy": 0.3267262790352106, "epoch": 0.2604584576841595, "grad_norm": 1.008274793624878, "learning_rate": 1.7362614808353432e-05, "loss": 0.3291, "mean_token_accuracy": 0.9015838727355003, "num_tokens": 112078120.0, "step": 16920 }, { "entropy": 0.3432713944464922, "epoch": 0.26061239294283806, "grad_norm": 1.0016416311264038, "learning_rate": 1.7372877007542718e-05, "loss": 0.3454, "mean_token_accuracy": 0.9003993228077889, "num_tokens": 112147027.0, "step": 16930 }, { "entropy": 0.3162102567963302, "epoch": 0.26076632820151663, "grad_norm": 0.8950538039207458, "learning_rate": 1.7383139206732004e-05, "loss": 0.3239, "mean_token_accuracy": 0.9039274267852306, "num_tokens": 112212995.0, "step": 16940 }, { "entropy": 0.33742969995364547, "epoch": 0.2609202634601952, "grad_norm": 0.9371647834777832, "learning_rate": 1.739340140592129e-05, "loss": 0.3368, "mean_token_accuracy": 0.8983174599707127, "num_tokens": 112279670.0, "step": 16950 }, { "entropy": 0.3377174368128181, "epoch": 0.26107419871887383, "grad_norm": 0.8180281519889832, "learning_rate": 1.7403663605110576e-05, "loss": 0.336, "mean_token_accuracy": 0.9002619981765747, "num_tokens": 112347459.0, "step": 16960 }, { "entropy": 0.3472407415509224, "epoch": 0.2612281339775524, "grad_norm": 0.7680931091308594, "learning_rate": 1.7413925804299862e-05, "loss": 0.3444, "mean_token_accuracy": 0.8923381201922893, "num_tokens": 112407438.0, "step": 16970 }, { "entropy": 0.3500169165432453, "epoch": 0.261382069236231, "grad_norm": 0.9076848030090332, "learning_rate": 1.7424188003489148e-05, "loss": 0.3407, "mean_token_accuracy": 0.8955918706953525, "num_tokens": 112478532.0, "step": 16980 }, { "entropy": 0.3368359410203993, "epoch": 0.26153600449490955, "grad_norm": 0.6429152488708496, "learning_rate": 1.7434450202678434e-05, "loss": 0.3255, "mean_token_accuracy": 0.899484134465456, "num_tokens": 112555035.0, "step": 16990 }, { "entropy": 0.32718883343040944, "epoch": 0.2616899397535881, "grad_norm": 0.8545633554458618, "learning_rate": 1.7444712401867723e-05, "loss": 0.3291, "mean_token_accuracy": 0.9018556095659733, "num_tokens": 112621675.0, "step": 17000 }, { "entropy": 0.32663317769765854, "epoch": 0.2618438750122667, "grad_norm": 1.060789704322815, "learning_rate": 1.745497460105701e-05, "loss": 0.3251, "mean_token_accuracy": 0.9014083690941334, "num_tokens": 112689537.0, "step": 17010 }, { "entropy": 0.34951881803572177, "epoch": 0.2619978102709453, "grad_norm": 0.874295175075531, "learning_rate": 1.7465236800246295e-05, "loss": 0.3522, "mean_token_accuracy": 0.8939487211406231, "num_tokens": 112757274.0, "step": 17020 }, { "entropy": 0.32634014952927826, "epoch": 0.2621517455296239, "grad_norm": 0.8614416122436523, "learning_rate": 1.747549899943558e-05, "loss": 0.3294, "mean_token_accuracy": 0.9025229081511498, "num_tokens": 112824301.0, "step": 17030 }, { "entropy": 0.32828684905543926, "epoch": 0.26230568078830246, "grad_norm": 0.9670031666755676, "learning_rate": 1.7485761198624867e-05, "loss": 0.3296, "mean_token_accuracy": 0.9006827071309089, "num_tokens": 112886204.0, "step": 17040 }, { "entropy": 0.33315091459080576, "epoch": 0.26245961604698104, "grad_norm": 0.7824294567108154, "learning_rate": 1.7496023397814153e-05, "loss": 0.3403, "mean_token_accuracy": 0.9013552956283093, "num_tokens": 112951116.0, "step": 17050 }, { "entropy": 0.3381863536313176, "epoch": 0.2626135513056596, "grad_norm": 1.3116976022720337, "learning_rate": 1.750628559700344e-05, "loss": 0.3338, "mean_token_accuracy": 0.8976793609559536, "num_tokens": 113011319.0, "step": 17060 }, { "entropy": 0.3408623540773988, "epoch": 0.2627674865643382, "grad_norm": 1.0368373394012451, "learning_rate": 1.7516547796192728e-05, "loss": 0.3407, "mean_token_accuracy": 0.8970039263367653, "num_tokens": 113078934.0, "step": 17070 }, { "entropy": 0.3310428064316511, "epoch": 0.2629214218230168, "grad_norm": 0.9086218476295471, "learning_rate": 1.7526809995382014e-05, "loss": 0.3253, "mean_token_accuracy": 0.9006276018917561, "num_tokens": 113146413.0, "step": 17080 }, { "entropy": 0.35879295859485866, "epoch": 0.2630753570816954, "grad_norm": 0.9020190238952637, "learning_rate": 1.75370721945713e-05, "loss": 0.3565, "mean_token_accuracy": 0.8945882752537727, "num_tokens": 113205823.0, "step": 17090 }, { "entropy": 0.33560264892876146, "epoch": 0.26322929234037395, "grad_norm": 0.9563761949539185, "learning_rate": 1.7547334393760586e-05, "loss": 0.3328, "mean_token_accuracy": 0.8997765950858593, "num_tokens": 113270574.0, "step": 17100 }, { "entropy": 0.3357530771754682, "epoch": 0.2633832275990525, "grad_norm": 0.9724740386009216, "learning_rate": 1.7557596592949872e-05, "loss": 0.3356, "mean_token_accuracy": 0.9001648962497711, "num_tokens": 113338963.0, "step": 17110 }, { "entropy": 0.3295282498933375, "epoch": 0.2635371628577311, "grad_norm": 0.7319137454032898, "learning_rate": 1.7567858792139158e-05, "loss": 0.3347, "mean_token_accuracy": 0.9009747497737408, "num_tokens": 113407607.0, "step": 17120 }, { "entropy": 0.32881121095269916, "epoch": 0.2636910981164097, "grad_norm": 0.786820650100708, "learning_rate": 1.7578120991328444e-05, "loss": 0.3343, "mean_token_accuracy": 0.8997563540935516, "num_tokens": 113473507.0, "step": 17130 }, { "entropy": 0.32696691397577526, "epoch": 0.26384503337508824, "grad_norm": 1.238505482673645, "learning_rate": 1.758838319051773e-05, "loss": 0.3283, "mean_token_accuracy": 0.9018128991127015, "num_tokens": 113534641.0, "step": 17140 }, { "entropy": 0.32944930270314216, "epoch": 0.2639989686337669, "grad_norm": 0.8669398427009583, "learning_rate": 1.7598645389707016e-05, "loss": 0.3275, "mean_token_accuracy": 0.9012321896851063, "num_tokens": 113604937.0, "step": 17150 }, { "entropy": 0.34239002522081136, "epoch": 0.26415290389244545, "grad_norm": 1.167815089225769, "learning_rate": 1.76089075888963e-05, "loss": 0.3405, "mean_token_accuracy": 0.8971305549144745, "num_tokens": 113664951.0, "step": 17160 }, { "entropy": 0.3292405246756971, "epoch": 0.264306839151124, "grad_norm": 0.8754289746284485, "learning_rate": 1.7619169788085587e-05, "loss": 0.3286, "mean_token_accuracy": 0.8989837057888508, "num_tokens": 113736661.0, "step": 17170 }, { "entropy": 0.33321822974830867, "epoch": 0.2644607744098026, "grad_norm": 0.760918140411377, "learning_rate": 1.7629431987274873e-05, "loss": 0.3426, "mean_token_accuracy": 0.8983546905219555, "num_tokens": 113803382.0, "step": 17180 }, { "entropy": 0.32121702190488577, "epoch": 0.26461470966848116, "grad_norm": 0.9277869462966919, "learning_rate": 1.763969418646416e-05, "loss": 0.3199, "mean_token_accuracy": 0.903628671169281, "num_tokens": 113875576.0, "step": 17190 }, { "entropy": 0.34431537613272667, "epoch": 0.26476864492715974, "grad_norm": 1.1185203790664673, "learning_rate": 1.7649956385653445e-05, "loss": 0.3544, "mean_token_accuracy": 0.8939193300902843, "num_tokens": 113935925.0, "step": 17200 }, { "entropy": 0.32231566589325666, "epoch": 0.26492258018583836, "grad_norm": 0.8001461625099182, "learning_rate": 1.7660218584842735e-05, "loss": 0.322, "mean_token_accuracy": 0.9045388855040073, "num_tokens": 114000819.0, "step": 17210 }, { "entropy": 0.35068774707615374, "epoch": 0.26507651544451694, "grad_norm": 1.2036925554275513, "learning_rate": 1.767048078403202e-05, "loss": 0.3595, "mean_token_accuracy": 0.8955626435577869, "num_tokens": 114064519.0, "step": 17220 }, { "entropy": 0.332734390348196, "epoch": 0.2652304507031955, "grad_norm": 1.1434742212295532, "learning_rate": 1.7680742983221307e-05, "loss": 0.3317, "mean_token_accuracy": 0.9017195709049701, "num_tokens": 114122220.0, "step": 17230 }, { "entropy": 0.34667324777692554, "epoch": 0.2653843859618741, "grad_norm": 0.7313071489334106, "learning_rate": 1.7691005182410593e-05, "loss": 0.3502, "mean_token_accuracy": 0.8956535518169403, "num_tokens": 114195444.0, "step": 17240 }, { "entropy": 0.34252557028084996, "epoch": 0.26553832122055265, "grad_norm": 0.9716715216636658, "learning_rate": 1.770126738159988e-05, "loss": 0.339, "mean_token_accuracy": 0.8994065225124359, "num_tokens": 114259172.0, "step": 17250 }, { "entropy": 0.3333128709346056, "epoch": 0.2656922564792312, "grad_norm": 0.8473114371299744, "learning_rate": 1.7711529580789164e-05, "loss": 0.329, "mean_token_accuracy": 0.8983029715716839, "num_tokens": 114331929.0, "step": 17260 }, { "entropy": 0.3458598479628563, "epoch": 0.26584619173790985, "grad_norm": 1.0433317422866821, "learning_rate": 1.772179177997845e-05, "loss": 0.3343, "mean_token_accuracy": 0.8963445216417313, "num_tokens": 114396803.0, "step": 17270 }, { "entropy": 0.3350240648724139, "epoch": 0.2660001269965884, "grad_norm": 0.7127974033355713, "learning_rate": 1.773205397916774e-05, "loss": 0.3347, "mean_token_accuracy": 0.9008177511394024, "num_tokens": 114463902.0, "step": 17280 }, { "entropy": 0.32696425849571825, "epoch": 0.266154062255267, "grad_norm": 0.8280390501022339, "learning_rate": 1.7742316178357026e-05, "loss": 0.3359, "mean_token_accuracy": 0.9005702018737793, "num_tokens": 114539056.0, "step": 17290 }, { "entropy": 0.33062227480113504, "epoch": 0.26630799751394557, "grad_norm": 0.9462801218032837, "learning_rate": 1.775257837754631e-05, "loss": 0.331, "mean_token_accuracy": 0.902378911525011, "num_tokens": 114602457.0, "step": 17300 }, { "entropy": 0.3430726915597916, "epoch": 0.26646193277262414, "grad_norm": 0.9516341686248779, "learning_rate": 1.7762840576735598e-05, "loss": 0.3426, "mean_token_accuracy": 0.8965471282601356, "num_tokens": 114660038.0, "step": 17310 }, { "entropy": 0.3418724987655878, "epoch": 0.2666158680313027, "grad_norm": 0.6261414289474487, "learning_rate": 1.7773102775924883e-05, "loss": 0.3247, "mean_token_accuracy": 0.8988322265446186, "num_tokens": 114735484.0, "step": 17320 }, { "entropy": 0.34584970911964774, "epoch": 0.2667698032899813, "grad_norm": 0.7104029655456543, "learning_rate": 1.778336497511417e-05, "loss": 0.3502, "mean_token_accuracy": 0.8937083020806312, "num_tokens": 114805711.0, "step": 17330 }, { "entropy": 0.33415639977902173, "epoch": 0.2669237385486599, "grad_norm": 1.195180058479309, "learning_rate": 1.7793627174303455e-05, "loss": 0.3305, "mean_token_accuracy": 0.9017716914415359, "num_tokens": 114863989.0, "step": 17340 }, { "entropy": 0.3267094448208809, "epoch": 0.2670776738073385, "grad_norm": 0.9495595097541809, "learning_rate": 1.780388937349274e-05, "loss": 0.3321, "mean_token_accuracy": 0.9014309540390968, "num_tokens": 114922570.0, "step": 17350 }, { "entropy": 0.3224093994125724, "epoch": 0.26723160906601706, "grad_norm": 0.9073861241340637, "learning_rate": 1.7814151572682027e-05, "loss": 0.3335, "mean_token_accuracy": 0.9019424229860306, "num_tokens": 114991260.0, "step": 17360 }, { "entropy": 0.3282756397500634, "epoch": 0.26738554432469563, "grad_norm": 0.9272039532661438, "learning_rate": 1.7824413771871313e-05, "loss": 0.3274, "mean_token_accuracy": 0.9004266299307346, "num_tokens": 115057200.0, "step": 17370 }, { "entropy": 0.3267279269173741, "epoch": 0.2675394795833742, "grad_norm": 0.9846112728118896, "learning_rate": 1.78346759710606e-05, "loss": 0.3291, "mean_token_accuracy": 0.8995296455919742, "num_tokens": 115116645.0, "step": 17380 }, { "entropy": 0.3185133788734674, "epoch": 0.2676934148420528, "grad_norm": 0.9397248029708862, "learning_rate": 1.7844938170249885e-05, "loss": 0.3359, "mean_token_accuracy": 0.9037774935364723, "num_tokens": 115179263.0, "step": 17390 }, { "entropy": 0.33048973940312865, "epoch": 0.2678473501007314, "grad_norm": 1.3580721616744995, "learning_rate": 1.785520036943917e-05, "loss": 0.327, "mean_token_accuracy": 0.9030727662146092, "num_tokens": 115246132.0, "step": 17400 }, { "entropy": 0.31641656514257194, "epoch": 0.26800128535941, "grad_norm": 0.9955900311470032, "learning_rate": 1.7865462568628457e-05, "loss": 0.3225, "mean_token_accuracy": 0.9036760345101357, "num_tokens": 115312051.0, "step": 17410 }, { "entropy": 0.33160144854336976, "epoch": 0.26815522061808855, "grad_norm": 0.9492262005805969, "learning_rate": 1.7875724767817746e-05, "loss": 0.3121, "mean_token_accuracy": 0.9016811370849609, "num_tokens": 115373337.0, "step": 17420 }, { "entropy": 0.3458529172465205, "epoch": 0.2683091558767671, "grad_norm": 0.9125699996948242, "learning_rate": 1.7885986967007032e-05, "loss": 0.3495, "mean_token_accuracy": 0.8957816883921623, "num_tokens": 115440753.0, "step": 17430 }, { "entropy": 0.35891821710392835, "epoch": 0.2684630911354457, "grad_norm": 1.0887140035629272, "learning_rate": 1.7896249166196318e-05, "loss": 0.3625, "mean_token_accuracy": 0.890265741199255, "num_tokens": 115509993.0, "step": 17440 }, { "entropy": 0.36482307575643064, "epoch": 0.26861702639412427, "grad_norm": 0.8981031179428101, "learning_rate": 1.7906511365385604e-05, "loss": 0.3408, "mean_token_accuracy": 0.8946661293506623, "num_tokens": 115568893.0, "step": 17450 }, { "entropy": 0.3244128798134625, "epoch": 0.2687709616528029, "grad_norm": 1.209650993347168, "learning_rate": 1.791677356457489e-05, "loss": 0.3166, "mean_token_accuracy": 0.9020876966416835, "num_tokens": 115638137.0, "step": 17460 }, { "entropy": 0.3380894744768739, "epoch": 0.26892489691148147, "grad_norm": 1.0418038368225098, "learning_rate": 1.7927035763764176e-05, "loss": 0.3454, "mean_token_accuracy": 0.8951732762157917, "num_tokens": 115708465.0, "step": 17470 }, { "entropy": 0.3292379710823298, "epoch": 0.26907883217016004, "grad_norm": 0.6921848058700562, "learning_rate": 1.7937297962953462e-05, "loss": 0.3215, "mean_token_accuracy": 0.9039123483002186, "num_tokens": 115773386.0, "step": 17480 }, { "entropy": 0.3496047928929329, "epoch": 0.2692327674288386, "grad_norm": 1.084485650062561, "learning_rate": 1.794756016214275e-05, "loss": 0.3402, "mean_token_accuracy": 0.8923346310853958, "num_tokens": 115838482.0, "step": 17490 }, { "entropy": 0.3233501508831978, "epoch": 0.2693867026875172, "grad_norm": 0.6718249320983887, "learning_rate": 1.7957822361332037e-05, "loss": 0.3271, "mean_token_accuracy": 0.9024628028273582, "num_tokens": 115904568.0, "step": 17500 }, { "entropy": 0.32324060015380385, "epoch": 0.26954063794619576, "grad_norm": 0.9488973617553711, "learning_rate": 1.7968084560521323e-05, "loss": 0.3229, "mean_token_accuracy": 0.9046642817556858, "num_tokens": 115964639.0, "step": 17510 }, { "entropy": 0.33987170848995446, "epoch": 0.26969457320487433, "grad_norm": 0.9004833698272705, "learning_rate": 1.797834675971061e-05, "loss": 0.3345, "mean_token_accuracy": 0.897523558139801, "num_tokens": 116028177.0, "step": 17520 }, { "entropy": 0.33562736166641116, "epoch": 0.26984850846355296, "grad_norm": 1.117241382598877, "learning_rate": 1.7988608958899895e-05, "loss": 0.3343, "mean_token_accuracy": 0.9005119487643242, "num_tokens": 116098631.0, "step": 17530 }, { "entropy": 0.34842162784188985, "epoch": 0.27000244372223153, "grad_norm": 1.202579140663147, "learning_rate": 1.799887115808918e-05, "loss": 0.3622, "mean_token_accuracy": 0.8951206840574741, "num_tokens": 116162228.0, "step": 17540 }, { "entropy": 0.3319896414875984, "epoch": 0.2701563789809101, "grad_norm": 0.7575272917747498, "learning_rate": 1.8009133357278467e-05, "loss": 0.324, "mean_token_accuracy": 0.9012371838092804, "num_tokens": 116230522.0, "step": 17550 }, { "entropy": 0.33420419683679936, "epoch": 0.2703103142395887, "grad_norm": 0.6977285146713257, "learning_rate": 1.8019395556467753e-05, "loss": 0.3377, "mean_token_accuracy": 0.899548900872469, "num_tokens": 116298618.0, "step": 17560 }, { "entropy": 0.34322519712150096, "epoch": 0.27046424949826725, "grad_norm": 0.8822567462921143, "learning_rate": 1.802965775565704e-05, "loss": 0.3501, "mean_token_accuracy": 0.8985599122941494, "num_tokens": 116370836.0, "step": 17570 }, { "entropy": 0.318590141646564, "epoch": 0.2706181847569458, "grad_norm": 0.91290283203125, "learning_rate": 1.8039919954846325e-05, "loss": 0.3268, "mean_token_accuracy": 0.9015832155942917, "num_tokens": 116441794.0, "step": 17580 }, { "entropy": 0.3407038267701864, "epoch": 0.27077212001562445, "grad_norm": 1.190450668334961, "learning_rate": 1.805018215403561e-05, "loss": 0.3328, "mean_token_accuracy": 0.896803268045187, "num_tokens": 116510498.0, "step": 17590 }, { "entropy": 0.3351840785704553, "epoch": 0.270926055274303, "grad_norm": 0.9177846908569336, "learning_rate": 1.8060444353224897e-05, "loss": 0.3366, "mean_token_accuracy": 0.8980799987912178, "num_tokens": 116576586.0, "step": 17600 }, { "entropy": 0.33689290527254345, "epoch": 0.2710799905329816, "grad_norm": 0.6941289901733398, "learning_rate": 1.8070706552414183e-05, "loss": 0.3389, "mean_token_accuracy": 0.8958573952317238, "num_tokens": 116650574.0, "step": 17610 }, { "entropy": 0.3399543246254325, "epoch": 0.27123392579166017, "grad_norm": 0.9315921068191528, "learning_rate": 1.808096875160347e-05, "loss": 0.3337, "mean_token_accuracy": 0.8991213329136372, "num_tokens": 116719571.0, "step": 17620 }, { "entropy": 0.33986415062099695, "epoch": 0.27138786105033874, "grad_norm": 1.0691410303115845, "learning_rate": 1.8091230950792755e-05, "loss": 0.3275, "mean_token_accuracy": 0.8958191476762295, "num_tokens": 116778962.0, "step": 17630 }, { "entropy": 0.339136203750968, "epoch": 0.2715417963090173, "grad_norm": 0.8881390690803528, "learning_rate": 1.8101493149982044e-05, "loss": 0.3447, "mean_token_accuracy": 0.8982717260718346, "num_tokens": 116840376.0, "step": 17640 }, { "entropy": 0.3207347812131047, "epoch": 0.27169573156769594, "grad_norm": 0.856797456741333, "learning_rate": 1.811175534917133e-05, "loss": 0.3264, "mean_token_accuracy": 0.9027819439768792, "num_tokens": 116907622.0, "step": 17650 }, { "entropy": 0.3513524286448956, "epoch": 0.2718496668263745, "grad_norm": 1.0036264657974243, "learning_rate": 1.8122017548360616e-05, "loss": 0.352, "mean_token_accuracy": 0.894532623142004, "num_tokens": 116966587.0, "step": 17660 }, { "entropy": 0.33447758238762615, "epoch": 0.2720036020850531, "grad_norm": 0.7602262496948242, "learning_rate": 1.8132279747549902e-05, "loss": 0.3334, "mean_token_accuracy": 0.898228795081377, "num_tokens": 117032045.0, "step": 17670 }, { "entropy": 0.3240508737042546, "epoch": 0.27215753734373166, "grad_norm": 0.9256485104560852, "learning_rate": 1.8142541946739188e-05, "loss": 0.3264, "mean_token_accuracy": 0.902124635130167, "num_tokens": 117094927.0, "step": 17680 }, { "entropy": 0.33878416009247303, "epoch": 0.27231147260241023, "grad_norm": 1.0349743366241455, "learning_rate": 1.8152804145928474e-05, "loss": 0.3489, "mean_token_accuracy": 0.8967732183635235, "num_tokens": 117159194.0, "step": 17690 }, { "entropy": 0.31656110994517805, "epoch": 0.2724654078610888, "grad_norm": 0.8137916326522827, "learning_rate": 1.816306634511776e-05, "loss": 0.3184, "mean_token_accuracy": 0.9053230866789818, "num_tokens": 117225234.0, "step": 17700 }, { "entropy": 0.3719494355842471, "epoch": 0.2726193431197674, "grad_norm": 1.078629970550537, "learning_rate": 1.817332854430705e-05, "loss": 0.369, "mean_token_accuracy": 0.8875271417200565, "num_tokens": 117281877.0, "step": 17710 }, { "entropy": 0.330704138148576, "epoch": 0.272773278378446, "grad_norm": 0.9304664134979248, "learning_rate": 1.8183590743496335e-05, "loss": 0.3177, "mean_token_accuracy": 0.9013441808521747, "num_tokens": 117345477.0, "step": 17720 }, { "entropy": 0.3328275098465383, "epoch": 0.2729272136371246, "grad_norm": 0.7553912997245789, "learning_rate": 1.819385294268562e-05, "loss": 0.3363, "mean_token_accuracy": 0.8982691809535026, "num_tokens": 117415159.0, "step": 17730 }, { "entropy": 0.33077959725633266, "epoch": 0.27308114889580315, "grad_norm": 0.7435310482978821, "learning_rate": 1.8204115141874907e-05, "loss": 0.3437, "mean_token_accuracy": 0.9005403317511081, "num_tokens": 117479093.0, "step": 17740 }, { "entropy": 0.30998784601688384, "epoch": 0.2732350841544817, "grad_norm": 0.834969162940979, "learning_rate": 1.8214377341064193e-05, "loss": 0.3139, "mean_token_accuracy": 0.9058653369545937, "num_tokens": 117546729.0, "step": 17750 }, { "entropy": 0.3138628302142024, "epoch": 0.2733890194131603, "grad_norm": 0.8698156476020813, "learning_rate": 1.822463954025348e-05, "loss": 0.3171, "mean_token_accuracy": 0.9059965811669827, "num_tokens": 117609182.0, "step": 17760 }, { "entropy": 0.3502957566641271, "epoch": 0.27354295467183887, "grad_norm": 0.8994895815849304, "learning_rate": 1.8234901739442765e-05, "loss": 0.3511, "mean_token_accuracy": 0.8948268003761768, "num_tokens": 117673051.0, "step": 17770 }, { "entropy": 0.3429972339421511, "epoch": 0.2736968899305175, "grad_norm": 0.9377636909484863, "learning_rate": 1.824516393863205e-05, "loss": 0.3387, "mean_token_accuracy": 0.8971825934946537, "num_tokens": 117740395.0, "step": 17780 }, { "entropy": 0.35126289604231714, "epoch": 0.27385082518919607, "grad_norm": 0.9297899007797241, "learning_rate": 1.8255426137821337e-05, "loss": 0.3438, "mean_token_accuracy": 0.8941214941442013, "num_tokens": 117804265.0, "step": 17790 }, { "entropy": 0.3317994685843587, "epoch": 0.27400476044787464, "grad_norm": 0.814372181892395, "learning_rate": 1.8265688337010622e-05, "loss": 0.3236, "mean_token_accuracy": 0.8993019595742225, "num_tokens": 117873827.0, "step": 17800 }, { "entropy": 0.32088489178568125, "epoch": 0.2741586957065532, "grad_norm": 0.6984691023826599, "learning_rate": 1.827595053619991e-05, "loss": 0.3222, "mean_token_accuracy": 0.9046843655407428, "num_tokens": 117942517.0, "step": 17810 }, { "entropy": 0.32344599831849336, "epoch": 0.2743126309652318, "grad_norm": 0.9602479338645935, "learning_rate": 1.8286212735389194e-05, "loss": 0.3252, "mean_token_accuracy": 0.9008918285369873, "num_tokens": 118004560.0, "step": 17820 }, { "entropy": 0.32744576800614594, "epoch": 0.27446656622391036, "grad_norm": 1.0785682201385498, "learning_rate": 1.829647493457848e-05, "loss": 0.3203, "mean_token_accuracy": 0.9056738309562207, "num_tokens": 118065710.0, "step": 17830 }, { "entropy": 0.31549134757369757, "epoch": 0.274620501482589, "grad_norm": 0.8278864622116089, "learning_rate": 1.8306737133767766e-05, "loss": 0.3112, "mean_token_accuracy": 0.903442082554102, "num_tokens": 118134457.0, "step": 17840 }, { "entropy": 0.32847188748419287, "epoch": 0.27477443674126756, "grad_norm": 1.0605436563491821, "learning_rate": 1.8316999332957056e-05, "loss": 0.3309, "mean_token_accuracy": 0.898414571583271, "num_tokens": 118199098.0, "step": 17850 }, { "entropy": 0.3308624433353543, "epoch": 0.27492837199994613, "grad_norm": 1.2811568975448608, "learning_rate": 1.832726153214634e-05, "loss": 0.3297, "mean_token_accuracy": 0.9025349333882332, "num_tokens": 118259699.0, "step": 17860 }, { "entropy": 0.3382772298529744, "epoch": 0.2750823072586247, "grad_norm": 1.0225830078125, "learning_rate": 1.8337523731335627e-05, "loss": 0.346, "mean_token_accuracy": 0.8976543061435223, "num_tokens": 118323374.0, "step": 17870 }, { "entropy": 0.3355094992555678, "epoch": 0.2752362425173033, "grad_norm": 0.7772510647773743, "learning_rate": 1.8347785930524913e-05, "loss": 0.3337, "mean_token_accuracy": 0.8982699505984784, "num_tokens": 118390035.0, "step": 17880 }, { "entropy": 0.3153948625549674, "epoch": 0.27539017777598185, "grad_norm": 0.8413348197937012, "learning_rate": 1.83580481297142e-05, "loss": 0.321, "mean_token_accuracy": 0.9023184008896351, "num_tokens": 118465649.0, "step": 17890 }, { "entropy": 0.346817615441978, "epoch": 0.2755441130346604, "grad_norm": 0.9238510131835938, "learning_rate": 1.8368310328903485e-05, "loss": 0.3554, "mean_token_accuracy": 0.8932204321026802, "num_tokens": 118529720.0, "step": 17900 }, { "entropy": 0.33223065072670577, "epoch": 0.27569804829333905, "grad_norm": 1.0054680109024048, "learning_rate": 1.837857252809277e-05, "loss": 0.3187, "mean_token_accuracy": 0.9023290403187275, "num_tokens": 118595551.0, "step": 17910 }, { "entropy": 0.3175911670550704, "epoch": 0.2758519835520176, "grad_norm": 0.8565828800201416, "learning_rate": 1.8388834727282057e-05, "loss": 0.3231, "mean_token_accuracy": 0.9018827937543392, "num_tokens": 118666237.0, "step": 17920 }, { "entropy": 0.32941658133640883, "epoch": 0.2760059188106962, "grad_norm": 0.7954923510551453, "learning_rate": 1.8399096926471343e-05, "loss": 0.3309, "mean_token_accuracy": 0.9030625447630882, "num_tokens": 118737644.0, "step": 17930 }, { "entropy": 0.31545482417568566, "epoch": 0.27615985406937477, "grad_norm": 0.7722528576850891, "learning_rate": 1.840935912566063e-05, "loss": 0.3111, "mean_token_accuracy": 0.9037425480782986, "num_tokens": 118804605.0, "step": 17940 }, { "entropy": 0.3119498655200005, "epoch": 0.27631378932805334, "grad_norm": 1.0152130126953125, "learning_rate": 1.8419621324849915e-05, "loss": 0.3203, "mean_token_accuracy": 0.9050949320197106, "num_tokens": 118867292.0, "step": 17950 }, { "entropy": 0.35318719428032636, "epoch": 0.2764677245867319, "grad_norm": 0.9829860329627991, "learning_rate": 1.84298835240392e-05, "loss": 0.3552, "mean_token_accuracy": 0.8925770796835423, "num_tokens": 118929157.0, "step": 17960 }, { "entropy": 0.3393428960815072, "epoch": 0.27662165984541054, "grad_norm": 0.9313311576843262, "learning_rate": 1.8440145723228487e-05, "loss": 0.3391, "mean_token_accuracy": 0.8989765681326389, "num_tokens": 118994554.0, "step": 17970 }, { "entropy": 0.3177703684195876, "epoch": 0.2767755951040891, "grad_norm": 0.8115026354789734, "learning_rate": 1.8450407922417773e-05, "loss": 0.322, "mean_token_accuracy": 0.902335163205862, "num_tokens": 119058458.0, "step": 17980 }, { "entropy": 0.32877353504300116, "epoch": 0.2769295303627677, "grad_norm": 0.886058509349823, "learning_rate": 1.8460670121607062e-05, "loss": 0.3261, "mean_token_accuracy": 0.9037520006299019, "num_tokens": 119129350.0, "step": 17990 }, { "entropy": 0.3334943439811468, "epoch": 0.27708346562144626, "grad_norm": 0.6872387528419495, "learning_rate": 1.8470932320796348e-05, "loss": 0.3341, "mean_token_accuracy": 0.8979296267032624, "num_tokens": 119202118.0, "step": 18000 }, { "entropy": 0.34408335397019985, "epoch": 0.27723740088012483, "grad_norm": 1.0940088033676147, "learning_rate": 1.8481194519985634e-05, "loss": 0.3598, "mean_token_accuracy": 0.8945767514407634, "num_tokens": 119264650.0, "step": 18010 }, { "entropy": 0.3404914395883679, "epoch": 0.2773913361388034, "grad_norm": 1.160576581954956, "learning_rate": 1.849145671917492e-05, "loss": 0.3341, "mean_token_accuracy": 0.8994818747043609, "num_tokens": 119327828.0, "step": 18020 }, { "entropy": 0.3262810755521059, "epoch": 0.27754527139748203, "grad_norm": 0.8363104462623596, "learning_rate": 1.8501718918364206e-05, "loss": 0.3283, "mean_token_accuracy": 0.9007451735436917, "num_tokens": 119393383.0, "step": 18030 }, { "entropy": 0.3451887616887689, "epoch": 0.2776992066561606, "grad_norm": 0.9473867416381836, "learning_rate": 1.8511981117553492e-05, "loss": 0.3409, "mean_token_accuracy": 0.8950209952890873, "num_tokens": 119470021.0, "step": 18040 }, { "entropy": 0.32390596680343153, "epoch": 0.2778531419148392, "grad_norm": 0.9084698557853699, "learning_rate": 1.8522243316742778e-05, "loss": 0.3112, "mean_token_accuracy": 0.9035158783197403, "num_tokens": 119534217.0, "step": 18050 }, { "entropy": 0.32387550696730616, "epoch": 0.27800707717351775, "grad_norm": 0.7257477641105652, "learning_rate": 1.8532505515932067e-05, "loss": 0.3219, "mean_token_accuracy": 0.9017894007265568, "num_tokens": 119599278.0, "step": 18060 }, { "entropy": 0.32100676763802766, "epoch": 0.2781610124321963, "grad_norm": 0.8377177715301514, "learning_rate": 1.8542767715121353e-05, "loss": 0.3295, "mean_token_accuracy": 0.9021190159022808, "num_tokens": 119677642.0, "step": 18070 }, { "entropy": 0.3147047184407711, "epoch": 0.2783149476908749, "grad_norm": 1.225521445274353, "learning_rate": 1.855302991431064e-05, "loss": 0.3209, "mean_token_accuracy": 0.9055307827889919, "num_tokens": 119741441.0, "step": 18080 }, { "entropy": 0.34841421749442814, "epoch": 0.27846888294955346, "grad_norm": 0.753430962562561, "learning_rate": 1.8563292113499925e-05, "loss": 0.3553, "mean_token_accuracy": 0.8956600032746792, "num_tokens": 119808331.0, "step": 18090 }, { "entropy": 0.3192646037787199, "epoch": 0.2786228182082321, "grad_norm": 0.9461034536361694, "learning_rate": 1.857355431268921e-05, "loss": 0.3128, "mean_token_accuracy": 0.9045535750687123, "num_tokens": 119869116.0, "step": 18100 }, { "entropy": 0.31945878360420465, "epoch": 0.27877675346691067, "grad_norm": 0.8182504773139954, "learning_rate": 1.8583816511878497e-05, "loss": 0.3261, "mean_token_accuracy": 0.9044293686747551, "num_tokens": 119934642.0, "step": 18110 }, { "entropy": 0.3033894492313266, "epoch": 0.27893068872558924, "grad_norm": 0.7271350622177124, "learning_rate": 1.8594078711067783e-05, "loss": 0.3056, "mean_token_accuracy": 0.905570087581873, "num_tokens": 120011030.0, "step": 18120 }, { "entropy": 0.3206852206028998, "epoch": 0.2790846239842678, "grad_norm": 0.7902372479438782, "learning_rate": 1.860434091025707e-05, "loss": 0.3347, "mean_token_accuracy": 0.9041827782988549, "num_tokens": 120078287.0, "step": 18130 }, { "entropy": 0.34205353166908026, "epoch": 0.2792385592429464, "grad_norm": 0.9019453525543213, "learning_rate": 1.8614603109446355e-05, "loss": 0.3488, "mean_token_accuracy": 0.8993356332182885, "num_tokens": 120144822.0, "step": 18140 }, { "entropy": 0.3260689870454371, "epoch": 0.27939249450162496, "grad_norm": 0.7553229331970215, "learning_rate": 1.862486530863564e-05, "loss": 0.325, "mean_token_accuracy": 0.9010759718716145, "num_tokens": 120213167.0, "step": 18150 }, { "entropy": 0.3252909860573709, "epoch": 0.2795464297603036, "grad_norm": 1.070816159248352, "learning_rate": 1.8635127507824927e-05, "loss": 0.3274, "mean_token_accuracy": 0.9008226372301579, "num_tokens": 120279347.0, "step": 18160 }, { "entropy": 0.33978174766525626, "epoch": 0.27970036501898216, "grad_norm": 0.9909418821334839, "learning_rate": 1.8645389707014213e-05, "loss": 0.3358, "mean_token_accuracy": 0.9000211350619793, "num_tokens": 120344400.0, "step": 18170 }, { "entropy": 0.3182203622534871, "epoch": 0.27985430027766073, "grad_norm": 0.9559057950973511, "learning_rate": 1.86556519062035e-05, "loss": 0.3245, "mean_token_accuracy": 0.9028216220438481, "num_tokens": 120416970.0, "step": 18180 }, { "entropy": 0.33926021996885536, "epoch": 0.2800082355363393, "grad_norm": 0.7449178695678711, "learning_rate": 1.8665914105392785e-05, "loss": 0.3359, "mean_token_accuracy": 0.897214786708355, "num_tokens": 120487169.0, "step": 18190 }, { "entropy": 0.35417638421058656, "epoch": 0.2801621707950179, "grad_norm": 0.7608311176300049, "learning_rate": 1.8676176304582074e-05, "loss": 0.353, "mean_token_accuracy": 0.8925740242004394, "num_tokens": 120553201.0, "step": 18200 }, { "entropy": 0.31654341686517, "epoch": 0.28031610605369645, "grad_norm": 0.9928494095802307, "learning_rate": 1.868643850377136e-05, "loss": 0.3158, "mean_token_accuracy": 0.9037090092897415, "num_tokens": 120623577.0, "step": 18210 }, { "entropy": 0.3314059882424772, "epoch": 0.2804700413123751, "grad_norm": 0.7749924063682556, "learning_rate": 1.8696700702960646e-05, "loss": 0.3352, "mean_token_accuracy": 0.8993528179824353, "num_tokens": 120702684.0, "step": 18220 }, { "entropy": 0.3420862969011068, "epoch": 0.28062397657105365, "grad_norm": 0.9246329069137573, "learning_rate": 1.8706962902149932e-05, "loss": 0.3466, "mean_token_accuracy": 0.8968097820878029, "num_tokens": 120767986.0, "step": 18230 }, { "entropy": 0.32253103591501714, "epoch": 0.2807779118297322, "grad_norm": 0.9243056774139404, "learning_rate": 1.8717225101339218e-05, "loss": 0.3223, "mean_token_accuracy": 0.9017111822962761, "num_tokens": 120832273.0, "step": 18240 }, { "entropy": 0.33565416354686023, "epoch": 0.2809318470884108, "grad_norm": 0.9674714803695679, "learning_rate": 1.8727487300528504e-05, "loss": 0.3363, "mean_token_accuracy": 0.9000039622187614, "num_tokens": 120891024.0, "step": 18250 }, { "entropy": 0.3144376671873033, "epoch": 0.28108578234708936, "grad_norm": 0.7190493941307068, "learning_rate": 1.873774949971779e-05, "loss": 0.3119, "mean_token_accuracy": 0.908236201107502, "num_tokens": 120954018.0, "step": 18260 }, { "entropy": 0.3305823964998126, "epoch": 0.28123971760576794, "grad_norm": 0.8126688599586487, "learning_rate": 1.874801169890708e-05, "loss": 0.3352, "mean_token_accuracy": 0.8973361492156983, "num_tokens": 121019348.0, "step": 18270 }, { "entropy": 0.33019577227532865, "epoch": 0.2813936528644465, "grad_norm": 0.9119473695755005, "learning_rate": 1.8758273898096365e-05, "loss": 0.3335, "mean_token_accuracy": 0.9013904921710492, "num_tokens": 121080882.0, "step": 18280 }, { "entropy": 0.32298508314415814, "epoch": 0.28154758812312514, "grad_norm": 0.7447236776351929, "learning_rate": 1.876853609728565e-05, "loss": 0.3256, "mean_token_accuracy": 0.9013004519045353, "num_tokens": 121147256.0, "step": 18290 }, { "entropy": 0.3230129054747522, "epoch": 0.2817015233818037, "grad_norm": 0.9033135771751404, "learning_rate": 1.8778798296474937e-05, "loss": 0.3268, "mean_token_accuracy": 0.9026780411601066, "num_tokens": 121213361.0, "step": 18300 }, { "entropy": 0.31332110743969677, "epoch": 0.2818554586404823, "grad_norm": 0.7670292258262634, "learning_rate": 1.8789060495664223e-05, "loss": 0.3198, "mean_token_accuracy": 0.9044810645282269, "num_tokens": 121282477.0, "step": 18310 }, { "entropy": 0.3150912248529494, "epoch": 0.28200939389916085, "grad_norm": 0.7784136533737183, "learning_rate": 1.879932269485351e-05, "loss": 0.3214, "mean_token_accuracy": 0.905445821583271, "num_tokens": 121348101.0, "step": 18320 }, { "entropy": 0.32992134960368275, "epoch": 0.2821633291578394, "grad_norm": 0.8437848687171936, "learning_rate": 1.8809584894042795e-05, "loss": 0.3347, "mean_token_accuracy": 0.9007382288575172, "num_tokens": 121411181.0, "step": 18330 }, { "entropy": 0.33764418605715035, "epoch": 0.282317264416518, "grad_norm": 0.8823918104171753, "learning_rate": 1.881984709323208e-05, "loss": 0.3453, "mean_token_accuracy": 0.8972452595829964, "num_tokens": 121481639.0, "step": 18340 }, { "entropy": 0.31537831397727134, "epoch": 0.2824711996751966, "grad_norm": 0.9654721021652222, "learning_rate": 1.8830109292421366e-05, "loss": 0.3192, "mean_token_accuracy": 0.9069120280444622, "num_tokens": 121547733.0, "step": 18350 }, { "entropy": 0.32444445211440326, "epoch": 0.2826251349338752, "grad_norm": 0.972398579120636, "learning_rate": 1.8840371491610652e-05, "loss": 0.343, "mean_token_accuracy": 0.8984679192304611, "num_tokens": 121611504.0, "step": 18360 }, { "entropy": 0.3220734564587474, "epoch": 0.28277907019255377, "grad_norm": 0.8639740347862244, "learning_rate": 1.885063369079994e-05, "loss": 0.3102, "mean_token_accuracy": 0.9044206023216248, "num_tokens": 121678065.0, "step": 18370 }, { "entropy": 0.3394918445497751, "epoch": 0.28293300545123234, "grad_norm": 0.8275685906410217, "learning_rate": 1.8860895889989224e-05, "loss": 0.3347, "mean_token_accuracy": 0.8987101890146733, "num_tokens": 121745890.0, "step": 18380 }, { "entropy": 0.3224625298753381, "epoch": 0.2830869407099109, "grad_norm": 0.9466720223426819, "learning_rate": 1.887115808917851e-05, "loss": 0.33, "mean_token_accuracy": 0.9042651824653148, "num_tokens": 121815678.0, "step": 18390 }, { "entropy": 0.3609408846125007, "epoch": 0.2832408759685895, "grad_norm": 0.6700239777565002, "learning_rate": 1.8881420288367796e-05, "loss": 0.3561, "mean_token_accuracy": 0.8935481742024421, "num_tokens": 121882961.0, "step": 18400 }, { "entropy": 0.32733147060498596, "epoch": 0.2833948112272681, "grad_norm": 1.0627809762954712, "learning_rate": 1.8891682487557086e-05, "loss": 0.3251, "mean_token_accuracy": 0.9018705911934376, "num_tokens": 121952895.0, "step": 18410 }, { "entropy": 0.3153926435858011, "epoch": 0.2835487464859467, "grad_norm": 1.23601496219635, "learning_rate": 1.890194468674637e-05, "loss": 0.3098, "mean_token_accuracy": 0.9064513020217418, "num_tokens": 122019906.0, "step": 18420 }, { "entropy": 0.3122779069468379, "epoch": 0.28370268174462526, "grad_norm": 0.8609316349029541, "learning_rate": 1.8912206885935657e-05, "loss": 0.3164, "mean_token_accuracy": 0.9034980505704879, "num_tokens": 122079283.0, "step": 18430 }, { "entropy": 0.3427103723399341, "epoch": 0.28385661700330383, "grad_norm": 0.950169026851654, "learning_rate": 1.8922469085124943e-05, "loss": 0.3446, "mean_token_accuracy": 0.8971457973122596, "num_tokens": 122144382.0, "step": 18440 }, { "entropy": 0.34319494403898715, "epoch": 0.2840105522619824, "grad_norm": 0.9444339275360107, "learning_rate": 1.893273128431423e-05, "loss": 0.3387, "mean_token_accuracy": 0.8964257128536701, "num_tokens": 122204168.0, "step": 18450 }, { "entropy": 0.32015845738351345, "epoch": 0.284164487520661, "grad_norm": 0.973302960395813, "learning_rate": 1.8942993483503515e-05, "loss": 0.3231, "mean_token_accuracy": 0.9039513505995274, "num_tokens": 122262437.0, "step": 18460 }, { "entropy": 0.2932649128139019, "epoch": 0.28431842277933955, "grad_norm": 0.6887993812561035, "learning_rate": 1.89532556826928e-05, "loss": 0.3066, "mean_token_accuracy": 0.912144111841917, "num_tokens": 122331885.0, "step": 18470 }, { "entropy": 0.34332219399511815, "epoch": 0.2844723580380182, "grad_norm": 0.8867966532707214, "learning_rate": 1.896351788188209e-05, "loss": 0.3495, "mean_token_accuracy": 0.8959463514387608, "num_tokens": 122393795.0, "step": 18480 }, { "entropy": 0.3311495829373598, "epoch": 0.28462629329669675, "grad_norm": 0.9707996845245361, "learning_rate": 1.8973780081071376e-05, "loss": 0.3326, "mean_token_accuracy": 0.9001730173826218, "num_tokens": 122460255.0, "step": 18490 }, { "entropy": 0.3141169817186892, "epoch": 0.2847802285553753, "grad_norm": 0.8669468760490417, "learning_rate": 1.8984042280260662e-05, "loss": 0.3192, "mean_token_accuracy": 0.9026700273156166, "num_tokens": 122519827.0, "step": 18500 }, { "entropy": 0.3267633724957705, "epoch": 0.2849341638140539, "grad_norm": 0.8246471881866455, "learning_rate": 1.899430447944995e-05, "loss": 0.3162, "mean_token_accuracy": 0.9044230557978153, "num_tokens": 122587413.0, "step": 18510 }, { "entropy": 0.328487266972661, "epoch": 0.28508809907273247, "grad_norm": 0.8656097650527954, "learning_rate": 1.9004566678639234e-05, "loss": 0.3232, "mean_token_accuracy": 0.8979416154325008, "num_tokens": 122652627.0, "step": 18520 }, { "entropy": 0.32284033661708234, "epoch": 0.28524203433141104, "grad_norm": 0.5975400805473328, "learning_rate": 1.901482887782852e-05, "loss": 0.3293, "mean_token_accuracy": 0.9009480752050877, "num_tokens": 122731722.0, "step": 18530 }, { "entropy": 0.3374594641849399, "epoch": 0.28539596959008967, "grad_norm": 0.7893279194831848, "learning_rate": 1.9025091077017806e-05, "loss": 0.3367, "mean_token_accuracy": 0.8990962877869606, "num_tokens": 122793640.0, "step": 18540 }, { "entropy": 0.3194980889558792, "epoch": 0.28554990484876824, "grad_norm": 0.8450246453285217, "learning_rate": 1.9035353276207092e-05, "loss": 0.3162, "mean_token_accuracy": 0.9024796769022941, "num_tokens": 122859383.0, "step": 18550 }, { "entropy": 0.33162004351615904, "epoch": 0.2857038401074468, "grad_norm": 0.753115177154541, "learning_rate": 1.9045615475396378e-05, "loss": 0.337, "mean_token_accuracy": 0.8988065227866173, "num_tokens": 122932841.0, "step": 18560 }, { "entropy": 0.33100267173722386, "epoch": 0.2858577753661254, "grad_norm": 0.7864899039268494, "learning_rate": 1.9055877674585664e-05, "loss": 0.3232, "mean_token_accuracy": 0.9002846643328667, "num_tokens": 123001739.0, "step": 18570 }, { "entropy": 0.32212676014751196, "epoch": 0.28601171062480396, "grad_norm": 0.9949078559875488, "learning_rate": 1.906613987377495e-05, "loss": 0.326, "mean_token_accuracy": 0.9034456670284271, "num_tokens": 123073585.0, "step": 18580 }, { "entropy": 0.33012649985030296, "epoch": 0.28616564588348253, "grad_norm": 0.865713357925415, "learning_rate": 1.9076402072964236e-05, "loss": 0.3334, "mean_token_accuracy": 0.8991921000182629, "num_tokens": 123141461.0, "step": 18590 }, { "entropy": 0.3090610635466874, "epoch": 0.28631958114216116, "grad_norm": 0.8200778365135193, "learning_rate": 1.9086664272153522e-05, "loss": 0.3127, "mean_token_accuracy": 0.9079195253551007, "num_tokens": 123209052.0, "step": 18600 }, { "entropy": 0.3373510794714093, "epoch": 0.28647351640083973, "grad_norm": 0.714702308177948, "learning_rate": 1.9096926471342808e-05, "loss": 0.3392, "mean_token_accuracy": 0.8968178391456604, "num_tokens": 123280583.0, "step": 18610 }, { "entropy": 0.33674519322812557, "epoch": 0.2866274516595183, "grad_norm": 0.6024294495582581, "learning_rate": 1.9107188670532097e-05, "loss": 0.3279, "mean_token_accuracy": 0.8989624418318272, "num_tokens": 123346847.0, "step": 18620 }, { "entropy": 0.31936446763575077, "epoch": 0.2867813869181969, "grad_norm": 0.7530230283737183, "learning_rate": 1.9117450869721383e-05, "loss": 0.3175, "mean_token_accuracy": 0.9057679817080497, "num_tokens": 123410823.0, "step": 18630 }, { "entropy": 0.33497188799083233, "epoch": 0.28693532217687545, "grad_norm": 0.6995027661323547, "learning_rate": 1.912771306891067e-05, "loss": 0.3309, "mean_token_accuracy": 0.899405661970377, "num_tokens": 123478224.0, "step": 18640 }, { "entropy": 0.32617325941100717, "epoch": 0.287089257435554, "grad_norm": 0.7989957928657532, "learning_rate": 1.9137975268099955e-05, "loss": 0.3203, "mean_token_accuracy": 0.9050661280751229, "num_tokens": 123538235.0, "step": 18650 }, { "entropy": 0.3225190402939916, "epoch": 0.2872431926942326, "grad_norm": 0.8685078620910645, "learning_rate": 1.914823746728924e-05, "loss": 0.3216, "mean_token_accuracy": 0.9018322393298149, "num_tokens": 123603433.0, "step": 18660 }, { "entropy": 0.33108960120007397, "epoch": 0.2873971279529112, "grad_norm": 1.1431646347045898, "learning_rate": 1.9158499666478527e-05, "loss": 0.3288, "mean_token_accuracy": 0.8980362087488174, "num_tokens": 123675345.0, "step": 18670 }, { "entropy": 0.3223969384096563, "epoch": 0.2875510632115898, "grad_norm": 0.8823962211608887, "learning_rate": 1.9168761865667813e-05, "loss": 0.3274, "mean_token_accuracy": 0.9038758382201195, "num_tokens": 123741254.0, "step": 18680 }, { "entropy": 0.34826893396675584, "epoch": 0.28770499847026837, "grad_norm": 1.1902899742126465, "learning_rate": 1.9179024064857102e-05, "loss": 0.3596, "mean_token_accuracy": 0.8938956841826439, "num_tokens": 123813415.0, "step": 18690 }, { "entropy": 0.33403853960335256, "epoch": 0.28785893372894694, "grad_norm": 0.9236346483230591, "learning_rate": 1.9189286264046388e-05, "loss": 0.3322, "mean_token_accuracy": 0.8999733053147793, "num_tokens": 123875878.0, "step": 18700 }, { "entropy": 0.33894577231258155, "epoch": 0.2880128689876255, "grad_norm": 0.9568053483963013, "learning_rate": 1.9199548463235674e-05, "loss": 0.3315, "mean_token_accuracy": 0.8983626760542393, "num_tokens": 123937948.0, "step": 18710 }, { "entropy": 0.3219125496223569, "epoch": 0.2881668042463041, "grad_norm": 0.7980546355247498, "learning_rate": 1.920981066242496e-05, "loss": 0.325, "mean_token_accuracy": 0.9023103557527066, "num_tokens": 124000265.0, "step": 18720 }, { "entropy": 0.30076578399166465, "epoch": 0.2883207395049827, "grad_norm": 0.7189030051231384, "learning_rate": 1.9220072861614246e-05, "loss": 0.3155, "mean_token_accuracy": 0.9085955910384655, "num_tokens": 124064156.0, "step": 18730 }, { "entropy": 0.32313946383073927, "epoch": 0.2884746747636613, "grad_norm": 0.8689915537834167, "learning_rate": 1.9230335060803532e-05, "loss": 0.3227, "mean_token_accuracy": 0.9044741302728653, "num_tokens": 124132330.0, "step": 18740 }, { "entropy": 0.3353875996544957, "epoch": 0.28862861002233986, "grad_norm": 0.7994841933250427, "learning_rate": 1.9240597259992818e-05, "loss": 0.3382, "mean_token_accuracy": 0.8989833690226078, "num_tokens": 124194766.0, "step": 18750 }, { "entropy": 0.34182289661839604, "epoch": 0.28878254528101843, "grad_norm": 1.418738842010498, "learning_rate": 1.9250859459182104e-05, "loss": 0.3387, "mean_token_accuracy": 0.8987330310046673, "num_tokens": 124246613.0, "step": 18760 }, { "entropy": 0.3425668646581471, "epoch": 0.288936480539697, "grad_norm": 0.8356424570083618, "learning_rate": 1.926112165837139e-05, "loss": 0.3366, "mean_token_accuracy": 0.8959762990474701, "num_tokens": 124312397.0, "step": 18770 }, { "entropy": 0.3211830432526767, "epoch": 0.2890904157983756, "grad_norm": 0.9722719192504883, "learning_rate": 1.9271383857560676e-05, "loss": 0.3342, "mean_token_accuracy": 0.9047818377614021, "num_tokens": 124377394.0, "step": 18780 }, { "entropy": 0.3291613722220063, "epoch": 0.2892443510570542, "grad_norm": 0.7549973130226135, "learning_rate": 1.928164605674996e-05, "loss": 0.328, "mean_token_accuracy": 0.9007111825048923, "num_tokens": 124451342.0, "step": 18790 }, { "entropy": 0.3333371676504612, "epoch": 0.2893982863157328, "grad_norm": 1.0745495557785034, "learning_rate": 1.9291908255939248e-05, "loss": 0.3345, "mean_token_accuracy": 0.9003669664263725, "num_tokens": 124509280.0, "step": 18800 }, { "entropy": 0.3069153239950538, "epoch": 0.28955222157441135, "grad_norm": 0.8741242289543152, "learning_rate": 1.9302170455128534e-05, "loss": 0.2973, "mean_token_accuracy": 0.90470185354352, "num_tokens": 124571508.0, "step": 18810 }, { "entropy": 0.348543930798769, "epoch": 0.2897061568330899, "grad_norm": 0.9234386086463928, "learning_rate": 1.931243265431782e-05, "loss": 0.3604, "mean_token_accuracy": 0.8926108784973621, "num_tokens": 124636356.0, "step": 18820 }, { "entropy": 0.3283236464485526, "epoch": 0.2898600920917685, "grad_norm": 0.709385871887207, "learning_rate": 1.932269485350711e-05, "loss": 0.3329, "mean_token_accuracy": 0.9041508346796036, "num_tokens": 124706081.0, "step": 18830 }, { "entropy": 0.3360951265320182, "epoch": 0.29001402735044707, "grad_norm": 0.8011663556098938, "learning_rate": 1.9332957052696395e-05, "loss": 0.3278, "mean_token_accuracy": 0.8976170502603054, "num_tokens": 124773414.0, "step": 18840 }, { "entropy": 0.3161689517088234, "epoch": 0.29016796260912564, "grad_norm": 0.7538257837295532, "learning_rate": 1.934321925188568e-05, "loss": 0.3192, "mean_token_accuracy": 0.9064976952970027, "num_tokens": 124836983.0, "step": 18850 }, { "entropy": 0.3363803943619132, "epoch": 0.29032189786780427, "grad_norm": 1.1080001592636108, "learning_rate": 1.9353481451074967e-05, "loss": 0.3392, "mean_token_accuracy": 0.8979464426636696, "num_tokens": 124900155.0, "step": 18860 }, { "entropy": 0.3317596189677715, "epoch": 0.29047583312648284, "grad_norm": 0.727939784526825, "learning_rate": 1.9363743650264253e-05, "loss": 0.3304, "mean_token_accuracy": 0.8997397512197495, "num_tokens": 124961240.0, "step": 18870 }, { "entropy": 0.3214283221401274, "epoch": 0.2906297683851614, "grad_norm": 0.7206705808639526, "learning_rate": 1.937400584945354e-05, "loss": 0.3262, "mean_token_accuracy": 0.9037024497985839, "num_tokens": 125032302.0, "step": 18880 }, { "entropy": 0.3126100546680391, "epoch": 0.29078370364384, "grad_norm": 0.721017599105835, "learning_rate": 1.9384268048642825e-05, "loss": 0.3289, "mean_token_accuracy": 0.9044610694050789, "num_tokens": 125093535.0, "step": 18890 }, { "entropy": 0.3136457627639174, "epoch": 0.29093763890251856, "grad_norm": 0.7851596474647522, "learning_rate": 1.9394530247832114e-05, "loss": 0.3141, "mean_token_accuracy": 0.9062489628791809, "num_tokens": 125165642.0, "step": 18900 }, { "entropy": 0.3213448886759579, "epoch": 0.29109157416119713, "grad_norm": 0.9231459498405457, "learning_rate": 1.94047924470214e-05, "loss": 0.3334, "mean_token_accuracy": 0.9022372499108314, "num_tokens": 125228667.0, "step": 18910 }, { "entropy": 0.3412524709478021, "epoch": 0.29124550941987576, "grad_norm": 0.8685522675514221, "learning_rate": 1.9415054646210686e-05, "loss": 0.335, "mean_token_accuracy": 0.8981604754924775, "num_tokens": 125295320.0, "step": 18920 }, { "entropy": 0.3223870312795043, "epoch": 0.29139944467855433, "grad_norm": 0.8753737807273865, "learning_rate": 1.942531684539997e-05, "loss": 0.3169, "mean_token_accuracy": 0.9050321854650974, "num_tokens": 125356716.0, "step": 18930 }, { "entropy": 0.32664001900702716, "epoch": 0.2915533799372329, "grad_norm": 1.0639865398406982, "learning_rate": 1.9435579044589258e-05, "loss": 0.3399, "mean_token_accuracy": 0.8999121680855751, "num_tokens": 125422316.0, "step": 18940 }, { "entropy": 0.34976419899612665, "epoch": 0.2917073151959115, "grad_norm": 0.760429322719574, "learning_rate": 1.9445841243778544e-05, "loss": 0.3388, "mean_token_accuracy": 0.8959164015948773, "num_tokens": 125493408.0, "step": 18950 }, { "entropy": 0.33665865724906324, "epoch": 0.29186125045459005, "grad_norm": 1.4777162075042725, "learning_rate": 1.945610344296783e-05, "loss": 0.3232, "mean_token_accuracy": 0.8983580566942692, "num_tokens": 125556668.0, "step": 18960 }, { "entropy": 0.34609358180314304, "epoch": 0.2920151857132686, "grad_norm": 0.7222606539726257, "learning_rate": 1.9466365642157115e-05, "loss": 0.3395, "mean_token_accuracy": 0.8934235885739327, "num_tokens": 125625434.0, "step": 18970 }, { "entropy": 0.32533278558403256, "epoch": 0.29216912097194725, "grad_norm": 0.9197064638137817, "learning_rate": 1.94766278413464e-05, "loss": 0.3158, "mean_token_accuracy": 0.9050007507205009, "num_tokens": 125688620.0, "step": 18980 }, { "entropy": 0.3292812298052013, "epoch": 0.2923230562306258, "grad_norm": 0.7825255990028381, "learning_rate": 1.9486890040535687e-05, "loss": 0.3396, "mean_token_accuracy": 0.8983354821801186, "num_tokens": 125748194.0, "step": 18990 }, { "entropy": 0.3371522225439548, "epoch": 0.2924769914893044, "grad_norm": 0.8718425035476685, "learning_rate": 1.9497152239724973e-05, "loss": 0.3334, "mean_token_accuracy": 0.8983974762260913, "num_tokens": 125815224.0, "step": 19000 }, { "entropy": 0.3195478274486959, "epoch": 0.29263092674798297, "grad_norm": 0.943288266658783, "learning_rate": 1.950741443891426e-05, "loss": 0.3351, "mean_token_accuracy": 0.9029567867517472, "num_tokens": 125885115.0, "step": 19010 }, { "entropy": 0.3297803720459342, "epoch": 0.29278486200666154, "grad_norm": 0.7769254446029663, "learning_rate": 1.9517676638103545e-05, "loss": 0.3287, "mean_token_accuracy": 0.9009628802537918, "num_tokens": 125954572.0, "step": 19020 }, { "entropy": 0.333796694688499, "epoch": 0.2929387972653401, "grad_norm": 0.8674343824386597, "learning_rate": 1.952793883729283e-05, "loss": 0.3431, "mean_token_accuracy": 0.9008184254169465, "num_tokens": 126021684.0, "step": 19030 }, { "entropy": 0.3210636245086789, "epoch": 0.2930927325240187, "grad_norm": 0.7259174585342407, "learning_rate": 1.953820103648212e-05, "loss": 0.31, "mean_token_accuracy": 0.9037628553807735, "num_tokens": 126086698.0, "step": 19040 }, { "entropy": 0.3282139152288437, "epoch": 0.2932466677826973, "grad_norm": 0.8118738532066345, "learning_rate": 1.9548463235671406e-05, "loss": 0.3451, "mean_token_accuracy": 0.900468809902668, "num_tokens": 126150225.0, "step": 19050 }, { "entropy": 0.33605186082422733, "epoch": 0.2934006030413759, "grad_norm": 0.866915225982666, "learning_rate": 1.9558725434860692e-05, "loss": 0.3367, "mean_token_accuracy": 0.8994342058897018, "num_tokens": 126210305.0, "step": 19060 }, { "entropy": 0.3276091646403074, "epoch": 0.29355453830005446, "grad_norm": 0.8588497042655945, "learning_rate": 1.956898763404998e-05, "loss": 0.3306, "mean_token_accuracy": 0.9017309308052063, "num_tokens": 126278550.0, "step": 19070 }, { "entropy": 0.33005668930709364, "epoch": 0.29370847355873303, "grad_norm": 0.7890899181365967, "learning_rate": 1.9579249833239264e-05, "loss": 0.3294, "mean_token_accuracy": 0.9007031463086606, "num_tokens": 126343446.0, "step": 19080 }, { "entropy": 0.3372112323530018, "epoch": 0.2938624088174116, "grad_norm": 0.7905132174491882, "learning_rate": 1.958951203242855e-05, "loss": 0.331, "mean_token_accuracy": 0.8982608288526535, "num_tokens": 126407256.0, "step": 19090 }, { "entropy": 0.34158627623692156, "epoch": 0.2940163440760902, "grad_norm": 0.7442073822021484, "learning_rate": 1.9599774231617836e-05, "loss": 0.3462, "mean_token_accuracy": 0.8953173398971558, "num_tokens": 126482084.0, "step": 19100 }, { "entropy": 0.34382039150223137, "epoch": 0.2941702793347688, "grad_norm": 1.0335508584976196, "learning_rate": 1.9610036430807126e-05, "loss": 0.3264, "mean_token_accuracy": 0.8964486919343472, "num_tokens": 126547384.0, "step": 19110 }, { "entropy": 0.32044007861986756, "epoch": 0.2943242145934474, "grad_norm": 0.846759557723999, "learning_rate": 1.962029862999641e-05, "loss": 0.3316, "mean_token_accuracy": 0.9012502908706665, "num_tokens": 126617455.0, "step": 19120 }, { "entropy": 0.33829523231834174, "epoch": 0.29447814985212595, "grad_norm": 0.8764218091964722, "learning_rate": 1.9630560829185697e-05, "loss": 0.3353, "mean_token_accuracy": 0.8982304103672505, "num_tokens": 126688119.0, "step": 19130 }, { "entropy": 0.3214715336449444, "epoch": 0.2946320851108045, "grad_norm": 0.7332745790481567, "learning_rate": 1.9640823028374983e-05, "loss": 0.3346, "mean_token_accuracy": 0.9025524832308293, "num_tokens": 126756598.0, "step": 19140 }, { "entropy": 0.34083395581692455, "epoch": 0.2947860203694831, "grad_norm": 0.7267321944236755, "learning_rate": 1.965108522756427e-05, "loss": 0.3331, "mean_token_accuracy": 0.8977976836264133, "num_tokens": 126826015.0, "step": 19150 }, { "entropy": 0.3202080473303795, "epoch": 0.29493995562816167, "grad_norm": 0.9876533150672913, "learning_rate": 1.9661347426753555e-05, "loss": 0.3324, "mean_token_accuracy": 0.9025640793144702, "num_tokens": 126893548.0, "step": 19160 }, { "entropy": 0.3173240368254483, "epoch": 0.2950938908868403, "grad_norm": 0.7122992277145386, "learning_rate": 1.967160962594284e-05, "loss": 0.321, "mean_token_accuracy": 0.9026424705982208, "num_tokens": 126954338.0, "step": 19170 }, { "entropy": 0.3486126588657498, "epoch": 0.29524782614551887, "grad_norm": 0.7788870334625244, "learning_rate": 1.9681871825132127e-05, "loss": 0.325, "mean_token_accuracy": 0.8938244514167308, "num_tokens": 127015096.0, "step": 19180 }, { "entropy": 0.30542272701859474, "epoch": 0.29540176140419744, "grad_norm": 0.7434176206588745, "learning_rate": 1.9692134024321413e-05, "loss": 0.3079, "mean_token_accuracy": 0.9082300193607807, "num_tokens": 127081338.0, "step": 19190 }, { "entropy": 0.33732370464131234, "epoch": 0.295555696662876, "grad_norm": 0.7154481410980225, "learning_rate": 1.97023962235107e-05, "loss": 0.3544, "mean_token_accuracy": 0.896927110850811, "num_tokens": 127148104.0, "step": 19200 }, { "entropy": 0.32429257286712526, "epoch": 0.2957096319215546, "grad_norm": 0.8775435090065002, "learning_rate": 1.9712658422699985e-05, "loss": 0.3271, "mean_token_accuracy": 0.9069225870072841, "num_tokens": 127210417.0, "step": 19210 }, { "entropy": 0.3290254076011479, "epoch": 0.29586356718023316, "grad_norm": 0.7428288459777832, "learning_rate": 1.972292062188927e-05, "loss": 0.3292, "mean_token_accuracy": 0.9010067731142044, "num_tokens": 127278556.0, "step": 19220 }, { "entropy": 0.31314332559704783, "epoch": 0.29601750243891173, "grad_norm": 0.7737833857536316, "learning_rate": 1.9733182821078557e-05, "loss": 0.321, "mean_token_accuracy": 0.9052186094224453, "num_tokens": 127344265.0, "step": 19230 }, { "entropy": 0.3306963432580233, "epoch": 0.29617143769759036, "grad_norm": 0.8414455056190491, "learning_rate": 1.9743445020267843e-05, "loss": 0.34, "mean_token_accuracy": 0.8984043627977372, "num_tokens": 127417216.0, "step": 19240 }, { "entropy": 0.318381373398006, "epoch": 0.29632537295626893, "grad_norm": 0.8173977732658386, "learning_rate": 1.9753707219457132e-05, "loss": 0.3198, "mean_token_accuracy": 0.9028556972742081, "num_tokens": 127488913.0, "step": 19250 }, { "entropy": 0.3243398381397128, "epoch": 0.2964793082149475, "grad_norm": 1.151339054107666, "learning_rate": 1.9763969418646418e-05, "loss": 0.3377, "mean_token_accuracy": 0.9015326112508774, "num_tokens": 127549456.0, "step": 19260 }, { "entropy": 0.31312176762148736, "epoch": 0.2966332434736261, "grad_norm": 0.8327497243881226, "learning_rate": 1.9774231617835704e-05, "loss": 0.308, "mean_token_accuracy": 0.905054833739996, "num_tokens": 127617387.0, "step": 19270 }, { "entropy": 0.30886292541399596, "epoch": 0.29678717873230465, "grad_norm": 0.9432815909385681, "learning_rate": 1.978449381702499e-05, "loss": 0.2981, "mean_token_accuracy": 0.9067428521811962, "num_tokens": 127671926.0, "step": 19280 }, { "entropy": 0.34039590423926713, "epoch": 0.2969411139909832, "grad_norm": 0.8275793790817261, "learning_rate": 1.9794756016214276e-05, "loss": 0.3491, "mean_token_accuracy": 0.8970563165843487, "num_tokens": 127736827.0, "step": 19290 }, { "entropy": 0.3274091688916087, "epoch": 0.29709504924966185, "grad_norm": 0.9054062962532043, "learning_rate": 1.9805018215403562e-05, "loss": 0.3196, "mean_token_accuracy": 0.9018181875348091, "num_tokens": 127803942.0, "step": 19300 }, { "entropy": 0.3211236183531582, "epoch": 0.2972489845083404, "grad_norm": 0.8000773787498474, "learning_rate": 1.9815280414592848e-05, "loss": 0.3222, "mean_token_accuracy": 0.9009824454784393, "num_tokens": 127866676.0, "step": 19310 }, { "entropy": 0.32450553281232714, "epoch": 0.297402919767019, "grad_norm": 0.895027756690979, "learning_rate": 1.9825542613782137e-05, "loss": 0.3166, "mean_token_accuracy": 0.9025765568017959, "num_tokens": 127938555.0, "step": 19320 }, { "entropy": 0.3214075221680105, "epoch": 0.29755685502569756, "grad_norm": 0.7933201789855957, "learning_rate": 1.9835804812971423e-05, "loss": 0.3159, "mean_token_accuracy": 0.900906240195036, "num_tokens": 128001628.0, "step": 19330 }, { "entropy": 0.3171557173132896, "epoch": 0.29771079028437614, "grad_norm": 0.8892960548400879, "learning_rate": 1.984606701216071e-05, "loss": 0.3282, "mean_token_accuracy": 0.9027291238307953, "num_tokens": 128067266.0, "step": 19340 }, { "entropy": 0.3337212834507227, "epoch": 0.2978647255430547, "grad_norm": 0.7185205221176147, "learning_rate": 1.9856329211349995e-05, "loss": 0.3329, "mean_token_accuracy": 0.8993307769298553, "num_tokens": 128135189.0, "step": 19350 }, { "entropy": 0.33095887266099455, "epoch": 0.29801866080173334, "grad_norm": 0.7963835000991821, "learning_rate": 1.986659141053928e-05, "loss": 0.3365, "mean_token_accuracy": 0.9005683168768883, "num_tokens": 128201742.0, "step": 19360 }, { "entropy": 0.34348348937928674, "epoch": 0.2981725960604119, "grad_norm": 0.8043532967567444, "learning_rate": 1.9876853609728567e-05, "loss": 0.3356, "mean_token_accuracy": 0.8976098485291004, "num_tokens": 128275779.0, "step": 19370 }, { "entropy": 0.3093395301140845, "epoch": 0.2983265313190905, "grad_norm": 0.8707140684127808, "learning_rate": 1.9887115808917853e-05, "loss": 0.3188, "mean_token_accuracy": 0.9070346049964428, "num_tokens": 128338598.0, "step": 19380 }, { "entropy": 0.3317289683967829, "epoch": 0.29848046657776905, "grad_norm": 0.8499150276184082, "learning_rate": 1.989737800810714e-05, "loss": 0.347, "mean_token_accuracy": 0.8990559913218021, "num_tokens": 128407672.0, "step": 19390 }, { "entropy": 0.321701158862561, "epoch": 0.2986344018364476, "grad_norm": 0.8470450639724731, "learning_rate": 1.9907640207296425e-05, "loss": 0.3312, "mean_token_accuracy": 0.9036515459418297, "num_tokens": 128477895.0, "step": 19400 }, { "entropy": 0.32691779676824806, "epoch": 0.2987883370951262, "grad_norm": 0.7854071855545044, "learning_rate": 1.991790240648571e-05, "loss": 0.3298, "mean_token_accuracy": 0.9016014315187931, "num_tokens": 128544995.0, "step": 19410 }, { "entropy": 0.3137723653577268, "epoch": 0.29894227235380477, "grad_norm": 0.778806209564209, "learning_rate": 1.9928164605674997e-05, "loss": 0.3215, "mean_token_accuracy": 0.9025790803134441, "num_tokens": 128609791.0, "step": 19420 }, { "entropy": 0.3408910881727934, "epoch": 0.2990962076124834, "grad_norm": 0.8324446678161621, "learning_rate": 1.9938426804864283e-05, "loss": 0.3417, "mean_token_accuracy": 0.8955629408359528, "num_tokens": 128680770.0, "step": 19430 }, { "entropy": 0.3322860665619373, "epoch": 0.299250142871162, "grad_norm": 0.8694840669631958, "learning_rate": 1.994868900405357e-05, "loss": 0.3293, "mean_token_accuracy": 0.9019720427691936, "num_tokens": 128739186.0, "step": 19440 }, { "entropy": 0.3197678321972489, "epoch": 0.29940407812984055, "grad_norm": 0.8559448719024658, "learning_rate": 1.9958951203242854e-05, "loss": 0.3278, "mean_token_accuracy": 0.9044013902544975, "num_tokens": 128804975.0, "step": 19450 }, { "entropy": 0.32509424537420273, "epoch": 0.2995580133885191, "grad_norm": 0.7125030159950256, "learning_rate": 1.9969213402432144e-05, "loss": 0.3291, "mean_token_accuracy": 0.8992971859872341, "num_tokens": 128878104.0, "step": 19460 }, { "entropy": 0.33232873529195783, "epoch": 0.2997119486471977, "grad_norm": 0.8866647481918335, "learning_rate": 1.997947560162143e-05, "loss": 0.3294, "mean_token_accuracy": 0.9026747800409793, "num_tokens": 128950071.0, "step": 19470 }, { "entropy": 0.3277430723421276, "epoch": 0.29986588390587626, "grad_norm": 0.7980249524116516, "learning_rate": 1.9989737800810716e-05, "loss": 0.3328, "mean_token_accuracy": 0.9004473224282264, "num_tokens": 129024805.0, "step": 19480 }, { "entropy": 0.3397134566679597, "epoch": 0.3000198191645549, "grad_norm": 0.8709871172904968, "learning_rate": 2e-05, "loss": 0.3374, "mean_token_accuracy": 0.8950427561998368, "num_tokens": 129094221.0, "step": 19490 }, { "entropy": 0.3110186429694295, "epoch": 0.30017375442323346, "grad_norm": 0.7501894235610962, "learning_rate": 1.999999983959771e-05, "loss": 0.3255, "mean_token_accuracy": 0.906337684392929, "num_tokens": 129156922.0, "step": 19500 }, { "entropy": 0.32589228730648756, "epoch": 0.30032768968191204, "grad_norm": 1.071861743927002, "learning_rate": 1.9999999358390837e-05, "loss": 0.3205, "mean_token_accuracy": 0.8992316380143166, "num_tokens": 129216812.0, "step": 19510 }, { "entropy": 0.3267574620433152, "epoch": 0.3004816249405906, "grad_norm": 0.8347981572151184, "learning_rate": 1.99999985563794e-05, "loss": 0.3361, "mean_token_accuracy": 0.9006208159029484, "num_tokens": 129280562.0, "step": 19520 }, { "entropy": 0.3246456513181329, "epoch": 0.3006355601992692, "grad_norm": 0.7376304268836975, "learning_rate": 1.9999997433563426e-05, "loss": 0.3246, "mean_token_accuracy": 0.9036637417972088, "num_tokens": 129343029.0, "step": 19530 }, { "entropy": 0.34202362671494485, "epoch": 0.30078949545794775, "grad_norm": 0.7266255617141724, "learning_rate": 1.999999598994295e-05, "loss": 0.3388, "mean_token_accuracy": 0.8952557191252708, "num_tokens": 129418011.0, "step": 19540 }, { "entropy": 0.3685049916617572, "epoch": 0.3009434307166264, "grad_norm": 1.1553267240524292, "learning_rate": 1.999999422551802e-05, "loss": 0.3625, "mean_token_accuracy": 0.8884289413690567, "num_tokens": 129478243.0, "step": 19550 }, { "entropy": 0.335553190857172, "epoch": 0.30109736597530495, "grad_norm": 0.6836233139038086, "learning_rate": 1.9999992140288686e-05, "loss": 0.3257, "mean_token_accuracy": 0.8989661931991577, "num_tokens": 129549271.0, "step": 19560 }, { "entropy": 0.3254010515287519, "epoch": 0.3012513012339835, "grad_norm": 0.900862455368042, "learning_rate": 1.999998973425502e-05, "loss": 0.3267, "mean_token_accuracy": 0.8996990233659744, "num_tokens": 129624545.0, "step": 19570 }, { "entropy": 0.3246078667230904, "epoch": 0.3014052364926621, "grad_norm": 0.8562678694725037, "learning_rate": 1.9999987007417104e-05, "loss": 0.3239, "mean_token_accuracy": 0.9025262542068958, "num_tokens": 129684076.0, "step": 19580 }, { "entropy": 0.3213284742087126, "epoch": 0.30155917175134067, "grad_norm": 0.7437049150466919, "learning_rate": 1.9999983959775016e-05, "loss": 0.3263, "mean_token_accuracy": 0.9019702203571797, "num_tokens": 129752291.0, "step": 19590 }, { "entropy": 0.31470332723110916, "epoch": 0.30171310701001924, "grad_norm": 0.6939236521720886, "learning_rate": 1.999998059132886e-05, "loss": 0.321, "mean_token_accuracy": 0.9024942100048066, "num_tokens": 129820065.0, "step": 19600 }, { "entropy": 0.3325593626126647, "epoch": 0.3018670422686978, "grad_norm": 0.7724924683570862, "learning_rate": 1.9999976902078738e-05, "loss": 0.3371, "mean_token_accuracy": 0.8991015508770943, "num_tokens": 129892428.0, "step": 19610 }, { "entropy": 0.3026108662597835, "epoch": 0.30202097752737644, "grad_norm": 0.6901084780693054, "learning_rate": 1.9999972892024778e-05, "loss": 0.3037, "mean_token_accuracy": 0.9106933020055294, "num_tokens": 129951549.0, "step": 19620 }, { "entropy": 0.3181462939828634, "epoch": 0.302174912786055, "grad_norm": 1.2185308933258057, "learning_rate": 1.9999968561167097e-05, "loss": 0.3196, "mean_token_accuracy": 0.8991687692701816, "num_tokens": 130014714.0, "step": 19630 }, { "entropy": 0.35568520035594703, "epoch": 0.3023288480447336, "grad_norm": 0.9317598938941956, "learning_rate": 1.9999963909505842e-05, "loss": 0.3409, "mean_token_accuracy": 0.8923911087214946, "num_tokens": 130082971.0, "step": 19640 }, { "entropy": 0.360156371537596, "epoch": 0.30248278330341216, "grad_norm": 0.9982752203941345, "learning_rate": 1.9999958937041165e-05, "loss": 0.3498, "mean_token_accuracy": 0.8936807997524738, "num_tokens": 130153094.0, "step": 19650 }, { "entropy": 0.33335075713694096, "epoch": 0.30263671856209073, "grad_norm": 0.7421761155128479, "learning_rate": 1.9999953643773215e-05, "loss": 0.3293, "mean_token_accuracy": 0.9002755858004093, "num_tokens": 130216317.0, "step": 19660 }, { "entropy": 0.3433493396267295, "epoch": 0.3027906538207693, "grad_norm": 0.9346762895584106, "learning_rate": 1.999994802970217e-05, "loss": 0.3333, "mean_token_accuracy": 0.894124711304903, "num_tokens": 130280794.0, "step": 19670 }, { "entropy": 0.3350706595927477, "epoch": 0.30294458907944793, "grad_norm": 0.8668228387832642, "learning_rate": 1.9999942094828208e-05, "loss": 0.329, "mean_token_accuracy": 0.9005538478493691, "num_tokens": 130346151.0, "step": 19680 }, { "entropy": 0.34198490492999556, "epoch": 0.3030985243381265, "grad_norm": 0.7757605910301208, "learning_rate": 1.9999935839151513e-05, "loss": 0.3461, "mean_token_accuracy": 0.8954013377428055, "num_tokens": 130414706.0, "step": 19690 }, { "entropy": 0.3203779198229313, "epoch": 0.3032524595968051, "grad_norm": 0.7683009505271912, "learning_rate": 1.99999292626723e-05, "loss": 0.3253, "mean_token_accuracy": 0.9040663719177247, "num_tokens": 130475149.0, "step": 19700 }, { "entropy": 0.3367359958589077, "epoch": 0.30340639485548365, "grad_norm": 0.7105514407157898, "learning_rate": 1.9999922365390767e-05, "loss": 0.3359, "mean_token_accuracy": 0.8982115767896175, "num_tokens": 130539908.0, "step": 19710 }, { "entropy": 0.33744219075888393, "epoch": 0.3035603301141622, "grad_norm": 0.8697171211242676, "learning_rate": 1.9999915147307143e-05, "loss": 0.3251, "mean_token_accuracy": 0.9001261346042156, "num_tokens": 130609168.0, "step": 19720 }, { "entropy": 0.3266807278618217, "epoch": 0.3037142653728408, "grad_norm": 0.8554345369338989, "learning_rate": 1.9999907608421652e-05, "loss": 0.323, "mean_token_accuracy": 0.900694639980793, "num_tokens": 130672545.0, "step": 19730 }, { "entropy": 0.324900827743113, "epoch": 0.3038682006315194, "grad_norm": 0.7487205266952515, "learning_rate": 1.9999899748734542e-05, "loss": 0.3253, "mean_token_accuracy": 0.9009183019399643, "num_tokens": 130736774.0, "step": 19740 }, { "entropy": 0.3214568785391748, "epoch": 0.304022135890198, "grad_norm": 0.9999540448188782, "learning_rate": 1.9999891568246066e-05, "loss": 0.3361, "mean_token_accuracy": 0.9041481740772724, "num_tokens": 130799905.0, "step": 19750 }, { "entropy": 0.31954907439649105, "epoch": 0.30417607114887657, "grad_norm": 0.7857552170753479, "learning_rate": 1.9999883066956482e-05, "loss": 0.3304, "mean_token_accuracy": 0.9046133302152157, "num_tokens": 130862628.0, "step": 19760 }, { "entropy": 0.3362925169989467, "epoch": 0.30433000640755514, "grad_norm": 1.2004436254501343, "learning_rate": 1.9999874244866068e-05, "loss": 0.3347, "mean_token_accuracy": 0.8970183849334716, "num_tokens": 130923908.0, "step": 19770 }, { "entropy": 0.33613842325285076, "epoch": 0.3044839416662337, "grad_norm": 0.8491421341896057, "learning_rate": 1.99998651019751e-05, "loss": 0.3304, "mean_token_accuracy": 0.9008071012794971, "num_tokens": 130982296.0, "step": 19780 }, { "entropy": 0.33051209300756457, "epoch": 0.3046378769249123, "grad_norm": 0.6922972202301025, "learning_rate": 1.9999855638283875e-05, "loss": 0.3135, "mean_token_accuracy": 0.9002452924847603, "num_tokens": 131045657.0, "step": 19790 }, { "entropy": 0.307290802616626, "epoch": 0.30479181218359086, "grad_norm": 0.9036703705787659, "learning_rate": 1.9999845853792698e-05, "loss": 0.3216, "mean_token_accuracy": 0.9062139622867107, "num_tokens": 131115045.0, "step": 19800 }, { "entropy": 0.32537705786526205, "epoch": 0.3049457474422695, "grad_norm": 1.237472414970398, "learning_rate": 1.999983574850188e-05, "loss": 0.3259, "mean_token_accuracy": 0.9002103164792061, "num_tokens": 131176005.0, "step": 19810 }, { "entropy": 0.3445144886150956, "epoch": 0.30509968270094806, "grad_norm": 0.8789829611778259, "learning_rate": 1.9999825322411744e-05, "loss": 0.3285, "mean_token_accuracy": 0.8976062819361686, "num_tokens": 131242777.0, "step": 19820 }, { "entropy": 0.30491778487339616, "epoch": 0.30525361795962663, "grad_norm": 0.9373149275779724, "learning_rate": 1.9999814575522634e-05, "loss": 0.2974, "mean_token_accuracy": 0.9022399462759495, "num_tokens": 131306070.0, "step": 19830 }, { "entropy": 0.3340556975454092, "epoch": 0.3054075532183052, "grad_norm": 0.8670139312744141, "learning_rate": 1.9999803507834883e-05, "loss": 0.3239, "mean_token_accuracy": 0.900655473023653, "num_tokens": 131367821.0, "step": 19840 }, { "entropy": 0.3242602193728089, "epoch": 0.3055614884769838, "grad_norm": 0.7933480739593506, "learning_rate": 1.9999792119348852e-05, "loss": 0.3288, "mean_token_accuracy": 0.9002412281930446, "num_tokens": 131430197.0, "step": 19850 }, { "entropy": 0.3405096993781626, "epoch": 0.30571542373566235, "grad_norm": 0.803017258644104, "learning_rate": 1.9999780410064906e-05, "loss": 0.349, "mean_token_accuracy": 0.8962196856737137, "num_tokens": 131494705.0, "step": 19860 }, { "entropy": 0.3380158488638699, "epoch": 0.305869358994341, "grad_norm": 0.8139877915382385, "learning_rate": 1.999976837998342e-05, "loss": 0.3301, "mean_token_accuracy": 0.8969452247023583, "num_tokens": 131556765.0, "step": 19870 }, { "entropy": 0.3431771328672767, "epoch": 0.30602329425301955, "grad_norm": 0.6491170525550842, "learning_rate": 1.9999756029104778e-05, "loss": 0.3463, "mean_token_accuracy": 0.896998206526041, "num_tokens": 131621036.0, "step": 19880 }, { "entropy": 0.3247729953378439, "epoch": 0.3061772295116981, "grad_norm": 0.9475200772285461, "learning_rate": 1.999974335742938e-05, "loss": 0.3277, "mean_token_accuracy": 0.9024634249508381, "num_tokens": 131693775.0, "step": 19890 }, { "entropy": 0.31089992839843034, "epoch": 0.3063311647703767, "grad_norm": 0.755856454372406, "learning_rate": 1.999973036495763e-05, "loss": 0.3169, "mean_token_accuracy": 0.9064569145441055, "num_tokens": 131759754.0, "step": 19900 }, { "entropy": 0.33354764264076947, "epoch": 0.30648510002905527, "grad_norm": 0.6360670924186707, "learning_rate": 1.9999717051689942e-05, "loss": 0.3288, "mean_token_accuracy": 0.8971344292163849, "num_tokens": 131832535.0, "step": 19910 }, { "entropy": 0.3399680665694177, "epoch": 0.30663903528773384, "grad_norm": 0.7720867395401001, "learning_rate": 1.9999703417626755e-05, "loss": 0.3311, "mean_token_accuracy": 0.8991780556738377, "num_tokens": 131900013.0, "step": 19920 }, { "entropy": 0.3314864174462855, "epoch": 0.30679297054641247, "grad_norm": 0.7482878565788269, "learning_rate": 1.999968946276849e-05, "loss": 0.3347, "mean_token_accuracy": 0.898952991515398, "num_tokens": 131968736.0, "step": 19930 }, { "entropy": 0.3047964217141271, "epoch": 0.30694690580509104, "grad_norm": 0.9101071357727051, "learning_rate": 1.9999675187115608e-05, "loss": 0.3218, "mean_token_accuracy": 0.9065073527395725, "num_tokens": 132034398.0, "step": 19940 }, { "entropy": 0.32592560881748794, "epoch": 0.3071008410637696, "grad_norm": 0.9789097309112549, "learning_rate": 1.9999660590668558e-05, "loss": 0.3264, "mean_token_accuracy": 0.9030951067805291, "num_tokens": 132098626.0, "step": 19950 }, { "entropy": 0.333867771923542, "epoch": 0.3072547763224482, "grad_norm": 0.7680266499519348, "learning_rate": 1.9999645673427813e-05, "loss": 0.3319, "mean_token_accuracy": 0.8997580409049988, "num_tokens": 132162242.0, "step": 19960 }, { "entropy": 0.3272513972595334, "epoch": 0.30740871158112676, "grad_norm": 0.9250402450561523, "learning_rate": 1.999963043539385e-05, "loss": 0.3287, "mean_token_accuracy": 0.9013773269951344, "num_tokens": 132228186.0, "step": 19970 }, { "entropy": 0.3442480109632015, "epoch": 0.30756264683980533, "grad_norm": 0.6405095458030701, "learning_rate": 1.9999614876567156e-05, "loss": 0.3409, "mean_token_accuracy": 0.8979146733880043, "num_tokens": 132291512.0, "step": 19980 }, { "entropy": 0.30606405083090066, "epoch": 0.3077165820984839, "grad_norm": 0.6245688199996948, "learning_rate": 1.9999598996948237e-05, "loss": 0.3172, "mean_token_accuracy": 0.9079359665513038, "num_tokens": 132357274.0, "step": 19990 }, { "entropy": 0.3158779742196202, "epoch": 0.30787051735716253, "grad_norm": 1.2739683389663696, "learning_rate": 1.9999582796537595e-05, "loss": 0.323, "mean_token_accuracy": 0.905526278167963, "num_tokens": 132422362.0, "step": 20000 }, { "epoch": 0.30787051735716253, "eval_entropy": 0.32280719651395695, "eval_loss": 0.32545405626296997, "eval_mean_token_accuracy": 0.901307802606294, "eval_num_tokens": 132422362.0, "eval_runtime": 7697.9583, "eval_samples_per_second": 4.22, "eval_steps_per_second": 4.22, "step": 20000 }, { "entropy": 0.3347072254866362, "epoch": 0.3080244526158411, "grad_norm": 0.9041303396224976, "learning_rate": 1.999956627533575e-05, "loss": 0.3313, "mean_token_accuracy": 0.8964965790510178, "num_tokens": 61981.0, "step": 20010 }, { "entropy": 0.33035993967205285, "epoch": 0.3081783878745197, "grad_norm": 0.861560046672821, "learning_rate": 1.999954943334324e-05, "loss": 0.3358, "mean_token_accuracy": 0.9019302926957607, "num_tokens": 123944.0, "step": 20020 }, { "entropy": 0.3224920451641083, "epoch": 0.30833232313319825, "grad_norm": 0.7752121686935425, "learning_rate": 1.9999532270560597e-05, "loss": 0.3206, "mean_token_accuracy": 0.9009259589016437, "num_tokens": 191778.0, "step": 20030 }, { "entropy": 0.3307066596113145, "epoch": 0.3084862583918768, "grad_norm": 0.7593549489974976, "learning_rate": 1.9999514786988373e-05, "loss": 0.3426, "mean_token_accuracy": 0.8986778497695923, "num_tokens": 257413.0, "step": 20040 }, { "entropy": 0.34766760300844907, "epoch": 0.3086401936505554, "grad_norm": 0.8223702311515808, "learning_rate": 1.999949698262713e-05, "loss": 0.336, "mean_token_accuracy": 0.8963188670575619, "num_tokens": 312534.0, "step": 20050 }, { "entropy": 0.3203911466524005, "epoch": 0.308794128909234, "grad_norm": 0.6495424509048462, "learning_rate": 1.999947885747744e-05, "loss": 0.3202, "mean_token_accuracy": 0.9030540652573109, "num_tokens": 380427.0, "step": 20060 }, { "entropy": 0.33497158009558914, "epoch": 0.3089480641679126, "grad_norm": 1.1022565364837646, "learning_rate": 1.9999460411539886e-05, "loss": 0.3285, "mean_token_accuracy": 0.8967165946960449, "num_tokens": 444234.0, "step": 20070 }, { "entropy": 0.3407123415730894, "epoch": 0.30910199942659117, "grad_norm": 0.7305319905281067, "learning_rate": 1.9999441644815055e-05, "loss": 0.3317, "mean_token_accuracy": 0.899373161047697, "num_tokens": 512375.0, "step": 20080 }, { "entropy": 0.34480364676564934, "epoch": 0.30925593468526974, "grad_norm": 0.8187503814697266, "learning_rate": 1.9999422557303553e-05, "loss": 0.3528, "mean_token_accuracy": 0.897357776761055, "num_tokens": 575623.0, "step": 20090 }, { "entropy": 0.3211612654849887, "epoch": 0.3094098699439483, "grad_norm": 0.789129376411438, "learning_rate": 1.9999403149005993e-05, "loss": 0.3175, "mean_token_accuracy": 0.9017923474311829, "num_tokens": 641519.0, "step": 20100 }, { "entropy": 0.33596133552491664, "epoch": 0.3095638052026269, "grad_norm": 0.7894786596298218, "learning_rate": 1.9999383419922994e-05, "loss": 0.341, "mean_token_accuracy": 0.8999257504940033, "num_tokens": 711419.0, "step": 20110 }, { "entropy": 0.320742604508996, "epoch": 0.3097177404613055, "grad_norm": 0.8032031655311584, "learning_rate": 1.999936337005519e-05, "loss": 0.3254, "mean_token_accuracy": 0.9051031261682511, "num_tokens": 782170.0, "step": 20120 }, { "entropy": 0.3204994902946055, "epoch": 0.3098716757199841, "grad_norm": 0.8819548487663269, "learning_rate": 1.9999342999403222e-05, "loss": 0.3273, "mean_token_accuracy": 0.902387448400259, "num_tokens": 840706.0, "step": 20130 }, { "entropy": 0.31915026465430857, "epoch": 0.31002561097866266, "grad_norm": 0.9971718788146973, "learning_rate": 1.999932230796775e-05, "loss": 0.3266, "mean_token_accuracy": 0.9034750774502754, "num_tokens": 906330.0, "step": 20140 }, { "entropy": 0.3391741840168834, "epoch": 0.31017954623734123, "grad_norm": 0.7820737957954407, "learning_rate": 1.9999301295749434e-05, "loss": 0.329, "mean_token_accuracy": 0.8995667144656181, "num_tokens": 973186.0, "step": 20150 }, { "entropy": 0.33998251985758543, "epoch": 0.3103334814960198, "grad_norm": 0.6700939536094666, "learning_rate": 1.9999279962748947e-05, "loss": 0.3465, "mean_token_accuracy": 0.8964553765952588, "num_tokens": 1038043.0, "step": 20160 }, { "entropy": 0.35810003904625776, "epoch": 0.3104874167546984, "grad_norm": 0.9016314744949341, "learning_rate": 1.9999258308966975e-05, "loss": 0.3511, "mean_token_accuracy": 0.890233151614666, "num_tokens": 1103324.0, "step": 20170 }, { "entropy": 0.33071319088339807, "epoch": 0.31064135201337695, "grad_norm": 0.7779943346977234, "learning_rate": 1.999923633440421e-05, "loss": 0.329, "mean_token_accuracy": 0.9020752087235451, "num_tokens": 1167501.0, "step": 20180 }, { "entropy": 0.3247944932430983, "epoch": 0.3107952872720556, "grad_norm": 0.9149175882339478, "learning_rate": 1.9999214039061363e-05, "loss": 0.3245, "mean_token_accuracy": 0.8990068592131137, "num_tokens": 1229576.0, "step": 20190 }, { "entropy": 0.3338072143495083, "epoch": 0.31094922253073415, "grad_norm": 0.7549015283584595, "learning_rate": 1.9999191422939145e-05, "loss": 0.3406, "mean_token_accuracy": 0.8995651409029961, "num_tokens": 1295858.0, "step": 20200 }, { "entropy": 0.34247630164027215, "epoch": 0.3111031577894127, "grad_norm": 0.8262788653373718, "learning_rate": 1.999916848603828e-05, "loss": 0.3352, "mean_token_accuracy": 0.8983215972781181, "num_tokens": 1362501.0, "step": 20210 }, { "entropy": 0.3004088549874723, "epoch": 0.3112570930480913, "grad_norm": 0.8069485425949097, "learning_rate": 1.9999145228359506e-05, "loss": 0.3148, "mean_token_accuracy": 0.9085034124553204, "num_tokens": 1425711.0, "step": 20220 }, { "entropy": 0.3248146019876003, "epoch": 0.31141102830676987, "grad_norm": 0.8618872761726379, "learning_rate": 1.9999121649903566e-05, "loss": 0.3472, "mean_token_accuracy": 0.9007827617228031, "num_tokens": 1481310.0, "step": 20230 }, { "entropy": 0.32999210711568594, "epoch": 0.31156496356544844, "grad_norm": 0.9175731539726257, "learning_rate": 1.9999097750671225e-05, "loss": 0.3234, "mean_token_accuracy": 0.9020946636795998, "num_tokens": 1544688.0, "step": 20240 }, { "entropy": 0.33640504349023104, "epoch": 0.31171889882412707, "grad_norm": 0.7880797982215881, "learning_rate": 1.999907353066324e-05, "loss": 0.3454, "mean_token_accuracy": 0.8982562065124512, "num_tokens": 1609054.0, "step": 20250 }, { "entropy": 0.3447146167978644, "epoch": 0.31187283408280564, "grad_norm": 1.3436784744262695, "learning_rate": 1.9999048989880393e-05, "loss": 0.3367, "mean_token_accuracy": 0.8986137859523297, "num_tokens": 1668263.0, "step": 20260 }, { "entropy": 0.32000910881906747, "epoch": 0.3120267693414842, "grad_norm": 0.8852439522743225, "learning_rate": 1.999902412832347e-05, "loss": 0.3217, "mean_token_accuracy": 0.9036146491765976, "num_tokens": 1729262.0, "step": 20270 }, { "entropy": 0.3233157303184271, "epoch": 0.3121807046001628, "grad_norm": 0.6732173562049866, "learning_rate": 1.9998998945993267e-05, "loss": 0.3244, "mean_token_accuracy": 0.9001628942787647, "num_tokens": 1801599.0, "step": 20280 }, { "entropy": 0.32776245921850206, "epoch": 0.31233463985884136, "grad_norm": 0.747530996799469, "learning_rate": 1.99989734428906e-05, "loss": 0.3249, "mean_token_accuracy": 0.9025560513138771, "num_tokens": 1867694.0, "step": 20290 }, { "entropy": 0.3012982369400561, "epoch": 0.31248857511751993, "grad_norm": 0.6400830149650574, "learning_rate": 1.9998947619016277e-05, "loss": 0.3194, "mean_token_accuracy": 0.9068549551069737, "num_tokens": 1939803.0, "step": 20300 }, { "entropy": 0.3169936267659068, "epoch": 0.31264251037619856, "grad_norm": 0.9237083792686462, "learning_rate": 1.999892147437113e-05, "loss": 0.3157, "mean_token_accuracy": 0.9035529345273972, "num_tokens": 2004478.0, "step": 20310 }, { "entropy": 0.3275701323524117, "epoch": 0.31279644563487713, "grad_norm": 0.80589359998703, "learning_rate": 1.9998895008955993e-05, "loss": 0.3202, "mean_token_accuracy": 0.9011306025087833, "num_tokens": 2071716.0, "step": 20320 }, { "entropy": 0.32274338472634556, "epoch": 0.3129503808935557, "grad_norm": 1.006787896156311, "learning_rate": 1.9998868222771727e-05, "loss": 0.327, "mean_token_accuracy": 0.901336070150137, "num_tokens": 2135138.0, "step": 20330 }, { "entropy": 0.3137738867662847, "epoch": 0.3131043161522343, "grad_norm": 0.8040913939476013, "learning_rate": 1.9998841115819183e-05, "loss": 0.3149, "mean_token_accuracy": 0.9054370261728764, "num_tokens": 2198481.0, "step": 20340 }, { "entropy": 0.3429859759286046, "epoch": 0.31325825141091285, "grad_norm": 0.6540361046791077, "learning_rate": 1.999881368809923e-05, "loss": 0.3571, "mean_token_accuracy": 0.8941518656909466, "num_tokens": 2266774.0, "step": 20350 }, { "entropy": 0.3325257131829858, "epoch": 0.3134121866695914, "grad_norm": 0.7895792126655579, "learning_rate": 1.9998785939612752e-05, "loss": 0.3255, "mean_token_accuracy": 0.9018323630094528, "num_tokens": 2324487.0, "step": 20360 }, { "entropy": 0.3127421744167805, "epoch": 0.31356612192827, "grad_norm": 0.609680712223053, "learning_rate": 1.9998757870360636e-05, "loss": 0.3235, "mean_token_accuracy": 0.902238404750824, "num_tokens": 2388325.0, "step": 20370 }, { "entropy": 0.3157782141119242, "epoch": 0.3137200571869486, "grad_norm": 0.67366623878479, "learning_rate": 1.999872948034378e-05, "loss": 0.3211, "mean_token_accuracy": 0.9036032505333423, "num_tokens": 2463665.0, "step": 20380 }, { "entropy": 0.33369248965755105, "epoch": 0.3138739924456272, "grad_norm": 0.739258348941803, "learning_rate": 1.9998700769563103e-05, "loss": 0.333, "mean_token_accuracy": 0.900524215400219, "num_tokens": 2533236.0, "step": 20390 }, { "entropy": 0.3104946153238416, "epoch": 0.31402792770430576, "grad_norm": 0.8736655116081238, "learning_rate": 1.999867173801952e-05, "loss": 0.3141, "mean_token_accuracy": 0.9045924440026283, "num_tokens": 2600479.0, "step": 20400 }, { "entropy": 0.3174760231748223, "epoch": 0.31418186296298434, "grad_norm": 1.0883560180664062, "learning_rate": 1.9998642385713963e-05, "loss": 0.3203, "mean_token_accuracy": 0.9057169444859028, "num_tokens": 2659348.0, "step": 20410 }, { "entropy": 0.3144790745340288, "epoch": 0.3143357982216629, "grad_norm": 0.6872767210006714, "learning_rate": 1.9998612712647374e-05, "loss": 0.3033, "mean_token_accuracy": 0.9052769258618355, "num_tokens": 2728425.0, "step": 20420 }, { "entropy": 0.32876908788457515, "epoch": 0.3144897334803415, "grad_norm": 0.7846230268478394, "learning_rate": 1.9998582718820703e-05, "loss": 0.3255, "mean_token_accuracy": 0.9021555997431279, "num_tokens": 2793830.0, "step": 20430 }, { "entropy": 0.3222017565742135, "epoch": 0.3146436687390201, "grad_norm": 0.7308081984519958, "learning_rate": 1.999855240423492e-05, "loss": 0.3278, "mean_token_accuracy": 0.9023187100887299, "num_tokens": 2855272.0, "step": 20440 }, { "entropy": 0.34397133979946376, "epoch": 0.3147976039976987, "grad_norm": 0.9795470237731934, "learning_rate": 1.999852176889099e-05, "loss": 0.3388, "mean_token_accuracy": 0.8978821352124214, "num_tokens": 2924358.0, "step": 20450 }, { "entropy": 0.30738527411594985, "epoch": 0.31495153925637726, "grad_norm": 0.8881495594978333, "learning_rate": 1.9998490812789897e-05, "loss": 0.3222, "mean_token_accuracy": 0.9041716478765011, "num_tokens": 2994636.0, "step": 20460 }, { "entropy": 0.3319209169596434, "epoch": 0.31510547451505583, "grad_norm": 0.7563307881355286, "learning_rate": 1.9998459535932633e-05, "loss": 0.3318, "mean_token_accuracy": 0.8990750476717949, "num_tokens": 3057848.0, "step": 20470 }, { "entropy": 0.32492388105019926, "epoch": 0.3152594097737344, "grad_norm": 0.9577974081039429, "learning_rate": 1.9998427938320207e-05, "loss": 0.3213, "mean_token_accuracy": 0.9054618440568447, "num_tokens": 3121402.0, "step": 20480 }, { "entropy": 0.3363164685666561, "epoch": 0.315413345032413, "grad_norm": 0.9014992117881775, "learning_rate": 1.9998396019953627e-05, "loss": 0.3539, "mean_token_accuracy": 0.8966502644121647, "num_tokens": 3181786.0, "step": 20490 }, { "entropy": 0.34364599268883467, "epoch": 0.3155672802910916, "grad_norm": 0.6742159128189087, "learning_rate": 1.9998363780833914e-05, "loss": 0.3458, "mean_token_accuracy": 0.8965003348886966, "num_tokens": 3257587.0, "step": 20500 }, { "entropy": 0.3298993154428899, "epoch": 0.3157212155497702, "grad_norm": 1.1375958919525146, "learning_rate": 1.9998331220962113e-05, "loss": 0.331, "mean_token_accuracy": 0.9015438072383404, "num_tokens": 3321754.0, "step": 20510 }, { "entropy": 0.32871368853375316, "epoch": 0.31587515080844875, "grad_norm": 0.7640058994293213, "learning_rate": 1.999829834033926e-05, "loss": 0.3288, "mean_token_accuracy": 0.8993797950446606, "num_tokens": 3392021.0, "step": 20520 }, { "entropy": 0.3240631720051169, "epoch": 0.3160290860671273, "grad_norm": 0.9028982520103455, "learning_rate": 1.9998265138966416e-05, "loss": 0.3287, "mean_token_accuracy": 0.8999566994607449, "num_tokens": 3462567.0, "step": 20530 }, { "entropy": 0.3325187089852989, "epoch": 0.3161830213258059, "grad_norm": 0.7886708974838257, "learning_rate": 1.9998231616844638e-05, "loss": 0.3263, "mean_token_accuracy": 0.9003114968538284, "num_tokens": 3522944.0, "step": 20540 }, { "entropy": 0.3249139989726245, "epoch": 0.31633695658448446, "grad_norm": 0.8448202013969421, "learning_rate": 1.9998197773975007e-05, "loss": 0.3301, "mean_token_accuracy": 0.899862652271986, "num_tokens": 3584529.0, "step": 20550 }, { "entropy": 0.3163489722646773, "epoch": 0.31649089184316304, "grad_norm": 0.941638708114624, "learning_rate": 1.999816361035861e-05, "loss": 0.3349, "mean_token_accuracy": 0.903981164842844, "num_tokens": 3654674.0, "step": 20560 }, { "entropy": 0.31588115487247703, "epoch": 0.31664482710184166, "grad_norm": 0.746479332447052, "learning_rate": 1.999812912599654e-05, "loss": 0.3101, "mean_token_accuracy": 0.9066561579704284, "num_tokens": 3727516.0, "step": 20570 }, { "entropy": 0.32253835890442134, "epoch": 0.31679876236052024, "grad_norm": 0.7831437587738037, "learning_rate": 1.9998094320889902e-05, "loss": 0.3245, "mean_token_accuracy": 0.9013739421963691, "num_tokens": 3791736.0, "step": 20580 }, { "entropy": 0.31667505074292424, "epoch": 0.3169526976191988, "grad_norm": 0.8480985760688782, "learning_rate": 1.9998059195039818e-05, "loss": 0.3154, "mean_token_accuracy": 0.9054758682847023, "num_tokens": 3859616.0, "step": 20590 }, { "entropy": 0.3032220259308815, "epoch": 0.3171066328778774, "grad_norm": 0.7087958455085754, "learning_rate": 1.9998023748447406e-05, "loss": 0.317, "mean_token_accuracy": 0.9047880738973617, "num_tokens": 3933624.0, "step": 20600 }, { "entropy": 0.32701658494770525, "epoch": 0.31726056813655595, "grad_norm": 0.6910621523857117, "learning_rate": 1.9997987981113813e-05, "loss": 0.3246, "mean_token_accuracy": 0.9022379249334336, "num_tokens": 3994091.0, "step": 20610 }, { "entropy": 0.332864548265934, "epoch": 0.3174145033952345, "grad_norm": 0.7758082747459412, "learning_rate": 1.999795189304018e-05, "loss": 0.3254, "mean_token_accuracy": 0.9003124877810478, "num_tokens": 4055924.0, "step": 20620 }, { "entropy": 0.31469618398696186, "epoch": 0.31756843865391315, "grad_norm": 0.8365737199783325, "learning_rate": 1.9997915484227667e-05, "loss": 0.3321, "mean_token_accuracy": 0.9037733726203442, "num_tokens": 4119858.0, "step": 20630 }, { "entropy": 0.33256882168352603, "epoch": 0.3177223739125917, "grad_norm": 0.8803041577339172, "learning_rate": 1.999787875467744e-05, "loss": 0.3301, "mean_token_accuracy": 0.8997885681688785, "num_tokens": 4192130.0, "step": 20640 }, { "entropy": 0.30810202080756427, "epoch": 0.3178763091712703, "grad_norm": 0.6903203129768372, "learning_rate": 1.999784170439068e-05, "loss": 0.3075, "mean_token_accuracy": 0.9081846453249455, "num_tokens": 4269065.0, "step": 20650 }, { "entropy": 0.3107946041040123, "epoch": 0.31803024442994887, "grad_norm": 0.7259735465049744, "learning_rate": 1.9997804333368574e-05, "loss": 0.3199, "mean_token_accuracy": 0.9032630771398544, "num_tokens": 4340177.0, "step": 20660 }, { "entropy": 0.33907619630917907, "epoch": 0.31818417968862744, "grad_norm": 0.7867041230201721, "learning_rate": 1.9997766641612323e-05, "loss": 0.3278, "mean_token_accuracy": 0.8978477239608764, "num_tokens": 4408541.0, "step": 20670 }, { "entropy": 0.3122843835502863, "epoch": 0.318338114947306, "grad_norm": 0.8661094903945923, "learning_rate": 1.9997728629123134e-05, "loss": 0.2951, "mean_token_accuracy": 0.904891487956047, "num_tokens": 4476391.0, "step": 20680 }, { "entropy": 0.30846708854660393, "epoch": 0.31849205020598464, "grad_norm": 0.7461251020431519, "learning_rate": 1.9997690295902225e-05, "loss": 0.3196, "mean_token_accuracy": 0.9058919161558151, "num_tokens": 4540572.0, "step": 20690 }, { "entropy": 0.3161243820562959, "epoch": 0.3186459854646632, "grad_norm": 0.6394002437591553, "learning_rate": 1.9997651641950827e-05, "loss": 0.3274, "mean_token_accuracy": 0.9026125624775887, "num_tokens": 4599916.0, "step": 20700 }, { "entropy": 0.30663278894498947, "epoch": 0.3187999207233418, "grad_norm": 0.6339073181152344, "learning_rate": 1.9997612667270183e-05, "loss": 0.3059, "mean_token_accuracy": 0.9078592106699943, "num_tokens": 4668583.0, "step": 20710 }, { "entropy": 0.32488867491483686, "epoch": 0.31895385598202036, "grad_norm": 0.9084647297859192, "learning_rate": 1.9997573371861544e-05, "loss": 0.3383, "mean_token_accuracy": 0.8994366310536861, "num_tokens": 4741855.0, "step": 20720 }, { "entropy": 0.31782431453466414, "epoch": 0.31910779124069893, "grad_norm": 0.7065509557723999, "learning_rate": 1.9997533755726163e-05, "loss": 0.3175, "mean_token_accuracy": 0.9028032198548317, "num_tokens": 4811425.0, "step": 20730 }, { "entropy": 0.32952187610790135, "epoch": 0.3192617264993775, "grad_norm": 0.8253627419471741, "learning_rate": 1.9997493818865318e-05, "loss": 0.3371, "mean_token_accuracy": 0.8995845876634121, "num_tokens": 4871849.0, "step": 20740 }, { "entropy": 0.33169056158512833, "epoch": 0.3194156617580561, "grad_norm": 0.831104576587677, "learning_rate": 1.9997453561280288e-05, "loss": 0.3354, "mean_token_accuracy": 0.9010242328047753, "num_tokens": 4940556.0, "step": 20750 }, { "entropy": 0.3060676698572934, "epoch": 0.3195695970167347, "grad_norm": 1.1648696660995483, "learning_rate": 1.9997412982972363e-05, "loss": 0.3099, "mean_token_accuracy": 0.909056943655014, "num_tokens": 5000637.0, "step": 20760 }, { "entropy": 0.3214529614895582, "epoch": 0.3197235322754133, "grad_norm": 0.729767382144928, "learning_rate": 1.9997372083942845e-05, "loss": 0.3307, "mean_token_accuracy": 0.9013886846601963, "num_tokens": 5065874.0, "step": 20770 }, { "entropy": 0.32025805208832026, "epoch": 0.31987746753409185, "grad_norm": 0.8289809823036194, "learning_rate": 1.999733086419305e-05, "loss": 0.3171, "mean_token_accuracy": 0.9061845920979976, "num_tokens": 5134467.0, "step": 20780 }, { "entropy": 0.3174214235506952, "epoch": 0.3200314027927704, "grad_norm": 0.7311874628067017, "learning_rate": 1.9997289323724296e-05, "loss": 0.3262, "mean_token_accuracy": 0.9029562018811703, "num_tokens": 5201657.0, "step": 20790 }, { "entropy": 0.3214405408129096, "epoch": 0.320185338051449, "grad_norm": 0.928863525390625, "learning_rate": 1.9997247462537917e-05, "loss": 0.3178, "mean_token_accuracy": 0.9022215642035007, "num_tokens": 5267943.0, "step": 20800 }, { "entropy": 0.3281028586439788, "epoch": 0.32033927331012757, "grad_norm": 0.6420918107032776, "learning_rate": 1.9997205280635255e-05, "loss": 0.3222, "mean_token_accuracy": 0.900465478003025, "num_tokens": 5333933.0, "step": 20810 }, { "entropy": 0.352724084071815, "epoch": 0.3204932085688062, "grad_norm": 0.8455303907394409, "learning_rate": 1.9997162778017667e-05, "loss": 0.3586, "mean_token_accuracy": 0.8926255613565445, "num_tokens": 5402671.0, "step": 20820 }, { "entropy": 0.3109988898038864, "epoch": 0.32064714382748477, "grad_norm": 0.8385092616081238, "learning_rate": 1.9997119954686512e-05, "loss": 0.3188, "mean_token_accuracy": 0.9059574082493782, "num_tokens": 5470920.0, "step": 20830 }, { "entropy": 0.3395197711884975, "epoch": 0.32080107908616334, "grad_norm": 0.7554437518119812, "learning_rate": 1.9997076810643165e-05, "loss": 0.3334, "mean_token_accuracy": 0.8989646673202515, "num_tokens": 5539566.0, "step": 20840 }, { "entropy": 0.31853863801807164, "epoch": 0.3209550143448419, "grad_norm": 0.7634948492050171, "learning_rate": 1.9997033345889013e-05, "loss": 0.3234, "mean_token_accuracy": 0.9009221814572811, "num_tokens": 5604915.0, "step": 20850 }, { "entropy": 0.319579553604126, "epoch": 0.3211089496035205, "grad_norm": 0.7234779596328735, "learning_rate": 1.9996989560425446e-05, "loss": 0.3193, "mean_token_accuracy": 0.9038145035505295, "num_tokens": 5671723.0, "step": 20860 }, { "entropy": 0.2926713588647544, "epoch": 0.32126288486219906, "grad_norm": 0.823910653591156, "learning_rate": 1.999694545425387e-05, "loss": 0.2996, "mean_token_accuracy": 0.9103976584970951, "num_tokens": 5741796.0, "step": 20870 }, { "entropy": 0.29142162268981336, "epoch": 0.3214168201208777, "grad_norm": 1.0342694520950317, "learning_rate": 1.9996901027375703e-05, "loss": 0.3009, "mean_token_accuracy": 0.9098099328577518, "num_tokens": 5800457.0, "step": 20880 }, { "entropy": 0.33148889113217594, "epoch": 0.32157075537955626, "grad_norm": 0.7009119391441345, "learning_rate": 1.9996856279792368e-05, "loss": 0.3369, "mean_token_accuracy": 0.8993267767131329, "num_tokens": 5867964.0, "step": 20890 }, { "entropy": 0.32836490850895644, "epoch": 0.32172469063823483, "grad_norm": 0.7377252578735352, "learning_rate": 1.9996811211505297e-05, "loss": 0.3198, "mean_token_accuracy": 0.9007248856127262, "num_tokens": 5933572.0, "step": 20900 }, { "entropy": 0.32377056861296294, "epoch": 0.3218786258969134, "grad_norm": 0.8450233340263367, "learning_rate": 1.9996765822515943e-05, "loss": 0.3282, "mean_token_accuracy": 0.9024140805006027, "num_tokens": 5999933.0, "step": 20910 }, { "entropy": 0.29779250295832754, "epoch": 0.322032561155592, "grad_norm": 0.8110250234603882, "learning_rate": 1.9996720112825753e-05, "loss": 0.3148, "mean_token_accuracy": 0.9084112711250782, "num_tokens": 6071350.0, "step": 20920 }, { "entropy": 0.3042646306566894, "epoch": 0.32218649641427055, "grad_norm": 1.0449506044387817, "learning_rate": 1.9996674082436204e-05, "loss": 0.3172, "mean_token_accuracy": 0.9076802462339402, "num_tokens": 6132264.0, "step": 20930 }, { "entropy": 0.3407701740041375, "epoch": 0.3223404316729491, "grad_norm": 0.8841515779495239, "learning_rate": 1.9996627731348767e-05, "loss": 0.3285, "mean_token_accuracy": 0.897602079808712, "num_tokens": 6195549.0, "step": 20940 }, { "entropy": 0.34370814068242905, "epoch": 0.32249436693162775, "grad_norm": 0.7497528791427612, "learning_rate": 1.9996581059564925e-05, "loss": 0.3418, "mean_token_accuracy": 0.8966041721403599, "num_tokens": 6264315.0, "step": 20950 }, { "entropy": 0.31409940114244816, "epoch": 0.3226483021903063, "grad_norm": 0.6464594602584839, "learning_rate": 1.9996534067086186e-05, "loss": 0.3137, "mean_token_accuracy": 0.9049668565392495, "num_tokens": 6334291.0, "step": 20960 }, { "entropy": 0.3022282734513283, "epoch": 0.3228022374489849, "grad_norm": 0.7223424911499023, "learning_rate": 1.9996486753914044e-05, "loss": 0.3143, "mean_token_accuracy": 0.9062580078840256, "num_tokens": 6399983.0, "step": 20970 }, { "entropy": 0.311343730147928, "epoch": 0.32295617270766347, "grad_norm": 0.8762120008468628, "learning_rate": 1.9996439120050028e-05, "loss": 0.3125, "mean_token_accuracy": 0.9052100658416748, "num_tokens": 6462224.0, "step": 20980 }, { "entropy": 0.31933387331664564, "epoch": 0.32311010796634204, "grad_norm": 0.7594583630561829, "learning_rate": 1.999639116549566e-05, "loss": 0.3105, "mean_token_accuracy": 0.9046965971589088, "num_tokens": 6530189.0, "step": 20990 }, { "entropy": 0.3177080057561398, "epoch": 0.3232640432250206, "grad_norm": 1.1540496349334717, "learning_rate": 1.9996342890252483e-05, "loss": 0.3195, "mean_token_accuracy": 0.9025858141481876, "num_tokens": 6598467.0, "step": 21000 }, { "entropy": 0.31153180040419104, "epoch": 0.32341797848369924, "grad_norm": 0.8481530547142029, "learning_rate": 1.9996294294322042e-05, "loss": 0.3135, "mean_token_accuracy": 0.9066160455346107, "num_tokens": 6665216.0, "step": 21010 }, { "entropy": 0.3219974584877491, "epoch": 0.3235719137423778, "grad_norm": 0.911251962184906, "learning_rate": 1.9996245377705896e-05, "loss": 0.3287, "mean_token_accuracy": 0.9021592698991299, "num_tokens": 6727770.0, "step": 21020 }, { "entropy": 0.32309828139841557, "epoch": 0.3237258490010564, "grad_norm": 0.6803611516952515, "learning_rate": 1.9996196140405617e-05, "loss": 0.3294, "mean_token_accuracy": 0.90355968400836, "num_tokens": 6799021.0, "step": 21030 }, { "entropy": 0.3176530947908759, "epoch": 0.32387978425973496, "grad_norm": 0.6160640120506287, "learning_rate": 1.9996146582422784e-05, "loss": 0.3101, "mean_token_accuracy": 0.9047819837927819, "num_tokens": 6863656.0, "step": 21040 }, { "entropy": 0.30901142992079256, "epoch": 0.32403371951841353, "grad_norm": 0.7597231268882751, "learning_rate": 1.9996096703758983e-05, "loss": 0.309, "mean_token_accuracy": 0.9071716494858265, "num_tokens": 6930951.0, "step": 21050 }, { "entropy": 0.3094387024641037, "epoch": 0.3241876547770921, "grad_norm": 1.0766651630401611, "learning_rate": 1.9996046504415817e-05, "loss": 0.3165, "mean_token_accuracy": 0.9044065602123738, "num_tokens": 6995756.0, "step": 21060 }, { "entropy": 0.31660088188946245, "epoch": 0.32434159003577073, "grad_norm": 0.8407514691352844, "learning_rate": 1.9995995984394897e-05, "loss": 0.3391, "mean_token_accuracy": 0.9041324004530906, "num_tokens": 7062587.0, "step": 21070 }, { "entropy": 0.34154240917414425, "epoch": 0.3244955252944493, "grad_norm": 0.7545040845870972, "learning_rate": 1.9995945143697844e-05, "loss": 0.3392, "mean_token_accuracy": 0.8985571384429931, "num_tokens": 7137975.0, "step": 21080 }, { "entropy": 0.31895367372781036, "epoch": 0.3246494605531279, "grad_norm": 0.9967809319496155, "learning_rate": 1.9995893982326288e-05, "loss": 0.3177, "mean_token_accuracy": 0.9048464901745319, "num_tokens": 7200940.0, "step": 21090 }, { "entropy": 0.3141972843557596, "epoch": 0.32480339581180645, "grad_norm": 0.6984860897064209, "learning_rate": 1.9995842500281868e-05, "loss": 0.3239, "mean_token_accuracy": 0.9029401890933514, "num_tokens": 7271239.0, "step": 21100 }, { "entropy": 0.3282560114748776, "epoch": 0.324957331070485, "grad_norm": 0.746810793876648, "learning_rate": 1.999579069756624e-05, "loss": 0.3283, "mean_token_accuracy": 0.9016815856099129, "num_tokens": 7337044.0, "step": 21110 }, { "entropy": 0.31553209153935313, "epoch": 0.3251112663291636, "grad_norm": 0.9140670895576477, "learning_rate": 1.9995738574181063e-05, "loss": 0.3123, "mean_token_accuracy": 0.9036860302090645, "num_tokens": 7395087.0, "step": 21120 }, { "entropy": 0.3224813649430871, "epoch": 0.32526520158784217, "grad_norm": 0.856269121170044, "learning_rate": 1.9995686130128012e-05, "loss": 0.336, "mean_token_accuracy": 0.8999235920608044, "num_tokens": 7462858.0, "step": 21130 }, { "entropy": 0.30161890145391224, "epoch": 0.3254191368465208, "grad_norm": 0.6990885734558105, "learning_rate": 1.9995633365408766e-05, "loss": 0.3072, "mean_token_accuracy": 0.9112702630460262, "num_tokens": 7520993.0, "step": 21140 }, { "entropy": 0.3180363643914461, "epoch": 0.32557307210519937, "grad_norm": 0.8482049703598022, "learning_rate": 1.9995580280025017e-05, "loss": 0.3185, "mean_token_accuracy": 0.9024524226784706, "num_tokens": 7584048.0, "step": 21150 }, { "entropy": 0.3478139342740178, "epoch": 0.32572700736387794, "grad_norm": 0.7555426955223083, "learning_rate": 1.999552687397847e-05, "loss": 0.3372, "mean_token_accuracy": 0.8958433054387569, "num_tokens": 7651513.0, "step": 21160 }, { "entropy": 0.3330830929800868, "epoch": 0.3258809426225565, "grad_norm": 0.7335700392723083, "learning_rate": 1.999547314727084e-05, "loss": 0.3411, "mean_token_accuracy": 0.9015285648405552, "num_tokens": 7713404.0, "step": 21170 }, { "entropy": 0.3250585845671594, "epoch": 0.3260348778812351, "grad_norm": 0.7276309728622437, "learning_rate": 1.999541909990385e-05, "loss": 0.3295, "mean_token_accuracy": 0.8997125126421451, "num_tokens": 7770657.0, "step": 21180 }, { "entropy": 0.30642259689047935, "epoch": 0.32618881313991366, "grad_norm": 0.6842045187950134, "learning_rate": 1.999536473187923e-05, "loss": 0.3197, "mean_token_accuracy": 0.9061708681285381, "num_tokens": 7837182.0, "step": 21190 }, { "entropy": 0.30940080918371676, "epoch": 0.3263427483985923, "grad_norm": 1.0917699337005615, "learning_rate": 1.999531004319873e-05, "loss": 0.3024, "mean_token_accuracy": 0.9039850294589996, "num_tokens": 7906378.0, "step": 21200 }, { "entropy": 0.33123040813952687, "epoch": 0.32649668365727086, "grad_norm": 0.8052946329116821, "learning_rate": 1.9995255033864098e-05, "loss": 0.3347, "mean_token_accuracy": 0.8977292656898499, "num_tokens": 7971708.0, "step": 21210 }, { "entropy": 0.3394948675297201, "epoch": 0.32665061891594943, "grad_norm": 0.7219736576080322, "learning_rate": 1.9995199703877107e-05, "loss": 0.3333, "mean_token_accuracy": 0.9002779044210911, "num_tokens": 8033756.0, "step": 21220 }, { "entropy": 0.3224007304757833, "epoch": 0.326804554174628, "grad_norm": 0.9352275729179382, "learning_rate": 1.9995144053239522e-05, "loss": 0.3283, "mean_token_accuracy": 0.9037214510142804, "num_tokens": 8090046.0, "step": 21230 }, { "entropy": 0.32164302468299866, "epoch": 0.3269584894333066, "grad_norm": 0.8037646412849426, "learning_rate": 1.9995088081953136e-05, "loss": 0.3249, "mean_token_accuracy": 0.9018212720751763, "num_tokens": 8149798.0, "step": 21240 }, { "entropy": 0.33140066675841806, "epoch": 0.32711242469198515, "grad_norm": 0.7759402990341187, "learning_rate": 1.999503179001974e-05, "loss": 0.3238, "mean_token_accuracy": 0.897957157343626, "num_tokens": 8218590.0, "step": 21250 }, { "entropy": 0.32710194075480103, "epoch": 0.3272663599506638, "grad_norm": 0.7521681189537048, "learning_rate": 1.9994975177441143e-05, "loss": 0.3249, "mean_token_accuracy": 0.9016332909464836, "num_tokens": 8287975.0, "step": 21260 }, { "entropy": 0.33995426520705224, "epoch": 0.32742029520934235, "grad_norm": 0.8409105539321899, "learning_rate": 1.999491824421916e-05, "loss": 0.3515, "mean_token_accuracy": 0.8946811221539974, "num_tokens": 8355167.0, "step": 21270 }, { "entropy": 0.33882763525471093, "epoch": 0.3275742304680209, "grad_norm": 0.7188692688941956, "learning_rate": 1.999486099035562e-05, "loss": 0.3353, "mean_token_accuracy": 0.8974065437912941, "num_tokens": 8423161.0, "step": 21280 }, { "entropy": 0.3154967346228659, "epoch": 0.3277281657266995, "grad_norm": 0.6856040954589844, "learning_rate": 1.9994803415852352e-05, "loss": 0.3103, "mean_token_accuracy": 0.9058394074440003, "num_tokens": 8487590.0, "step": 21290 }, { "entropy": 0.30501783853396774, "epoch": 0.32788210098537807, "grad_norm": 0.6318585872650146, "learning_rate": 1.999474552071121e-05, "loss": 0.3106, "mean_token_accuracy": 0.9051105931401253, "num_tokens": 8552279.0, "step": 21300 }, { "entropy": 0.33164909267798065, "epoch": 0.32803603624405664, "grad_norm": 0.7497948408126831, "learning_rate": 1.999468730493405e-05, "loss": 0.3391, "mean_token_accuracy": 0.9012236542999744, "num_tokens": 8614428.0, "step": 21310 }, { "entropy": 0.32239878447726367, "epoch": 0.3281899715027352, "grad_norm": 0.6721944212913513, "learning_rate": 1.999462876852274e-05, "loss": 0.3278, "mean_token_accuracy": 0.9023381367325782, "num_tokens": 8681715.0, "step": 21320 }, { "entropy": 0.35052482783794403, "epoch": 0.32834390676141384, "grad_norm": 0.7066013813018799, "learning_rate": 1.999456991147916e-05, "loss": 0.3471, "mean_token_accuracy": 0.8913077406585217, "num_tokens": 8757491.0, "step": 21330 }, { "entropy": 0.29230973720550535, "epoch": 0.3284978420200924, "grad_norm": 0.8446633815765381, "learning_rate": 1.9994510733805188e-05, "loss": 0.2857, "mean_token_accuracy": 0.913440765440464, "num_tokens": 8822598.0, "step": 21340 }, { "entropy": 0.3188203953206539, "epoch": 0.328651777278771, "grad_norm": 0.7055615186691284, "learning_rate": 1.9994451235502734e-05, "loss": 0.3267, "mean_token_accuracy": 0.9019789583981037, "num_tokens": 8891700.0, "step": 21350 }, { "entropy": 0.32349367225542663, "epoch": 0.32880571253744956, "grad_norm": 0.8611619472503662, "learning_rate": 1.99943914165737e-05, "loss": 0.3245, "mean_token_accuracy": 0.9027596637606621, "num_tokens": 8957551.0, "step": 21360 }, { "entropy": 0.30869204523041843, "epoch": 0.32895964779612813, "grad_norm": 0.7465277910232544, "learning_rate": 1.999433127702001e-05, "loss": 0.3162, "mean_token_accuracy": 0.9073692634701729, "num_tokens": 9025281.0, "step": 21370 }, { "entropy": 0.3234313581138849, "epoch": 0.3291135830548067, "grad_norm": 0.7493135333061218, "learning_rate": 1.999427081684359e-05, "loss": 0.3363, "mean_token_accuracy": 0.9002398960292339, "num_tokens": 9088770.0, "step": 21380 }, { "entropy": 0.32174520697444675, "epoch": 0.32926751831348533, "grad_norm": 0.8062043786048889, "learning_rate": 1.9994210036046375e-05, "loss": 0.3093, "mean_token_accuracy": 0.9016591988503933, "num_tokens": 9155823.0, "step": 21390 }, { "entropy": 0.3257349546067417, "epoch": 0.3294214535721639, "grad_norm": 0.7243297696113586, "learning_rate": 1.9994148934630327e-05, "loss": 0.3386, "mean_token_accuracy": 0.9020783565938473, "num_tokens": 9218947.0, "step": 21400 }, { "entropy": 0.3360528042539954, "epoch": 0.3295753888308425, "grad_norm": 0.808285117149353, "learning_rate": 1.9994087512597394e-05, "loss": 0.3253, "mean_token_accuracy": 0.8973312243819237, "num_tokens": 9282288.0, "step": 21410 }, { "entropy": 0.3306349810212851, "epoch": 0.32972932408952105, "grad_norm": 0.8035721778869629, "learning_rate": 1.9994025769949555e-05, "loss": 0.3151, "mean_token_accuracy": 0.9001225627958774, "num_tokens": 9354671.0, "step": 21420 }, { "entropy": 0.328083177190274, "epoch": 0.3298832593481996, "grad_norm": 0.6182601451873779, "learning_rate": 1.9993963706688787e-05, "loss": 0.3334, "mean_token_accuracy": 0.8993057273328304, "num_tokens": 9417029.0, "step": 21430 }, { "entropy": 0.303246658295393, "epoch": 0.3300371946068782, "grad_norm": 0.7689828872680664, "learning_rate": 1.999390132281708e-05, "loss": 0.3012, "mean_token_accuracy": 0.9091710582375526, "num_tokens": 9479814.0, "step": 21440 }, { "entropy": 0.31079767299816013, "epoch": 0.3301911298655568, "grad_norm": 1.1726057529449463, "learning_rate": 1.9993838618336442e-05, "loss": 0.3287, "mean_token_accuracy": 0.9043589577078819, "num_tokens": 9543365.0, "step": 21450 }, { "entropy": 0.3227193206548691, "epoch": 0.3303450651242354, "grad_norm": 0.6824550628662109, "learning_rate": 1.9993775593248876e-05, "loss": 0.3298, "mean_token_accuracy": 0.9005361780524254, "num_tokens": 9603288.0, "step": 21460 }, { "entropy": 0.3107419306412339, "epoch": 0.33049900038291397, "grad_norm": 0.7983534336090088, "learning_rate": 1.99937122475564e-05, "loss": 0.3114, "mean_token_accuracy": 0.9069050207734108, "num_tokens": 9671332.0, "step": 21470 }, { "entropy": 0.31183935245499017, "epoch": 0.33065293564159254, "grad_norm": 0.659069836139679, "learning_rate": 1.9993648581261062e-05, "loss": 0.3074, "mean_token_accuracy": 0.9046528205275536, "num_tokens": 9740110.0, "step": 21480 }, { "entropy": 0.31509490674361584, "epoch": 0.3308068709002711, "grad_norm": 0.6838434934616089, "learning_rate": 1.9993584594364893e-05, "loss": 0.306, "mean_token_accuracy": 0.9047120742499828, "num_tokens": 9809976.0, "step": 21490 }, { "entropy": 0.31824709055945277, "epoch": 0.3309608061589497, "grad_norm": 0.704776406288147, "learning_rate": 1.999352028686995e-05, "loss": 0.3264, "mean_token_accuracy": 0.9037995375692844, "num_tokens": 9884714.0, "step": 21500 }, { "entropy": 0.32250343887135385, "epoch": 0.33111474141762826, "grad_norm": 0.61302250623703, "learning_rate": 1.9993455658778294e-05, "loss": 0.338, "mean_token_accuracy": 0.9048451967537403, "num_tokens": 9945513.0, "step": 21510 }, { "entropy": 0.3406271729618311, "epoch": 0.3312686766763069, "grad_norm": 0.733169436454773, "learning_rate": 1.9993390710092e-05, "loss": 0.3384, "mean_token_accuracy": 0.8986567407846451, "num_tokens": 10008504.0, "step": 21520 }, { "entropy": 0.3253984970971942, "epoch": 0.33142261193498546, "grad_norm": 0.6051371693611145, "learning_rate": 1.9993325440813146e-05, "loss": 0.3084, "mean_token_accuracy": 0.9024057656526565, "num_tokens": 10078130.0, "step": 21530 }, { "entropy": 0.3132121905684471, "epoch": 0.33157654719366403, "grad_norm": 0.7878724336624146, "learning_rate": 1.9993259850943832e-05, "loss": 0.3153, "mean_token_accuracy": 0.9044586546719074, "num_tokens": 10148090.0, "step": 21540 }, { "entropy": 0.3165295778773725, "epoch": 0.3317304824523426, "grad_norm": 0.7947086691856384, "learning_rate": 1.9993193940486165e-05, "loss": 0.3121, "mean_token_accuracy": 0.9056487508118153, "num_tokens": 10209094.0, "step": 21550 }, { "entropy": 0.3379816751927137, "epoch": 0.3318844177110212, "grad_norm": 0.8789849877357483, "learning_rate": 1.999312770944225e-05, "loss": 0.346, "mean_token_accuracy": 0.8975513108074665, "num_tokens": 10272187.0, "step": 21560 }, { "entropy": 0.3215874877758324, "epoch": 0.33203835296969975, "grad_norm": 0.7094950079917908, "learning_rate": 1.999306115781422e-05, "loss": 0.319, "mean_token_accuracy": 0.9012355767190456, "num_tokens": 10347675.0, "step": 21570 }, { "entropy": 0.34019933762028814, "epoch": 0.3321922882283784, "grad_norm": 0.7621740698814392, "learning_rate": 1.9992994285604202e-05, "loss": 0.3356, "mean_token_accuracy": 0.8974838092923164, "num_tokens": 10415223.0, "step": 21580 }, { "entropy": 0.3281352447345853, "epoch": 0.33234622348705695, "grad_norm": 0.7586294412612915, "learning_rate": 1.999292709281435e-05, "loss": 0.3258, "mean_token_accuracy": 0.8996278829872608, "num_tokens": 10479927.0, "step": 21590 }, { "entropy": 0.3125830490142107, "epoch": 0.3325001587457355, "grad_norm": 0.9568009376525879, "learning_rate": 1.9992859579446814e-05, "loss": 0.3151, "mean_token_accuracy": 0.9045048221945763, "num_tokens": 10541906.0, "step": 21600 }, { "entropy": 0.3207790692336857, "epoch": 0.3326540940044141, "grad_norm": 0.6377637982368469, "learning_rate": 1.999279174550376e-05, "loss": 0.3259, "mean_token_accuracy": 0.9018832735717297, "num_tokens": 10617515.0, "step": 21610 }, { "entropy": 0.32296547628939154, "epoch": 0.33280802926309266, "grad_norm": 0.9560837149620056, "learning_rate": 1.9992723590987368e-05, "loss": 0.3203, "mean_token_accuracy": 0.903771112114191, "num_tokens": 10676473.0, "step": 21620 }, { "entropy": 0.3020092289894819, "epoch": 0.33296196452177124, "grad_norm": 0.6576690077781677, "learning_rate": 1.999265511589982e-05, "loss": 0.3064, "mean_token_accuracy": 0.9081644400954246, "num_tokens": 10737690.0, "step": 21630 }, { "entropy": 0.3195934203453362, "epoch": 0.33311589978044986, "grad_norm": 0.7608587741851807, "learning_rate": 1.9992586320243317e-05, "loss": 0.3293, "mean_token_accuracy": 0.9021513588726521, "num_tokens": 10799069.0, "step": 21640 }, { "entropy": 0.3242793150246143, "epoch": 0.33326983503912844, "grad_norm": 0.6674180626869202, "learning_rate": 1.999251720402006e-05, "loss": 0.3136, "mean_token_accuracy": 0.9015105709433555, "num_tokens": 10862740.0, "step": 21650 }, { "entropy": 0.3054505571722984, "epoch": 0.333423770297807, "grad_norm": 0.7060570120811462, "learning_rate": 1.9992447767232273e-05, "loss": 0.3108, "mean_token_accuracy": 0.9077334120869637, "num_tokens": 10924409.0, "step": 21660 }, { "entropy": 0.3183489296585321, "epoch": 0.3335777055564856, "grad_norm": 1.0429401397705078, "learning_rate": 1.999237800988218e-05, "loss": 0.3107, "mean_token_accuracy": 0.9022411316633224, "num_tokens": 10992536.0, "step": 21670 }, { "entropy": 0.31695804903283714, "epoch": 0.33373164081516415, "grad_norm": 0.784873366355896, "learning_rate": 1.9992307931972016e-05, "loss": 0.3128, "mean_token_accuracy": 0.9043129928410053, "num_tokens": 11051236.0, "step": 21680 }, { "entropy": 0.32992726024240254, "epoch": 0.3338855760738427, "grad_norm": 0.6390624642372131, "learning_rate": 1.9992237533504037e-05, "loss": 0.3316, "mean_token_accuracy": 0.9011027745902538, "num_tokens": 11123179.0, "step": 21690 }, { "entropy": 0.30556295020505786, "epoch": 0.3340395113325213, "grad_norm": 0.6751687526702881, "learning_rate": 1.9992166814480497e-05, "loss": 0.3021, "mean_token_accuracy": 0.9041781470179557, "num_tokens": 11190264.0, "step": 21700 }, { "entropy": 0.3064158839173615, "epoch": 0.3341934465911999, "grad_norm": 0.6729692816734314, "learning_rate": 1.999209577490366e-05, "loss": 0.3161, "mean_token_accuracy": 0.9066353090107441, "num_tokens": 11262787.0, "step": 21710 }, { "entropy": 0.3278814608231187, "epoch": 0.3343473818498785, "grad_norm": 1.1114226579666138, "learning_rate": 1.9992024414775812e-05, "loss": 0.3412, "mean_token_accuracy": 0.9015132546424866, "num_tokens": 11324129.0, "step": 21720 }, { "entropy": 0.3108110249042511, "epoch": 0.3345013171085571, "grad_norm": 0.6909723877906799, "learning_rate": 1.999195273409924e-05, "loss": 0.3137, "mean_token_accuracy": 0.9059933885931969, "num_tokens": 11394628.0, "step": 21730 }, { "entropy": 0.3251840639859438, "epoch": 0.33465525236723564, "grad_norm": 0.7626580595970154, "learning_rate": 1.9991880732876246e-05, "loss": 0.3288, "mean_token_accuracy": 0.9016415148973465, "num_tokens": 11461177.0, "step": 21740 }, { "entropy": 0.30339964721351864, "epoch": 0.3348091876259142, "grad_norm": 0.7362270355224609, "learning_rate": 1.9991808411109134e-05, "loss": 0.3038, "mean_token_accuracy": 0.9068808928132057, "num_tokens": 11523942.0, "step": 21750 }, { "entropy": 0.3374151198193431, "epoch": 0.3349631228845928, "grad_norm": 0.7838295698165894, "learning_rate": 1.9991735768800228e-05, "loss": 0.3208, "mean_token_accuracy": 0.8988882884383201, "num_tokens": 11580680.0, "step": 21760 }, { "entropy": 0.3165876277722418, "epoch": 0.3351170581432714, "grad_norm": 0.6188244819641113, "learning_rate": 1.9991662805951856e-05, "loss": 0.319, "mean_token_accuracy": 0.9043731026351451, "num_tokens": 11650434.0, "step": 21770 }, { "entropy": 0.309801593516022, "epoch": 0.33527099340195, "grad_norm": 0.6685105562210083, "learning_rate": 1.9991589522566365e-05, "loss": 0.3092, "mean_token_accuracy": 0.9039145022630691, "num_tokens": 11717495.0, "step": 21780 }, { "entropy": 0.31629672832787037, "epoch": 0.33542492866062856, "grad_norm": 0.89154452085495, "learning_rate": 1.9991515918646098e-05, "loss": 0.3237, "mean_token_accuracy": 0.9045199036598206, "num_tokens": 11774946.0, "step": 21790 }, { "entropy": 0.31827989369630816, "epoch": 0.33557886391930714, "grad_norm": 0.7613545060157776, "learning_rate": 1.9991441994193418e-05, "loss": 0.3136, "mean_token_accuracy": 0.9031230092048645, "num_tokens": 11851851.0, "step": 21800 }, { "entropy": 0.3203876039944589, "epoch": 0.3357327991779857, "grad_norm": 0.9879562258720398, "learning_rate": 1.99913677492107e-05, "loss": 0.325, "mean_token_accuracy": 0.9014826528728008, "num_tokens": 11917309.0, "step": 21810 }, { "entropy": 0.29528325200080874, "epoch": 0.3358867344366643, "grad_norm": 0.7557036876678467, "learning_rate": 1.999129318370032e-05, "loss": 0.2954, "mean_token_accuracy": 0.9096343077719211, "num_tokens": 11984276.0, "step": 21820 }, { "entropy": 0.3127232295460999, "epoch": 0.3360406696953429, "grad_norm": 0.7958164811134338, "learning_rate": 1.9991218297664683e-05, "loss": 0.3069, "mean_token_accuracy": 0.9026836387813091, "num_tokens": 12054125.0, "step": 21830 }, { "entropy": 0.33447002042084933, "epoch": 0.3361946049540215, "grad_norm": 0.9219587445259094, "learning_rate": 1.9991143091106176e-05, "loss": 0.3306, "mean_token_accuracy": 0.8992726728320122, "num_tokens": 12119639.0, "step": 21840 }, { "entropy": 0.35323177706450226, "epoch": 0.33634854021270005, "grad_norm": 0.8104297518730164, "learning_rate": 1.9991067564027217e-05, "loss": 0.3431, "mean_token_accuracy": 0.8942392379045486, "num_tokens": 12192565.0, "step": 21850 }, { "entropy": 0.31885669827461244, "epoch": 0.3365024754713786, "grad_norm": 0.8399248123168945, "learning_rate": 1.9990991716430236e-05, "loss": 0.3157, "mean_token_accuracy": 0.9037266805768013, "num_tokens": 12254330.0, "step": 21860 }, { "entropy": 0.33622518312186, "epoch": 0.3366564107300572, "grad_norm": 0.546898603439331, "learning_rate": 1.9990915548317654e-05, "loss": 0.34, "mean_token_accuracy": 0.897280140966177, "num_tokens": 12329458.0, "step": 21870 }, { "entropy": 0.3142722500488162, "epoch": 0.33681034598873577, "grad_norm": 0.8413144946098328, "learning_rate": 1.9990839059691925e-05, "loss": 0.3179, "mean_token_accuracy": 0.906386086344719, "num_tokens": 12396794.0, "step": 21880 }, { "entropy": 0.3412132033146918, "epoch": 0.33696428124741434, "grad_norm": 0.9005770087242126, "learning_rate": 1.9990762250555495e-05, "loss": 0.3383, "mean_token_accuracy": 0.8967149086296559, "num_tokens": 12465113.0, "step": 21890 }, { "entropy": 0.34331868030130863, "epoch": 0.33711821650609297, "grad_norm": 0.7029370665550232, "learning_rate": 1.9990685120910837e-05, "loss": 0.3464, "mean_token_accuracy": 0.8946383066475392, "num_tokens": 12532939.0, "step": 21900 }, { "entropy": 0.3195636718533933, "epoch": 0.33727215176477154, "grad_norm": 0.713618814945221, "learning_rate": 1.9990607670760414e-05, "loss": 0.3314, "mean_token_accuracy": 0.9044018067419529, "num_tokens": 12605289.0, "step": 21910 }, { "entropy": 0.3366281546652317, "epoch": 0.3374260870234501, "grad_norm": 0.6985756754875183, "learning_rate": 1.999052990010672e-05, "loss": 0.3332, "mean_token_accuracy": 0.8992457464337349, "num_tokens": 12673881.0, "step": 21920 }, { "entropy": 0.2987397267483175, "epoch": 0.3375800222821287, "grad_norm": 0.9188125133514404, "learning_rate": 1.9990451808952244e-05, "loss": 0.3027, "mean_token_accuracy": 0.9082032337784767, "num_tokens": 12737369.0, "step": 21930 }, { "entropy": 0.32008592449128626, "epoch": 0.33773395754080726, "grad_norm": 0.5896173715591431, "learning_rate": 1.9990373397299496e-05, "loss": 0.3279, "mean_token_accuracy": 0.9001857459545135, "num_tokens": 12808685.0, "step": 21940 }, { "entropy": 0.3180290624499321, "epoch": 0.33788789279948583, "grad_norm": 0.7768407464027405, "learning_rate": 1.999029466515099e-05, "loss": 0.315, "mean_token_accuracy": 0.9060790747404098, "num_tokens": 12872139.0, "step": 21950 }, { "entropy": 0.34493645206093787, "epoch": 0.33804182805816446, "grad_norm": 0.646183431148529, "learning_rate": 1.9990215612509248e-05, "loss": 0.3479, "mean_token_accuracy": 0.8916840516030788, "num_tokens": 12932971.0, "step": 21960 }, { "entropy": 0.3373729952611029, "epoch": 0.33819576331684303, "grad_norm": 0.6914187669754028, "learning_rate": 1.9990136239376812e-05, "loss": 0.3365, "mean_token_accuracy": 0.9009270399808884, "num_tokens": 12995531.0, "step": 21970 }, { "entropy": 0.3384730763733387, "epoch": 0.3383496985755216, "grad_norm": 0.7085709571838379, "learning_rate": 1.9990056545756223e-05, "loss": 0.3439, "mean_token_accuracy": 0.8963684886693954, "num_tokens": 13063447.0, "step": 21980 }, { "entropy": 0.3124067196622491, "epoch": 0.3385036338342002, "grad_norm": 0.8621397018432617, "learning_rate": 1.9989976531650042e-05, "loss": 0.3199, "mean_token_accuracy": 0.9048227399587632, "num_tokens": 13130108.0, "step": 21990 }, { "entropy": 0.3261044792830944, "epoch": 0.33865756909287875, "grad_norm": 0.7906909584999084, "learning_rate": 1.998989619706083e-05, "loss": 0.3288, "mean_token_accuracy": 0.902996439486742, "num_tokens": 13187197.0, "step": 22000 }, { "entropy": 0.34708120990544555, "epoch": 0.3388115043515573, "grad_norm": 0.7705202102661133, "learning_rate": 1.9989815541991173e-05, "loss": 0.3496, "mean_token_accuracy": 0.8955181159079075, "num_tokens": 13249712.0, "step": 22010 }, { "entropy": 0.3123684834688902, "epoch": 0.33896543961023595, "grad_norm": 0.6270869970321655, "learning_rate": 1.998973456644365e-05, "loss": 0.309, "mean_token_accuracy": 0.9047353066504001, "num_tokens": 13322360.0, "step": 22020 }, { "entropy": 0.3159182289615273, "epoch": 0.3391193748689145, "grad_norm": 0.8217530250549316, "learning_rate": 1.9989653270420866e-05, "loss": 0.3204, "mean_token_accuracy": 0.9044232361018658, "num_tokens": 13393499.0, "step": 22030 }, { "entropy": 0.3061977018602192, "epoch": 0.3392733101275931, "grad_norm": 0.5073696970939636, "learning_rate": 1.998957165392542e-05, "loss": 0.3148, "mean_token_accuracy": 0.9070772796869278, "num_tokens": 13463218.0, "step": 22040 }, { "entropy": 0.3111883225850761, "epoch": 0.33942724538627167, "grad_norm": 0.7976201176643372, "learning_rate": 1.998948971695994e-05, "loss": 0.3088, "mean_token_accuracy": 0.9040212728083133, "num_tokens": 13527911.0, "step": 22050 }, { "entropy": 0.3095220452174544, "epoch": 0.33958118064495024, "grad_norm": 0.7730515599250793, "learning_rate": 1.9989407459527047e-05, "loss": 0.3247, "mean_token_accuracy": 0.9030368447303772, "num_tokens": 13593603.0, "step": 22060 }, { "entropy": 0.3205151869915426, "epoch": 0.3397351159036288, "grad_norm": 0.702601969242096, "learning_rate": 1.9989324881629385e-05, "loss": 0.3135, "mean_token_accuracy": 0.9017509624361992, "num_tokens": 13660263.0, "step": 22070 }, { "entropy": 0.329964061640203, "epoch": 0.3398890511623074, "grad_norm": 0.9036518335342407, "learning_rate": 1.99892419832696e-05, "loss": 0.3275, "mean_token_accuracy": 0.8997100323438645, "num_tokens": 13727061.0, "step": 22080 }, { "entropy": 0.31612818483263255, "epoch": 0.340042986420986, "grad_norm": 0.6427741050720215, "learning_rate": 1.9989158764450352e-05, "loss": 0.3113, "mean_token_accuracy": 0.9059128940105439, "num_tokens": 13794107.0, "step": 22090 }, { "entropy": 0.29632913153618573, "epoch": 0.3401969216796646, "grad_norm": 1.0279381275177002, "learning_rate": 1.9989075225174313e-05, "loss": 0.304, "mean_token_accuracy": 0.9079443588852882, "num_tokens": 13854264.0, "step": 22100 }, { "entropy": 0.2951574523001909, "epoch": 0.34035085693834316, "grad_norm": 0.7588937878608704, "learning_rate": 1.998899136544416e-05, "loss": 0.3116, "mean_token_accuracy": 0.9110758244991303, "num_tokens": 13925475.0, "step": 22110 }, { "entropy": 0.32508263550698757, "epoch": 0.34050479219702173, "grad_norm": 0.8268002271652222, "learning_rate": 1.9988907185262586e-05, "loss": 0.3128, "mean_token_accuracy": 0.900681371986866, "num_tokens": 13988266.0, "step": 22120 }, { "entropy": 0.31452884217724203, "epoch": 0.3406587274557003, "grad_norm": 0.8393418192863464, "learning_rate": 1.9988822684632287e-05, "loss": 0.3012, "mean_token_accuracy": 0.9047173008322715, "num_tokens": 14060303.0, "step": 22130 }, { "entropy": 0.31476883459836247, "epoch": 0.3408126627143789, "grad_norm": 0.7006365656852722, "learning_rate": 1.9988737863555977e-05, "loss": 0.3222, "mean_token_accuracy": 0.9031876914203167, "num_tokens": 14128217.0, "step": 22140 }, { "entropy": 0.3188710899092257, "epoch": 0.3409665979730575, "grad_norm": 0.7680262923240662, "learning_rate": 1.998865272203638e-05, "loss": 0.3192, "mean_token_accuracy": 0.9037799194455147, "num_tokens": 14199293.0, "step": 22150 }, { "entropy": 0.31918554436415436, "epoch": 0.3411205332317361, "grad_norm": 0.6579177379608154, "learning_rate": 1.9988567260076222e-05, "loss": 0.3237, "mean_token_accuracy": 0.9029463410377503, "num_tokens": 14263201.0, "step": 22160 }, { "entropy": 0.31800243612378837, "epoch": 0.34127446849041465, "grad_norm": 1.0138856172561646, "learning_rate": 1.9988481477678245e-05, "loss": 0.3312, "mean_token_accuracy": 0.9048355303704738, "num_tokens": 14328703.0, "step": 22170 }, { "entropy": 0.3250843619927764, "epoch": 0.3414284037490932, "grad_norm": 0.7845010161399841, "learning_rate": 1.9988395374845204e-05, "loss": 0.3265, "mean_token_accuracy": 0.9008662335574626, "num_tokens": 14397591.0, "step": 22180 }, { "entropy": 0.35432516857981683, "epoch": 0.3415823390077718, "grad_norm": 0.8876001238822937, "learning_rate": 1.998830895157986e-05, "loss": 0.35, "mean_token_accuracy": 0.8954906769096851, "num_tokens": 14466469.0, "step": 22190 }, { "entropy": 0.3220886575058103, "epoch": 0.34173627426645037, "grad_norm": 0.7191734910011292, "learning_rate": 1.998822220788499e-05, "loss": 0.3255, "mean_token_accuracy": 0.9040166981518268, "num_tokens": 14538929.0, "step": 22200 }, { "entropy": 0.2994925453327596, "epoch": 0.341890209525129, "grad_norm": 0.6772475242614746, "learning_rate": 1.9988135143763367e-05, "loss": 0.3183, "mean_token_accuracy": 0.9069646663963795, "num_tokens": 14604890.0, "step": 22210 }, { "entropy": 0.3110925691202283, "epoch": 0.34204414478380757, "grad_norm": 0.7542855739593506, "learning_rate": 1.9988047759217793e-05, "loss": 0.311, "mean_token_accuracy": 0.9054268084466457, "num_tokens": 14669588.0, "step": 22220 }, { "entropy": 0.2867104595527053, "epoch": 0.34219808004248614, "grad_norm": 0.6546667218208313, "learning_rate": 1.9987960054251064e-05, "loss": 0.3071, "mean_token_accuracy": 0.9109865978360177, "num_tokens": 14736902.0, "step": 22230 }, { "entropy": 0.3347309987060726, "epoch": 0.3423520153011647, "grad_norm": 0.8572447299957275, "learning_rate": 1.9987872028866e-05, "loss": 0.3328, "mean_token_accuracy": 0.8981968022882938, "num_tokens": 14803936.0, "step": 22240 }, { "entropy": 0.3028885948471725, "epoch": 0.3425059505598433, "grad_norm": 0.6640207767486572, "learning_rate": 1.9987783683065423e-05, "loss": 0.3048, "mean_token_accuracy": 0.9094952575862407, "num_tokens": 14878360.0, "step": 22250 }, { "entropy": 0.31735448390245435, "epoch": 0.34265988581852186, "grad_norm": 0.8989962935447693, "learning_rate": 1.9987695016852166e-05, "loss": 0.3304, "mean_token_accuracy": 0.9002150543034076, "num_tokens": 14945657.0, "step": 22260 }, { "entropy": 0.3294907037168741, "epoch": 0.34281382107720043, "grad_norm": 0.8086941242218018, "learning_rate": 1.9987606030229072e-05, "loss": 0.3314, "mean_token_accuracy": 0.8999433234333992, "num_tokens": 15011548.0, "step": 22270 }, { "entropy": 0.3183254225179553, "epoch": 0.34296775633587906, "grad_norm": 0.8444575071334839, "learning_rate": 1.9987516723199e-05, "loss": 0.3337, "mean_token_accuracy": 0.9029113970696926, "num_tokens": 15078659.0, "step": 22280 }, { "entropy": 0.3290675056166947, "epoch": 0.34312169159455763, "grad_norm": 0.8848020434379578, "learning_rate": 1.9987427095764808e-05, "loss": 0.3341, "mean_token_accuracy": 0.8991132467985153, "num_tokens": 15148215.0, "step": 22290 }, { "entropy": 0.315796450804919, "epoch": 0.3432756268532362, "grad_norm": 0.6608877182006836, "learning_rate": 1.998733714792938e-05, "loss": 0.3136, "mean_token_accuracy": 0.9032759137451649, "num_tokens": 15207289.0, "step": 22300 }, { "entropy": 0.31530854031443595, "epoch": 0.3434295621119148, "grad_norm": 0.7847200632095337, "learning_rate": 1.9987246879695598e-05, "loss": 0.3201, "mean_token_accuracy": 0.9032632678747177, "num_tokens": 15281242.0, "step": 22310 }, { "entropy": 0.3153892303816974, "epoch": 0.34358349737059335, "grad_norm": 0.9310738444328308, "learning_rate": 1.9987156291066355e-05, "loss": 0.3138, "mean_token_accuracy": 0.9069709911942482, "num_tokens": 15339731.0, "step": 22320 }, { "entropy": 0.3057523346506059, "epoch": 0.3437374326292719, "grad_norm": 0.8661115169525146, "learning_rate": 1.9987065382044563e-05, "loss": 0.3093, "mean_token_accuracy": 0.9062799632549285, "num_tokens": 15404501.0, "step": 22330 }, { "entropy": 0.325658545922488, "epoch": 0.34389136788795055, "grad_norm": 0.8674892783164978, "learning_rate": 1.998697415263313e-05, "loss": 0.3343, "mean_token_accuracy": 0.899762288480997, "num_tokens": 15465737.0, "step": 22340 }, { "entropy": 0.32471953677013515, "epoch": 0.3440453031466291, "grad_norm": 0.7135553956031799, "learning_rate": 1.998688260283499e-05, "loss": 0.3343, "mean_token_accuracy": 0.9037616357207299, "num_tokens": 15527049.0, "step": 22350 }, { "entropy": 0.3264730704948306, "epoch": 0.3441992384053077, "grad_norm": 0.7882671356201172, "learning_rate": 1.998679073265308e-05, "loss": 0.3284, "mean_token_accuracy": 0.9032473534345626, "num_tokens": 15591798.0, "step": 22360 }, { "entropy": 0.31193241914734243, "epoch": 0.34435317366398627, "grad_norm": 0.6930598616600037, "learning_rate": 1.998669854209034e-05, "loss": 0.3263, "mean_token_accuracy": 0.9060379445552826, "num_tokens": 15663302.0, "step": 22370 }, { "entropy": 0.34344130866229533, "epoch": 0.34450710892266484, "grad_norm": 0.7934554815292358, "learning_rate": 1.9986606031149734e-05, "loss": 0.3386, "mean_token_accuracy": 0.8948141172528267, "num_tokens": 15731217.0, "step": 22380 }, { "entropy": 0.30862381514161824, "epoch": 0.3446610441813434, "grad_norm": 1.0752453804016113, "learning_rate": 1.9986513199834227e-05, "loss": 0.3173, "mean_token_accuracy": 0.9064489148557187, "num_tokens": 15804387.0, "step": 22390 }, { "entropy": 0.32254926264286043, "epoch": 0.34481497944002204, "grad_norm": 0.7805715799331665, "learning_rate": 1.9986420048146803e-05, "loss": 0.335, "mean_token_accuracy": 0.9011410534381866, "num_tokens": 15873606.0, "step": 22400 }, { "entropy": 0.3063559292815626, "epoch": 0.3449689146987006, "grad_norm": 0.7780539393424988, "learning_rate": 1.998632657609044e-05, "loss": 0.3014, "mean_token_accuracy": 0.9065180785953999, "num_tokens": 15938263.0, "step": 22410 }, { "entropy": 0.27088852543383835, "epoch": 0.3451228499573792, "grad_norm": 0.6054472327232361, "learning_rate": 1.9986232783668143e-05, "loss": 0.2777, "mean_token_accuracy": 0.9168240658938884, "num_tokens": 15999109.0, "step": 22420 }, { "entropy": 0.32296714540570975, "epoch": 0.34527678521605776, "grad_norm": 0.8777714967727661, "learning_rate": 1.998613867088292e-05, "loss": 0.3258, "mean_token_accuracy": 0.9002107121050358, "num_tokens": 16066835.0, "step": 22430 }, { "entropy": 0.3228213648311794, "epoch": 0.34543072047473633, "grad_norm": 0.7729988098144531, "learning_rate": 1.9986044237737793e-05, "loss": 0.3303, "mean_token_accuracy": 0.8996409751474858, "num_tokens": 16124391.0, "step": 22440 }, { "entropy": 0.32858061166480185, "epoch": 0.3455846557334149, "grad_norm": 0.7189005017280579, "learning_rate": 1.998594948423579e-05, "loss": 0.3387, "mean_token_accuracy": 0.9023611709475517, "num_tokens": 16195774.0, "step": 22450 }, { "entropy": 0.32138034570962193, "epoch": 0.3457385909920935, "grad_norm": 1.0231417417526245, "learning_rate": 1.998585441037994e-05, "loss": 0.3195, "mean_token_accuracy": 0.9029594130814076, "num_tokens": 16251341.0, "step": 22460 }, { "entropy": 0.32227826341986654, "epoch": 0.3458925262507721, "grad_norm": 0.804178774356842, "learning_rate": 1.998575901617331e-05, "loss": 0.3253, "mean_token_accuracy": 0.9028976731002331, "num_tokens": 16313367.0, "step": 22470 }, { "entropy": 0.3092894903384149, "epoch": 0.3460464615094507, "grad_norm": 0.5136247277259827, "learning_rate": 1.9985663301618952e-05, "loss": 0.3071, "mean_token_accuracy": 0.906329832226038, "num_tokens": 16379042.0, "step": 22480 }, { "entropy": 0.29923151088878513, "epoch": 0.34620039676812925, "grad_norm": 0.9127590656280518, "learning_rate": 1.9985567266719935e-05, "loss": 0.3026, "mean_token_accuracy": 0.9090830102562905, "num_tokens": 16450265.0, "step": 22490 }, { "entropy": 0.3424237518571317, "epoch": 0.3463543320268078, "grad_norm": 0.8019019961357117, "learning_rate": 1.9985470911479346e-05, "loss": 0.3495, "mean_token_accuracy": 0.8955218717455864, "num_tokens": 16515648.0, "step": 22500 }, { "entropy": 0.312747785076499, "epoch": 0.3465082672854864, "grad_norm": 1.2019537687301636, "learning_rate": 1.9985374235900267e-05, "loss": 0.3156, "mean_token_accuracy": 0.9055553480982781, "num_tokens": 16574229.0, "step": 22510 }, { "entropy": 0.335863172262907, "epoch": 0.34666220254416497, "grad_norm": 0.8594834804534912, "learning_rate": 1.998527723998581e-05, "loss": 0.3309, "mean_token_accuracy": 0.8963372059166431, "num_tokens": 16632362.0, "step": 22520 }, { "entropy": 0.3334421357139945, "epoch": 0.3468161378028436, "grad_norm": 1.2539266347885132, "learning_rate": 1.9985179923739078e-05, "loss": 0.3353, "mean_token_accuracy": 0.9005231514573098, "num_tokens": 16696110.0, "step": 22530 }, { "entropy": 0.3226277997717261, "epoch": 0.34697007306152217, "grad_norm": 0.9457434415817261, "learning_rate": 1.9985082287163197e-05, "loss": 0.3264, "mean_token_accuracy": 0.9034535877406598, "num_tokens": 16755950.0, "step": 22540 }, { "entropy": 0.31078321384266017, "epoch": 0.34712400832020074, "grad_norm": 0.7772952318191528, "learning_rate": 1.9984984330261296e-05, "loss": 0.3121, "mean_token_accuracy": 0.9047380536794662, "num_tokens": 16824728.0, "step": 22550 }, { "entropy": 0.3458792475052178, "epoch": 0.3472779435788793, "grad_norm": 0.7328395843505859, "learning_rate": 1.998488605303652e-05, "loss": 0.3406, "mean_token_accuracy": 0.8959846891462803, "num_tokens": 16895570.0, "step": 22560 }, { "entropy": 0.3310255757533014, "epoch": 0.3474318788375579, "grad_norm": 0.7789574861526489, "learning_rate": 1.9984787455492025e-05, "loss": 0.3308, "mean_token_accuracy": 0.9002535946667194, "num_tokens": 16957996.0, "step": 22570 }, { "entropy": 0.3088794708251953, "epoch": 0.34758581409623646, "grad_norm": 0.7640385627746582, "learning_rate": 1.998468853763097e-05, "loss": 0.3278, "mean_token_accuracy": 0.9054429396986962, "num_tokens": 17015149.0, "step": 22580 }, { "entropy": 0.32428188603371383, "epoch": 0.3477397493549151, "grad_norm": 0.5977707505226135, "learning_rate": 1.998458929945653e-05, "loss": 0.3254, "mean_token_accuracy": 0.9034992948174476, "num_tokens": 17083556.0, "step": 22590 }, { "entropy": 0.33052520379424094, "epoch": 0.34789368461359366, "grad_norm": 0.8275395631790161, "learning_rate": 1.9984489740971888e-05, "loss": 0.3316, "mean_token_accuracy": 0.9000757202506066, "num_tokens": 17151441.0, "step": 22600 }, { "entropy": 0.33023833502084016, "epoch": 0.34804761987227223, "grad_norm": 0.883009672164917, "learning_rate": 1.9984389862180235e-05, "loss": 0.3351, "mean_token_accuracy": 0.9002456195652485, "num_tokens": 17224485.0, "step": 22610 }, { "entropy": 0.30533677199855447, "epoch": 0.3482015551309508, "grad_norm": 0.6956403255462646, "learning_rate": 1.998428966308478e-05, "loss": 0.3114, "mean_token_accuracy": 0.9086042404174804, "num_tokens": 17277150.0, "step": 22620 }, { "entropy": 0.31397290136665107, "epoch": 0.3483554903896294, "grad_norm": 0.8072841763496399, "learning_rate": 1.9984189143688732e-05, "loss": 0.321, "mean_token_accuracy": 0.9045050874352455, "num_tokens": 17340670.0, "step": 22630 }, { "entropy": 0.3257579393684864, "epoch": 0.34850942564830795, "grad_norm": 0.8359113931655884, "learning_rate": 1.9984088303995318e-05, "loss": 0.3207, "mean_token_accuracy": 0.8992670074105262, "num_tokens": 17404442.0, "step": 22640 }, { "entropy": 0.3107326484285295, "epoch": 0.3486633609069865, "grad_norm": 0.680881917476654, "learning_rate": 1.998398714400778e-05, "loss": 0.3007, "mean_token_accuracy": 0.9065340586006642, "num_tokens": 17471178.0, "step": 22650 }, { "entropy": 0.3102188930846751, "epoch": 0.34881729616566515, "grad_norm": 0.6565739512443542, "learning_rate": 1.998388566372935e-05, "loss": 0.3116, "mean_token_accuracy": 0.9061592511832715, "num_tokens": 17537370.0, "step": 22660 }, { "entropy": 0.3150072744116187, "epoch": 0.3489712314243437, "grad_norm": 0.8073962926864624, "learning_rate": 1.9983783863163297e-05, "loss": 0.3137, "mean_token_accuracy": 0.9037772297859192, "num_tokens": 17604317.0, "step": 22670 }, { "entropy": 0.314076644461602, "epoch": 0.3491251666830223, "grad_norm": 0.6371170878410339, "learning_rate": 1.998368174231288e-05, "loss": 0.3311, "mean_token_accuracy": 0.9046189032495022, "num_tokens": 17671725.0, "step": 22680 }, { "entropy": 0.32679656958207487, "epoch": 0.34927910194170086, "grad_norm": 1.085918664932251, "learning_rate": 1.9983579301181374e-05, "loss": 0.3289, "mean_token_accuracy": 0.8984683886170387, "num_tokens": 17733724.0, "step": 22690 }, { "entropy": 0.32469261148944495, "epoch": 0.34943303720037944, "grad_norm": 0.667458176612854, "learning_rate": 1.9983476539772068e-05, "loss": 0.3314, "mean_token_accuracy": 0.9015580423176288, "num_tokens": 17797284.0, "step": 22700 }, { "entropy": 0.3110892024822533, "epoch": 0.349586972459058, "grad_norm": 0.6966239809989929, "learning_rate": 1.9983373458088254e-05, "loss": 0.3142, "mean_token_accuracy": 0.9068196631968022, "num_tokens": 17863793.0, "step": 22710 }, { "entropy": 0.33349677715450526, "epoch": 0.34974090771773664, "grad_norm": 0.8872436881065369, "learning_rate": 1.9983270056133243e-05, "loss": 0.3338, "mean_token_accuracy": 0.8989570319652558, "num_tokens": 17929900.0, "step": 22720 }, { "entropy": 0.31333945160731674, "epoch": 0.3498948429764152, "grad_norm": 0.6086751818656921, "learning_rate": 1.9983166333910355e-05, "loss": 0.3183, "mean_token_accuracy": 0.9047303669154644, "num_tokens": 17994614.0, "step": 22730 }, { "entropy": 0.32666941694915297, "epoch": 0.3500487782350938, "grad_norm": 0.7440767288208008, "learning_rate": 1.998306229142291e-05, "loss": 0.3278, "mean_token_accuracy": 0.9000937007367611, "num_tokens": 18067433.0, "step": 22740 }, { "entropy": 0.31002767384052277, "epoch": 0.35020271349377236, "grad_norm": 0.7312213778495789, "learning_rate": 1.9982957928674252e-05, "loss": 0.3108, "mean_token_accuracy": 0.9067370362579823, "num_tokens": 18131899.0, "step": 22750 }, { "entropy": 0.327476323582232, "epoch": 0.3503566487524509, "grad_norm": 0.6471096277236938, "learning_rate": 1.998285324566773e-05, "loss": 0.3239, "mean_token_accuracy": 0.8997953355312347, "num_tokens": 18201334.0, "step": 22760 }, { "entropy": 0.32165315030142666, "epoch": 0.3505105840111295, "grad_norm": 0.6706848740577698, "learning_rate": 1.9982748242406697e-05, "loss": 0.3185, "mean_token_accuracy": 0.9032859578728676, "num_tokens": 18270127.0, "step": 22770 }, { "entropy": 0.30486612739041447, "epoch": 0.35066451926980813, "grad_norm": 0.7650002837181091, "learning_rate": 1.998264291889452e-05, "loss": 0.3108, "mean_token_accuracy": 0.9087849743664265, "num_tokens": 18330850.0, "step": 22780 }, { "entropy": 0.316231194883585, "epoch": 0.3508184545284867, "grad_norm": 0.8083269596099854, "learning_rate": 1.9982537275134582e-05, "loss": 0.3364, "mean_token_accuracy": 0.9027035512030125, "num_tokens": 18408193.0, "step": 22790 }, { "entropy": 0.3152686612680554, "epoch": 0.3509723897871653, "grad_norm": 0.6582704782485962, "learning_rate": 1.9982431311130277e-05, "loss": 0.3262, "mean_token_accuracy": 0.9046223483979702, "num_tokens": 18477668.0, "step": 22800 }, { "entropy": 0.3033268116414547, "epoch": 0.35112632504584385, "grad_norm": 0.7050147652626038, "learning_rate": 1.9982325026884994e-05, "loss": 0.3173, "mean_token_accuracy": 0.9087815977632999, "num_tokens": 18535867.0, "step": 22810 }, { "entropy": 0.3035153294913471, "epoch": 0.3512802603045224, "grad_norm": 0.7876719236373901, "learning_rate": 1.998221842240215e-05, "loss": 0.3133, "mean_token_accuracy": 0.9062198519706726, "num_tokens": 18597097.0, "step": 22820 }, { "entropy": 0.32225736454129217, "epoch": 0.351434195563201, "grad_norm": 0.6835848093032837, "learning_rate": 1.9982111497685162e-05, "loss": 0.3214, "mean_token_accuracy": 0.900603549182415, "num_tokens": 18658361.0, "step": 22830 }, { "entropy": 0.33616746999323366, "epoch": 0.35158813082187956, "grad_norm": 0.780742883682251, "learning_rate": 1.998200425273746e-05, "loss": 0.324, "mean_token_accuracy": 0.8996063336730004, "num_tokens": 18719886.0, "step": 22840 }, { "entropy": 0.3309591661207378, "epoch": 0.3517420660805582, "grad_norm": 0.7095834016799927, "learning_rate": 1.9981896687562486e-05, "loss": 0.316, "mean_token_accuracy": 0.9008444786071778, "num_tokens": 18789271.0, "step": 22850 }, { "entropy": 0.31067327708005904, "epoch": 0.35189600133923676, "grad_norm": 0.9341641068458557, "learning_rate": 1.998178880216369e-05, "loss": 0.3105, "mean_token_accuracy": 0.90602016299963, "num_tokens": 18855407.0, "step": 22860 }, { "entropy": 0.3024227666668594, "epoch": 0.35204993659791534, "grad_norm": 0.6767318248748779, "learning_rate": 1.9981680596544533e-05, "loss": 0.3102, "mean_token_accuracy": 0.907864122837782, "num_tokens": 18927184.0, "step": 22870 }, { "entropy": 0.31554132606834173, "epoch": 0.3522038718565939, "grad_norm": 0.8028276562690735, "learning_rate": 1.9981572070708486e-05, "loss": 0.3246, "mean_token_accuracy": 0.9041713699698448, "num_tokens": 18997347.0, "step": 22880 }, { "entropy": 0.27726228814572096, "epoch": 0.3523578071152725, "grad_norm": 0.5414261817932129, "learning_rate": 1.9981463224659034e-05, "loss": 0.2846, "mean_token_accuracy": 0.9159536443650722, "num_tokens": 19061408.0, "step": 22890 }, { "entropy": 0.2941951255314052, "epoch": 0.35251174237395105, "grad_norm": 0.7344881296157837, "learning_rate": 1.9981354058399658e-05, "loss": 0.2956, "mean_token_accuracy": 0.9100583121180534, "num_tokens": 19130901.0, "step": 22900 }, { "entropy": 0.3113068314269185, "epoch": 0.3526656776326297, "grad_norm": 0.7312901616096497, "learning_rate": 1.9981244571933873e-05, "loss": 0.3102, "mean_token_accuracy": 0.905620476603508, "num_tokens": 19189068.0, "step": 22910 }, { "entropy": 0.31383316991850735, "epoch": 0.35281961289130825, "grad_norm": 0.7123277187347412, "learning_rate": 1.9981134765265185e-05, "loss": 0.3157, "mean_token_accuracy": 0.9025836564600468, "num_tokens": 19256069.0, "step": 22920 }, { "entropy": 0.30656777285039427, "epoch": 0.3529735481499868, "grad_norm": 0.7684402465820312, "learning_rate": 1.998102463839712e-05, "loss": 0.313, "mean_token_accuracy": 0.906920999288559, "num_tokens": 19313393.0, "step": 22930 }, { "entropy": 0.3043548157438636, "epoch": 0.3531274834086654, "grad_norm": 0.6843296885490417, "learning_rate": 1.9980914191333205e-05, "loss": 0.3097, "mean_token_accuracy": 0.906833977997303, "num_tokens": 19380722.0, "step": 22940 }, { "entropy": 0.3016151297837496, "epoch": 0.35328141866734397, "grad_norm": 0.533971905708313, "learning_rate": 1.9980803424076988e-05, "loss": 0.2952, "mean_token_accuracy": 0.907032848149538, "num_tokens": 19455894.0, "step": 22950 }, { "entropy": 0.3133284538984299, "epoch": 0.35343535392602254, "grad_norm": 0.5676846504211426, "learning_rate": 1.9980692336632023e-05, "loss": 0.3142, "mean_token_accuracy": 0.9050533935427666, "num_tokens": 19520899.0, "step": 22960 }, { "entropy": 0.338543504755944, "epoch": 0.3535892891847011, "grad_norm": 0.6819412708282471, "learning_rate": 1.9980580929001872e-05, "loss": 0.3386, "mean_token_accuracy": 0.8992750883102417, "num_tokens": 19581513.0, "step": 22970 }, { "entropy": 0.3249441374093294, "epoch": 0.35374322444337974, "grad_norm": 1.013296127319336, "learning_rate": 1.9980469201190106e-05, "loss": 0.3355, "mean_token_accuracy": 0.8997890211641788, "num_tokens": 19639259.0, "step": 22980 }, { "entropy": 0.3184667005203664, "epoch": 0.3538971597020583, "grad_norm": 0.7601450085639954, "learning_rate": 1.9980357153200317e-05, "loss": 0.3167, "mean_token_accuracy": 0.9022683583199977, "num_tokens": 19705763.0, "step": 22990 }, { "entropy": 0.32995788054540753, "epoch": 0.3540510949607369, "grad_norm": 0.645913302898407, "learning_rate": 1.9980244785036088e-05, "loss": 0.3278, "mean_token_accuracy": 0.9020815789699554, "num_tokens": 19769527.0, "step": 23000 }, { "entropy": 0.34380579050630333, "epoch": 0.35420503021941546, "grad_norm": 0.8576709032058716, "learning_rate": 1.9980132096701038e-05, "loss": 0.3376, "mean_token_accuracy": 0.8964173309504986, "num_tokens": 19837387.0, "step": 23010 }, { "entropy": 0.3228688883595169, "epoch": 0.35435896547809403, "grad_norm": 0.8860835433006287, "learning_rate": 1.9980019088198773e-05, "loss": 0.3199, "mean_token_accuracy": 0.9013651572167873, "num_tokens": 19902401.0, "step": 23020 }, { "entropy": 0.3111913370899856, "epoch": 0.3545129007367726, "grad_norm": 0.7504994869232178, "learning_rate": 1.9979905759532914e-05, "loss": 0.3127, "mean_token_accuracy": 0.9068504191935063, "num_tokens": 19965680.0, "step": 23030 }, { "entropy": 0.3078837622888386, "epoch": 0.35466683599545124, "grad_norm": 0.734421968460083, "learning_rate": 1.997979211070711e-05, "loss": 0.3091, "mean_token_accuracy": 0.9061454936861992, "num_tokens": 20034139.0, "step": 23040 }, { "entropy": 0.32001781156286596, "epoch": 0.3548207712541298, "grad_norm": 0.6728128790855408, "learning_rate": 1.9979678141724998e-05, "loss": 0.3167, "mean_token_accuracy": 0.9030817478895188, "num_tokens": 20100683.0, "step": 23050 }, { "entropy": 0.31887772101908923, "epoch": 0.3549747065128084, "grad_norm": 1.1058647632598877, "learning_rate": 1.997956385259024e-05, "loss": 0.327, "mean_token_accuracy": 0.9007700346410275, "num_tokens": 20168556.0, "step": 23060 }, { "entropy": 0.31685448372736574, "epoch": 0.35512864177148695, "grad_norm": 0.5945132970809937, "learning_rate": 1.9979449243306492e-05, "loss": 0.3198, "mean_token_accuracy": 0.9035773567855359, "num_tokens": 20245614.0, "step": 23070 }, { "entropy": 0.30906652556732295, "epoch": 0.3552825770301655, "grad_norm": 0.6121296882629395, "learning_rate": 1.9979334313877437e-05, "loss": 0.3078, "mean_token_accuracy": 0.9046322233974934, "num_tokens": 20313971.0, "step": 23080 }, { "entropy": 0.3138235383667052, "epoch": 0.3554365122888441, "grad_norm": 0.7228451371192932, "learning_rate": 1.9979219064306764e-05, "loss": 0.3214, "mean_token_accuracy": 0.9050215743482113, "num_tokens": 20374793.0, "step": 23090 }, { "entropy": 0.31712428471073506, "epoch": 0.3555904475475227, "grad_norm": 0.7861889004707336, "learning_rate": 1.997910349459817e-05, "loss": 0.32, "mean_token_accuracy": 0.9038146652281285, "num_tokens": 20443967.0, "step": 23100 }, { "entropy": 0.32994882203638554, "epoch": 0.3557443828062013, "grad_norm": 0.7946621775627136, "learning_rate": 1.997898760475536e-05, "loss": 0.3328, "mean_token_accuracy": 0.8990198500454426, "num_tokens": 20505306.0, "step": 23110 }, { "entropy": 0.3196871059015393, "epoch": 0.35589831806487987, "grad_norm": 0.81160968542099, "learning_rate": 1.997887139478205e-05, "loss": 0.3159, "mean_token_accuracy": 0.9057561203837394, "num_tokens": 20570228.0, "step": 23120 }, { "entropy": 0.31187468888238074, "epoch": 0.35605225332355844, "grad_norm": 0.6220771670341492, "learning_rate": 1.997875486468197e-05, "loss": 0.3038, "mean_token_accuracy": 0.9047398805618286, "num_tokens": 20631239.0, "step": 23130 }, { "entropy": 0.3175012127496302, "epoch": 0.356206188582237, "grad_norm": 0.5926417708396912, "learning_rate": 1.9978638014458865e-05, "loss": 0.3167, "mean_token_accuracy": 0.9050295002758503, "num_tokens": 20695721.0, "step": 23140 }, { "entropy": 0.3019552898593247, "epoch": 0.3563601238409156, "grad_norm": 0.7853606343269348, "learning_rate": 1.997852084411647e-05, "loss": 0.2959, "mean_token_accuracy": 0.9089874155819416, "num_tokens": 20755910.0, "step": 23150 }, { "entropy": 0.3122219108045101, "epoch": 0.35651405909959416, "grad_norm": 0.8015866875648499, "learning_rate": 1.9978403353658556e-05, "loss": 0.316, "mean_token_accuracy": 0.9042653687298298, "num_tokens": 20813759.0, "step": 23160 }, { "entropy": 0.3162457692436874, "epoch": 0.3566679943582728, "grad_norm": 0.7675080895423889, "learning_rate": 1.997828554308889e-05, "loss": 0.3188, "mean_token_accuracy": 0.9026547513902188, "num_tokens": 20878088.0, "step": 23170 }, { "entropy": 0.3165657501667738, "epoch": 0.35682192961695136, "grad_norm": 0.6563487648963928, "learning_rate": 1.9978167412411246e-05, "loss": 0.3159, "mean_token_accuracy": 0.9038542062044144, "num_tokens": 20951472.0, "step": 23180 }, { "entropy": 0.3038612915202975, "epoch": 0.35697586487562993, "grad_norm": 0.7552381753921509, "learning_rate": 1.9978048961629416e-05, "loss": 0.3149, "mean_token_accuracy": 0.9067206770181656, "num_tokens": 21015985.0, "step": 23190 }, { "entropy": 0.31254281271249057, "epoch": 0.3571298001343085, "grad_norm": 0.8421341776847839, "learning_rate": 1.9977930190747204e-05, "loss": 0.3313, "mean_token_accuracy": 0.9070429854094982, "num_tokens": 21079924.0, "step": 23200 }, { "entropy": 0.33144065607339146, "epoch": 0.3572837353929871, "grad_norm": 0.8750845789909363, "learning_rate": 1.9977811099768413e-05, "loss": 0.3236, "mean_token_accuracy": 0.9000006452202797, "num_tokens": 21143162.0, "step": 23210 }, { "entropy": 0.29894145037978886, "epoch": 0.35743767065166565, "grad_norm": 0.6089403033256531, "learning_rate": 1.997769168869687e-05, "loss": 0.3007, "mean_token_accuracy": 0.9109389267861843, "num_tokens": 21210969.0, "step": 23220 }, { "entropy": 0.32412985535338523, "epoch": 0.3575916059103443, "grad_norm": 0.6936253309249878, "learning_rate": 1.9977571957536405e-05, "loss": 0.3361, "mean_token_accuracy": 0.8992355808615684, "num_tokens": 21279800.0, "step": 23230 }, { "entropy": 0.3101286256685853, "epoch": 0.35774554116902285, "grad_norm": 0.8576826453208923, "learning_rate": 1.9977451906290853e-05, "loss": 0.3085, "mean_token_accuracy": 0.9042671233415603, "num_tokens": 21339924.0, "step": 23240 }, { "entropy": 0.33077242579311134, "epoch": 0.3578994764277014, "grad_norm": 0.5774107575416565, "learning_rate": 1.9977331534964072e-05, "loss": 0.3377, "mean_token_accuracy": 0.9009350918233394, "num_tokens": 21400996.0, "step": 23250 }, { "entropy": 0.31013451926410196, "epoch": 0.35805341168638, "grad_norm": 0.7507098913192749, "learning_rate": 1.9977210843559922e-05, "loss": 0.3142, "mean_token_accuracy": 0.9080297835171223, "num_tokens": 21463280.0, "step": 23260 }, { "entropy": 0.2954201280139387, "epoch": 0.35820734694505857, "grad_norm": 0.6823977828025818, "learning_rate": 1.9977089832082272e-05, "loss": 0.3109, "mean_token_accuracy": 0.911161632835865, "num_tokens": 21531039.0, "step": 23270 }, { "entropy": 0.3135211165994406, "epoch": 0.35836128220373714, "grad_norm": 0.8797701597213745, "learning_rate": 1.9976968500535005e-05, "loss": 0.3257, "mean_token_accuracy": 0.9040906757116318, "num_tokens": 21593002.0, "step": 23280 }, { "entropy": 0.31533478386700153, "epoch": 0.35851521746241577, "grad_norm": 0.7868728041648865, "learning_rate": 1.9976846848922018e-05, "loss": 0.3195, "mean_token_accuracy": 0.9047038428485393, "num_tokens": 21658125.0, "step": 23290 }, { "entropy": 0.32736375695094466, "epoch": 0.35866915272109434, "grad_norm": 0.7164791822433472, "learning_rate": 1.997672487724721e-05, "loss": 0.3221, "mean_token_accuracy": 0.9026046082377434, "num_tokens": 21714060.0, "step": 23300 }, { "entropy": 0.31679877834394576, "epoch": 0.3588230879797729, "grad_norm": 0.6101922988891602, "learning_rate": 1.997660258551449e-05, "loss": 0.3201, "mean_token_accuracy": 0.90361939817667, "num_tokens": 21782047.0, "step": 23310 }, { "entropy": 0.32554817739874126, "epoch": 0.3589770232384515, "grad_norm": 0.7318460941314697, "learning_rate": 1.997647997372779e-05, "loss": 0.3408, "mean_token_accuracy": 0.9024608567357063, "num_tokens": 21848928.0, "step": 23320 }, { "entropy": 0.3046639686450362, "epoch": 0.35913095849713006, "grad_norm": 0.6081191897392273, "learning_rate": 1.9976357041891034e-05, "loss": 0.3178, "mean_token_accuracy": 0.9075599186122417, "num_tokens": 21921810.0, "step": 23330 }, { "entropy": 0.306543701980263, "epoch": 0.35928489375580863, "grad_norm": 0.9828604459762573, "learning_rate": 1.9976233790008172e-05, "loss": 0.3093, "mean_token_accuracy": 0.9052274703979493, "num_tokens": 21980830.0, "step": 23340 }, { "entropy": 0.30426696613430976, "epoch": 0.3594388290144872, "grad_norm": 0.6000154614448547, "learning_rate": 1.9976110218083158e-05, "loss": 0.3055, "mean_token_accuracy": 0.9095471121370793, "num_tokens": 22048379.0, "step": 23350 }, { "entropy": 0.3113205130212009, "epoch": 0.35959276427316583, "grad_norm": 0.6484618186950684, "learning_rate": 1.9975986326119955e-05, "loss": 0.3218, "mean_token_accuracy": 0.9057395584881306, "num_tokens": 22113580.0, "step": 23360 }, { "entropy": 0.31465021809563043, "epoch": 0.3597466995318444, "grad_norm": 0.7290979623794556, "learning_rate": 1.9975862114122534e-05, "loss": 0.3164, "mean_token_accuracy": 0.9030299544334411, "num_tokens": 22176699.0, "step": 23370 }, { "entropy": 0.30646268334239724, "epoch": 0.359900634790523, "grad_norm": 1.0354928970336914, "learning_rate": 1.9975737582094882e-05, "loss": 0.319, "mean_token_accuracy": 0.9042933560907841, "num_tokens": 22245866.0, "step": 23380 }, { "entropy": 0.2808321843855083, "epoch": 0.36005457004920155, "grad_norm": 0.703535258769989, "learning_rate": 1.9975612730040996e-05, "loss": 0.2953, "mean_token_accuracy": 0.9146287225186824, "num_tokens": 22301375.0, "step": 23390 }, { "entropy": 0.32899646935984495, "epoch": 0.3602085053078801, "grad_norm": 0.6601384878158569, "learning_rate": 1.997548755796488e-05, "loss": 0.3244, "mean_token_accuracy": 0.9004277199506759, "num_tokens": 22362521.0, "step": 23400 }, { "entropy": 0.32191678900271653, "epoch": 0.3603624405665587, "grad_norm": 0.6567837595939636, "learning_rate": 1.9975362065870555e-05, "loss": 0.326, "mean_token_accuracy": 0.9021859034895897, "num_tokens": 22433659.0, "step": 23410 }, { "entropy": 0.333246857393533, "epoch": 0.3605163758252373, "grad_norm": 0.7456390261650085, "learning_rate": 1.9975236253762035e-05, "loss": 0.3354, "mean_token_accuracy": 0.9005053035914898, "num_tokens": 22502200.0, "step": 23420 }, { "entropy": 0.29962367098778486, "epoch": 0.3606703110839159, "grad_norm": 0.6338890790939331, "learning_rate": 1.9975110121643366e-05, "loss": 0.2962, "mean_token_accuracy": 0.9081262119114399, "num_tokens": 22557419.0, "step": 23430 }, { "entropy": 0.31183787565678356, "epoch": 0.36082424634259447, "grad_norm": 0.8387086391448975, "learning_rate": 1.997498366951859e-05, "loss": 0.3118, "mean_token_accuracy": 0.9062176093459129, "num_tokens": 22622657.0, "step": 23440 }, { "entropy": 0.32328963559120893, "epoch": 0.36097818160127304, "grad_norm": 0.6905167102813721, "learning_rate": 1.9974856897391764e-05, "loss": 0.321, "mean_token_accuracy": 0.9037507548928261, "num_tokens": 22689261.0, "step": 23450 }, { "entropy": 0.295623988751322, "epoch": 0.3611321168599516, "grad_norm": 0.796995997428894, "learning_rate": 1.997472980526696e-05, "loss": 0.3063, "mean_token_accuracy": 0.9101363696157933, "num_tokens": 22748096.0, "step": 23460 }, { "entropy": 0.3012785171158612, "epoch": 0.3612860521186302, "grad_norm": 0.772588312625885, "learning_rate": 1.9974602393148244e-05, "loss": 0.3106, "mean_token_accuracy": 0.9074118115007878, "num_tokens": 22818296.0, "step": 23470 }, { "entropy": 0.3143458621576428, "epoch": 0.3614399873773088, "grad_norm": 0.5633523464202881, "learning_rate": 1.9974474661039713e-05, "loss": 0.3141, "mean_token_accuracy": 0.9050880551338196, "num_tokens": 22888496.0, "step": 23480 }, { "entropy": 0.3034037120640278, "epoch": 0.3615939226359874, "grad_norm": 0.598654568195343, "learning_rate": 1.9974346608945466e-05, "loss": 0.2996, "mean_token_accuracy": 0.9085473716259003, "num_tokens": 22962230.0, "step": 23490 }, { "entropy": 0.31001080479472876, "epoch": 0.36174785789466596, "grad_norm": 0.6672106981277466, "learning_rate": 1.99742182368696e-05, "loss": 0.3148, "mean_token_accuracy": 0.9050404630601406, "num_tokens": 23028126.0, "step": 23500 }, { "entropy": 0.31494851782917976, "epoch": 0.36190179315334453, "grad_norm": 0.6217115521430969, "learning_rate": 1.9974089544816246e-05, "loss": 0.3155, "mean_token_accuracy": 0.9033957488834858, "num_tokens": 23092140.0, "step": 23510 }, { "entropy": 0.2892609572969377, "epoch": 0.3620557284120231, "grad_norm": 1.0037569999694824, "learning_rate": 1.9973960532789524e-05, "loss": 0.3048, "mean_token_accuracy": 0.9096511095762253, "num_tokens": 23162240.0, "step": 23520 }, { "entropy": 0.3113910714164376, "epoch": 0.3622096636707017, "grad_norm": 0.6339561343193054, "learning_rate": 1.9973831200793575e-05, "loss": 0.3116, "mean_token_accuracy": 0.9076832219958305, "num_tokens": 23237836.0, "step": 23530 }, { "entropy": 0.33574255239218476, "epoch": 0.36236359892938025, "grad_norm": 0.6915492415428162, "learning_rate": 1.997370154883255e-05, "loss": 0.3346, "mean_token_accuracy": 0.8980366639792919, "num_tokens": 23306043.0, "step": 23540 }, { "entropy": 0.31098546385765075, "epoch": 0.3625175341880589, "grad_norm": 0.7549551725387573, "learning_rate": 1.9973571576910605e-05, "loss": 0.3198, "mean_token_accuracy": 0.9033577024936676, "num_tokens": 23370986.0, "step": 23550 }, { "entropy": 0.31468042731285095, "epoch": 0.36267146944673745, "grad_norm": 0.8313280344009399, "learning_rate": 1.9973441285031913e-05, "loss": 0.3267, "mean_token_accuracy": 0.9049945399165154, "num_tokens": 23436246.0, "step": 23560 }, { "entropy": 0.31729614529758693, "epoch": 0.362825404705416, "grad_norm": 0.6399728059768677, "learning_rate": 1.9973310673200653e-05, "loss": 0.3018, "mean_token_accuracy": 0.9043165519833565, "num_tokens": 23504330.0, "step": 23570 }, { "entropy": 0.3125679102726281, "epoch": 0.3629793399640946, "grad_norm": 0.7195016741752625, "learning_rate": 1.9973179741421013e-05, "loss": 0.32, "mean_token_accuracy": 0.9036789633333683, "num_tokens": 23568655.0, "step": 23580 }, { "entropy": 0.299930686596781, "epoch": 0.36313327522277317, "grad_norm": 0.7559289336204529, "learning_rate": 1.9973048489697192e-05, "loss": 0.312, "mean_token_accuracy": 0.9086351498961449, "num_tokens": 23637140.0, "step": 23590 }, { "entropy": 0.31720883222296836, "epoch": 0.36328721048145174, "grad_norm": 0.7476507425308228, "learning_rate": 1.9972916918033404e-05, "loss": 0.3144, "mean_token_accuracy": 0.9037539884448051, "num_tokens": 23711980.0, "step": 23600 }, { "entropy": 0.3110018242150545, "epoch": 0.36344114574013037, "grad_norm": 0.8268715739250183, "learning_rate": 1.997278502643387e-05, "loss": 0.3153, "mean_token_accuracy": 0.9040335573256015, "num_tokens": 23783735.0, "step": 23610 }, { "entropy": 0.31937989946454765, "epoch": 0.36359508099880894, "grad_norm": 0.6595677137374878, "learning_rate": 1.997265281490282e-05, "loss": 0.3241, "mean_token_accuracy": 0.9062789656221867, "num_tokens": 23849768.0, "step": 23620 }, { "entropy": 0.31646197512745855, "epoch": 0.3637490162574875, "grad_norm": 0.7706979513168335, "learning_rate": 1.99725202834445e-05, "loss": 0.3265, "mean_token_accuracy": 0.9036854594945908, "num_tokens": 23909656.0, "step": 23630 }, { "entropy": 0.3333808358758688, "epoch": 0.3639029515161661, "grad_norm": 0.5455695390701294, "learning_rate": 1.997238743206315e-05, "loss": 0.3345, "mean_token_accuracy": 0.8983547724783421, "num_tokens": 23979388.0, "step": 23640 }, { "entropy": 0.3332245582714677, "epoch": 0.36405688677484466, "grad_norm": 0.5915918350219727, "learning_rate": 1.9972254260763043e-05, "loss": 0.331, "mean_token_accuracy": 0.8988476954400539, "num_tokens": 24049075.0, "step": 23650 }, { "entropy": 0.29702964490279554, "epoch": 0.36421082203352323, "grad_norm": 0.8695030808448792, "learning_rate": 1.997212076954845e-05, "loss": 0.2958, "mean_token_accuracy": 0.9091203793883323, "num_tokens": 24110416.0, "step": 23660 }, { "entropy": 0.30681328205391767, "epoch": 0.36436475729220186, "grad_norm": 0.7343393564224243, "learning_rate": 1.9971986958423648e-05, "loss": 0.3163, "mean_token_accuracy": 0.9049829214811325, "num_tokens": 24174806.0, "step": 23670 }, { "entropy": 0.33428387315943836, "epoch": 0.36451869255088043, "grad_norm": 0.5918293595314026, "learning_rate": 1.997185282739293e-05, "loss": 0.3365, "mean_token_accuracy": 0.899001894146204, "num_tokens": 24246138.0, "step": 23680 }, { "entropy": 0.32716428972780703, "epoch": 0.364672627809559, "grad_norm": 0.9501112103462219, "learning_rate": 1.9971718376460605e-05, "loss": 0.3296, "mean_token_accuracy": 0.9024996101856232, "num_tokens": 24306917.0, "step": 23690 }, { "entropy": 0.32034989055246116, "epoch": 0.3648265630682376, "grad_norm": 0.5893335342407227, "learning_rate": 1.997158360563098e-05, "loss": 0.3199, "mean_token_accuracy": 0.901951989531517, "num_tokens": 24369576.0, "step": 23700 }, { "entropy": 0.3183416025713086, "epoch": 0.36498049832691615, "grad_norm": 0.7949944734573364, "learning_rate": 1.9971448514908386e-05, "loss": 0.3277, "mean_token_accuracy": 0.9006408892571927, "num_tokens": 24436606.0, "step": 23710 }, { "entropy": 0.3381361082196236, "epoch": 0.3651344335855947, "grad_norm": 0.6596007943153381, "learning_rate": 1.997131310429715e-05, "loss": 0.3389, "mean_token_accuracy": 0.8976947858929634, "num_tokens": 24500287.0, "step": 23720 }, { "entropy": 0.3229618499986827, "epoch": 0.3652883688442733, "grad_norm": 0.8230901956558228, "learning_rate": 1.997117737380162e-05, "loss": 0.3138, "mean_token_accuracy": 0.9021942302584648, "num_tokens": 24565717.0, "step": 23730 }, { "entropy": 0.309796985052526, "epoch": 0.3654423041029519, "grad_norm": 0.6285703778266907, "learning_rate": 1.9971041323426142e-05, "loss": 0.3116, "mean_token_accuracy": 0.9049277067184448, "num_tokens": 24633101.0, "step": 23740 }, { "entropy": 0.30879982374608517, "epoch": 0.3655962393616305, "grad_norm": 0.8299192190170288, "learning_rate": 1.997090495317509e-05, "loss": 0.3134, "mean_token_accuracy": 0.9053399659693241, "num_tokens": 24692873.0, "step": 23750 }, { "entropy": 0.32355168983340266, "epoch": 0.36575017462030907, "grad_norm": 0.647565484046936, "learning_rate": 1.997076826305284e-05, "loss": 0.3256, "mean_token_accuracy": 0.9018921300768852, "num_tokens": 24758652.0, "step": 23760 }, { "entropy": 0.3038117161951959, "epoch": 0.36590410987898764, "grad_norm": 0.7044543027877808, "learning_rate": 1.9970631253063768e-05, "loss": 0.3045, "mean_token_accuracy": 0.9088686414062976, "num_tokens": 24823392.0, "step": 23770 }, { "entropy": 0.31972640175372363, "epoch": 0.3660580451376662, "grad_norm": 0.6119922995567322, "learning_rate": 1.9970493923212277e-05, "loss": 0.3184, "mean_token_accuracy": 0.9045259930193424, "num_tokens": 24899079.0, "step": 23780 }, { "entropy": 0.30567996520549057, "epoch": 0.3662119803963448, "grad_norm": 0.646751880645752, "learning_rate": 1.9970356273502765e-05, "loss": 0.3121, "mean_token_accuracy": 0.9057943008840084, "num_tokens": 24955024.0, "step": 23790 }, { "entropy": 0.3101213444955647, "epoch": 0.3663659156550234, "grad_norm": 0.5621387362480164, "learning_rate": 1.997021830393966e-05, "loss": 0.3242, "mean_token_accuracy": 0.9063423693180084, "num_tokens": 25023043.0, "step": 23800 }, { "entropy": 0.3224111529998481, "epoch": 0.366519850913702, "grad_norm": 0.6180081963539124, "learning_rate": 1.9970080014527376e-05, "loss": 0.3113, "mean_token_accuracy": 0.9033336281776428, "num_tokens": 25096051.0, "step": 23810 }, { "entropy": 0.3173413410782814, "epoch": 0.36667378617238056, "grad_norm": 0.759417712688446, "learning_rate": 1.9969941405270353e-05, "loss": 0.3217, "mean_token_accuracy": 0.9021435759961605, "num_tokens": 25161962.0, "step": 23820 }, { "entropy": 0.31134605724364517, "epoch": 0.36682772143105913, "grad_norm": 0.9152867794036865, "learning_rate": 1.9969802476173044e-05, "loss": 0.3107, "mean_token_accuracy": 0.9062475040555, "num_tokens": 25232891.0, "step": 23830 }, { "entropy": 0.3156177127733827, "epoch": 0.3669816566897377, "grad_norm": 0.8478100299835205, "learning_rate": 1.99696632272399e-05, "loss": 0.3263, "mean_token_accuracy": 0.9019363440573216, "num_tokens": 25295736.0, "step": 23840 }, { "entropy": 0.32186301350593566, "epoch": 0.3671355919484163, "grad_norm": 0.8278965950012207, "learning_rate": 1.9969523658475385e-05, "loss": 0.3135, "mean_token_accuracy": 0.9044503822922707, "num_tokens": 25358840.0, "step": 23850 }, { "entropy": 0.34168895371258257, "epoch": 0.3672895272070949, "grad_norm": 0.8854647874832153, "learning_rate": 1.9969383769883983e-05, "loss": 0.3373, "mean_token_accuracy": 0.895020293444395, "num_tokens": 25429732.0, "step": 23860 }, { "entropy": 0.3369906423613429, "epoch": 0.3674434624657735, "grad_norm": 0.8210282325744629, "learning_rate": 1.9969243561470178e-05, "loss": 0.3267, "mean_token_accuracy": 0.9007311522960663, "num_tokens": 25489113.0, "step": 23870 }, { "entropy": 0.31883188635110854, "epoch": 0.36759739772445205, "grad_norm": 0.8256662487983704, "learning_rate": 1.996910303323847e-05, "loss": 0.3227, "mean_token_accuracy": 0.9034341864287854, "num_tokens": 25558113.0, "step": 23880 }, { "entropy": 0.31447718581184747, "epoch": 0.3677513329831306, "grad_norm": 0.6270312666893005, "learning_rate": 1.9968962185193367e-05, "loss": 0.3092, "mean_token_accuracy": 0.9052457295358181, "num_tokens": 25624970.0, "step": 23890 }, { "entropy": 0.2957416166551411, "epoch": 0.3679052682418092, "grad_norm": 0.9467841982841492, "learning_rate": 1.9968821017339386e-05, "loss": 0.2962, "mean_token_accuracy": 0.9095651507377625, "num_tokens": 25683597.0, "step": 23900 }, { "entropy": 0.31762309549376366, "epoch": 0.36805920350048776, "grad_norm": 0.6448183059692383, "learning_rate": 1.9968679529681056e-05, "loss": 0.3163, "mean_token_accuracy": 0.9041934624314308, "num_tokens": 25739450.0, "step": 23910 }, { "entropy": 0.3154096495360136, "epoch": 0.36821313875916634, "grad_norm": 0.8959481120109558, "learning_rate": 1.9968537722222915e-05, "loss": 0.3133, "mean_token_accuracy": 0.9038734950125218, "num_tokens": 25811935.0, "step": 23920 }, { "entropy": 0.32124657556414604, "epoch": 0.36836707401784496, "grad_norm": 0.6476492285728455, "learning_rate": 1.9968395594969515e-05, "loss": 0.3274, "mean_token_accuracy": 0.9012029893696308, "num_tokens": 25885655.0, "step": 23930 }, { "entropy": 0.30034850593656304, "epoch": 0.36852100927652354, "grad_norm": 0.7123849987983704, "learning_rate": 1.9968253147925415e-05, "loss": 0.3006, "mean_token_accuracy": 0.9071980006992817, "num_tokens": 25953239.0, "step": 23940 }, { "entropy": 0.326381048373878, "epoch": 0.3686749445352021, "grad_norm": 0.719023585319519, "learning_rate": 1.9968110381095183e-05, "loss": 0.3111, "mean_token_accuracy": 0.9009270377457141, "num_tokens": 26020329.0, "step": 23950 }, { "entropy": 0.31534684645012023, "epoch": 0.3688288797938807, "grad_norm": 0.591740608215332, "learning_rate": 1.99679672944834e-05, "loss": 0.3333, "mean_token_accuracy": 0.901650470495224, "num_tokens": 26087754.0, "step": 23960 }, { "entropy": 0.30069461204111575, "epoch": 0.36898281505255925, "grad_norm": 0.6671233773231506, "learning_rate": 1.9967823888094657e-05, "loss": 0.3066, "mean_token_accuracy": 0.91038958132267, "num_tokens": 26154525.0, "step": 23970 }, { "entropy": 0.31745749730616807, "epoch": 0.3691367503112378, "grad_norm": 0.5795464515686035, "learning_rate": 1.9967680161933554e-05, "loss": 0.3205, "mean_token_accuracy": 0.9035241477191448, "num_tokens": 26222181.0, "step": 23980 }, { "entropy": 0.3320769258774817, "epoch": 0.36929068556991645, "grad_norm": 0.7740620374679565, "learning_rate": 1.99675361160047e-05, "loss": 0.3257, "mean_token_accuracy": 0.9002109497785569, "num_tokens": 26288376.0, "step": 23990 }, { "entropy": 0.3179560878314078, "epoch": 0.369444620828595, "grad_norm": 0.612586498260498, "learning_rate": 1.9967391750312718e-05, "loss": 0.3264, "mean_token_accuracy": 0.9041740000247955, "num_tokens": 26359056.0, "step": 24000 }, { "entropy": 0.3270796125754714, "epoch": 0.3695985560872736, "grad_norm": 0.660561740398407, "learning_rate": 1.9967247064862242e-05, "loss": 0.3333, "mean_token_accuracy": 0.8999091759324074, "num_tokens": 26418755.0, "step": 24010 }, { "entropy": 0.2976296544075012, "epoch": 0.3697524913459522, "grad_norm": 0.7354880571365356, "learning_rate": 1.9967102059657905e-05, "loss": 0.2969, "mean_token_accuracy": 0.9083767041563988, "num_tokens": 26491827.0, "step": 24020 }, { "entropy": 0.3055484010837972, "epoch": 0.36990642660463074, "grad_norm": 0.6352931261062622, "learning_rate": 1.9966956734704368e-05, "loss": 0.3133, "mean_token_accuracy": 0.9078402876853943, "num_tokens": 26555579.0, "step": 24030 }, { "entropy": 0.31945009548217057, "epoch": 0.3700603618633093, "grad_norm": 0.6534560918807983, "learning_rate": 1.9966811090006287e-05, "loss": 0.3251, "mean_token_accuracy": 0.9020285673439503, "num_tokens": 26616784.0, "step": 24040 }, { "entropy": 0.3028695579618216, "epoch": 0.37021429712198795, "grad_norm": 0.7984675168991089, "learning_rate": 1.9966665125568342e-05, "loss": 0.3059, "mean_token_accuracy": 0.9076647877693176, "num_tokens": 26692955.0, "step": 24050 }, { "entropy": 0.29141416735947134, "epoch": 0.3703682323806665, "grad_norm": 0.7493507266044617, "learning_rate": 1.996651884139521e-05, "loss": 0.3038, "mean_token_accuracy": 0.9114543810486794, "num_tokens": 26753106.0, "step": 24060 }, { "entropy": 0.3216040008701384, "epoch": 0.3705221676393451, "grad_norm": 0.7238502502441406, "learning_rate": 1.9966372237491578e-05, "loss": 0.325, "mean_token_accuracy": 0.9029701963067055, "num_tokens": 26823900.0, "step": 24070 }, { "entropy": 0.2971381567418575, "epoch": 0.37067610289802366, "grad_norm": 0.5978664755821228, "learning_rate": 1.996622531386216e-05, "loss": 0.3055, "mean_token_accuracy": 0.9110109359025955, "num_tokens": 26893442.0, "step": 24080 }, { "entropy": 0.3326671156100929, "epoch": 0.37083003815670224, "grad_norm": 0.9005787372589111, "learning_rate": 1.9966078070511663e-05, "loss": 0.3239, "mean_token_accuracy": 0.9017243832349777, "num_tokens": 26947977.0, "step": 24090 }, { "entropy": 0.3163498696871102, "epoch": 0.3709839734153808, "grad_norm": 0.6026418209075928, "learning_rate": 1.996593050744481e-05, "loss": 0.3289, "mean_token_accuracy": 0.9030040696263313, "num_tokens": 27013767.0, "step": 24100 }, { "entropy": 0.2983822622336447, "epoch": 0.3711379086740594, "grad_norm": 0.7614117860794067, "learning_rate": 1.9965782624666343e-05, "loss": 0.3196, "mean_token_accuracy": 0.9091682493686676, "num_tokens": 27072498.0, "step": 24110 }, { "entropy": 0.29762095436453817, "epoch": 0.371291843932738, "grad_norm": 0.6658770442008972, "learning_rate": 1.9965634422180998e-05, "loss": 0.2978, "mean_token_accuracy": 0.9106610290706157, "num_tokens": 27125837.0, "step": 24120 }, { "entropy": 0.30941572804003953, "epoch": 0.3714457791914166, "grad_norm": 0.8526014685630798, "learning_rate": 1.9965485899993533e-05, "loss": 0.3035, "mean_token_accuracy": 0.9071419626474381, "num_tokens": 27199247.0, "step": 24130 }, { "entropy": 0.32369440579786896, "epoch": 0.37159971445009515, "grad_norm": 0.7982751131057739, "learning_rate": 1.9965337058108708e-05, "loss": 0.3147, "mean_token_accuracy": 0.9023417644202709, "num_tokens": 27260395.0, "step": 24140 }, { "entropy": 0.30728986030444505, "epoch": 0.3717536497087737, "grad_norm": 0.7587102055549622, "learning_rate": 1.9965187896531307e-05, "loss": 0.3254, "mean_token_accuracy": 0.906478800624609, "num_tokens": 27331633.0, "step": 24150 }, { "entropy": 0.3152029004879296, "epoch": 0.3719075849674523, "grad_norm": 0.9018804430961609, "learning_rate": 1.9965038415266105e-05, "loss": 0.3233, "mean_token_accuracy": 0.9038520582020283, "num_tokens": 27394079.0, "step": 24160 }, { "entropy": 0.33206995818763974, "epoch": 0.37206152022613087, "grad_norm": 0.8186246156692505, "learning_rate": 1.9964888614317902e-05, "loss": 0.3338, "mean_token_accuracy": 0.8967523537576199, "num_tokens": 27452063.0, "step": 24170 }, { "entropy": 0.32296742033213377, "epoch": 0.3722154554848095, "grad_norm": 0.9030364155769348, "learning_rate": 1.9964738493691506e-05, "loss": 0.3148, "mean_token_accuracy": 0.9051904305815697, "num_tokens": 27518515.0, "step": 24180 }, { "entropy": 0.3213732578791678, "epoch": 0.37236939074348807, "grad_norm": 0.8417277336120605, "learning_rate": 1.9964588053391728e-05, "loss": 0.3228, "mean_token_accuracy": 0.9002837620675563, "num_tokens": 27586115.0, "step": 24190 }, { "entropy": 0.3036590712144971, "epoch": 0.37252332600216664, "grad_norm": 0.7150784134864807, "learning_rate": 1.99644372934234e-05, "loss": 0.3184, "mean_token_accuracy": 0.9069882586598397, "num_tokens": 27649756.0, "step": 24200 }, { "entropy": 0.3169761053286493, "epoch": 0.3726772612608452, "grad_norm": 0.822959840297699, "learning_rate": 1.9964286213791353e-05, "loss": 0.3004, "mean_token_accuracy": 0.9054484747350215, "num_tokens": 27714106.0, "step": 24210 }, { "entropy": 0.3108423165977001, "epoch": 0.3728311965195238, "grad_norm": 0.6576409339904785, "learning_rate": 1.996413481450044e-05, "loss": 0.3176, "mean_token_accuracy": 0.9039301328361035, "num_tokens": 27784980.0, "step": 24220 }, { "entropy": 0.3592915639281273, "epoch": 0.37298513177820236, "grad_norm": 0.5528927445411682, "learning_rate": 1.996398309555551e-05, "loss": 0.3526, "mean_token_accuracy": 0.89077078551054, "num_tokens": 27853744.0, "step": 24230 }, { "entropy": 0.3129045152105391, "epoch": 0.373139067036881, "grad_norm": 0.824577808380127, "learning_rate": 1.9963831056961433e-05, "loss": 0.3238, "mean_token_accuracy": 0.9064977623522281, "num_tokens": 27909673.0, "step": 24240 }, { "entropy": 0.3133428184315562, "epoch": 0.37329300229555956, "grad_norm": 0.8756524324417114, "learning_rate": 1.996367869872309e-05, "loss": 0.3231, "mean_token_accuracy": 0.9024774000048638, "num_tokens": 27972955.0, "step": 24250 }, { "entropy": 0.32160260882228614, "epoch": 0.37344693755423813, "grad_norm": 0.775426983833313, "learning_rate": 1.9963526020845366e-05, "loss": 0.3244, "mean_token_accuracy": 0.9031195171177387, "num_tokens": 28046948.0, "step": 24260 }, { "entropy": 0.3074630414135754, "epoch": 0.3736008728129167, "grad_norm": 0.7787631154060364, "learning_rate": 1.9963373023333158e-05, "loss": 0.3212, "mean_token_accuracy": 0.9044622972607612, "num_tokens": 28111907.0, "step": 24270 }, { "entropy": 0.3241295050829649, "epoch": 0.3737548080715953, "grad_norm": 0.7078209519386292, "learning_rate": 1.996321970619138e-05, "loss": 0.3324, "mean_token_accuracy": 0.8990539096295833, "num_tokens": 28179700.0, "step": 24280 }, { "entropy": 0.3479930647648871, "epoch": 0.37390874333027385, "grad_norm": 0.5948483347892761, "learning_rate": 1.9963066069424944e-05, "loss": 0.3514, "mean_token_accuracy": 0.8944721467792988, "num_tokens": 28252750.0, "step": 24290 }, { "entropy": 0.3076492233201861, "epoch": 0.3740626785889524, "grad_norm": 0.636975884437561, "learning_rate": 1.996291211303878e-05, "loss": 0.3019, "mean_token_accuracy": 0.9095054268836975, "num_tokens": 28319163.0, "step": 24300 }, { "entropy": 0.3120477644726634, "epoch": 0.37421661384763105, "grad_norm": 0.8139878511428833, "learning_rate": 1.9962757837037827e-05, "loss": 0.3185, "mean_token_accuracy": 0.9035201892256737, "num_tokens": 28389669.0, "step": 24310 }, { "entropy": 0.31568331066519023, "epoch": 0.3743705491063096, "grad_norm": 1.1128860712051392, "learning_rate": 1.9962603241427035e-05, "loss": 0.3249, "mean_token_accuracy": 0.902273914963007, "num_tokens": 28450417.0, "step": 24320 }, { "entropy": 0.3301850212737918, "epoch": 0.3745244843649882, "grad_norm": 1.001133918762207, "learning_rate": 1.9962448326211363e-05, "loss": 0.3149, "mean_token_accuracy": 0.9012772977352143, "num_tokens": 28521654.0, "step": 24330 }, { "entropy": 0.31129483235999944, "epoch": 0.37467841962366677, "grad_norm": 0.7131984233856201, "learning_rate": 1.9962293091395785e-05, "loss": 0.313, "mean_token_accuracy": 0.9028408490121365, "num_tokens": 28588077.0, "step": 24340 }, { "entropy": 0.3106164598837495, "epoch": 0.37483235488234534, "grad_norm": 0.6569303870201111, "learning_rate": 1.9962137536985273e-05, "loss": 0.3111, "mean_token_accuracy": 0.9052676685154438, "num_tokens": 28664621.0, "step": 24350 }, { "entropy": 0.35048610288649795, "epoch": 0.3749862901410239, "grad_norm": 0.6426203846931458, "learning_rate": 1.9961981662984825e-05, "loss": 0.3381, "mean_token_accuracy": 0.8935618326067924, "num_tokens": 28733518.0, "step": 24360 }, { "entropy": 0.321132945176214, "epoch": 0.37514022539970254, "grad_norm": 0.7106867432594299, "learning_rate": 1.9961825469399433e-05, "loss": 0.3211, "mean_token_accuracy": 0.9026364587247372, "num_tokens": 28798092.0, "step": 24370 }, { "entropy": 0.3143374825827777, "epoch": 0.3752941606583811, "grad_norm": 0.7849541902542114, "learning_rate": 1.9961668956234117e-05, "loss": 0.3203, "mean_token_accuracy": 0.9045069962739944, "num_tokens": 28860025.0, "step": 24380 }, { "entropy": 0.32134253773838284, "epoch": 0.3754480959170597, "grad_norm": 0.7182298302650452, "learning_rate": 1.9961512123493895e-05, "loss": 0.3214, "mean_token_accuracy": 0.9038788512349129, "num_tokens": 28925955.0, "step": 24390 }, { "entropy": 0.2961650582961738, "epoch": 0.37560203117573826, "grad_norm": 0.7251172065734863, "learning_rate": 1.9961354971183795e-05, "loss": 0.2965, "mean_token_accuracy": 0.9104778729379177, "num_tokens": 28988546.0, "step": 24400 }, { "entropy": 0.3105804258957505, "epoch": 0.37575596643441683, "grad_norm": 0.7461613416671753, "learning_rate": 1.9961197499308863e-05, "loss": 0.3204, "mean_token_accuracy": 0.9032128110527993, "num_tokens": 29066685.0, "step": 24410 }, { "entropy": 0.3354851774871349, "epoch": 0.3759099016930954, "grad_norm": 0.6222379207611084, "learning_rate": 1.9961039707874142e-05, "loss": 0.33, "mean_token_accuracy": 0.8980173826217651, "num_tokens": 29141976.0, "step": 24420 }, { "entropy": 0.3145941342227161, "epoch": 0.37606383695177403, "grad_norm": 0.7265601754188538, "learning_rate": 1.9960881596884706e-05, "loss": 0.3013, "mean_token_accuracy": 0.9035117328166962, "num_tokens": 29214622.0, "step": 24430 }, { "entropy": 0.30646559596061707, "epoch": 0.3762177722104526, "grad_norm": 0.6077368259429932, "learning_rate": 1.9960723166345622e-05, "loss": 0.3041, "mean_token_accuracy": 0.9037439428269863, "num_tokens": 29285440.0, "step": 24440 }, { "entropy": 0.3190978097729385, "epoch": 0.3763717074691312, "grad_norm": 0.7787706851959229, "learning_rate": 1.996056441626197e-05, "loss": 0.3149, "mean_token_accuracy": 0.9021379865705967, "num_tokens": 29345187.0, "step": 24450 }, { "entropy": 0.3120700094848871, "epoch": 0.37652564272780975, "grad_norm": 0.7333254814147949, "learning_rate": 1.9960405346638845e-05, "loss": 0.3139, "mean_token_accuracy": 0.9063153363764286, "num_tokens": 29411307.0, "step": 24460 }, { "entropy": 0.3157899593003094, "epoch": 0.3766795779864883, "grad_norm": 0.7851434946060181, "learning_rate": 1.996024595748135e-05, "loss": 0.3253, "mean_token_accuracy": 0.9037524588406086, "num_tokens": 29474524.0, "step": 24470 }, { "entropy": 0.32036717543378473, "epoch": 0.3768335132451669, "grad_norm": 0.7015953660011292, "learning_rate": 1.99600862487946e-05, "loss": 0.3088, "mean_token_accuracy": 0.9033918276429176, "num_tokens": 29537329.0, "step": 24480 }, { "entropy": 0.3375622329302132, "epoch": 0.37698744850384547, "grad_norm": 0.7056258916854858, "learning_rate": 1.9959926220583713e-05, "loss": 0.3342, "mean_token_accuracy": 0.8981992609798908, "num_tokens": 29603143.0, "step": 24490 }, { "entropy": 0.3011350595392287, "epoch": 0.3771413837625241, "grad_norm": 0.5546168684959412, "learning_rate": 1.9959765872853828e-05, "loss": 0.3089, "mean_token_accuracy": 0.9076470680534839, "num_tokens": 29675192.0, "step": 24500 }, { "entropy": 0.3119169004261494, "epoch": 0.37729531902120267, "grad_norm": 0.7655936479568481, "learning_rate": 1.995960520561009e-05, "loss": 0.3116, "mean_token_accuracy": 0.90387869104743, "num_tokens": 29737511.0, "step": 24510 }, { "entropy": 0.33419267283752563, "epoch": 0.37744925427988124, "grad_norm": 0.7680578827857971, "learning_rate": 1.995944421885765e-05, "loss": 0.3221, "mean_token_accuracy": 0.898661108314991, "num_tokens": 29806915.0, "step": 24520 }, { "entropy": 0.32351861111819746, "epoch": 0.3776031895385598, "grad_norm": 0.6731173992156982, "learning_rate": 1.9959282912601674e-05, "loss": 0.3204, "mean_token_accuracy": 0.9017509691417217, "num_tokens": 29873574.0, "step": 24530 }, { "entropy": 0.3101672226563096, "epoch": 0.3777571247972384, "grad_norm": 0.8110003471374512, "learning_rate": 1.9959121286847336e-05, "loss": 0.3178, "mean_token_accuracy": 0.9063662812113762, "num_tokens": 29941849.0, "step": 24540 }, { "entropy": 0.3145000383257866, "epoch": 0.37791106005591696, "grad_norm": 0.6394909620285034, "learning_rate": 1.995895934159982e-05, "loss": 0.3192, "mean_token_accuracy": 0.9055570207536221, "num_tokens": 30005578.0, "step": 24550 }, { "entropy": 0.2897805940359831, "epoch": 0.3780649953145956, "grad_norm": 0.7279020547866821, "learning_rate": 1.995879707686432e-05, "loss": 0.313, "mean_token_accuracy": 0.9114047653973103, "num_tokens": 30074349.0, "step": 24560 }, { "entropy": 0.29500934900715947, "epoch": 0.37821893057327416, "grad_norm": 0.8975698351860046, "learning_rate": 1.9958634492646053e-05, "loss": 0.3053, "mean_token_accuracy": 0.9120386838912964, "num_tokens": 30143440.0, "step": 24570 }, { "entropy": 0.3152789250947535, "epoch": 0.37837286583195273, "grad_norm": 0.6769262552261353, "learning_rate": 1.9958471588950217e-05, "loss": 0.3212, "mean_token_accuracy": 0.9061571188271046, "num_tokens": 30205610.0, "step": 24580 }, { "entropy": 0.2955694392323494, "epoch": 0.3785268010906313, "grad_norm": 0.6980963349342346, "learning_rate": 1.9958308365782047e-05, "loss": 0.3136, "mean_token_accuracy": 0.909553873538971, "num_tokens": 30271263.0, "step": 24590 }, { "entropy": 0.3135385149158537, "epoch": 0.3786807363493099, "grad_norm": 0.5837219953536987, "learning_rate": 1.9958144823146782e-05, "loss": 0.3045, "mean_token_accuracy": 0.9052187636494636, "num_tokens": 30333292.0, "step": 24600 }, { "entropy": 0.302809158526361, "epoch": 0.37883467160798845, "grad_norm": 0.5899837017059326, "learning_rate": 1.995798096104967e-05, "loss": 0.3098, "mean_token_accuracy": 0.9093459397554398, "num_tokens": 30388133.0, "step": 24610 }, { "entropy": 0.30163239743560555, "epoch": 0.3789886068666671, "grad_norm": 1.0383524894714355, "learning_rate": 1.995781677949596e-05, "loss": 0.3102, "mean_token_accuracy": 0.9050278432667256, "num_tokens": 30451852.0, "step": 24620 }, { "entropy": 0.3153795043937862, "epoch": 0.37914254212534565, "grad_norm": 0.5407894849777222, "learning_rate": 1.9957652278490917e-05, "loss": 0.3103, "mean_token_accuracy": 0.9051769658923149, "num_tokens": 30525924.0, "step": 24630 }, { "entropy": 0.3183408245444298, "epoch": 0.3792964773840242, "grad_norm": 0.6569169163703918, "learning_rate": 1.995748745803983e-05, "loss": 0.3172, "mean_token_accuracy": 0.9016592219471932, "num_tokens": 30593373.0, "step": 24640 }, { "entropy": 0.3059252819046378, "epoch": 0.3794504126427028, "grad_norm": 0.7885048389434814, "learning_rate": 1.9957322318147977e-05, "loss": 0.3062, "mean_token_accuracy": 0.9064842708408832, "num_tokens": 30664000.0, "step": 24650 }, { "entropy": 0.29423928558826445, "epoch": 0.37960434790138137, "grad_norm": 0.755490779876709, "learning_rate": 1.995715685882066e-05, "loss": 0.3097, "mean_token_accuracy": 0.9106752276420593, "num_tokens": 30732029.0, "step": 24660 }, { "entropy": 0.31936578694731, "epoch": 0.37975828316005994, "grad_norm": 0.8081358671188354, "learning_rate": 1.9956991080063188e-05, "loss": 0.3205, "mean_token_accuracy": 0.903112705796957, "num_tokens": 30792415.0, "step": 24670 }, { "entropy": 0.3342219203710556, "epoch": 0.3799122184187385, "grad_norm": 0.7402741312980652, "learning_rate": 1.9956824981880874e-05, "loss": 0.3162, "mean_token_accuracy": 0.8969711177051067, "num_tokens": 30870914.0, "step": 24680 }, { "entropy": 0.3306219527497888, "epoch": 0.38006615367741714, "grad_norm": 0.9618604183197021, "learning_rate": 1.995665856427905e-05, "loss": 0.3173, "mean_token_accuracy": 0.9009860448539257, "num_tokens": 30926370.0, "step": 24690 }, { "entropy": 0.2996103061363101, "epoch": 0.3802200889360957, "grad_norm": 0.8708629608154297, "learning_rate": 1.995649182726306e-05, "loss": 0.31, "mean_token_accuracy": 0.904794541746378, "num_tokens": 30993943.0, "step": 24700 }, { "entropy": 0.33841804983094337, "epoch": 0.3803740241947743, "grad_norm": 0.6427260637283325, "learning_rate": 1.9956324770838244e-05, "loss": 0.3313, "mean_token_accuracy": 0.8979269407689572, "num_tokens": 31062010.0, "step": 24710 }, { "entropy": 0.33489024052396416, "epoch": 0.38052795945345286, "grad_norm": 0.6586459875106812, "learning_rate": 1.9956157395009962e-05, "loss": 0.3251, "mean_token_accuracy": 0.9020677641034126, "num_tokens": 31126142.0, "step": 24720 }, { "entropy": 0.29952854216098784, "epoch": 0.38068189471213143, "grad_norm": 0.8196510076522827, "learning_rate": 1.9955989699783588e-05, "loss": 0.3065, "mean_token_accuracy": 0.9092638432979584, "num_tokens": 31185955.0, "step": 24730 }, { "entropy": 0.32325118873268366, "epoch": 0.38083582997081, "grad_norm": 0.6772305369377136, "learning_rate": 1.99558216851645e-05, "loss": 0.3191, "mean_token_accuracy": 0.8995734110474587, "num_tokens": 31251915.0, "step": 24740 }, { "entropy": 0.33032481372356415, "epoch": 0.38098976522948863, "grad_norm": 0.7328757643699646, "learning_rate": 1.9955653351158085e-05, "loss": 0.3256, "mean_token_accuracy": 0.8995803855359554, "num_tokens": 31317994.0, "step": 24750 }, { "entropy": 0.3117421210743487, "epoch": 0.3811437004881672, "grad_norm": 0.7500578165054321, "learning_rate": 1.9955484697769752e-05, "loss": 0.2967, "mean_token_accuracy": 0.9079851262271404, "num_tokens": 31388172.0, "step": 24760 }, { "entropy": 0.31059277299791577, "epoch": 0.3812976357468458, "grad_norm": 0.6390094757080078, "learning_rate": 1.9955315725004905e-05, "loss": 0.3309, "mean_token_accuracy": 0.9036707744002342, "num_tokens": 31457435.0, "step": 24770 }, { "entropy": 0.3316069170832634, "epoch": 0.38145157100552435, "grad_norm": 0.7779527306556702, "learning_rate": 1.995514643286896e-05, "loss": 0.3195, "mean_token_accuracy": 0.9012675799429417, "num_tokens": 31522350.0, "step": 24780 }, { "entropy": 0.3406974129378796, "epoch": 0.3816055062642029, "grad_norm": 0.6652946472167969, "learning_rate": 1.995497682136736e-05, "loss": 0.3348, "mean_token_accuracy": 0.8979453414678573, "num_tokens": 31588956.0, "step": 24790 }, { "entropy": 0.30248015047982335, "epoch": 0.3817594415228815, "grad_norm": 0.6770617365837097, "learning_rate": 1.9954806890505535e-05, "loss": 0.3069, "mean_token_accuracy": 0.9066692784428596, "num_tokens": 31657431.0, "step": 24800 }, { "entropy": 0.327003868855536, "epoch": 0.3819133767815601, "grad_norm": 0.7820358872413635, "learning_rate": 1.995463664028894e-05, "loss": 0.3262, "mean_token_accuracy": 0.9000002168118953, "num_tokens": 31725831.0, "step": 24810 }, { "entropy": 0.3071428766474128, "epoch": 0.3820673120402387, "grad_norm": 0.6069677472114563, "learning_rate": 1.9954466070723043e-05, "loss": 0.3136, "mean_token_accuracy": 0.9049650341272354, "num_tokens": 31788666.0, "step": 24820 }, { "entropy": 0.2907501167617738, "epoch": 0.38222124729891727, "grad_norm": 0.842039942741394, "learning_rate": 1.9954295181813307e-05, "loss": 0.2918, "mean_token_accuracy": 0.9137908145785332, "num_tokens": 31852165.0, "step": 24830 }, { "entropy": 0.3058939939364791, "epoch": 0.38237518255759584, "grad_norm": 0.6801638007164001, "learning_rate": 1.995412397356522e-05, "loss": 0.3118, "mean_token_accuracy": 0.9044063076376915, "num_tokens": 31922460.0, "step": 24840 }, { "entropy": 0.30028272047638893, "epoch": 0.3825291178162744, "grad_norm": 0.6212348341941833, "learning_rate": 1.995395244598427e-05, "loss": 0.3086, "mean_token_accuracy": 0.9104204796254635, "num_tokens": 31989810.0, "step": 24850 }, { "entropy": 0.3137330707162619, "epoch": 0.382683053074953, "grad_norm": 0.5868242383003235, "learning_rate": 1.9953780599075965e-05, "loss": 0.316, "mean_token_accuracy": 0.9061118595302105, "num_tokens": 32050057.0, "step": 24860 }, { "entropy": 0.29749295869842174, "epoch": 0.38283698833363156, "grad_norm": 0.5984684824943542, "learning_rate": 1.9953608432845813e-05, "loss": 0.2967, "mean_token_accuracy": 0.9095104925334454, "num_tokens": 32122651.0, "step": 24870 }, { "entropy": 0.2965376297011971, "epoch": 0.3829909235923102, "grad_norm": 0.6407161951065063, "learning_rate": 1.995343594729934e-05, "loss": 0.303, "mean_token_accuracy": 0.9086657933890819, "num_tokens": 32200778.0, "step": 24880 }, { "entropy": 0.31065296214073895, "epoch": 0.38314485885098876, "grad_norm": 0.7421790957450867, "learning_rate": 1.9953263142442078e-05, "loss": 0.3151, "mean_token_accuracy": 0.9035020843148232, "num_tokens": 32274257.0, "step": 24890 }, { "entropy": 0.32334099868312477, "epoch": 0.38329879410966733, "grad_norm": 0.5841197967529297, "learning_rate": 1.9953090018279567e-05, "loss": 0.3202, "mean_token_accuracy": 0.9033744491636753, "num_tokens": 32349319.0, "step": 24900 }, { "entropy": 0.31271837493404747, "epoch": 0.3834527293683459, "grad_norm": 0.7438344955444336, "learning_rate": 1.9952916574817372e-05, "loss": 0.3086, "mean_token_accuracy": 0.9049708940088749, "num_tokens": 32421896.0, "step": 24910 }, { "entropy": 0.30942350905388594, "epoch": 0.3836066646270245, "grad_norm": 0.5933628082275391, "learning_rate": 1.9952742812061045e-05, "loss": 0.3127, "mean_token_accuracy": 0.9044400222599507, "num_tokens": 32490015.0, "step": 24920 }, { "entropy": 0.3127131363376975, "epoch": 0.38376059988570305, "grad_norm": 0.8434152007102966, "learning_rate": 1.9952568730016167e-05, "loss": 0.3164, "mean_token_accuracy": 0.9047483459115029, "num_tokens": 32561917.0, "step": 24930 }, { "entropy": 0.3157628562301397, "epoch": 0.3839145351443817, "grad_norm": 0.7066140174865723, "learning_rate": 1.9952394328688326e-05, "loss": 0.3162, "mean_token_accuracy": 0.9044956713914871, "num_tokens": 32629951.0, "step": 24940 }, { "entropy": 0.31119238678365946, "epoch": 0.38406847040306025, "grad_norm": 0.6452502012252808, "learning_rate": 1.9952219608083107e-05, "loss": 0.3141, "mean_token_accuracy": 0.9048009142279625, "num_tokens": 32686167.0, "step": 24950 }, { "entropy": 0.31432589665055277, "epoch": 0.3842224056617388, "grad_norm": 0.6752919554710388, "learning_rate": 1.995204456820612e-05, "loss": 0.3133, "mean_token_accuracy": 0.9018369078636169, "num_tokens": 32755705.0, "step": 24960 }, { "entropy": 0.3085582030005753, "epoch": 0.3843763409204174, "grad_norm": 0.5666850805282593, "learning_rate": 1.9951869209062984e-05, "loss": 0.3036, "mean_token_accuracy": 0.9070447511970997, "num_tokens": 32815104.0, "step": 24970 }, { "entropy": 0.31172906178981064, "epoch": 0.38453027617909596, "grad_norm": 0.7743751406669617, "learning_rate": 1.995169353065932e-05, "loss": 0.3171, "mean_token_accuracy": 0.9072008214890956, "num_tokens": 32889668.0, "step": 24980 }, { "entropy": 0.29427353767678144, "epoch": 0.38468421143777454, "grad_norm": 0.9895784854888916, "learning_rate": 1.9951517533000764e-05, "loss": 0.3012, "mean_token_accuracy": 0.9079718217253685, "num_tokens": 32961444.0, "step": 24990 }, { "entropy": 0.32035280764102936, "epoch": 0.38483814669645316, "grad_norm": 0.6503774523735046, "learning_rate": 1.9951341216092964e-05, "loss": 0.3085, "mean_token_accuracy": 0.9005698278546334, "num_tokens": 33024319.0, "step": 25000 }, { "entropy": 0.3113529147580266, "epoch": 0.38499208195513174, "grad_norm": 0.6822073459625244, "learning_rate": 1.9951164579941573e-05, "loss": 0.3074, "mean_token_accuracy": 0.9047565191984177, "num_tokens": 33091687.0, "step": 25010 }, { "entropy": 0.31280507389456036, "epoch": 0.3851460172138103, "grad_norm": 0.6387028098106384, "learning_rate": 1.9950987624552263e-05, "loss": 0.3186, "mean_token_accuracy": 0.9031152442097664, "num_tokens": 33161204.0, "step": 25020 }, { "entropy": 0.3309008710086346, "epoch": 0.3852999524724889, "grad_norm": 0.7996625304222107, "learning_rate": 1.9950810349930706e-05, "loss": 0.337, "mean_token_accuracy": 0.9003351517021656, "num_tokens": 33223816.0, "step": 25030 }, { "entropy": 0.3082101444713771, "epoch": 0.38545388773116745, "grad_norm": 0.6517648100852966, "learning_rate": 1.9950632756082594e-05, "loss": 0.3125, "mean_token_accuracy": 0.9079162061214447, "num_tokens": 33292727.0, "step": 25040 }, { "entropy": 0.312999857775867, "epoch": 0.385607822989846, "grad_norm": 0.7677973508834839, "learning_rate": 1.995045484301362e-05, "loss": 0.3276, "mean_token_accuracy": 0.9034610912203789, "num_tokens": 33354212.0, "step": 25050 }, { "entropy": 0.3070693048648536, "epoch": 0.3857617582485246, "grad_norm": 0.781257152557373, "learning_rate": 1.995027661072949e-05, "loss": 0.313, "mean_token_accuracy": 0.9061156652867794, "num_tokens": 33414476.0, "step": 25060 }, { "entropy": 0.32958370856940744, "epoch": 0.38591569350720323, "grad_norm": 0.7385582327842712, "learning_rate": 1.9950098059235926e-05, "loss": 0.34, "mean_token_accuracy": 0.8999774530529976, "num_tokens": 33473940.0, "step": 25070 }, { "entropy": 0.3088692149147391, "epoch": 0.3860696287658818, "grad_norm": 0.6231366395950317, "learning_rate": 1.9949919188538657e-05, "loss": 0.3001, "mean_token_accuracy": 0.9078176699578762, "num_tokens": 33546081.0, "step": 25080 }, { "entropy": 0.28204342499375346, "epoch": 0.3862235640245604, "grad_norm": 0.5838094353675842, "learning_rate": 1.9949739998643415e-05, "loss": 0.2914, "mean_token_accuracy": 0.9122403845191002, "num_tokens": 33620940.0, "step": 25090 }, { "entropy": 0.31184825021773577, "epoch": 0.38637749928323895, "grad_norm": 0.6591858267784119, "learning_rate": 1.9949560489555953e-05, "loss": 0.3095, "mean_token_accuracy": 0.9047812096774578, "num_tokens": 33688644.0, "step": 25100 }, { "entropy": 0.32148861652240157, "epoch": 0.3865314345419175, "grad_norm": 0.6169883012771606, "learning_rate": 1.9949380661282028e-05, "loss": 0.3176, "mean_token_accuracy": 0.9027347922325134, "num_tokens": 33749901.0, "step": 25110 }, { "entropy": 0.3107544032856822, "epoch": 0.3866853698005961, "grad_norm": 0.6555787324905396, "learning_rate": 1.9949200513827413e-05, "loss": 0.3119, "mean_token_accuracy": 0.9050154238939285, "num_tokens": 33817832.0, "step": 25120 }, { "entropy": 0.29883573167026045, "epoch": 0.3868393050592747, "grad_norm": 0.7322931289672852, "learning_rate": 1.9949020047197885e-05, "loss": 0.3077, "mean_token_accuracy": 0.9073532052338124, "num_tokens": 33895033.0, "step": 25130 }, { "entropy": 0.31156841311603783, "epoch": 0.3869932403179533, "grad_norm": 0.6109002232551575, "learning_rate": 1.994883926139923e-05, "loss": 0.3041, "mean_token_accuracy": 0.9050654627382755, "num_tokens": 33964580.0, "step": 25140 }, { "entropy": 0.3138269426301122, "epoch": 0.38714717557663186, "grad_norm": 0.5688770413398743, "learning_rate": 1.994865815643725e-05, "loss": 0.3175, "mean_token_accuracy": 0.9024610921740532, "num_tokens": 34021792.0, "step": 25150 }, { "entropy": 0.297438080701977, "epoch": 0.38730111083531044, "grad_norm": 0.6070348620414734, "learning_rate": 1.994847673231776e-05, "loss": 0.2962, "mean_token_accuracy": 0.9076153255999089, "num_tokens": 34087871.0, "step": 25160 }, { "entropy": 0.3423095565289259, "epoch": 0.387455046093989, "grad_norm": 0.9147930145263672, "learning_rate": 1.994829498904657e-05, "loss": 0.3426, "mean_token_accuracy": 0.9002889037132263, "num_tokens": 34145458.0, "step": 25170 }, { "entropy": 0.28979621743783357, "epoch": 0.3876089813526676, "grad_norm": 0.7113038897514343, "learning_rate": 1.9948112926629517e-05, "loss": 0.3068, "mean_token_accuracy": 0.9113945432007313, "num_tokens": 34213168.0, "step": 25180 }, { "entropy": 0.29331289073452355, "epoch": 0.3877629166113462, "grad_norm": 0.8397672176361084, "learning_rate": 1.9947930545072442e-05, "loss": 0.295, "mean_token_accuracy": 0.9082976132631302, "num_tokens": 34279371.0, "step": 25190 }, { "entropy": 0.31668848264962435, "epoch": 0.3879168518700248, "grad_norm": 0.6811426281929016, "learning_rate": 1.9947747844381193e-05, "loss": 0.3293, "mean_token_accuracy": 0.9035331569612026, "num_tokens": 34350038.0, "step": 25200 }, { "entropy": 0.3178696818649769, "epoch": 0.38807078712870335, "grad_norm": 0.8388307094573975, "learning_rate": 1.994756482456163e-05, "loss": 0.3219, "mean_token_accuracy": 0.903001169860363, "num_tokens": 34416377.0, "step": 25210 }, { "entropy": 0.31387786446139215, "epoch": 0.3882247223873819, "grad_norm": 0.8250382542610168, "learning_rate": 1.994738148561963e-05, "loss": 0.3166, "mean_token_accuracy": 0.9013803370296956, "num_tokens": 34478832.0, "step": 25220 }, { "entropy": 0.29592917999252677, "epoch": 0.3883786576460605, "grad_norm": 0.8838919997215271, "learning_rate": 1.9947197827561072e-05, "loss": 0.301, "mean_token_accuracy": 0.9104483835399151, "num_tokens": 34535869.0, "step": 25230 }, { "entropy": 0.3101503402926028, "epoch": 0.38853259290473907, "grad_norm": 0.686122715473175, "learning_rate": 1.994701385039185e-05, "loss": 0.3321, "mean_token_accuracy": 0.9060406677424908, "num_tokens": 34594298.0, "step": 25240 }, { "entropy": 0.287445352319628, "epoch": 0.38868652816341764, "grad_norm": 0.727407693862915, "learning_rate": 1.9946829554117857e-05, "loss": 0.2982, "mean_token_accuracy": 0.9107441745698452, "num_tokens": 34658157.0, "step": 25250 }, { "entropy": 0.2962276549078524, "epoch": 0.38884046342209627, "grad_norm": 0.8180723786354065, "learning_rate": 1.9946644938745013e-05, "loss": 0.3022, "mean_token_accuracy": 0.9079459838569164, "num_tokens": 34722037.0, "step": 25260 }, { "entropy": 0.31783777214586734, "epoch": 0.38899439868077484, "grad_norm": 0.8494791388511658, "learning_rate": 1.994646000427924e-05, "loss": 0.3172, "mean_token_accuracy": 0.9028706327080727, "num_tokens": 34783393.0, "step": 25270 }, { "entropy": 0.31648428495973346, "epoch": 0.3891483339394534, "grad_norm": 0.7591761350631714, "learning_rate": 1.994627475072647e-05, "loss": 0.3065, "mean_token_accuracy": 0.903795451670885, "num_tokens": 34852333.0, "step": 25280 }, { "entropy": 0.30819278471171857, "epoch": 0.389302269198132, "grad_norm": 0.9116161465644836, "learning_rate": 1.994608917809265e-05, "loss": 0.3015, "mean_token_accuracy": 0.9067599475383759, "num_tokens": 34919579.0, "step": 25290 }, { "entropy": 0.3348661420866847, "epoch": 0.38945620445681056, "grad_norm": 0.6423830389976501, "learning_rate": 1.9945903286383723e-05, "loss": 0.345, "mean_token_accuracy": 0.8997922450304031, "num_tokens": 34985413.0, "step": 25300 }, { "entropy": 0.31591659374535086, "epoch": 0.38961013971548913, "grad_norm": 0.8306542634963989, "learning_rate": 1.9945717075605664e-05, "loss": 0.3178, "mean_token_accuracy": 0.9039601571857929, "num_tokens": 35047324.0, "step": 25310 }, { "entropy": 0.32095763450488446, "epoch": 0.38976407497416776, "grad_norm": 0.926153838634491, "learning_rate": 1.994553054576444e-05, "loss": 0.3192, "mean_token_accuracy": 0.902543855458498, "num_tokens": 35113064.0, "step": 25320 }, { "entropy": 0.30306916050612925, "epoch": 0.38991801023284633, "grad_norm": 0.6587352752685547, "learning_rate": 1.9945343696866035e-05, "loss": 0.3069, "mean_token_accuracy": 0.9068064138293266, "num_tokens": 35183701.0, "step": 25330 }, { "entropy": 0.3050047469325364, "epoch": 0.3900719454915249, "grad_norm": 0.6623231768608093, "learning_rate": 1.9945156528916442e-05, "loss": 0.3131, "mean_token_accuracy": 0.9083381801843643, "num_tokens": 35252083.0, "step": 25340 }, { "entropy": 0.30105825727805496, "epoch": 0.3902258807502035, "grad_norm": 0.6224210262298584, "learning_rate": 1.9944969041921675e-05, "loss": 0.3042, "mean_token_accuracy": 0.9096469298005104, "num_tokens": 35323634.0, "step": 25350 }, { "entropy": 0.30666703889146446, "epoch": 0.39037981600888205, "grad_norm": 0.6981361508369446, "learning_rate": 1.9944781235887736e-05, "loss": 0.3102, "mean_token_accuracy": 0.9058722443878651, "num_tokens": 35395390.0, "step": 25360 }, { "entropy": 0.30536632165312766, "epoch": 0.3905337512675606, "grad_norm": 0.5819618105888367, "learning_rate": 1.9944593110820657e-05, "loss": 0.3169, "mean_token_accuracy": 0.9067915350198745, "num_tokens": 35463190.0, "step": 25370 }, { "entropy": 0.29110747678205373, "epoch": 0.39068768652623925, "grad_norm": 0.6849663853645325, "learning_rate": 1.9944404666726473e-05, "loss": 0.3043, "mean_token_accuracy": 0.9107661850750446, "num_tokens": 35529983.0, "step": 25380 }, { "entropy": 0.2949559030123055, "epoch": 0.3908416217849178, "grad_norm": 0.7201901078224182, "learning_rate": 1.9944215903611227e-05, "loss": 0.3122, "mean_token_accuracy": 0.9084416471421719, "num_tokens": 35594517.0, "step": 25390 }, { "entropy": 0.31990933418273926, "epoch": 0.3909955570435964, "grad_norm": 0.690693736076355, "learning_rate": 1.9944026821480977e-05, "loss": 0.318, "mean_token_accuracy": 0.902931022644043, "num_tokens": 35659282.0, "step": 25400 }, { "entropy": 0.32272197799757124, "epoch": 0.39114949230227497, "grad_norm": 0.8070246577262878, "learning_rate": 1.9943837420341786e-05, "loss": 0.3181, "mean_token_accuracy": 0.9024054735898972, "num_tokens": 35732896.0, "step": 25410 }, { "entropy": 0.3062912117689848, "epoch": 0.39130342756095354, "grad_norm": 0.8865343928337097, "learning_rate": 1.9943647700199734e-05, "loss": 0.3123, "mean_token_accuracy": 0.9035627894103527, "num_tokens": 35795794.0, "step": 25420 }, { "entropy": 0.3066105949692428, "epoch": 0.3914573628196321, "grad_norm": 0.7052505612373352, "learning_rate": 1.9943457661060903e-05, "loss": 0.2987, "mean_token_accuracy": 0.9047758437693119, "num_tokens": 35860217.0, "step": 25430 }, { "entropy": 0.31288065100088713, "epoch": 0.3916112980783107, "grad_norm": 0.6195175647735596, "learning_rate": 1.9943267302931392e-05, "loss": 0.3212, "mean_token_accuracy": 0.9049779504537583, "num_tokens": 35923701.0, "step": 25440 }, { "entropy": 0.32984203528612854, "epoch": 0.3917652333369893, "grad_norm": 0.629358172416687, "learning_rate": 1.9943076625817306e-05, "loss": 0.3269, "mean_token_accuracy": 0.898980662971735, "num_tokens": 35992309.0, "step": 25450 }, { "entropy": 0.326076640188694, "epoch": 0.3919191685956679, "grad_norm": 0.6713451147079468, "learning_rate": 1.9942885629724765e-05, "loss": 0.3222, "mean_token_accuracy": 0.8984007894992828, "num_tokens": 36058798.0, "step": 25460 }, { "entropy": 0.3010153035633266, "epoch": 0.39207310385434646, "grad_norm": 0.7900897860527039, "learning_rate": 1.9942694314659894e-05, "loss": 0.3041, "mean_token_accuracy": 0.9082496173679828, "num_tokens": 36115626.0, "step": 25470 }, { "entropy": 0.34613923048600553, "epoch": 0.39222703911302503, "grad_norm": 0.7444325685501099, "learning_rate": 1.9942502680628832e-05, "loss": 0.3464, "mean_token_accuracy": 0.89523391649127, "num_tokens": 36179401.0, "step": 25480 }, { "entropy": 0.3058058027178049, "epoch": 0.3923809743717036, "grad_norm": 0.7337254285812378, "learning_rate": 1.9942310727637725e-05, "loss": 0.3147, "mean_token_accuracy": 0.9075497783720493, "num_tokens": 36238060.0, "step": 25490 }, { "entropy": 0.31730923932045696, "epoch": 0.3925349096303822, "grad_norm": 0.7199666500091553, "learning_rate": 1.994211845569273e-05, "loss": 0.3165, "mean_token_accuracy": 0.9030690617859364, "num_tokens": 36302707.0, "step": 25500 }, { "entropy": 0.32235789503902196, "epoch": 0.3926888448890608, "grad_norm": 0.8912791609764099, "learning_rate": 1.994192586480002e-05, "loss": 0.3226, "mean_token_accuracy": 0.9035673305392266, "num_tokens": 36363659.0, "step": 25510 }, { "entropy": 0.3049864345230162, "epoch": 0.3928427801477394, "grad_norm": 1.2656811475753784, "learning_rate": 1.9941732954965768e-05, "loss": 0.303, "mean_token_accuracy": 0.9068434357643127, "num_tokens": 36428561.0, "step": 25520 }, { "entropy": 0.3074431726709008, "epoch": 0.39299671540641795, "grad_norm": 0.8034022450447083, "learning_rate": 1.9941539726196168e-05, "loss": 0.3106, "mean_token_accuracy": 0.9054245740175247, "num_tokens": 36491028.0, "step": 25530 }, { "entropy": 0.3064233331941068, "epoch": 0.3931506506650965, "grad_norm": 0.7486039400100708, "learning_rate": 1.9941346178497412e-05, "loss": 0.3071, "mean_token_accuracy": 0.9076719298958779, "num_tokens": 36555006.0, "step": 25540 }, { "entropy": 0.3108274425379932, "epoch": 0.3933045859237751, "grad_norm": 0.6159899234771729, "learning_rate": 1.9941152311875712e-05, "loss": 0.3227, "mean_token_accuracy": 0.9024525515735149, "num_tokens": 36615315.0, "step": 25550 }, { "entropy": 0.30429720170795915, "epoch": 0.39345852118245367, "grad_norm": 0.7288026809692383, "learning_rate": 1.9940958126337295e-05, "loss": 0.31, "mean_token_accuracy": 0.9087689101696015, "num_tokens": 36690665.0, "step": 25560 }, { "entropy": 0.2996250458061695, "epoch": 0.3936124564411323, "grad_norm": 0.9667459726333618, "learning_rate": 1.994076362188838e-05, "loss": 0.2969, "mean_token_accuracy": 0.9103990077972413, "num_tokens": 36754971.0, "step": 25570 }, { "entropy": 0.2916441353969276, "epoch": 0.39376639169981087, "grad_norm": 0.7592459321022034, "learning_rate": 1.9940568798535208e-05, "loss": 0.3024, "mean_token_accuracy": 0.911149150878191, "num_tokens": 36823232.0, "step": 25580 }, { "entropy": 0.31994672361761334, "epoch": 0.39392032695848944, "grad_norm": 0.8026037216186523, "learning_rate": 1.9940373656284034e-05, "loss": 0.3198, "mean_token_accuracy": 0.9042415134608746, "num_tokens": 36883886.0, "step": 25590 }, { "entropy": 0.3098545583896339, "epoch": 0.394074262217168, "grad_norm": 0.8485789895057678, "learning_rate": 1.9940178195141113e-05, "loss": 0.3095, "mean_token_accuracy": 0.903688333928585, "num_tokens": 36949120.0, "step": 25600 }, { "entropy": 0.31680681426078083, "epoch": 0.3942281974758466, "grad_norm": 0.6086865067481995, "learning_rate": 1.993998241511272e-05, "loss": 0.3189, "mean_token_accuracy": 0.9060021042823792, "num_tokens": 37018554.0, "step": 25610 }, { "entropy": 0.3211313247680664, "epoch": 0.39438213273452516, "grad_norm": 0.668811023235321, "learning_rate": 1.9939786316205138e-05, "loss": 0.3164, "mean_token_accuracy": 0.9028065428137779, "num_tokens": 37091638.0, "step": 25620 }, { "entropy": 0.3202915436588228, "epoch": 0.39453606799320373, "grad_norm": 0.5692824721336365, "learning_rate": 1.9939589898424646e-05, "loss": 0.3268, "mean_token_accuracy": 0.9034079343080521, "num_tokens": 37163388.0, "step": 25630 }, { "entropy": 0.32469515381380915, "epoch": 0.39469000325188236, "grad_norm": 0.6402381658554077, "learning_rate": 1.9939393161777562e-05, "loss": 0.3288, "mean_token_accuracy": 0.9012384630739689, "num_tokens": 37231574.0, "step": 25640 }, { "entropy": 0.27905724281445143, "epoch": 0.39484393851056093, "grad_norm": 0.6673490405082703, "learning_rate": 1.9939196106270183e-05, "loss": 0.2912, "mean_token_accuracy": 0.9129556439816952, "num_tokens": 37295863.0, "step": 25650 }, { "entropy": 0.29854724556207657, "epoch": 0.3949978737692395, "grad_norm": 0.9365684390068054, "learning_rate": 1.9938998731908835e-05, "loss": 0.3015, "mean_token_accuracy": 0.9099769338965416, "num_tokens": 37352290.0, "step": 25660 }, { "entropy": 0.3123446349985898, "epoch": 0.3951518090279181, "grad_norm": 0.6756871342658997, "learning_rate": 1.9938801038699857e-05, "loss": 0.3084, "mean_token_accuracy": 0.905426012724638, "num_tokens": 37421957.0, "step": 25670 }, { "entropy": 0.3137248924933374, "epoch": 0.39530574428659665, "grad_norm": 0.7825495004653931, "learning_rate": 1.9938603026649578e-05, "loss": 0.3092, "mean_token_accuracy": 0.9022759243845939, "num_tokens": 37484563.0, "step": 25680 }, { "entropy": 0.32120346473529937, "epoch": 0.3954596795452752, "grad_norm": 0.6941694021224976, "learning_rate": 1.9938404695764362e-05, "loss": 0.3248, "mean_token_accuracy": 0.9024720601737499, "num_tokens": 37546937.0, "step": 25690 }, { "entropy": 0.3108428847976029, "epoch": 0.39561361480395385, "grad_norm": 0.5650362968444824, "learning_rate": 1.9938206046050567e-05, "loss": 0.3123, "mean_token_accuracy": 0.9070301584899425, "num_tokens": 37617414.0, "step": 25700 }, { "entropy": 0.31502691823989154, "epoch": 0.3957675500626324, "grad_norm": 0.6151547431945801, "learning_rate": 1.9938007077514564e-05, "loss": 0.3212, "mean_token_accuracy": 0.9045773684978485, "num_tokens": 37685881.0, "step": 25710 }, { "entropy": 0.32075723111629484, "epoch": 0.395921485321311, "grad_norm": 0.6157472133636475, "learning_rate": 1.993780779016274e-05, "loss": 0.3217, "mean_token_accuracy": 0.903880151361227, "num_tokens": 37745102.0, "step": 25720 }, { "entropy": 0.315744396019727, "epoch": 0.39607542057998957, "grad_norm": 0.7339977622032166, "learning_rate": 1.993760818400148e-05, "loss": 0.3073, "mean_token_accuracy": 0.9047527194023133, "num_tokens": 37814428.0, "step": 25730 }, { "entropy": 0.3185446307063103, "epoch": 0.39622935583866814, "grad_norm": 0.6987192034721375, "learning_rate": 1.9937408259037202e-05, "loss": 0.3335, "mean_token_accuracy": 0.9006725616753102, "num_tokens": 37887919.0, "step": 25740 }, { "entropy": 0.32971182372421026, "epoch": 0.3963832910973467, "grad_norm": 0.5675910711288452, "learning_rate": 1.9937208015276305e-05, "loss": 0.3232, "mean_token_accuracy": 0.9007733508944511, "num_tokens": 37949505.0, "step": 25750 }, { "entropy": 0.30220625130459666, "epoch": 0.39653722635602534, "grad_norm": 0.6668892502784729, "learning_rate": 1.993700745272522e-05, "loss": 0.3035, "mean_token_accuracy": 0.9051231607794762, "num_tokens": 38018686.0, "step": 25760 }, { "entropy": 0.3133438299410045, "epoch": 0.3966911616147039, "grad_norm": 0.646301805973053, "learning_rate": 1.9936806571390383e-05, "loss": 0.3207, "mean_token_accuracy": 0.902175348252058, "num_tokens": 38085188.0, "step": 25770 }, { "entropy": 0.32995221465826036, "epoch": 0.3968450968733825, "grad_norm": 0.8912884593009949, "learning_rate": 1.9936605371278232e-05, "loss": 0.3283, "mean_token_accuracy": 0.90003727003932, "num_tokens": 38146467.0, "step": 25780 }, { "entropy": 0.32113895416259763, "epoch": 0.39699903213206106, "grad_norm": 0.5905745029449463, "learning_rate": 1.9936403852395228e-05, "loss": 0.3136, "mean_token_accuracy": 0.9031200870871544, "num_tokens": 38219354.0, "step": 25790 }, { "entropy": 0.307401969935745, "epoch": 0.39715296739073963, "grad_norm": 0.6887242197990417, "learning_rate": 1.9936202014747832e-05, "loss": 0.3186, "mean_token_accuracy": 0.9080689884722233, "num_tokens": 38280315.0, "step": 25800 }, { "entropy": 0.32115612076595423, "epoch": 0.3973069026494182, "grad_norm": 0.695600152015686, "learning_rate": 1.993599985834252e-05, "loss": 0.3284, "mean_token_accuracy": 0.9026311852037907, "num_tokens": 38344125.0, "step": 25810 }, { "entropy": 0.2987439910881221, "epoch": 0.3974608379080968, "grad_norm": 0.80635666847229, "learning_rate": 1.9935797383185774e-05, "loss": 0.3035, "mean_token_accuracy": 0.9096709236502647, "num_tokens": 38409074.0, "step": 25820 }, { "entropy": 0.3359289163723588, "epoch": 0.3976147731667754, "grad_norm": 0.6500299572944641, "learning_rate": 1.9935594589284097e-05, "loss": 0.3349, "mean_token_accuracy": 0.896680997312069, "num_tokens": 38482278.0, "step": 25830 }, { "entropy": 0.30128770135343075, "epoch": 0.397768708425454, "grad_norm": 0.7487115263938904, "learning_rate": 1.9935391476643987e-05, "loss": 0.2942, "mean_token_accuracy": 0.9107349008321762, "num_tokens": 38537105.0, "step": 25840 }, { "entropy": 0.2855343556962907, "epoch": 0.39792264368413255, "grad_norm": 0.5987274646759033, "learning_rate": 1.993518804527197e-05, "loss": 0.2941, "mean_token_accuracy": 0.9094381876289844, "num_tokens": 38609302.0, "step": 25850 }, { "entropy": 0.32451644595712426, "epoch": 0.3980765789428111, "grad_norm": 0.5440521240234375, "learning_rate": 1.993498429517456e-05, "loss": 0.3248, "mean_token_accuracy": 0.8992338798940182, "num_tokens": 38672472.0, "step": 25860 }, { "entropy": 0.28715666830539704, "epoch": 0.3982305142014897, "grad_norm": 0.5759981274604797, "learning_rate": 1.99347802263583e-05, "loss": 0.3025, "mean_token_accuracy": 0.9140514500439167, "num_tokens": 38741410.0, "step": 25870 }, { "entropy": 0.3103853419423103, "epoch": 0.39838444946016827, "grad_norm": 0.567173421382904, "learning_rate": 1.9934575838829733e-05, "loss": 0.3103, "mean_token_accuracy": 0.9057046450674534, "num_tokens": 38805111.0, "step": 25880 }, { "entropy": 0.3028188947588205, "epoch": 0.3985383847188469, "grad_norm": 0.6677603125572205, "learning_rate": 1.9934371132595426e-05, "loss": 0.3174, "mean_token_accuracy": 0.9069710731506347, "num_tokens": 38882923.0, "step": 25890 }, { "entropy": 0.31786643490195277, "epoch": 0.39869231997752547, "grad_norm": 0.8701064586639404, "learning_rate": 1.9934166107661933e-05, "loss": 0.3044, "mean_token_accuracy": 0.9041231885552407, "num_tokens": 38941096.0, "step": 25900 }, { "entropy": 0.30496927108615635, "epoch": 0.39884625523620404, "grad_norm": 0.6206269860267639, "learning_rate": 1.9933960764035838e-05, "loss": 0.3132, "mean_token_accuracy": 0.9063624866306782, "num_tokens": 39005248.0, "step": 25910 }, { "entropy": 0.30786711713299153, "epoch": 0.3990001904948826, "grad_norm": 0.7399181127548218, "learning_rate": 1.9933755101723723e-05, "loss": 0.3045, "mean_token_accuracy": 0.9085447892546654, "num_tokens": 39067855.0, "step": 25920 }, { "entropy": 0.3123221708461642, "epoch": 0.3991541257535612, "grad_norm": 0.6984162330627441, "learning_rate": 1.9933549120732196e-05, "loss": 0.3095, "mean_token_accuracy": 0.9038801036775113, "num_tokens": 39140269.0, "step": 25930 }, { "entropy": 0.32416222533211114, "epoch": 0.39930806101223976, "grad_norm": 0.5082425475120544, "learning_rate": 1.9933342821067856e-05, "loss": 0.3243, "mean_token_accuracy": 0.8992840111255646, "num_tokens": 39207847.0, "step": 25940 }, { "entropy": 0.30770405270159246, "epoch": 0.3994619962709184, "grad_norm": 0.805156409740448, "learning_rate": 1.9933136202737325e-05, "loss": 0.302, "mean_token_accuracy": 0.9051887109875679, "num_tokens": 39274382.0, "step": 25950 }, { "entropy": 0.321415226906538, "epoch": 0.39961593152959696, "grad_norm": 0.7275740504264832, "learning_rate": 1.9932929265747232e-05, "loss": 0.3275, "mean_token_accuracy": 0.9012041866779328, "num_tokens": 39341114.0, "step": 25960 }, { "entropy": 0.32182229291647674, "epoch": 0.39976986678827553, "grad_norm": 0.7820711135864258, "learning_rate": 1.9932722010104215e-05, "loss": 0.335, "mean_token_accuracy": 0.9025260508060455, "num_tokens": 39412905.0, "step": 25970 }, { "entropy": 0.2987605548463762, "epoch": 0.3999238020469541, "grad_norm": 0.8059412837028503, "learning_rate": 1.993251443581492e-05, "loss": 0.2887, "mean_token_accuracy": 0.9094772092998028, "num_tokens": 39472364.0, "step": 25980 }, { "entropy": 0.3084768160246313, "epoch": 0.4000777373056327, "grad_norm": 0.6818887591362, "learning_rate": 1.993230654288601e-05, "loss": 0.3128, "mean_token_accuracy": 0.904141791909933, "num_tokens": 39538279.0, "step": 25990 }, { "entropy": 0.30124288257211446, "epoch": 0.40023167256431125, "grad_norm": 0.6271400451660156, "learning_rate": 1.993209833132415e-05, "loss": 0.3004, "mean_token_accuracy": 0.9081384509801864, "num_tokens": 39602609.0, "step": 26000 }, { "entropy": 0.29260414754971864, "epoch": 0.4003856078229898, "grad_norm": 0.6942412257194519, "learning_rate": 1.993188980113602e-05, "loss": 0.2933, "mean_token_accuracy": 0.9109213657677173, "num_tokens": 39668921.0, "step": 26010 }, { "entropy": 0.29861087892204524, "epoch": 0.40053954308166845, "grad_norm": 0.5867583155632019, "learning_rate": 1.9931680952328317e-05, "loss": 0.3046, "mean_token_accuracy": 0.9072656027972699, "num_tokens": 39742060.0, "step": 26020 }, { "entropy": 0.3055783394724131, "epoch": 0.400693478340347, "grad_norm": 0.6093422174453735, "learning_rate": 1.9931471784907734e-05, "loss": 0.3191, "mean_token_accuracy": 0.907817392796278, "num_tokens": 39810550.0, "step": 26030 }, { "entropy": 0.3170306765474379, "epoch": 0.4008474135990256, "grad_norm": 0.855684220790863, "learning_rate": 1.9931262298880982e-05, "loss": 0.3209, "mean_token_accuracy": 0.9038955703377723, "num_tokens": 39880334.0, "step": 26040 }, { "entropy": 0.2995344408787787, "epoch": 0.40100134885770417, "grad_norm": 0.7018293142318726, "learning_rate": 1.9931052494254782e-05, "loss": 0.2996, "mean_token_accuracy": 0.9075600907206536, "num_tokens": 39948281.0, "step": 26050 }, { "entropy": 0.3159010412171483, "epoch": 0.40115528411638274, "grad_norm": 0.6949157118797302, "learning_rate": 1.9930842371035868e-05, "loss": 0.3206, "mean_token_accuracy": 0.9034360520541668, "num_tokens": 40008731.0, "step": 26060 }, { "entropy": 0.31671278104186057, "epoch": 0.4013092193750613, "grad_norm": 0.5671148300170898, "learning_rate": 1.9930631929230976e-05, "loss": 0.316, "mean_token_accuracy": 0.9062964968383312, "num_tokens": 40084437.0, "step": 26070 }, { "entropy": 0.29306667102500794, "epoch": 0.40146315463373994, "grad_norm": 0.6960563063621521, "learning_rate": 1.9930421168846855e-05, "loss": 0.3058, "mean_token_accuracy": 0.9099674247205257, "num_tokens": 40145632.0, "step": 26080 }, { "entropy": 0.2985279267653823, "epoch": 0.4016170898924185, "grad_norm": 0.9603780508041382, "learning_rate": 1.9930210089890276e-05, "loss": 0.3069, "mean_token_accuracy": 0.9078042089939118, "num_tokens": 40213372.0, "step": 26090 }, { "entropy": 0.3219702378846705, "epoch": 0.4017710251510971, "grad_norm": 0.6506826877593994, "learning_rate": 1.9929998692368002e-05, "loss": 0.3273, "mean_token_accuracy": 0.9009219959378243, "num_tokens": 40282023.0, "step": 26100 }, { "entropy": 0.30243638921529054, "epoch": 0.40192496040977566, "grad_norm": 0.7005345225334167, "learning_rate": 1.992978697628682e-05, "loss": 0.309, "mean_token_accuracy": 0.9081327617168427, "num_tokens": 40349433.0, "step": 26110 }, { "entropy": 0.32702186638489367, "epoch": 0.40207889566845423, "grad_norm": 0.6256127953529358, "learning_rate": 1.9929574941653516e-05, "loss": 0.3273, "mean_token_accuracy": 0.9003435149788857, "num_tokens": 40413463.0, "step": 26120 }, { "entropy": 0.30482884887605904, "epoch": 0.4022328309271328, "grad_norm": 0.9129577875137329, "learning_rate": 1.9929362588474895e-05, "loss": 0.3108, "mean_token_accuracy": 0.9079157814383507, "num_tokens": 40477242.0, "step": 26130 }, { "entropy": 0.3048070431686938, "epoch": 0.40238676618581143, "grad_norm": 0.6524698734283447, "learning_rate": 1.9929149916757775e-05, "loss": 0.315, "mean_token_accuracy": 0.9030817002058029, "num_tokens": 40541846.0, "step": 26140 }, { "entropy": 0.30857401750981805, "epoch": 0.40254070144449, "grad_norm": 0.6859299540519714, "learning_rate": 1.9928936926508968e-05, "loss": 0.3137, "mean_token_accuracy": 0.9059811942279339, "num_tokens": 40610144.0, "step": 26150 }, { "entropy": 0.3311195235699415, "epoch": 0.4026946367031686, "grad_norm": 0.9393783807754517, "learning_rate": 1.9928723617735315e-05, "loss": 0.3271, "mean_token_accuracy": 0.8993563242256641, "num_tokens": 40687929.0, "step": 26160 }, { "entropy": 0.30959445107728245, "epoch": 0.40284857196184715, "grad_norm": 0.7933489084243774, "learning_rate": 1.9928509990443657e-05, "loss": 0.3234, "mean_token_accuracy": 0.9047790862619877, "num_tokens": 40755524.0, "step": 26170 }, { "entropy": 0.30353370709344746, "epoch": 0.4030025072205257, "grad_norm": 0.8109551668167114, "learning_rate": 1.9928296044640848e-05, "loss": 0.3045, "mean_token_accuracy": 0.9067848488688469, "num_tokens": 40814126.0, "step": 26180 }, { "entropy": 0.31690475502982735, "epoch": 0.4031564424792043, "grad_norm": 0.4856850206851959, "learning_rate": 1.9928081780333746e-05, "loss": 0.3136, "mean_token_accuracy": 0.9040864162147045, "num_tokens": 40884070.0, "step": 26190 }, { "entropy": 0.30461966898292303, "epoch": 0.40331037773788286, "grad_norm": 0.6133163571357727, "learning_rate": 1.9927867197529228e-05, "loss": 0.3087, "mean_token_accuracy": 0.9075732320547104, "num_tokens": 40953935.0, "step": 26200 }, { "entropy": 0.3158606817945838, "epoch": 0.4034643129965615, "grad_norm": 0.6626740097999573, "learning_rate": 1.9927652296234182e-05, "loss": 0.3143, "mean_token_accuracy": 0.9032938636839389, "num_tokens": 41017574.0, "step": 26210 }, { "entropy": 0.3090870562940836, "epoch": 0.40361824825524006, "grad_norm": 0.5925602912902832, "learning_rate": 1.99274370764555e-05, "loss": 0.3038, "mean_token_accuracy": 0.9041004098951817, "num_tokens": 41090929.0, "step": 26220 }, { "entropy": 0.29971629474312067, "epoch": 0.40377218351391864, "grad_norm": 0.7389164566993713, "learning_rate": 1.9927221538200085e-05, "loss": 0.3005, "mean_token_accuracy": 0.9085587307810783, "num_tokens": 41157834.0, "step": 26230 }, { "entropy": 0.3012526002712548, "epoch": 0.4039261187725972, "grad_norm": 0.9366985559463501, "learning_rate": 1.992700568147485e-05, "loss": 0.3147, "mean_token_accuracy": 0.9073975458741188, "num_tokens": 41220047.0, "step": 26240 }, { "entropy": 0.3045380367897451, "epoch": 0.4040800540312758, "grad_norm": 0.7071318030357361, "learning_rate": 1.992678950628672e-05, "loss": 0.3012, "mean_token_accuracy": 0.9080505594611168, "num_tokens": 41284041.0, "step": 26250 }, { "entropy": 0.3022130971774459, "epoch": 0.40423398928995435, "grad_norm": 0.5711658000946045, "learning_rate": 1.9926573012642634e-05, "loss": 0.3057, "mean_token_accuracy": 0.9061795704066753, "num_tokens": 41351305.0, "step": 26260 }, { "entropy": 0.30420170091092585, "epoch": 0.404387924548633, "grad_norm": 0.680116593837738, "learning_rate": 1.9926356200549534e-05, "loss": 0.3101, "mean_token_accuracy": 0.9075202845036984, "num_tokens": 41417030.0, "step": 26270 }, { "entropy": 0.3085710466839373, "epoch": 0.40454185980731155, "grad_norm": 0.750390887260437, "learning_rate": 1.9926139070014377e-05, "loss": 0.315, "mean_token_accuracy": 0.9061169669032096, "num_tokens": 41473591.0, "step": 26280 }, { "entropy": 0.30695258136838677, "epoch": 0.4046957950659901, "grad_norm": 0.6414017081260681, "learning_rate": 1.992592162104413e-05, "loss": 0.3202, "mean_token_accuracy": 0.9038797058165073, "num_tokens": 41535425.0, "step": 26290 }, { "entropy": 0.3101935907267034, "epoch": 0.4048497303246687, "grad_norm": 0.6571234464645386, "learning_rate": 1.992570385364576e-05, "loss": 0.3147, "mean_token_accuracy": 0.9044148467481137, "num_tokens": 41597938.0, "step": 26300 }, { "entropy": 0.3223195618018508, "epoch": 0.40500366558334727, "grad_norm": 0.9613175988197327, "learning_rate": 1.9925485767826266e-05, "loss": 0.3241, "mean_token_accuracy": 0.90373455286026, "num_tokens": 41661780.0, "step": 26310 }, { "entropy": 0.2992906069383025, "epoch": 0.40515760084202584, "grad_norm": 0.7301570177078247, "learning_rate": 1.9925267363592637e-05, "loss": 0.3001, "mean_token_accuracy": 0.9065308675169945, "num_tokens": 41719713.0, "step": 26320 }, { "entropy": 0.3130445218645036, "epoch": 0.4053115361007045, "grad_norm": 0.5816271305084229, "learning_rate": 1.992504864095188e-05, "loss": 0.3148, "mean_token_accuracy": 0.9050626493990421, "num_tokens": 41788790.0, "step": 26330 }, { "entropy": 0.34204594176262615, "epoch": 0.40546547135938305, "grad_norm": 0.6375913023948669, "learning_rate": 1.992482959991101e-05, "loss": 0.3313, "mean_token_accuracy": 0.8944518096745014, "num_tokens": 41856065.0, "step": 26340 }, { "entropy": 0.3059587378054857, "epoch": 0.4056194066180616, "grad_norm": 0.9119774103164673, "learning_rate": 1.9924610240477058e-05, "loss": 0.3153, "mean_token_accuracy": 0.9059174306690693, "num_tokens": 41916801.0, "step": 26350 }, { "entropy": 0.29243826484307645, "epoch": 0.4057733418767402, "grad_norm": 0.5629673004150391, "learning_rate": 1.9924390562657063e-05, "loss": 0.3044, "mean_token_accuracy": 0.9096020855009556, "num_tokens": 41991709.0, "step": 26360 }, { "entropy": 0.3073584631085396, "epoch": 0.40592727713541876, "grad_norm": 0.6373172402381897, "learning_rate": 1.992417056645806e-05, "loss": 0.3122, "mean_token_accuracy": 0.9072392717003822, "num_tokens": 42059824.0, "step": 26370 }, { "entropy": 0.29975686948746444, "epoch": 0.40608121239409734, "grad_norm": 0.7432464361190796, "learning_rate": 1.992395025188712e-05, "loss": 0.305, "mean_token_accuracy": 0.909536312520504, "num_tokens": 42117808.0, "step": 26380 }, { "entropy": 0.2920199966058135, "epoch": 0.4062351476527759, "grad_norm": 0.6675158143043518, "learning_rate": 1.9923729618951308e-05, "loss": 0.2958, "mean_token_accuracy": 0.9111813485622406, "num_tokens": 42181551.0, "step": 26390 }, { "entropy": 0.28301866436377165, "epoch": 0.40638908291145454, "grad_norm": 0.8630090355873108, "learning_rate": 1.99235086676577e-05, "loss": 0.2988, "mean_token_accuracy": 0.9129537463188171, "num_tokens": 42245691.0, "step": 26400 }, { "entropy": 0.2988882062956691, "epoch": 0.4065430181701331, "grad_norm": 0.8220312595367432, "learning_rate": 1.9923287398013383e-05, "loss": 0.291, "mean_token_accuracy": 0.9094663091003895, "num_tokens": 42301160.0, "step": 26410 }, { "entropy": 0.32248898558318617, "epoch": 0.4066969534288117, "grad_norm": 0.6977103352546692, "learning_rate": 1.9923065810025458e-05, "loss": 0.3145, "mean_token_accuracy": 0.9014907829463482, "num_tokens": 42367445.0, "step": 26420 }, { "entropy": 0.3182626623660326, "epoch": 0.40685088868749025, "grad_norm": 0.8095713257789612, "learning_rate": 1.9922843903701033e-05, "loss": 0.3178, "mean_token_accuracy": 0.9002846382558346, "num_tokens": 42427196.0, "step": 26430 }, { "entropy": 0.3094492219388485, "epoch": 0.4070048239461688, "grad_norm": 0.8443151116371155, "learning_rate": 1.9922621679047224e-05, "loss": 0.3031, "mean_token_accuracy": 0.9039819844067096, "num_tokens": 42493280.0, "step": 26440 }, { "entropy": 0.32101530209183693, "epoch": 0.4071587592048474, "grad_norm": 0.5487485527992249, "learning_rate": 1.9922399136071165e-05, "loss": 0.3191, "mean_token_accuracy": 0.9030265115201473, "num_tokens": 42565128.0, "step": 26450 }, { "entropy": 0.2922062423080206, "epoch": 0.407312694463526, "grad_norm": 0.7074294090270996, "learning_rate": 1.9922176274779993e-05, "loss": 0.3062, "mean_token_accuracy": 0.9103771448135376, "num_tokens": 42634589.0, "step": 26460 }, { "entropy": 0.2873690132983029, "epoch": 0.4074666297222046, "grad_norm": 0.7697380781173706, "learning_rate": 1.9921953095180853e-05, "loss": 0.2941, "mean_token_accuracy": 0.9126391254365445, "num_tokens": 42700268.0, "step": 26470 }, { "entropy": 0.30361936083063484, "epoch": 0.40762056498088317, "grad_norm": 0.6629871129989624, "learning_rate": 1.992172959728091e-05, "loss": 0.308, "mean_token_accuracy": 0.9054936625063419, "num_tokens": 42764684.0, "step": 26480 }, { "entropy": 0.29612840311601757, "epoch": 0.40777450023956174, "grad_norm": 0.562651515007019, "learning_rate": 1.9921505781087335e-05, "loss": 0.305, "mean_token_accuracy": 0.9096514761447907, "num_tokens": 42831568.0, "step": 26490 }, { "entropy": 0.3144083318300545, "epoch": 0.4079284354982403, "grad_norm": 0.8188650012016296, "learning_rate": 1.9921281646607306e-05, "loss": 0.3199, "mean_token_accuracy": 0.9042028576135636, "num_tokens": 42899519.0, "step": 26500 }, { "entropy": 0.297007305175066, "epoch": 0.4080823707569189, "grad_norm": 0.7543115615844727, "learning_rate": 1.9921057193848014e-05, "loss": 0.3015, "mean_token_accuracy": 0.9107096724212169, "num_tokens": 42963379.0, "step": 26510 }, { "entropy": 0.30059899752959607, "epoch": 0.4082363060155975, "grad_norm": 0.6056002974510193, "learning_rate": 1.9920832422816653e-05, "loss": 0.315, "mean_token_accuracy": 0.9069982439279556, "num_tokens": 43034463.0, "step": 26520 }, { "entropy": 0.3211320679634809, "epoch": 0.4083902412742761, "grad_norm": 0.6129964590072632, "learning_rate": 1.9920607333520446e-05, "loss": 0.3238, "mean_token_accuracy": 0.9040498800575734, "num_tokens": 43103681.0, "step": 26530 }, { "entropy": 0.30464274864643814, "epoch": 0.40854417653295466, "grad_norm": 0.6918623447418213, "learning_rate": 1.9920381925966604e-05, "loss": 0.3087, "mean_token_accuracy": 0.906680803745985, "num_tokens": 43172228.0, "step": 26540 }, { "entropy": 0.31928916927427053, "epoch": 0.40869811179163323, "grad_norm": 0.6867807507514954, "learning_rate": 1.992015620016236e-05, "loss": 0.3269, "mean_token_accuracy": 0.9024677321314811, "num_tokens": 43237197.0, "step": 26550 }, { "entropy": 0.30203221794217827, "epoch": 0.4088520470503118, "grad_norm": 1.0695364475250244, "learning_rate": 1.9919930156114963e-05, "loss": 0.3087, "mean_token_accuracy": 0.9062610976397991, "num_tokens": 43304275.0, "step": 26560 }, { "entropy": 0.3047090653330088, "epoch": 0.4090059823089904, "grad_norm": 0.6811420321464539, "learning_rate": 1.9919703793831655e-05, "loss": 0.3108, "mean_token_accuracy": 0.9074368134140969, "num_tokens": 43368653.0, "step": 26570 }, { "entropy": 0.31111263800412414, "epoch": 0.40915991756766895, "grad_norm": 0.9275579452514648, "learning_rate": 1.99194771133197e-05, "loss": 0.3159, "mean_token_accuracy": 0.9039652794599533, "num_tokens": 43436445.0, "step": 26580 }, { "entropy": 0.3176244298927486, "epoch": 0.4093138528263476, "grad_norm": 0.8931095600128174, "learning_rate": 1.9919250114586374e-05, "loss": 0.3168, "mean_token_accuracy": 0.9020593412220478, "num_tokens": 43495083.0, "step": 26590 }, { "entropy": 0.3124756318517029, "epoch": 0.40946778808502615, "grad_norm": 0.5493554472923279, "learning_rate": 1.991902279763896e-05, "loss": 0.3119, "mean_token_accuracy": 0.9063635833561421, "num_tokens": 43562935.0, "step": 26600 }, { "entropy": 0.30989315211772916, "epoch": 0.4096217233437047, "grad_norm": 0.5738824605941772, "learning_rate": 1.991879516248474e-05, "loss": 0.3174, "mean_token_accuracy": 0.9055542141199112, "num_tokens": 43640259.0, "step": 26610 }, { "entropy": 0.31347999004647137, "epoch": 0.4097756586023833, "grad_norm": 0.539924681186676, "learning_rate": 1.9918567209131028e-05, "loss": 0.3177, "mean_token_accuracy": 0.9053505033254623, "num_tokens": 43708572.0, "step": 26620 }, { "entropy": 0.3072673350572586, "epoch": 0.40992959386106187, "grad_norm": 0.6598727703094482, "learning_rate": 1.9918338937585133e-05, "loss": 0.306, "mean_token_accuracy": 0.905868974328041, "num_tokens": 43776499.0, "step": 26630 }, { "entropy": 0.2986434715799987, "epoch": 0.41008352911974044, "grad_norm": 0.7462072372436523, "learning_rate": 1.9918110347854377e-05, "loss": 0.2944, "mean_token_accuracy": 0.9098625957965851, "num_tokens": 43841629.0, "step": 26640 }, { "entropy": 0.2980769742280245, "epoch": 0.41023746437841907, "grad_norm": 0.7137727737426758, "learning_rate": 1.991788143994609e-05, "loss": 0.294, "mean_token_accuracy": 0.9078647285699845, "num_tokens": 43904376.0, "step": 26650 }, { "entropy": 0.32103090956807134, "epoch": 0.41039139963709764, "grad_norm": 0.6966286897659302, "learning_rate": 1.9917652213867623e-05, "loss": 0.3234, "mean_token_accuracy": 0.901121923327446, "num_tokens": 43969288.0, "step": 26660 }, { "entropy": 0.27990884985774755, "epoch": 0.4105453348957762, "grad_norm": 0.6005133390426636, "learning_rate": 1.9917422669626325e-05, "loss": 0.2851, "mean_token_accuracy": 0.9143617898225784, "num_tokens": 44038320.0, "step": 26670 }, { "entropy": 0.29316373774781823, "epoch": 0.4106992701544548, "grad_norm": 0.7198433876037598, "learning_rate": 1.9917192807229562e-05, "loss": 0.2896, "mean_token_accuracy": 0.9118744246661663, "num_tokens": 44104110.0, "step": 26680 }, { "entropy": 0.31922057177871466, "epoch": 0.41085320541313336, "grad_norm": 0.8294338583946228, "learning_rate": 1.9916962626684707e-05, "loss": 0.3218, "mean_token_accuracy": 0.9013502463698387, "num_tokens": 44168199.0, "step": 26690 }, { "entropy": 0.29279342824593185, "epoch": 0.41100714067181193, "grad_norm": 0.9673091173171997, "learning_rate": 1.9916732127999146e-05, "loss": 0.3057, "mean_token_accuracy": 0.9085810638964176, "num_tokens": 44230623.0, "step": 26700 }, { "entropy": 0.2970798810943961, "epoch": 0.41116107593049056, "grad_norm": 0.7180892825126648, "learning_rate": 1.991650131118027e-05, "loss": 0.3047, "mean_token_accuracy": 0.9097771525382996, "num_tokens": 44295839.0, "step": 26710 }, { "entropy": 0.33115675412118434, "epoch": 0.41131501118916913, "grad_norm": 0.7546360492706299, "learning_rate": 1.9916270176235487e-05, "loss": 0.3304, "mean_token_accuracy": 0.896971482783556, "num_tokens": 44365604.0, "step": 26720 }, { "entropy": 0.30046710800379517, "epoch": 0.4114689464478477, "grad_norm": 0.7124701142311096, "learning_rate": 1.9916038723172208e-05, "loss": 0.3034, "mean_token_accuracy": 0.9088523924350739, "num_tokens": 44426774.0, "step": 26730 }, { "entropy": 0.3178302073851228, "epoch": 0.4116228817065263, "grad_norm": 0.6147100925445557, "learning_rate": 1.9915806951997863e-05, "loss": 0.311, "mean_token_accuracy": 0.9012364029884339, "num_tokens": 44493428.0, "step": 26740 }, { "entropy": 0.30959378853440284, "epoch": 0.41177681696520485, "grad_norm": 0.6789140105247498, "learning_rate": 1.9915574862719884e-05, "loss": 0.3113, "mean_token_accuracy": 0.9048341572284698, "num_tokens": 44564810.0, "step": 26750 }, { "entropy": 0.3046029590070248, "epoch": 0.4119307522238834, "grad_norm": 0.8493073582649231, "learning_rate": 1.9915342455345716e-05, "loss": 0.3122, "mean_token_accuracy": 0.9057836182415485, "num_tokens": 44640405.0, "step": 26760 }, { "entropy": 0.3212901037186384, "epoch": 0.412084687482562, "grad_norm": 0.9585779309272766, "learning_rate": 1.991510972988282e-05, "loss": 0.3223, "mean_token_accuracy": 0.904084074497223, "num_tokens": 44693807.0, "step": 26770 }, { "entropy": 0.30963266557082536, "epoch": 0.4122386227412406, "grad_norm": 0.8020614981651306, "learning_rate": 1.9914876686338657e-05, "loss": 0.3208, "mean_token_accuracy": 0.9052711240947247, "num_tokens": 44755281.0, "step": 26780 }, { "entropy": 0.305694180727005, "epoch": 0.4123925579999192, "grad_norm": 0.6722801923751831, "learning_rate": 1.99146433247207e-05, "loss": 0.3072, "mean_token_accuracy": 0.9056058466434479, "num_tokens": 44814246.0, "step": 26790 }, { "entropy": 0.3128107053227723, "epoch": 0.41254649325859777, "grad_norm": 0.7262657284736633, "learning_rate": 1.9914409645036445e-05, "loss": 0.3216, "mean_token_accuracy": 0.903828126937151, "num_tokens": 44883727.0, "step": 26800 }, { "entropy": 0.3153581881895661, "epoch": 0.41270042851727634, "grad_norm": 0.6401700973510742, "learning_rate": 1.9914175647293378e-05, "loss": 0.3117, "mean_token_accuracy": 0.9061642244458199, "num_tokens": 44945464.0, "step": 26810 }, { "entropy": 0.32179684210568665, "epoch": 0.4128543637759549, "grad_norm": 0.7955727577209473, "learning_rate": 1.9913941331499017e-05, "loss": 0.3094, "mean_token_accuracy": 0.9034419491887092, "num_tokens": 45007219.0, "step": 26820 }, { "entropy": 0.3244527880102396, "epoch": 0.4130082990346335, "grad_norm": 0.6572422385215759, "learning_rate": 1.9913706697660867e-05, "loss": 0.3213, "mean_token_accuracy": 0.9012073948979378, "num_tokens": 45073247.0, "step": 26830 }, { "entropy": 0.2966752745211124, "epoch": 0.4131622342933121, "grad_norm": 0.6049423217773438, "learning_rate": 1.9913471745786464e-05, "loss": 0.305, "mean_token_accuracy": 0.9102147936820983, "num_tokens": 45137318.0, "step": 26840 }, { "entropy": 0.2998836928978562, "epoch": 0.4133161695519907, "grad_norm": 0.7141998410224915, "learning_rate": 1.9913236475883343e-05, "loss": 0.2967, "mean_token_accuracy": 0.908133701980114, "num_tokens": 45212482.0, "step": 26850 }, { "entropy": 0.30139523297548293, "epoch": 0.41347010481066926, "grad_norm": 0.531173586845398, "learning_rate": 1.9913000887959052e-05, "loss": 0.3059, "mean_token_accuracy": 0.9073968827724457, "num_tokens": 45283943.0, "step": 26860 }, { "entropy": 0.3179065169766545, "epoch": 0.41362404006934783, "grad_norm": 0.6343366503715515, "learning_rate": 1.9912764982021144e-05, "loss": 0.3181, "mean_token_accuracy": 0.9034175060689449, "num_tokens": 45353973.0, "step": 26870 }, { "entropy": 0.3247424319386482, "epoch": 0.4137779753280264, "grad_norm": 0.7394707798957825, "learning_rate": 1.991252875807719e-05, "loss": 0.3259, "mean_token_accuracy": 0.9030341297388077, "num_tokens": 45417472.0, "step": 26880 }, { "entropy": 0.324956058897078, "epoch": 0.413931910586705, "grad_norm": 0.5921822786331177, "learning_rate": 1.9912292216134775e-05, "loss": 0.3225, "mean_token_accuracy": 0.9021261028945446, "num_tokens": 45477648.0, "step": 26890 }, { "entropy": 0.3056440004147589, "epoch": 0.4140858458453836, "grad_norm": 0.4869864583015442, "learning_rate": 1.991205535620148e-05, "loss": 0.3086, "mean_token_accuracy": 0.9056720331311225, "num_tokens": 45540118.0, "step": 26900 }, { "entropy": 0.31879280097782614, "epoch": 0.4142397811040622, "grad_norm": 0.7568949460983276, "learning_rate": 1.99118181782849e-05, "loss": 0.3209, "mean_token_accuracy": 0.9020446464419365, "num_tokens": 45610760.0, "step": 26910 }, { "entropy": 0.3207519484683871, "epoch": 0.41439371636274075, "grad_norm": 0.7702557444572449, "learning_rate": 1.9911580682392652e-05, "loss": 0.3215, "mean_token_accuracy": 0.9026577159762382, "num_tokens": 45686439.0, "step": 26920 }, { "entropy": 0.3325859323143959, "epoch": 0.4145476516214193, "grad_norm": 0.721810519695282, "learning_rate": 1.9911342868532353e-05, "loss": 0.343, "mean_token_accuracy": 0.8968240678310394, "num_tokens": 45750362.0, "step": 26930 }, { "entropy": 0.32083233250305054, "epoch": 0.4147015868800979, "grad_norm": 0.749262273311615, "learning_rate": 1.991110473671163e-05, "loss": 0.3159, "mean_token_accuracy": 0.9038623727858066, "num_tokens": 45822970.0, "step": 26940 }, { "entropy": 0.2941782400943339, "epoch": 0.41485552213877647, "grad_norm": 0.7700084447860718, "learning_rate": 1.991086628693812e-05, "loss": 0.2989, "mean_token_accuracy": 0.9102820619940758, "num_tokens": 45885943.0, "step": 26950 }, { "entropy": 0.3277132433839142, "epoch": 0.41500945739745504, "grad_norm": 0.5932334661483765, "learning_rate": 1.991062751921948e-05, "loss": 0.334, "mean_token_accuracy": 0.8969912536442279, "num_tokens": 45951108.0, "step": 26960 }, { "entropy": 0.3156210331246257, "epoch": 0.41516339265613367, "grad_norm": 0.7051202058792114, "learning_rate": 1.991038843356336e-05, "loss": 0.3161, "mean_token_accuracy": 0.9072240464389324, "num_tokens": 46009250.0, "step": 26970 }, { "entropy": 0.2948564887978137, "epoch": 0.41531732791481224, "grad_norm": 0.5628758072853088, "learning_rate": 1.9910149029977443e-05, "loss": 0.2936, "mean_token_accuracy": 0.9107703655958176, "num_tokens": 46074668.0, "step": 26980 }, { "entropy": 0.30869811652228235, "epoch": 0.4154712631734908, "grad_norm": 0.693184494972229, "learning_rate": 1.9909909308469398e-05, "loss": 0.3263, "mean_token_accuracy": 0.9083169415593147, "num_tokens": 46132021.0, "step": 26990 }, { "entropy": 0.31900833547115326, "epoch": 0.4156251984321694, "grad_norm": 0.7010278105735779, "learning_rate": 1.9909669269046916e-05, "loss": 0.334, "mean_token_accuracy": 0.9012772247195244, "num_tokens": 46198247.0, "step": 27000 }, { "entropy": 0.30173703096807003, "epoch": 0.41577913369084796, "grad_norm": 0.8315527439117432, "learning_rate": 1.9909428911717702e-05, "loss": 0.3021, "mean_token_accuracy": 0.9083228401839734, "num_tokens": 46264488.0, "step": 27010 }, { "entropy": 0.3080227178521454, "epoch": 0.41593306894952653, "grad_norm": 0.6658578515052795, "learning_rate": 1.9909188236489467e-05, "loss": 0.3057, "mean_token_accuracy": 0.9067093141376972, "num_tokens": 46325887.0, "step": 27020 }, { "entropy": 0.2985944588668644, "epoch": 0.41608700420820516, "grad_norm": 0.735312819480896, "learning_rate": 1.990894724336993e-05, "loss": 0.3018, "mean_token_accuracy": 0.9082526043057442, "num_tokens": 46387753.0, "step": 27030 }, { "entropy": 0.31153921792283656, "epoch": 0.41624093946688373, "grad_norm": 0.6144599318504333, "learning_rate": 1.9908705932366822e-05, "loss": 0.3131, "mean_token_accuracy": 0.9053078390657902, "num_tokens": 46455054.0, "step": 27040 }, { "entropy": 0.2977529370225966, "epoch": 0.4163948747255623, "grad_norm": 0.6816189885139465, "learning_rate": 1.9908464303487886e-05, "loss": 0.3149, "mean_token_accuracy": 0.9076840169727802, "num_tokens": 46518841.0, "step": 27050 }, { "entropy": 0.30132136689499023, "epoch": 0.4165488099842409, "grad_norm": 0.6159217357635498, "learning_rate": 1.9908222356740872e-05, "loss": 0.296, "mean_token_accuracy": 0.9083411045372486, "num_tokens": 46587993.0, "step": 27060 }, { "entropy": 0.29830464217811825, "epoch": 0.41670274524291945, "grad_norm": 0.7015092372894287, "learning_rate": 1.9907980092133544e-05, "loss": 0.3081, "mean_token_accuracy": 0.9068642228841781, "num_tokens": 46659847.0, "step": 27070 }, { "entropy": 0.3159900365397334, "epoch": 0.416856680501598, "grad_norm": 0.5835812091827393, "learning_rate": 1.990773750967367e-05, "loss": 0.3008, "mean_token_accuracy": 0.9060671478509903, "num_tokens": 46726131.0, "step": 27080 }, { "entropy": 0.2839926891960204, "epoch": 0.41701061576027665, "grad_norm": 0.7454734444618225, "learning_rate": 1.9907494609369037e-05, "loss": 0.2934, "mean_token_accuracy": 0.9112546771764756, "num_tokens": 46794820.0, "step": 27090 }, { "entropy": 0.3212497955188155, "epoch": 0.4171645510189552, "grad_norm": 0.6045587658882141, "learning_rate": 1.990725139122743e-05, "loss": 0.3133, "mean_token_accuracy": 0.902609221637249, "num_tokens": 46866023.0, "step": 27100 }, { "entropy": 0.30494731524959207, "epoch": 0.4173184862776338, "grad_norm": 0.5335627198219299, "learning_rate": 1.990700785525666e-05, "loss": 0.3147, "mean_token_accuracy": 0.9073425121605396, "num_tokens": 46934321.0, "step": 27110 }, { "entropy": 0.2991961143910885, "epoch": 0.41747242153631237, "grad_norm": 0.62852942943573, "learning_rate": 1.9906764001464536e-05, "loss": 0.3124, "mean_token_accuracy": 0.9069581583142281, "num_tokens": 47006000.0, "step": 27120 }, { "entropy": 0.2908892784267664, "epoch": 0.41762635679499094, "grad_norm": 0.580099880695343, "learning_rate": 1.9906519829858882e-05, "loss": 0.2865, "mean_token_accuracy": 0.9111432753503322, "num_tokens": 47082972.0, "step": 27130 }, { "entropy": 0.3113209753297269, "epoch": 0.4177802920536695, "grad_norm": 0.807106614112854, "learning_rate": 1.9906275340447527e-05, "loss": 0.306, "mean_token_accuracy": 0.9059384636580944, "num_tokens": 47138116.0, "step": 27140 }, { "entropy": 0.3040598763152957, "epoch": 0.4179342273123481, "grad_norm": 0.6594510078430176, "learning_rate": 1.9906030533238317e-05, "loss": 0.3047, "mean_token_accuracy": 0.907172505557537, "num_tokens": 47204788.0, "step": 27150 }, { "entropy": 0.3303468408063054, "epoch": 0.4180881625710267, "grad_norm": 0.5856615900993347, "learning_rate": 1.990578540823911e-05, "loss": 0.3193, "mean_token_accuracy": 0.8999118804931641, "num_tokens": 47278777.0, "step": 27160 }, { "entropy": 0.3225341094657779, "epoch": 0.4182420978297053, "grad_norm": 0.6260884404182434, "learning_rate": 1.990553996545776e-05, "loss": 0.3169, "mean_token_accuracy": 0.9027343705296517, "num_tokens": 47338403.0, "step": 27170 }, { "entropy": 0.3204022116959095, "epoch": 0.41839603308838386, "grad_norm": 0.5574000477790833, "learning_rate": 1.990529420490215e-05, "loss": 0.3301, "mean_token_accuracy": 0.8999538943171501, "num_tokens": 47408099.0, "step": 27180 }, { "entropy": 0.3058486418798566, "epoch": 0.41854996834706243, "grad_norm": 0.7174505591392517, "learning_rate": 1.990504812658016e-05, "loss": 0.3164, "mean_token_accuracy": 0.9065449945628643, "num_tokens": 47471338.0, "step": 27190 }, { "entropy": 0.32812973484396935, "epoch": 0.418703903605741, "grad_norm": 0.6977081894874573, "learning_rate": 1.9904801730499685e-05, "loss": 0.3184, "mean_token_accuracy": 0.8998264789581298, "num_tokens": 47534629.0, "step": 27200 }, { "entropy": 0.31119681438431146, "epoch": 0.4188578388644196, "grad_norm": 1.0928330421447754, "learning_rate": 1.9904555016668632e-05, "loss": 0.3207, "mean_token_accuracy": 0.903719849139452, "num_tokens": 47597599.0, "step": 27210 }, { "entropy": 0.321012020111084, "epoch": 0.4190117741230982, "grad_norm": 0.5911897420883179, "learning_rate": 1.990430798509491e-05, "loss": 0.3154, "mean_token_accuracy": 0.9023505575954914, "num_tokens": 47672736.0, "step": 27220 }, { "entropy": 0.3084384492598474, "epoch": 0.4191657093817768, "grad_norm": 0.9152031540870667, "learning_rate": 1.990406063578645e-05, "loss": 0.3078, "mean_token_accuracy": 0.9059621021151543, "num_tokens": 47733673.0, "step": 27230 }, { "entropy": 0.32814218252897265, "epoch": 0.41931964464045535, "grad_norm": 0.6716633439064026, "learning_rate": 1.9903812968751182e-05, "loss": 0.3382, "mean_token_accuracy": 0.8997465088963509, "num_tokens": 47802549.0, "step": 27240 }, { "entropy": 0.33206268567591907, "epoch": 0.4194735798991339, "grad_norm": 0.7617354393005371, "learning_rate": 1.9903564983997052e-05, "loss": 0.3285, "mean_token_accuracy": 0.8983004912734032, "num_tokens": 47879865.0, "step": 27250 }, { "entropy": 0.3095676300115883, "epoch": 0.4196275151578125, "grad_norm": 1.0317893028259277, "learning_rate": 1.990331668153202e-05, "loss": 0.3108, "mean_token_accuracy": 0.9050423718988896, "num_tokens": 47940466.0, "step": 27260 }, { "entropy": 0.295662781316787, "epoch": 0.41978145041649106, "grad_norm": 0.6928068995475769, "learning_rate": 1.9903068061364046e-05, "loss": 0.2992, "mean_token_accuracy": 0.9094168424606324, "num_tokens": 48007100.0, "step": 27270 }, { "entropy": 0.2918867086991668, "epoch": 0.4199353856751697, "grad_norm": 0.7840443253517151, "learning_rate": 1.990281912350111e-05, "loss": 0.3106, "mean_token_accuracy": 0.9083827510476112, "num_tokens": 48072377.0, "step": 27280 }, { "entropy": 0.3226533823646605, "epoch": 0.42008932093384826, "grad_norm": 0.7998830676078796, "learning_rate": 1.9902569867951195e-05, "loss": 0.3084, "mean_token_accuracy": 0.9011823855340481, "num_tokens": 48130265.0, "step": 27290 }, { "entropy": 0.3001393660902977, "epoch": 0.42024325619252684, "grad_norm": 0.69426429271698, "learning_rate": 1.99023202947223e-05, "loss": 0.3095, "mean_token_accuracy": 0.9063654273748398, "num_tokens": 48194803.0, "step": 27300 }, { "entropy": 0.32880763821303843, "epoch": 0.4203971914512054, "grad_norm": 0.6918032169342041, "learning_rate": 1.9902070403822427e-05, "loss": 0.3311, "mean_token_accuracy": 0.8997108325362205, "num_tokens": 48257520.0, "step": 27310 }, { "entropy": 0.29624890219420197, "epoch": 0.420551126709884, "grad_norm": 0.6478685140609741, "learning_rate": 1.99018201952596e-05, "loss": 0.2979, "mean_token_accuracy": 0.9113232024013996, "num_tokens": 48323500.0, "step": 27320 }, { "entropy": 0.30456789005547763, "epoch": 0.42070506196856255, "grad_norm": 0.6947031617164612, "learning_rate": 1.9901569669041843e-05, "loss": 0.3052, "mean_token_accuracy": 0.9055012546479702, "num_tokens": 48398824.0, "step": 27330 }, { "entropy": 0.2853333180770278, "epoch": 0.4208589972272411, "grad_norm": 0.7042633295059204, "learning_rate": 1.9901318825177188e-05, "loss": 0.2887, "mean_token_accuracy": 0.9111742466688156, "num_tokens": 48459034.0, "step": 27340 }, { "entropy": 0.3090991759672761, "epoch": 0.42101293248591976, "grad_norm": 0.7030131220817566, "learning_rate": 1.9901067663673687e-05, "loss": 0.307, "mean_token_accuracy": 0.905002485960722, "num_tokens": 48523533.0, "step": 27350 }, { "entropy": 0.2989234450273216, "epoch": 0.42116686774459833, "grad_norm": 0.5786672830581665, "learning_rate": 1.9900816184539393e-05, "loss": 0.3047, "mean_token_accuracy": 0.9087803408503532, "num_tokens": 48585396.0, "step": 27360 }, { "entropy": 0.2985526309348643, "epoch": 0.4213208030032769, "grad_norm": 0.6373277306556702, "learning_rate": 1.9900564387782378e-05, "loss": 0.3018, "mean_token_accuracy": 0.9078466542065143, "num_tokens": 48659464.0, "step": 27370 }, { "entropy": 0.31533564189448954, "epoch": 0.4214747382619555, "grad_norm": 0.6744126081466675, "learning_rate": 1.990031227341072e-05, "loss": 0.3223, "mean_token_accuracy": 0.9033081650733947, "num_tokens": 48730278.0, "step": 27380 }, { "entropy": 0.31416638363152743, "epoch": 0.42162867352063405, "grad_norm": 0.6271792650222778, "learning_rate": 1.9900059841432505e-05, "loss": 0.3174, "mean_token_accuracy": 0.9025910206139087, "num_tokens": 48802774.0, "step": 27390 }, { "entropy": 0.3242522496730089, "epoch": 0.4217826087793126, "grad_norm": 0.6464406847953796, "learning_rate": 1.9899807091855835e-05, "loss": 0.3191, "mean_token_accuracy": 0.9023568406701088, "num_tokens": 48871941.0, "step": 27400 }, { "entropy": 0.2882756466977298, "epoch": 0.42193654403799125, "grad_norm": 0.5623602867126465, "learning_rate": 1.9899554024688807e-05, "loss": 0.3065, "mean_token_accuracy": 0.9113137140870095, "num_tokens": 48943664.0, "step": 27410 }, { "entropy": 0.3167576783336699, "epoch": 0.4220904792966698, "grad_norm": 0.5182552337646484, "learning_rate": 1.9899300639939552e-05, "loss": 0.3137, "mean_token_accuracy": 0.9031381875276565, "num_tokens": 49013204.0, "step": 27420 }, { "entropy": 0.296485917083919, "epoch": 0.4222444145553484, "grad_norm": 0.601046621799469, "learning_rate": 1.9899046937616195e-05, "loss": 0.2964, "mean_token_accuracy": 0.911133423447609, "num_tokens": 49077945.0, "step": 27430 }, { "entropy": 0.28217288935557006, "epoch": 0.42239834981402696, "grad_norm": 0.6842182874679565, "learning_rate": 1.989879291772687e-05, "loss": 0.2823, "mean_token_accuracy": 0.9125446893274785, "num_tokens": 49146384.0, "step": 27440 }, { "entropy": 0.28478891458362343, "epoch": 0.42255228507270554, "grad_norm": 0.6683467626571655, "learning_rate": 1.9898538580279734e-05, "loss": 0.2879, "mean_token_accuracy": 0.9118358045816422, "num_tokens": 49215508.0, "step": 27450 }, { "entropy": 0.29877873566001656, "epoch": 0.4227062203313841, "grad_norm": 0.6879851818084717, "learning_rate": 1.989828392528294e-05, "loss": 0.2998, "mean_token_accuracy": 0.9089184097945691, "num_tokens": 49285814.0, "step": 27460 }, { "entropy": 0.3104163005948067, "epoch": 0.42286015559006274, "grad_norm": 0.8031854629516602, "learning_rate": 1.9898028952744663e-05, "loss": 0.311, "mean_token_accuracy": 0.9051888160407543, "num_tokens": 49361001.0, "step": 27470 }, { "entropy": 0.29945674929767846, "epoch": 0.4230140908487413, "grad_norm": 1.0862979888916016, "learning_rate": 1.9897773662673076e-05, "loss": 0.2901, "mean_token_accuracy": 0.9079838342964649, "num_tokens": 49426813.0, "step": 27480 }, { "entropy": 0.2926743606105447, "epoch": 0.4231680261074199, "grad_norm": 0.6698525547981262, "learning_rate": 1.9897518055076372e-05, "loss": 0.3083, "mean_token_accuracy": 0.9080202594399452, "num_tokens": 49491663.0, "step": 27490 }, { "entropy": 0.3136614698916674, "epoch": 0.42332196136609845, "grad_norm": 0.7823673486709595, "learning_rate": 1.989726212996275e-05, "loss": 0.3307, "mean_token_accuracy": 0.9009228773415089, "num_tokens": 49554674.0, "step": 27500 }, { "entropy": 0.303316371422261, "epoch": 0.423475896624777, "grad_norm": 0.7175101637840271, "learning_rate": 1.9897005887340423e-05, "loss": 0.3089, "mean_token_accuracy": 0.9094071835279465, "num_tokens": 49618635.0, "step": 27510 }, { "entropy": 0.31137896701693535, "epoch": 0.4236298318834556, "grad_norm": 0.7134029269218445, "learning_rate": 1.989674932721761e-05, "loss": 0.3031, "mean_token_accuracy": 0.9056868717074394, "num_tokens": 49680438.0, "step": 27520 }, { "entropy": 0.2950993521139026, "epoch": 0.42378376714213417, "grad_norm": 0.6524708271026611, "learning_rate": 1.9896492449602544e-05, "loss": 0.3062, "mean_token_accuracy": 0.9092950046062469, "num_tokens": 49750056.0, "step": 27530 }, { "entropy": 0.320847911760211, "epoch": 0.4239377024008128, "grad_norm": 0.5969999432563782, "learning_rate": 1.989623525450346e-05, "loss": 0.3094, "mean_token_accuracy": 0.9042626224458218, "num_tokens": 49817235.0, "step": 27540 }, { "entropy": 0.3085036168806255, "epoch": 0.42409163765949137, "grad_norm": 0.9033710360527039, "learning_rate": 1.989597774192861e-05, "loss": 0.3278, "mean_token_accuracy": 0.9041787661612034, "num_tokens": 49881815.0, "step": 27550 }, { "entropy": 0.2948911624029279, "epoch": 0.42424557291816994, "grad_norm": 0.7212362289428711, "learning_rate": 1.989571991188626e-05, "loss": 0.3035, "mean_token_accuracy": 0.9082578450441361, "num_tokens": 49942141.0, "step": 27560 }, { "entropy": 0.3108102552592754, "epoch": 0.4243995081768485, "grad_norm": 0.6079918742179871, "learning_rate": 1.989546176438468e-05, "loss": 0.3144, "mean_token_accuracy": 0.9089535616338253, "num_tokens": 50008971.0, "step": 27570 }, { "entropy": 0.30061958990991117, "epoch": 0.4245534434355271, "grad_norm": 0.7424546480178833, "learning_rate": 1.9895203299432144e-05, "loss": 0.3012, "mean_token_accuracy": 0.9077500946819782, "num_tokens": 50067420.0, "step": 27580 }, { "entropy": 0.2888146275654435, "epoch": 0.42470737869420566, "grad_norm": 0.7823603749275208, "learning_rate": 1.9894944517036956e-05, "loss": 0.2982, "mean_token_accuracy": 0.911719647794962, "num_tokens": 50134454.0, "step": 27590 }, { "entropy": 0.3155756704509258, "epoch": 0.4248613139528843, "grad_norm": 0.8455732464790344, "learning_rate": 1.9894685417207406e-05, "loss": 0.3188, "mean_token_accuracy": 0.905642069876194, "num_tokens": 50196946.0, "step": 27600 }, { "entropy": 0.29174650739878416, "epoch": 0.42501524921156286, "grad_norm": 0.6830773949623108, "learning_rate": 1.9894425999951816e-05, "loss": 0.3026, "mean_token_accuracy": 0.9111165136098862, "num_tokens": 50264042.0, "step": 27610 }, { "entropy": 0.30269325263798236, "epoch": 0.42516918447024143, "grad_norm": 0.717994749546051, "learning_rate": 1.9894166265278502e-05, "loss": 0.3173, "mean_token_accuracy": 0.9067228093743325, "num_tokens": 50325364.0, "step": 27620 }, { "entropy": 0.32220212165266277, "epoch": 0.42532311972892, "grad_norm": 0.7103472948074341, "learning_rate": 1.9893906213195797e-05, "loss": 0.3061, "mean_token_accuracy": 0.9041004799306392, "num_tokens": 50392847.0, "step": 27630 }, { "entropy": 0.2988027652725577, "epoch": 0.4254770549875986, "grad_norm": 0.7623358964920044, "learning_rate": 1.9893645843712045e-05, "loss": 0.3059, "mean_token_accuracy": 0.9084318108856678, "num_tokens": 50461472.0, "step": 27640 }, { "entropy": 0.29024833617731927, "epoch": 0.42563099024627715, "grad_norm": 0.6827003359794617, "learning_rate": 1.98933851568356e-05, "loss": 0.2964, "mean_token_accuracy": 0.9104725688695907, "num_tokens": 50522404.0, "step": 27650 }, { "entropy": 0.2990330087020993, "epoch": 0.4257849255049558, "grad_norm": 0.5637639164924622, "learning_rate": 1.989312415257482e-05, "loss": 0.3034, "mean_token_accuracy": 0.9067557752132416, "num_tokens": 50591078.0, "step": 27660 }, { "entropy": 0.31326928436756135, "epoch": 0.42593886076363435, "grad_norm": 0.7762871980667114, "learning_rate": 1.9892862830938088e-05, "loss": 0.3252, "mean_token_accuracy": 0.9054783768951893, "num_tokens": 50656562.0, "step": 27670 }, { "entropy": 0.3036535608582199, "epoch": 0.4260927960223129, "grad_norm": 0.5850859880447388, "learning_rate": 1.9892601191933772e-05, "loss": 0.3006, "mean_token_accuracy": 0.9077980779111385, "num_tokens": 50732768.0, "step": 27680 }, { "entropy": 0.3028957198373973, "epoch": 0.4262467312809915, "grad_norm": 0.5501613616943359, "learning_rate": 1.9892339235570284e-05, "loss": 0.3098, "mean_token_accuracy": 0.9066353820264339, "num_tokens": 50802704.0, "step": 27690 }, { "entropy": 0.3218564668670297, "epoch": 0.42640066653967007, "grad_norm": 0.7823810577392578, "learning_rate": 1.9892076961856012e-05, "loss": 0.3147, "mean_token_accuracy": 0.8999023735523224, "num_tokens": 50872660.0, "step": 27700 }, { "entropy": 0.3038877764716744, "epoch": 0.42655460179834864, "grad_norm": 0.6047643423080444, "learning_rate": 1.9891814370799376e-05, "loss": 0.2882, "mean_token_accuracy": 0.9069906160235405, "num_tokens": 50938395.0, "step": 27710 }, { "entropy": 0.2978515649214387, "epoch": 0.4267085370570272, "grad_norm": 0.6646595597267151, "learning_rate": 1.98915514624088e-05, "loss": 0.2833, "mean_token_accuracy": 0.9108671300113201, "num_tokens": 51005060.0, "step": 27720 }, { "entropy": 0.2912367622368038, "epoch": 0.42686247231570584, "grad_norm": 0.7293034791946411, "learning_rate": 1.989128823669272e-05, "loss": 0.2954, "mean_token_accuracy": 0.9097169518470765, "num_tokens": 51076922.0, "step": 27730 }, { "entropy": 0.29934232737869026, "epoch": 0.4270164075743844, "grad_norm": 0.7862244248390198, "learning_rate": 1.989102469365958e-05, "loss": 0.3044, "mean_token_accuracy": 0.9075880028307438, "num_tokens": 51148035.0, "step": 27740 }, { "entropy": 0.30185485938563944, "epoch": 0.427170342833063, "grad_norm": 1.007606863975525, "learning_rate": 1.9890760833317834e-05, "loss": 0.2937, "mean_token_accuracy": 0.9077267989516258, "num_tokens": 51218624.0, "step": 27750 }, { "entropy": 0.3077669894322753, "epoch": 0.42732427809174156, "grad_norm": 0.6334553360939026, "learning_rate": 1.989049665567594e-05, "loss": 0.3196, "mean_token_accuracy": 0.9053875662386417, "num_tokens": 51286713.0, "step": 27760 }, { "entropy": 0.3020851922221482, "epoch": 0.42747821335042013, "grad_norm": 0.6925987601280212, "learning_rate": 1.9890232160742383e-05, "loss": 0.2948, "mean_token_accuracy": 0.9081932432949543, "num_tokens": 51359675.0, "step": 27770 }, { "entropy": 0.3145247201435268, "epoch": 0.4276321486090987, "grad_norm": 0.5193847417831421, "learning_rate": 1.9889967348525642e-05, "loss": 0.3162, "mean_token_accuracy": 0.904154721647501, "num_tokens": 51424470.0, "step": 27780 }, { "entropy": 0.3025478133000433, "epoch": 0.42778608386777733, "grad_norm": 0.6528263688087463, "learning_rate": 1.9889702219034215e-05, "loss": 0.3065, "mean_token_accuracy": 0.9087739981710911, "num_tokens": 51497211.0, "step": 27790 }, { "entropy": 0.312215630710125, "epoch": 0.4279400191264559, "grad_norm": 0.755203902721405, "learning_rate": 1.9889436772276606e-05, "loss": 0.3184, "mean_token_accuracy": 0.9043018184602261, "num_tokens": 51564385.0, "step": 27800 }, { "entropy": 0.3118209283798933, "epoch": 0.4280939543851345, "grad_norm": 0.6633535623550415, "learning_rate": 1.9889171008261332e-05, "loss": 0.3023, "mean_token_accuracy": 0.9031897336244583, "num_tokens": 51627266.0, "step": 27810 }, { "entropy": 0.2792804893106222, "epoch": 0.42824788964381305, "grad_norm": 0.5548838973045349, "learning_rate": 1.988890492699692e-05, "loss": 0.2799, "mean_token_accuracy": 0.9133378393948078, "num_tokens": 51692984.0, "step": 27820 }, { "entropy": 0.2898480753414333, "epoch": 0.4284018249024916, "grad_norm": 0.6549878716468811, "learning_rate": 1.9888638528491905e-05, "loss": 0.2978, "mean_token_accuracy": 0.9114559769630433, "num_tokens": 51751730.0, "step": 27830 }, { "entropy": 0.31832596808671954, "epoch": 0.4285557601611702, "grad_norm": 0.6056057214736938, "learning_rate": 1.988837181275483e-05, "loss": 0.3286, "mean_token_accuracy": 0.9020627208054066, "num_tokens": 51812926.0, "step": 27840 }, { "entropy": 0.329242251906544, "epoch": 0.4287096954198488, "grad_norm": 0.7599592208862305, "learning_rate": 1.9888104779794253e-05, "loss": 0.3201, "mean_token_accuracy": 0.9004181779921054, "num_tokens": 51865764.0, "step": 27850 }, { "entropy": 0.2988056931644678, "epoch": 0.4288636306785274, "grad_norm": 0.6887120604515076, "learning_rate": 1.988783742961874e-05, "loss": 0.3112, "mean_token_accuracy": 0.9074786975979805, "num_tokens": 51928343.0, "step": 27860 }, { "entropy": 0.30677799228578806, "epoch": 0.42901756593720597, "grad_norm": 0.6177133321762085, "learning_rate": 1.9887569762236873e-05, "loss": 0.3114, "mean_token_accuracy": 0.9067078962922096, "num_tokens": 51994998.0, "step": 27870 }, { "entropy": 0.3077486649155617, "epoch": 0.42917150119588454, "grad_norm": 0.6627654433250427, "learning_rate": 1.9887301777657237e-05, "loss": 0.316, "mean_token_accuracy": 0.9055724151432514, "num_tokens": 52055526.0, "step": 27880 }, { "entropy": 0.3127062109299004, "epoch": 0.4293254364545631, "grad_norm": 0.7344763875007629, "learning_rate": 1.988703347588842e-05, "loss": 0.3161, "mean_token_accuracy": 0.9064129196107388, "num_tokens": 52118511.0, "step": 27890 }, { "entropy": 0.29545188387855886, "epoch": 0.4294793717132417, "grad_norm": 0.5891710519790649, "learning_rate": 1.988676485693904e-05, "loss": 0.3066, "mean_token_accuracy": 0.9094975382089615, "num_tokens": 52188039.0, "step": 27900 }, { "entropy": 0.2983818660490215, "epoch": 0.42963330697192026, "grad_norm": 0.619560956954956, "learning_rate": 1.988649592081771e-05, "loss": 0.3024, "mean_token_accuracy": 0.9082479394972325, "num_tokens": 52256026.0, "step": 27910 }, { "entropy": 0.28777927700430156, "epoch": 0.4297872422305989, "grad_norm": 0.6846375465393066, "learning_rate": 1.988622666753306e-05, "loss": 0.2753, "mean_token_accuracy": 0.9143072791397572, "num_tokens": 52322313.0, "step": 27920 }, { "entropy": 0.27869765516370537, "epoch": 0.42994117748927746, "grad_norm": 0.8747348189353943, "learning_rate": 1.9885957097093725e-05, "loss": 0.2949, "mean_token_accuracy": 0.9140624202787876, "num_tokens": 52381568.0, "step": 27930 }, { "entropy": 0.2990604774095118, "epoch": 0.43009511274795603, "grad_norm": 0.5549644231796265, "learning_rate": 1.9885687209508354e-05, "loss": 0.2995, "mean_token_accuracy": 0.9091459818184375, "num_tokens": 52437638.0, "step": 27940 }, { "entropy": 0.3017366025596857, "epoch": 0.4302490480066346, "grad_norm": 0.8321001529693604, "learning_rate": 1.98854170047856e-05, "loss": 0.3097, "mean_token_accuracy": 0.9086387753486633, "num_tokens": 52504368.0, "step": 27950 }, { "entropy": 0.2919153140857816, "epoch": 0.4304029832653132, "grad_norm": 1.131944179534912, "learning_rate": 1.9885146482934147e-05, "loss": 0.3012, "mean_token_accuracy": 0.9104367382824421, "num_tokens": 52559915.0, "step": 27960 }, { "entropy": 0.32204421795904636, "epoch": 0.43055691852399175, "grad_norm": 0.6023531556129456, "learning_rate": 1.9884875643962654e-05, "loss": 0.3192, "mean_token_accuracy": 0.9021780475974083, "num_tokens": 52624848.0, "step": 27970 }, { "entropy": 0.3063610717654228, "epoch": 0.4307108537826704, "grad_norm": 0.8700971603393555, "learning_rate": 1.9884604487879824e-05, "loss": 0.3108, "mean_token_accuracy": 0.9060028821229935, "num_tokens": 52687982.0, "step": 27980 }, { "entropy": 0.3138876920565963, "epoch": 0.43086478904134895, "grad_norm": 0.6572427749633789, "learning_rate": 1.9884333014694347e-05, "loss": 0.3241, "mean_token_accuracy": 0.9027061194181443, "num_tokens": 52751744.0, "step": 27990 }, { "entropy": 0.31475421972572803, "epoch": 0.4310187243000275, "grad_norm": 0.7010464668273926, "learning_rate": 1.9884061224414933e-05, "loss": 0.3209, "mean_token_accuracy": 0.9033787310123443, "num_tokens": 52814238.0, "step": 28000 }, { "entropy": 0.30808645794168116, "epoch": 0.4311726595587061, "grad_norm": 0.6168825626373291, "learning_rate": 1.9883789117050308e-05, "loss": 0.2923, "mean_token_accuracy": 0.9060308404266835, "num_tokens": 52883583.0, "step": 28010 }, { "entropy": 0.3047749719582498, "epoch": 0.43132659481738467, "grad_norm": 0.8939162492752075, "learning_rate": 1.9883516692609196e-05, "loss": 0.3107, "mean_token_accuracy": 0.9060496158897877, "num_tokens": 52949750.0, "step": 28020 }, { "entropy": 0.285431747790426, "epoch": 0.43148053007606324, "grad_norm": 0.7739930748939514, "learning_rate": 1.9883243951100337e-05, "loss": 0.3021, "mean_token_accuracy": 0.9106375351548195, "num_tokens": 53024137.0, "step": 28030 }, { "entropy": 0.3204363198950887, "epoch": 0.43163446533474187, "grad_norm": 0.645961344242096, "learning_rate": 1.988297089253248e-05, "loss": 0.3152, "mean_token_accuracy": 0.9018592834472656, "num_tokens": 53085033.0, "step": 28040 }, { "entropy": 0.3086090019904077, "epoch": 0.43178840059342044, "grad_norm": 0.6551128029823303, "learning_rate": 1.988269751691439e-05, "loss": 0.3152, "mean_token_accuracy": 0.9059003956615925, "num_tokens": 53146047.0, "step": 28050 }, { "entropy": 0.310097398981452, "epoch": 0.431942335852099, "grad_norm": 0.5550486445426941, "learning_rate": 1.9882423824254826e-05, "loss": 0.3071, "mean_token_accuracy": 0.9043182522058487, "num_tokens": 53221058.0, "step": 28060 }, { "entropy": 0.3028244859538972, "epoch": 0.4320962711107776, "grad_norm": 0.7607764005661011, "learning_rate": 1.9882149814562578e-05, "loss": 0.3078, "mean_token_accuracy": 0.9068647779524326, "num_tokens": 53281611.0, "step": 28070 }, { "entropy": 0.3347594890743494, "epoch": 0.43225020636945616, "grad_norm": 0.6801093816757202, "learning_rate": 1.9881875487846436e-05, "loss": 0.3309, "mean_token_accuracy": 0.8987414598464966, "num_tokens": 53352226.0, "step": 28080 }, { "entropy": 0.31376136317849157, "epoch": 0.43240414162813473, "grad_norm": 0.7990144491195679, "learning_rate": 1.9881600844115193e-05, "loss": 0.3048, "mean_token_accuracy": 0.904605271667242, "num_tokens": 53415789.0, "step": 28090 }, { "entropy": 0.2979456842876971, "epoch": 0.4325580768868133, "grad_norm": 0.726555585861206, "learning_rate": 1.9881325883377668e-05, "loss": 0.294, "mean_token_accuracy": 0.9094434946775436, "num_tokens": 53490214.0, "step": 28100 }, { "entropy": 0.3002763853408396, "epoch": 0.43271201214549193, "grad_norm": 0.4464186728000641, "learning_rate": 1.9881050605642677e-05, "loss": 0.3164, "mean_token_accuracy": 0.9073694176971913, "num_tokens": 53564762.0, "step": 28110 }, { "entropy": 0.3107990149408579, "epoch": 0.4328659474041705, "grad_norm": 0.6343061923980713, "learning_rate": 1.9880775010919054e-05, "loss": 0.3096, "mean_token_accuracy": 0.9056303516030312, "num_tokens": 53632623.0, "step": 28120 }, { "entropy": 0.3155827009119093, "epoch": 0.4330198826628491, "grad_norm": 0.6920119524002075, "learning_rate": 1.9880499099215637e-05, "loss": 0.309, "mean_token_accuracy": 0.9043539449572563, "num_tokens": 53697757.0, "step": 28130 }, { "entropy": 0.28630236256867647, "epoch": 0.43317381792152765, "grad_norm": 0.7937384247779846, "learning_rate": 1.9880222870541283e-05, "loss": 0.2977, "mean_token_accuracy": 0.9134231872856617, "num_tokens": 53761305.0, "step": 28140 }, { "entropy": 0.2976584668271244, "epoch": 0.4333277531802062, "grad_norm": 0.7972027063369751, "learning_rate": 1.9879946324904842e-05, "loss": 0.3212, "mean_token_accuracy": 0.9077294774353504, "num_tokens": 53826477.0, "step": 28150 }, { "entropy": 0.29317432837560775, "epoch": 0.4334816884388848, "grad_norm": 0.8899402618408203, "learning_rate": 1.9879669462315198e-05, "loss": 0.3025, "mean_token_accuracy": 0.9101955600082874, "num_tokens": 53892432.0, "step": 28160 }, { "entropy": 0.2871699234470725, "epoch": 0.4336356236975634, "grad_norm": 0.7335376143455505, "learning_rate": 1.987939228278123e-05, "loss": 0.2956, "mean_token_accuracy": 0.91226951405406, "num_tokens": 53951711.0, "step": 28170 }, { "entropy": 0.31371100721880796, "epoch": 0.433789558956242, "grad_norm": 0.6607787609100342, "learning_rate": 1.9879114786311824e-05, "loss": 0.3134, "mean_token_accuracy": 0.9046849936246872, "num_tokens": 54020050.0, "step": 28180 }, { "entropy": 0.302881426922977, "epoch": 0.43394349421492057, "grad_norm": 0.541305422782898, "learning_rate": 1.987883697291589e-05, "loss": 0.3087, "mean_token_accuracy": 0.9056659646332264, "num_tokens": 54088442.0, "step": 28190 }, { "entropy": 0.30643556658178567, "epoch": 0.43409742947359914, "grad_norm": 0.551150918006897, "learning_rate": 1.9878558842602333e-05, "loss": 0.3076, "mean_token_accuracy": 0.9056062079966068, "num_tokens": 54156743.0, "step": 28200 }, { "entropy": 0.31523193791508675, "epoch": 0.4342513647322777, "grad_norm": 0.774732768535614, "learning_rate": 1.9878280395380087e-05, "loss": 0.32, "mean_token_accuracy": 0.9027564570307731, "num_tokens": 54217859.0, "step": 28210 }, { "entropy": 0.3043846793472767, "epoch": 0.4344052999909563, "grad_norm": 0.6144709587097168, "learning_rate": 1.9878001631258068e-05, "loss": 0.3103, "mean_token_accuracy": 0.9077669195830822, "num_tokens": 54280728.0, "step": 28220 }, { "entropy": 0.29074178338050843, "epoch": 0.4345592352496349, "grad_norm": 0.6645740866661072, "learning_rate": 1.987772255024523e-05, "loss": 0.3085, "mean_token_accuracy": 0.908363950997591, "num_tokens": 54347266.0, "step": 28230 }, { "entropy": 0.3168956588022411, "epoch": 0.4347131705083135, "grad_norm": 0.6697700023651123, "learning_rate": 1.987744315235053e-05, "loss": 0.3251, "mean_token_accuracy": 0.9042929627001286, "num_tokens": 54405701.0, "step": 28240 }, { "entropy": 0.3112660192884505, "epoch": 0.43486710576699206, "grad_norm": 0.6778194308280945, "learning_rate": 1.987716343758292e-05, "loss": 0.3135, "mean_token_accuracy": 0.906395272910595, "num_tokens": 54474952.0, "step": 28250 }, { "entropy": 0.2921887906268239, "epoch": 0.43502104102567063, "grad_norm": 0.5758629441261292, "learning_rate": 1.9876883405951378e-05, "loss": 0.2971, "mean_token_accuracy": 0.9115381963551045, "num_tokens": 54543327.0, "step": 28260 }, { "entropy": 0.31562463408336044, "epoch": 0.4351749762843492, "grad_norm": 0.5582308769226074, "learning_rate": 1.987660305746489e-05, "loss": 0.3099, "mean_token_accuracy": 0.9033966608345508, "num_tokens": 54614750.0, "step": 28270 }, { "entropy": 0.3029437308199704, "epoch": 0.4353289115430278, "grad_norm": 0.7545730471611023, "learning_rate": 1.987632239213245e-05, "loss": 0.3151, "mean_token_accuracy": 0.9071681298315525, "num_tokens": 54677042.0, "step": 28280 }, { "entropy": 0.3077633300796151, "epoch": 0.43548284680170635, "grad_norm": 0.677215039730072, "learning_rate": 1.9876041409963055e-05, "loss": 0.3092, "mean_token_accuracy": 0.9055067606270313, "num_tokens": 54747546.0, "step": 28290 }, { "entropy": 0.2964548023417592, "epoch": 0.435636782060385, "grad_norm": 0.6727133393287659, "learning_rate": 1.987576011096573e-05, "loss": 0.3039, "mean_token_accuracy": 0.912313811480999, "num_tokens": 54806192.0, "step": 28300 }, { "entropy": 0.3116404363885522, "epoch": 0.43579071731906355, "grad_norm": 0.6530887484550476, "learning_rate": 1.9875478495149484e-05, "loss": 0.3191, "mean_token_accuracy": 0.9050770238041878, "num_tokens": 54868223.0, "step": 28310 }, { "entropy": 0.31929733054712417, "epoch": 0.4359446525777421, "grad_norm": 0.6931090354919434, "learning_rate": 1.987519656252337e-05, "loss": 0.3226, "mean_token_accuracy": 0.9027242176234722, "num_tokens": 54928365.0, "step": 28320 }, { "entropy": 0.31410774746909736, "epoch": 0.4360985878364207, "grad_norm": 0.6726746559143066, "learning_rate": 1.9874914313096415e-05, "loss": 0.3012, "mean_token_accuracy": 0.9061298280954361, "num_tokens": 55000905.0, "step": 28330 }, { "entropy": 0.2977668074890971, "epoch": 0.43625252309509926, "grad_norm": 0.7364420294761658, "learning_rate": 1.9874631746877687e-05, "loss": 0.309, "mean_token_accuracy": 0.9071997590363026, "num_tokens": 55075424.0, "step": 28340 }, { "entropy": 0.30433156006038187, "epoch": 0.43640645835377784, "grad_norm": 0.6886210441589355, "learning_rate": 1.9874348863876243e-05, "loss": 0.3017, "mean_token_accuracy": 0.9069453619420529, "num_tokens": 55146271.0, "step": 28350 }, { "entropy": 0.30536631597206, "epoch": 0.43656039361245647, "grad_norm": 0.7160617709159851, "learning_rate": 1.987406566410116e-05, "loss": 0.3086, "mean_token_accuracy": 0.9075088545680046, "num_tokens": 55214755.0, "step": 28360 }, { "entropy": 0.28831682493910193, "epoch": 0.43671432887113504, "grad_norm": 0.889567494392395, "learning_rate": 1.9873782147561525e-05, "loss": 0.2956, "mean_token_accuracy": 0.9118703730404377, "num_tokens": 55278335.0, "step": 28370 }, { "entropy": 0.2966877879574895, "epoch": 0.4368682641298136, "grad_norm": 0.6038804054260254, "learning_rate": 1.987349831426643e-05, "loss": 0.3088, "mean_token_accuracy": 0.9087071880698204, "num_tokens": 55346635.0, "step": 28380 }, { "entropy": 0.2818477379158139, "epoch": 0.4370221993884922, "grad_norm": 0.5755885243415833, "learning_rate": 1.9873214164224987e-05, "loss": 0.2937, "mean_token_accuracy": 0.9121284909546375, "num_tokens": 55413121.0, "step": 28390 }, { "entropy": 0.29140683328732847, "epoch": 0.43717613464717076, "grad_norm": 0.6591719388961792, "learning_rate": 1.9872929697446302e-05, "loss": 0.3029, "mean_token_accuracy": 0.9089686132967472, "num_tokens": 55478721.0, "step": 28400 }, { "entropy": 0.3238554522395134, "epoch": 0.43733006990584933, "grad_norm": 0.5810414552688599, "learning_rate": 1.9872644913939508e-05, "loss": 0.3189, "mean_token_accuracy": 0.9024020984768868, "num_tokens": 55549368.0, "step": 28410 }, { "entropy": 0.2872744478285313, "epoch": 0.43748400516452796, "grad_norm": 0.6617549061775208, "learning_rate": 1.9872359813713738e-05, "loss": 0.2995, "mean_token_accuracy": 0.910946948826313, "num_tokens": 55619328.0, "step": 28420 }, { "entropy": 0.30225573629140856, "epoch": 0.43763794042320653, "grad_norm": 0.5750695466995239, "learning_rate": 1.9872074396778142e-05, "loss": 0.3005, "mean_token_accuracy": 0.9078582726418972, "num_tokens": 55691023.0, "step": 28430 }, { "entropy": 0.3180083871819079, "epoch": 0.4377918756818851, "grad_norm": 1.1316475868225098, "learning_rate": 1.987178866314187e-05, "loss": 0.3182, "mean_token_accuracy": 0.8995433636009693, "num_tokens": 55753086.0, "step": 28440 }, { "entropy": 0.3100363747216761, "epoch": 0.4379458109405637, "grad_norm": 0.7758301496505737, "learning_rate": 1.9871502612814094e-05, "loss": 0.3012, "mean_token_accuracy": 0.9044413529336452, "num_tokens": 55825288.0, "step": 28450 }, { "entropy": 0.29233119944110514, "epoch": 0.43809974619924225, "grad_norm": 0.8904286026954651, "learning_rate": 1.9871216245803985e-05, "loss": 0.2961, "mean_token_accuracy": 0.9099707797169685, "num_tokens": 55890005.0, "step": 28460 }, { "entropy": 0.2984884441830218, "epoch": 0.4382536814579208, "grad_norm": 0.676877498626709, "learning_rate": 1.9870929562120736e-05, "loss": 0.3058, "mean_token_accuracy": 0.9077944241464138, "num_tokens": 55968496.0, "step": 28470 }, { "entropy": 0.28969781631603836, "epoch": 0.4384076167165994, "grad_norm": 0.600100576877594, "learning_rate": 1.9870642561773542e-05, "loss": 0.2856, "mean_token_accuracy": 0.9117626249790192, "num_tokens": 56029617.0, "step": 28480 }, { "entropy": 0.30177498869597913, "epoch": 0.438561551975278, "grad_norm": 0.7708256244659424, "learning_rate": 1.987035524477161e-05, "loss": 0.3162, "mean_token_accuracy": 0.9067918039858341, "num_tokens": 56081467.0, "step": 28490 }, { "entropy": 0.2946401511318982, "epoch": 0.4387154872339566, "grad_norm": 0.6428143382072449, "learning_rate": 1.9870067611124152e-05, "loss": 0.3006, "mean_token_accuracy": 0.9082478113472462, "num_tokens": 56152310.0, "step": 28500 }, { "entropy": 0.3173598114401102, "epoch": 0.43886942249263516, "grad_norm": 0.6372433304786682, "learning_rate": 1.9869779660840404e-05, "loss": 0.3127, "mean_token_accuracy": 0.9033277697861195, "num_tokens": 56218365.0, "step": 28510 }, { "entropy": 0.31257372377440334, "epoch": 0.43902335775131374, "grad_norm": 0.694607675075531, "learning_rate": 1.9869491393929598e-05, "loss": 0.3112, "mean_token_accuracy": 0.9060285426676273, "num_tokens": 56275405.0, "step": 28520 }, { "entropy": 0.2968608407303691, "epoch": 0.4391772930099923, "grad_norm": 0.9398279786109924, "learning_rate": 1.9869202810400986e-05, "loss": 0.3014, "mean_token_accuracy": 0.9097212471067906, "num_tokens": 56344286.0, "step": 28530 }, { "entropy": 0.32035089647397397, "epoch": 0.4393312282686709, "grad_norm": 0.5250552296638489, "learning_rate": 1.986891391026382e-05, "loss": 0.3205, "mean_token_accuracy": 0.9021671369671822, "num_tokens": 56410189.0, "step": 28540 }, { "entropy": 0.2962389885447919, "epoch": 0.4394851635273495, "grad_norm": 0.866380512714386, "learning_rate": 1.9868624693527372e-05, "loss": 0.3037, "mean_token_accuracy": 0.9102123856544495, "num_tokens": 56469548.0, "step": 28550 }, { "entropy": 0.28886409774422644, "epoch": 0.4396390987860281, "grad_norm": 0.5896332263946533, "learning_rate": 1.986833516020092e-05, "loss": 0.2968, "mean_token_accuracy": 0.9109774313867092, "num_tokens": 56539465.0, "step": 28560 }, { "entropy": 0.29662500973790884, "epoch": 0.43979303404470665, "grad_norm": 0.6133100390434265, "learning_rate": 1.9868045310293752e-05, "loss": 0.2943, "mean_token_accuracy": 0.9090922117233277, "num_tokens": 56613217.0, "step": 28570 }, { "entropy": 0.3079217137768865, "epoch": 0.4399469693033852, "grad_norm": 0.5868035554885864, "learning_rate": 1.9867755143815163e-05, "loss": 0.298, "mean_token_accuracy": 0.9058277457952499, "num_tokens": 56681106.0, "step": 28580 }, { "entropy": 0.291802757140249, "epoch": 0.4401009045620638, "grad_norm": 0.915803074836731, "learning_rate": 1.986746466077447e-05, "loss": 0.2975, "mean_token_accuracy": 0.9118874639272689, "num_tokens": 56744551.0, "step": 28590 }, { "entropy": 0.30692765293642876, "epoch": 0.44025483982074237, "grad_norm": 0.5708699226379395, "learning_rate": 1.986717386118098e-05, "loss": 0.2984, "mean_token_accuracy": 0.9057966627180576, "num_tokens": 56821049.0, "step": 28600 }, { "entropy": 0.3117764016613364, "epoch": 0.440408775079421, "grad_norm": 0.7186332941055298, "learning_rate": 1.9866882745044036e-05, "loss": 0.319, "mean_token_accuracy": 0.903804823756218, "num_tokens": 56894251.0, "step": 28610 }, { "entropy": 0.2925815592519939, "epoch": 0.4405627103380996, "grad_norm": 0.6552140116691589, "learning_rate": 1.9866591312372967e-05, "loss": 0.3021, "mean_token_accuracy": 0.910012847930193, "num_tokens": 56952431.0, "step": 28620 }, { "entropy": 0.30592889031395315, "epoch": 0.44071664559677814, "grad_norm": 0.593763530254364, "learning_rate": 1.9866299563177127e-05, "loss": 0.3123, "mean_token_accuracy": 0.9084803774952889, "num_tokens": 57018441.0, "step": 28630 }, { "entropy": 0.3010112361051142, "epoch": 0.4408705808554567, "grad_norm": 0.6523387432098389, "learning_rate": 1.986600749746587e-05, "loss": 0.3026, "mean_token_accuracy": 0.9063879393041134, "num_tokens": 57094208.0, "step": 28640 }, { "entropy": 0.3155913887545466, "epoch": 0.4410245161141353, "grad_norm": 0.717110276222229, "learning_rate": 1.9865715115248573e-05, "loss": 0.3264, "mean_token_accuracy": 0.9019225046038628, "num_tokens": 57161320.0, "step": 28650 }, { "entropy": 0.275880447588861, "epoch": 0.44117845137281386, "grad_norm": 0.6692810654640198, "learning_rate": 1.986542241653461e-05, "loss": 0.2826, "mean_token_accuracy": 0.9145985394716263, "num_tokens": 57232659.0, "step": 28660 }, { "entropy": 0.2877789097838104, "epoch": 0.44133238663149243, "grad_norm": 0.683647632598877, "learning_rate": 1.9865129401333375e-05, "loss": 0.2888, "mean_token_accuracy": 0.9098032839596272, "num_tokens": 57304164.0, "step": 28670 }, { "entropy": 0.3130753099918365, "epoch": 0.44148632189017106, "grad_norm": 0.6183835864067078, "learning_rate": 1.9864836069654262e-05, "loss": 0.3119, "mean_token_accuracy": 0.9034260742366313, "num_tokens": 57375607.0, "step": 28680 }, { "entropy": 0.29262628946453334, "epoch": 0.44164025714884964, "grad_norm": 0.6247296929359436, "learning_rate": 1.986454242150669e-05, "loss": 0.2977, "mean_token_accuracy": 0.9103852964937686, "num_tokens": 57444219.0, "step": 28690 }, { "entropy": 0.2947198929265141, "epoch": 0.4417941924075282, "grad_norm": 0.7496622204780579, "learning_rate": 1.986424845690007e-05, "loss": 0.3026, "mean_token_accuracy": 0.9080938249826431, "num_tokens": 57505233.0, "step": 28700 }, { "entropy": 0.30556996893137695, "epoch": 0.4419481276662068, "grad_norm": 0.5832917094230652, "learning_rate": 1.986395417584384e-05, "loss": 0.2971, "mean_token_accuracy": 0.90803914219141, "num_tokens": 57569858.0, "step": 28710 }, { "entropy": 0.2987947160378098, "epoch": 0.44210206292488535, "grad_norm": 0.7971284985542297, "learning_rate": 1.9863659578347437e-05, "loss": 0.3159, "mean_token_accuracy": 0.9081914946436882, "num_tokens": 57634171.0, "step": 28720 }, { "entropy": 0.2988855724222958, "epoch": 0.4422559981835639, "grad_norm": 0.7451384663581848, "learning_rate": 1.9863364664420313e-05, "loss": 0.2998, "mean_token_accuracy": 0.9080990083515644, "num_tokens": 57704567.0, "step": 28730 }, { "entropy": 0.31083751330152154, "epoch": 0.44240993344224255, "grad_norm": 0.6254254579544067, "learning_rate": 1.986306943407193e-05, "loss": 0.313, "mean_token_accuracy": 0.9058129541575909, "num_tokens": 57778928.0, "step": 28740 }, { "entropy": 0.31624507466331125, "epoch": 0.4425638687009211, "grad_norm": 0.6654564738273621, "learning_rate": 1.986277388731175e-05, "loss": 0.3192, "mean_token_accuracy": 0.9028505608439445, "num_tokens": 57853370.0, "step": 28750 }, { "entropy": 0.3032761816866696, "epoch": 0.4427178039595997, "grad_norm": 0.694149911403656, "learning_rate": 1.9862478024149273e-05, "loss": 0.3052, "mean_token_accuracy": 0.9065084077417851, "num_tokens": 57921970.0, "step": 28760 }, { "entropy": 0.31521760653704406, "epoch": 0.44287173921827827, "grad_norm": 0.6493293642997742, "learning_rate": 1.9862181844593972e-05, "loss": 0.3112, "mean_token_accuracy": 0.9049031369388103, "num_tokens": 57986480.0, "step": 28770 }, { "entropy": 0.30473718056455257, "epoch": 0.44302567447695684, "grad_norm": 0.6290785670280457, "learning_rate": 1.986188534865536e-05, "loss": 0.3123, "mean_token_accuracy": 0.9071151137351989, "num_tokens": 58052576.0, "step": 28780 }, { "entropy": 0.28898841263726355, "epoch": 0.4431796097356354, "grad_norm": 0.887597382068634, "learning_rate": 1.9861588536342943e-05, "loss": 0.2914, "mean_token_accuracy": 0.9103973530232906, "num_tokens": 58107164.0, "step": 28790 }, { "entropy": 0.29435537541285156, "epoch": 0.44333354499431404, "grad_norm": 0.7026850581169128, "learning_rate": 1.9861291407666242e-05, "loss": 0.2931, "mean_token_accuracy": 0.9123429030179977, "num_tokens": 58174943.0, "step": 28800 }, { "entropy": 0.3018002251163125, "epoch": 0.4434874802529926, "grad_norm": 0.892725944519043, "learning_rate": 1.9860993962634797e-05, "loss": 0.3197, "mean_token_accuracy": 0.9061097137629985, "num_tokens": 58240217.0, "step": 28810 }, { "entropy": 0.31441028136759996, "epoch": 0.4436414155116712, "grad_norm": 0.6251412630081177, "learning_rate": 1.9860696201258142e-05, "loss": 0.3201, "mean_token_accuracy": 0.9051227547228337, "num_tokens": 58309532.0, "step": 28820 }, { "entropy": 0.30184568781405685, "epoch": 0.44379535077034976, "grad_norm": 0.6932148933410645, "learning_rate": 1.9860398123545832e-05, "loss": 0.2882, "mean_token_accuracy": 0.9071275673806667, "num_tokens": 58382121.0, "step": 28830 }, { "entropy": 0.30340626183897257, "epoch": 0.44394928602902833, "grad_norm": 0.6082580089569092, "learning_rate": 1.9860099729507428e-05, "loss": 0.3086, "mean_token_accuracy": 0.904612523317337, "num_tokens": 58453999.0, "step": 28840 }, { "entropy": 0.296532632689923, "epoch": 0.4441032212877069, "grad_norm": 0.7113105654716492, "learning_rate": 1.9859801019152508e-05, "loss": 0.3087, "mean_token_accuracy": 0.9078173853456974, "num_tokens": 58525581.0, "step": 28850 }, { "entropy": 0.3178396594710648, "epoch": 0.4442571565463855, "grad_norm": 0.7431472539901733, "learning_rate": 1.9859501992490647e-05, "loss": 0.3128, "mean_token_accuracy": 0.9032329820096493, "num_tokens": 58593594.0, "step": 28860 }, { "entropy": 0.2964533243328333, "epoch": 0.4444110918050641, "grad_norm": 0.6086490750312805, "learning_rate": 1.9859202649531447e-05, "loss": 0.3003, "mean_token_accuracy": 0.9085000336170197, "num_tokens": 58658892.0, "step": 28870 }, { "entropy": 0.3013818201608956, "epoch": 0.4445650270637427, "grad_norm": 0.6247802972793579, "learning_rate": 1.98589029902845e-05, "loss": 0.2981, "mean_token_accuracy": 0.9087717890739441, "num_tokens": 58721621.0, "step": 28880 }, { "entropy": 0.2917134689167142, "epoch": 0.44471896232242125, "grad_norm": 0.7188671827316284, "learning_rate": 1.985860301475943e-05, "loss": 0.3024, "mean_token_accuracy": 0.9088465988636016, "num_tokens": 58786732.0, "step": 28890 }, { "entropy": 0.2999874968081713, "epoch": 0.4448728975810998, "grad_norm": 0.5861514210700989, "learning_rate": 1.9858302722965854e-05, "loss": 0.2943, "mean_token_accuracy": 0.9084188647568225, "num_tokens": 58859890.0, "step": 28900 }, { "entropy": 0.2896307394839823, "epoch": 0.4450268328397784, "grad_norm": 0.6567462086677551, "learning_rate": 1.9858002114913403e-05, "loss": 0.3049, "mean_token_accuracy": 0.9113890707492829, "num_tokens": 58925387.0, "step": 28910 }, { "entropy": 0.3103972556069493, "epoch": 0.44518076809845697, "grad_norm": 0.6924648284912109, "learning_rate": 1.985770119061173e-05, "loss": 0.3076, "mean_token_accuracy": 0.9046959608793259, "num_tokens": 58992636.0, "step": 28920 }, { "entropy": 0.29806662863120437, "epoch": 0.4453347033571356, "grad_norm": 0.6541290283203125, "learning_rate": 1.985739995007048e-05, "loss": 0.3017, "mean_token_accuracy": 0.9089355811476707, "num_tokens": 59064729.0, "step": 28930 }, { "entropy": 0.3146005506627262, "epoch": 0.44548863861581417, "grad_norm": 0.5870431661605835, "learning_rate": 1.9857098393299323e-05, "loss": 0.325, "mean_token_accuracy": 0.9030097223818302, "num_tokens": 59131716.0, "step": 28940 }, { "entropy": 0.2921004729345441, "epoch": 0.44564257387449274, "grad_norm": 0.7528036236763, "learning_rate": 1.985679652030793e-05, "loss": 0.2883, "mean_token_accuracy": 0.9139623299241066, "num_tokens": 59198524.0, "step": 28950 }, { "entropy": 0.30768423415720464, "epoch": 0.4457965091331713, "grad_norm": 0.6082051396369934, "learning_rate": 1.9856494331105984e-05, "loss": 0.3158, "mean_token_accuracy": 0.9066430293023586, "num_tokens": 59267326.0, "step": 28960 }, { "entropy": 0.2919261667877436, "epoch": 0.4459504443918499, "grad_norm": 0.6438739895820618, "learning_rate": 1.9856191825703184e-05, "loss": 0.2992, "mean_token_accuracy": 0.9089902549982071, "num_tokens": 59338773.0, "step": 28970 }, { "entropy": 0.30610564080998304, "epoch": 0.44610437965052846, "grad_norm": 0.6748772263526917, "learning_rate": 1.985588900410923e-05, "loss": 0.3123, "mean_token_accuracy": 0.9070577278733254, "num_tokens": 59405519.0, "step": 28980 }, { "entropy": 0.29516994692385196, "epoch": 0.4462583149092071, "grad_norm": 0.5416553616523743, "learning_rate": 1.9855585866333835e-05, "loss": 0.3024, "mean_token_accuracy": 0.9088552549481392, "num_tokens": 59468446.0, "step": 28990 }, { "entropy": 0.30894982600584625, "epoch": 0.44641225016788566, "grad_norm": 0.5850502848625183, "learning_rate": 1.9855282412386732e-05, "loss": 0.3258, "mean_token_accuracy": 0.9066998146474361, "num_tokens": 59532625.0, "step": 29000 }, { "entropy": 0.3049947296269238, "epoch": 0.44656618542656423, "grad_norm": 0.6210456490516663, "learning_rate": 1.9854978642277645e-05, "loss": 0.304, "mean_token_accuracy": 0.9058672487735748, "num_tokens": 59595079.0, "step": 29010 }, { "entropy": 0.301895852945745, "epoch": 0.4467201206852428, "grad_norm": 0.8129467964172363, "learning_rate": 1.985467455601633e-05, "loss": 0.3144, "mean_token_accuracy": 0.9065854839980603, "num_tokens": 59653892.0, "step": 29020 }, { "entropy": 0.29538535373285413, "epoch": 0.4468740559439214, "grad_norm": 0.6321992874145508, "learning_rate": 1.9854370153612537e-05, "loss": 0.2901, "mean_token_accuracy": 0.9078738830983639, "num_tokens": 59716436.0, "step": 29030 }, { "entropy": 0.30855622980743647, "epoch": 0.44702799120259995, "grad_norm": 0.5928578972816467, "learning_rate": 1.9854065435076028e-05, "loss": 0.3142, "mean_token_accuracy": 0.905761593580246, "num_tokens": 59783976.0, "step": 29040 }, { "entropy": 0.29745510322973134, "epoch": 0.4471819264612785, "grad_norm": 0.6369696855545044, "learning_rate": 1.9853760400416588e-05, "loss": 0.3025, "mean_token_accuracy": 0.9100111283361911, "num_tokens": 59846072.0, "step": 29050 }, { "entropy": 0.2930631073191762, "epoch": 0.44733586171995715, "grad_norm": 0.6534851789474487, "learning_rate": 1.985345504964399e-05, "loss": 0.3038, "mean_token_accuracy": 0.9116827823221684, "num_tokens": 59910115.0, "step": 29060 }, { "entropy": 0.3249824708327651, "epoch": 0.4474897969786357, "grad_norm": 0.6103017926216125, "learning_rate": 1.985314938276804e-05, "loss": 0.3215, "mean_token_accuracy": 0.9005982533097268, "num_tokens": 59979263.0, "step": 29070 }, { "entropy": 0.29704350205138325, "epoch": 0.4476437322373143, "grad_norm": 0.751054584980011, "learning_rate": 1.9852843399798543e-05, "loss": 0.3005, "mean_token_accuracy": 0.9085594959557056, "num_tokens": 60043546.0, "step": 29080 }, { "entropy": 0.2819291467778385, "epoch": 0.44779766749599287, "grad_norm": 0.7030400037765503, "learning_rate": 1.9852537100745307e-05, "loss": 0.2868, "mean_token_accuracy": 0.913824837654829, "num_tokens": 60115956.0, "step": 29090 }, { "entropy": 0.32913051573559643, "epoch": 0.44795160275467144, "grad_norm": 0.7451030015945435, "learning_rate": 1.985223048561817e-05, "loss": 0.3282, "mean_token_accuracy": 0.8985949918627739, "num_tokens": 60176119.0, "step": 29100 }, { "entropy": 0.31434405837208035, "epoch": 0.44810553801335, "grad_norm": 0.547315776348114, "learning_rate": 1.985192355442696e-05, "loss": 0.3053, "mean_token_accuracy": 0.905899553000927, "num_tokens": 60247846.0, "step": 29110 }, { "entropy": 0.2999236064963043, "epoch": 0.44825947327202864, "grad_norm": 0.8559381365776062, "learning_rate": 1.985161630718152e-05, "loss": 0.3146, "mean_token_accuracy": 0.9059007287025451, "num_tokens": 60314995.0, "step": 29120 }, { "entropy": 0.319025363586843, "epoch": 0.4484134085307072, "grad_norm": 0.7059308886528015, "learning_rate": 1.9851308743891717e-05, "loss": 0.3196, "mean_token_accuracy": 0.9030059687793255, "num_tokens": 60386957.0, "step": 29130 }, { "entropy": 0.3207985511049628, "epoch": 0.4485673437893858, "grad_norm": 0.6827106475830078, "learning_rate": 1.9851000864567417e-05, "loss": 0.3152, "mean_token_accuracy": 0.904264011234045, "num_tokens": 60457841.0, "step": 29140 }, { "entropy": 0.303292285092175, "epoch": 0.44872127904806436, "grad_norm": 0.6456571221351624, "learning_rate": 1.9850692669218486e-05, "loss": 0.2972, "mean_token_accuracy": 0.9067245699465275, "num_tokens": 60531257.0, "step": 29150 }, { "entropy": 0.3061292240396142, "epoch": 0.44887521430674293, "grad_norm": 0.7768036127090454, "learning_rate": 1.985038415785482e-05, "loss": 0.3096, "mean_token_accuracy": 0.9027407303452492, "num_tokens": 60593701.0, "step": 29160 }, { "entropy": 0.3175658782012761, "epoch": 0.4490291495654215, "grad_norm": 0.8002991676330566, "learning_rate": 1.9850075330486316e-05, "loss": 0.3265, "mean_token_accuracy": 0.9028694830834866, "num_tokens": 60656754.0, "step": 29170 }, { "entropy": 0.32925739251077174, "epoch": 0.44918308482410013, "grad_norm": 0.863787829875946, "learning_rate": 1.9849766187122878e-05, "loss": 0.3277, "mean_token_accuracy": 0.902561642229557, "num_tokens": 60717944.0, "step": 29180 }, { "entropy": 0.316574676707387, "epoch": 0.4493370200827787, "grad_norm": 0.8121773600578308, "learning_rate": 1.9849456727774425e-05, "loss": 0.3223, "mean_token_accuracy": 0.904185576736927, "num_tokens": 60784061.0, "step": 29190 }, { "entropy": 0.3074906338937581, "epoch": 0.4494909553414573, "grad_norm": 0.7498548626899719, "learning_rate": 1.9849146952450885e-05, "loss": 0.3083, "mean_token_accuracy": 0.9058119609951973, "num_tokens": 60850850.0, "step": 29200 }, { "entropy": 0.3091260138899088, "epoch": 0.44964489060013585, "grad_norm": 0.6473590731620789, "learning_rate": 1.9848836861162196e-05, "loss": 0.3151, "mean_token_accuracy": 0.9029917061328888, "num_tokens": 60925014.0, "step": 29210 }, { "entropy": 0.2923276557587087, "epoch": 0.4497988258588144, "grad_norm": 0.865498423576355, "learning_rate": 1.9848526453918305e-05, "loss": 0.2866, "mean_token_accuracy": 0.9099730759859085, "num_tokens": 60986899.0, "step": 29220 }, { "entropy": 0.3025450834073126, "epoch": 0.449952761117493, "grad_norm": 0.7002219557762146, "learning_rate": 1.984821573072917e-05, "loss": 0.3102, "mean_token_accuracy": 0.9083499535918236, "num_tokens": 61055865.0, "step": 29230 }, { "entropy": 0.3098035240545869, "epoch": 0.45010669637617157, "grad_norm": 0.8052839040756226, "learning_rate": 1.9847904691604757e-05, "loss": 0.3032, "mean_token_accuracy": 0.9042614385485649, "num_tokens": 61135215.0, "step": 29240 }, { "entropy": 0.28917374052107336, "epoch": 0.4502606316348502, "grad_norm": 0.6044452786445618, "learning_rate": 1.984759333655505e-05, "loss": 0.2911, "mean_token_accuracy": 0.9118543662130832, "num_tokens": 61202309.0, "step": 29250 }, { "entropy": 0.28131883265450597, "epoch": 0.45041456689352877, "grad_norm": 0.7777024507522583, "learning_rate": 1.9847281665590033e-05, "loss": 0.2871, "mean_token_accuracy": 0.9127372182905674, "num_tokens": 61270918.0, "step": 29260 }, { "entropy": 0.3153162945993245, "epoch": 0.45056850215220734, "grad_norm": 0.865909993648529, "learning_rate": 1.9846969678719705e-05, "loss": 0.3077, "mean_token_accuracy": 0.9035217605531216, "num_tokens": 61341417.0, "step": 29270 }, { "entropy": 0.30344665329903364, "epoch": 0.4507224374108859, "grad_norm": 0.6097361445426941, "learning_rate": 1.9846657375954076e-05, "loss": 0.299, "mean_token_accuracy": 0.9082898259162903, "num_tokens": 61409527.0, "step": 29280 }, { "entropy": 0.31243055323138835, "epoch": 0.4508763726695645, "grad_norm": 0.6643504500389099, "learning_rate": 1.9846344757303163e-05, "loss": 0.3242, "mean_token_accuracy": 0.9008510485291481, "num_tokens": 61472612.0, "step": 29290 }, { "entropy": 0.3004996013827622, "epoch": 0.45103030792824306, "grad_norm": 0.5530785322189331, "learning_rate": 1.9846031822776998e-05, "loss": 0.2959, "mean_token_accuracy": 0.9084445856511593, "num_tokens": 61541496.0, "step": 29300 }, { "entropy": 0.3023988394066691, "epoch": 0.4511842431869217, "grad_norm": 0.7500281929969788, "learning_rate": 1.9845718572385616e-05, "loss": 0.309, "mean_token_accuracy": 0.9095490470528602, "num_tokens": 61603848.0, "step": 29310 }, { "entropy": 0.30169574739411475, "epoch": 0.45133817844560026, "grad_norm": 0.7373437285423279, "learning_rate": 1.984540500613907e-05, "loss": 0.3034, "mean_token_accuracy": 0.9069956846535205, "num_tokens": 61667277.0, "step": 29320 }, { "entropy": 0.2976364830508828, "epoch": 0.45149211370427883, "grad_norm": 0.5996589064598083, "learning_rate": 1.9845091124047412e-05, "loss": 0.3021, "mean_token_accuracy": 0.9090096920728683, "num_tokens": 61735068.0, "step": 29330 }, { "entropy": 0.28087912816554306, "epoch": 0.4516460489629574, "grad_norm": 0.7080607414245605, "learning_rate": 1.9844776926120723e-05, "loss": 0.2935, "mean_token_accuracy": 0.9131565697491169, "num_tokens": 61796703.0, "step": 29340 }, { "entropy": 0.29027365408837796, "epoch": 0.451799984221636, "grad_norm": 0.6711325645446777, "learning_rate": 1.9844462412369075e-05, "loss": 0.3097, "mean_token_accuracy": 0.9104976020753384, "num_tokens": 61858428.0, "step": 29350 }, { "entropy": 0.30400209976360204, "epoch": 0.45195391948031455, "grad_norm": 0.5867029428482056, "learning_rate": 1.984414758280256e-05, "loss": 0.2995, "mean_token_accuracy": 0.9090206064283848, "num_tokens": 61919004.0, "step": 29360 }, { "entropy": 0.28924821661785244, "epoch": 0.4521078547389932, "grad_norm": 0.6967398524284363, "learning_rate": 1.984383243743128e-05, "loss": 0.2862, "mean_token_accuracy": 0.9121098063886166, "num_tokens": 61982192.0, "step": 29370 }, { "entropy": 0.3086257208138704, "epoch": 0.45226178999767175, "grad_norm": 0.6778244376182556, "learning_rate": 1.9843516976265338e-05, "loss": 0.3198, "mean_token_accuracy": 0.902478751540184, "num_tokens": 62056187.0, "step": 29380 }, { "entropy": 0.31649177707731724, "epoch": 0.4524157252563503, "grad_norm": 0.6964280009269714, "learning_rate": 1.984320119931486e-05, "loss": 0.31, "mean_token_accuracy": 0.9026323653757572, "num_tokens": 62117773.0, "step": 29390 }, { "entropy": 0.30377511531114576, "epoch": 0.4525696605150289, "grad_norm": 0.8224655985832214, "learning_rate": 1.9842885106589974e-05, "loss": 0.3026, "mean_token_accuracy": 0.9080650880932808, "num_tokens": 62182374.0, "step": 29400 }, { "entropy": 0.2802377752959728, "epoch": 0.45272359577370747, "grad_norm": 0.5336626172065735, "learning_rate": 1.9842568698100822e-05, "loss": 0.2807, "mean_token_accuracy": 0.9117032133042813, "num_tokens": 62253223.0, "step": 29410 }, { "entropy": 0.2972984932363033, "epoch": 0.45287753103238604, "grad_norm": 0.6842454075813293, "learning_rate": 1.9842251973857555e-05, "loss": 0.3004, "mean_token_accuracy": 0.9076485842466354, "num_tokens": 62310083.0, "step": 29420 }, { "entropy": 0.29256243659183384, "epoch": 0.4530314662910646, "grad_norm": 0.7037567496299744, "learning_rate": 1.984193493387033e-05, "loss": 0.3099, "mean_token_accuracy": 0.9098985247313977, "num_tokens": 62375638.0, "step": 29430 }, { "entropy": 0.307468253839761, "epoch": 0.45318540154974324, "grad_norm": 0.5674394965171814, "learning_rate": 1.984161757814932e-05, "loss": 0.3066, "mean_token_accuracy": 0.9072710834443569, "num_tokens": 62452877.0, "step": 29440 }, { "entropy": 0.30799544462934136, "epoch": 0.4533393368084218, "grad_norm": 0.8445190191268921, "learning_rate": 1.984129990670471e-05, "loss": 0.312, "mean_token_accuracy": 0.9033519878983498, "num_tokens": 62515923.0, "step": 29450 }, { "entropy": 0.31119491066783667, "epoch": 0.4534932720671004, "grad_norm": 0.5917114615440369, "learning_rate": 1.9840981919546686e-05, "loss": 0.3137, "mean_token_accuracy": 0.9047664202749729, "num_tokens": 62579911.0, "step": 29460 }, { "entropy": 0.28653032118454574, "epoch": 0.45364720732577896, "grad_norm": 0.5478231906890869, "learning_rate": 1.9840663616685448e-05, "loss": 0.3021, "mean_token_accuracy": 0.9126445792615414, "num_tokens": 62644143.0, "step": 29470 }, { "entropy": 0.30945668844506147, "epoch": 0.45380114258445753, "grad_norm": 0.5576807260513306, "learning_rate": 1.9840344998131212e-05, "loss": 0.3066, "mean_token_accuracy": 0.905463045835495, "num_tokens": 62710362.0, "step": 29480 }, { "entropy": 0.2928053751587868, "epoch": 0.4539550778431361, "grad_norm": 0.658364474773407, "learning_rate": 1.9840026063894193e-05, "loss": 0.301, "mean_token_accuracy": 0.910303134471178, "num_tokens": 62775125.0, "step": 29490 }, { "entropy": 0.29060793658718465, "epoch": 0.45410901310181473, "grad_norm": 0.71396404504776, "learning_rate": 1.983970681398463e-05, "loss": 0.2967, "mean_token_accuracy": 0.9093640737235547, "num_tokens": 62849402.0, "step": 29500 }, { "entropy": 0.31657837573438885, "epoch": 0.4542629483604933, "grad_norm": 0.7203885912895203, "learning_rate": 1.983938724841276e-05, "loss": 0.3166, "mean_token_accuracy": 0.9039005443453789, "num_tokens": 62915846.0, "step": 29510 }, { "entropy": 0.3274116553366184, "epoch": 0.4544168836191719, "grad_norm": 0.5895305871963501, "learning_rate": 1.9839067367188834e-05, "loss": 0.3195, "mean_token_accuracy": 0.8999674245715141, "num_tokens": 62984776.0, "step": 29520 }, { "entropy": 0.31508256727829576, "epoch": 0.45457081887785045, "grad_norm": 0.8389813899993896, "learning_rate": 1.9838747170323122e-05, "loss": 0.3282, "mean_token_accuracy": 0.9027167774736882, "num_tokens": 63047175.0, "step": 29530 }, { "entropy": 0.3072539241053164, "epoch": 0.454724754136529, "grad_norm": 0.6283929944038391, "learning_rate": 1.9838426657825886e-05, "loss": 0.302, "mean_token_accuracy": 0.9089178368449211, "num_tokens": 63116382.0, "step": 29540 }, { "entropy": 0.31097256811335683, "epoch": 0.4548786893952076, "grad_norm": 1.12034273147583, "learning_rate": 1.9838105829707413e-05, "loss": 0.3138, "mean_token_accuracy": 0.9057494275271892, "num_tokens": 63180050.0, "step": 29550 }, { "entropy": 0.2981690724380314, "epoch": 0.45503262465388616, "grad_norm": 0.6619350910186768, "learning_rate": 1.9837784685977996e-05, "loss": 0.3088, "mean_token_accuracy": 0.9057453200221062, "num_tokens": 63244221.0, "step": 29560 }, { "entropy": 0.28427096279338004, "epoch": 0.4551865599125648, "grad_norm": 0.7236504554748535, "learning_rate": 1.9837463226647934e-05, "loss": 0.2973, "mean_token_accuracy": 0.9134487137198448, "num_tokens": 63308333.0, "step": 29570 }, { "entropy": 0.28015727223828435, "epoch": 0.45534049517124336, "grad_norm": 0.6327548027038574, "learning_rate": 1.983714145172754e-05, "loss": 0.2861, "mean_token_accuracy": 0.914226806908846, "num_tokens": 63366841.0, "step": 29580 }, { "entropy": 0.28053792743012307, "epoch": 0.45549443042992194, "grad_norm": 0.5712384581565857, "learning_rate": 1.983681936122714e-05, "loss": 0.2973, "mean_token_accuracy": 0.9133526712656022, "num_tokens": 63441240.0, "step": 29590 }, { "entropy": 0.31586148831993344, "epoch": 0.4556483656886005, "grad_norm": 0.7099204659461975, "learning_rate": 1.9836496955157067e-05, "loss": 0.3143, "mean_token_accuracy": 0.902186457067728, "num_tokens": 63514055.0, "step": 29600 }, { "entropy": 0.31404051911085845, "epoch": 0.4558023009472791, "grad_norm": 0.7009071111679077, "learning_rate": 1.983617423352766e-05, "loss": 0.3006, "mean_token_accuracy": 0.9037651091814041, "num_tokens": 63583068.0, "step": 29610 }, { "entropy": 0.3099587867036462, "epoch": 0.45595623620595765, "grad_norm": 0.6110183596611023, "learning_rate": 1.9835851196349275e-05, "loss": 0.3113, "mean_token_accuracy": 0.9014001950621605, "num_tokens": 63652108.0, "step": 29620 }, { "entropy": 0.3087578701786697, "epoch": 0.4561101714646363, "grad_norm": 0.6707711815834045, "learning_rate": 1.9835527843632273e-05, "loss": 0.3071, "mean_token_accuracy": 0.9062794752418994, "num_tokens": 63716853.0, "step": 29630 }, { "entropy": 0.2938090811483562, "epoch": 0.45626410672331486, "grad_norm": 0.896425724029541, "learning_rate": 1.983520417538703e-05, "loss": 0.2901, "mean_token_accuracy": 0.9113183647394181, "num_tokens": 63768892.0, "step": 29640 }, { "entropy": 0.3110604747198522, "epoch": 0.4564180419819934, "grad_norm": 0.9240877628326416, "learning_rate": 1.9834880191623926e-05, "loss": 0.3119, "mean_token_accuracy": 0.9048765704035759, "num_tokens": 63836551.0, "step": 29650 }, { "entropy": 0.3127949514426291, "epoch": 0.456571977240672, "grad_norm": 0.673550546169281, "learning_rate": 1.983455589235336e-05, "loss": 0.3146, "mean_token_accuracy": 0.9047091633081437, "num_tokens": 63900799.0, "step": 29660 }, { "entropy": 0.3012548857368529, "epoch": 0.4567259124993506, "grad_norm": 0.5806161165237427, "learning_rate": 1.983423127758573e-05, "loss": 0.3102, "mean_token_accuracy": 0.9085797987878322, "num_tokens": 63966870.0, "step": 29670 }, { "entropy": 0.29981685457751156, "epoch": 0.45687984775802915, "grad_norm": 0.5300203561782837, "learning_rate": 1.983390634733145e-05, "loss": 0.3101, "mean_token_accuracy": 0.9069880560040474, "num_tokens": 64028265.0, "step": 29680 }, { "entropy": 0.30738999117165805, "epoch": 0.4570337830167078, "grad_norm": 0.6562466621398926, "learning_rate": 1.983358110160095e-05, "loss": 0.3084, "mean_token_accuracy": 0.9048863239586353, "num_tokens": 64088420.0, "step": 29690 }, { "entropy": 0.29656814616173505, "epoch": 0.45718771827538635, "grad_norm": 0.6376783847808838, "learning_rate": 1.9833255540404654e-05, "loss": 0.2994, "mean_token_accuracy": 0.9093433983623982, "num_tokens": 64151636.0, "step": 29700 }, { "entropy": 0.29243988879024985, "epoch": 0.4573416535340649, "grad_norm": 0.8653566837310791, "learning_rate": 1.983292966375302e-05, "loss": 0.3047, "mean_token_accuracy": 0.9096893608570099, "num_tokens": 64217960.0, "step": 29710 }, { "entropy": 0.2973176481202245, "epoch": 0.4574955887927435, "grad_norm": 0.6606300473213196, "learning_rate": 1.983260347165649e-05, "loss": 0.3026, "mean_token_accuracy": 0.9110989332199096, "num_tokens": 64277639.0, "step": 29720 }, { "entropy": 0.31443728329613807, "epoch": 0.45764952405142206, "grad_norm": 0.698207676410675, "learning_rate": 1.983227696412553e-05, "loss": 0.3138, "mean_token_accuracy": 0.9037510134279728, "num_tokens": 64350995.0, "step": 29730 }, { "entropy": 0.2880153788253665, "epoch": 0.45780345931010064, "grad_norm": 0.6003054976463318, "learning_rate": 1.9831950141170623e-05, "loss": 0.2925, "mean_token_accuracy": 0.9100025683641434, "num_tokens": 64419039.0, "step": 29740 }, { "entropy": 0.30148938270285724, "epoch": 0.4579573945687792, "grad_norm": 0.572963535785675, "learning_rate": 1.9831623002802244e-05, "loss": 0.304, "mean_token_accuracy": 0.9072396449744702, "num_tokens": 64486394.0, "step": 29750 }, { "entropy": 0.3156216263771057, "epoch": 0.45811132982745784, "grad_norm": 0.5749273300170898, "learning_rate": 1.9831295549030894e-05, "loss": 0.3077, "mean_token_accuracy": 0.9018870994448662, "num_tokens": 64559068.0, "step": 29760 }, { "entropy": 0.31528396941721437, "epoch": 0.4582652650861364, "grad_norm": 0.6326513290405273, "learning_rate": 1.9830967779867073e-05, "loss": 0.3158, "mean_token_accuracy": 0.9024253711104393, "num_tokens": 64622697.0, "step": 29770 }, { "entropy": 0.29280639700591565, "epoch": 0.458419200344815, "grad_norm": 0.8235968351364136, "learning_rate": 1.98306396953213e-05, "loss": 0.2949, "mean_token_accuracy": 0.9108583323657513, "num_tokens": 64689012.0, "step": 29780 }, { "entropy": 0.26688538305461407, "epoch": 0.45857313560349355, "grad_norm": 0.6659470796585083, "learning_rate": 1.9830311295404098e-05, "loss": 0.2769, "mean_token_accuracy": 0.9178809799253941, "num_tokens": 64756311.0, "step": 29790 }, { "entropy": 0.28948162412270906, "epoch": 0.4587270708621721, "grad_norm": 0.5763674378395081, "learning_rate": 1.9829982580126e-05, "loss": 0.304, "mean_token_accuracy": 0.9078711196780205, "num_tokens": 64819745.0, "step": 29800 }, { "entropy": 0.2950340843759477, "epoch": 0.4588810061208507, "grad_norm": 0.5920685529708862, "learning_rate": 1.982965354949756e-05, "loss": 0.3048, "mean_token_accuracy": 0.9088689409196377, "num_tokens": 64887886.0, "step": 29810 }, { "entropy": 0.29925672747194765, "epoch": 0.4590349413795293, "grad_norm": 0.6929040551185608, "learning_rate": 1.9829324203529324e-05, "loss": 0.2997, "mean_token_accuracy": 0.9094645716249943, "num_tokens": 64952048.0, "step": 29820 }, { "entropy": 0.3130037005059421, "epoch": 0.4591888766382079, "grad_norm": 0.6346549391746521, "learning_rate": 1.9828994542231864e-05, "loss": 0.3044, "mean_token_accuracy": 0.9047832995653152, "num_tokens": 65019984.0, "step": 29830 }, { "entropy": 0.288733293954283, "epoch": 0.45934281189688647, "grad_norm": 0.6864654421806335, "learning_rate": 1.982866456561575e-05, "loss": 0.3065, "mean_token_accuracy": 0.9109384119510651, "num_tokens": 65079130.0, "step": 29840 }, { "entropy": 0.2839735187590122, "epoch": 0.45949674715556504, "grad_norm": 0.8237162232398987, "learning_rate": 1.9828334273691572e-05, "loss": 0.2973, "mean_token_accuracy": 0.912580319494009, "num_tokens": 65142831.0, "step": 29850 }, { "entropy": 0.30343748042359947, "epoch": 0.4596506824142436, "grad_norm": 0.703399121761322, "learning_rate": 1.9828003666469923e-05, "loss": 0.3007, "mean_token_accuracy": 0.9085575707256794, "num_tokens": 65215209.0, "step": 29860 }, { "entropy": 0.31781340287998316, "epoch": 0.4598046176729222, "grad_norm": 0.7215396761894226, "learning_rate": 1.982767274396141e-05, "loss": 0.3252, "mean_token_accuracy": 0.901151405274868, "num_tokens": 65283696.0, "step": 29870 }, { "entropy": 0.2979705319739878, "epoch": 0.4599585529316008, "grad_norm": 0.5989993810653687, "learning_rate": 1.9827341506176655e-05, "loss": 0.3125, "mean_token_accuracy": 0.9096525825560093, "num_tokens": 65354276.0, "step": 29880 }, { "entropy": 0.29098656475543977, "epoch": 0.4601124881902794, "grad_norm": 0.5741584897041321, "learning_rate": 1.9827009953126277e-05, "loss": 0.2966, "mean_token_accuracy": 0.9110781587660313, "num_tokens": 65416908.0, "step": 29890 }, { "entropy": 0.30100529063493014, "epoch": 0.46026642344895796, "grad_norm": 0.6329573392868042, "learning_rate": 1.9826678084820908e-05, "loss": 0.308, "mean_token_accuracy": 0.9070828765630722, "num_tokens": 65485243.0, "step": 29900 }, { "entropy": 0.3045049454085529, "epoch": 0.46042035870763653, "grad_norm": 0.5450248122215271, "learning_rate": 1.9826345901271208e-05, "loss": 0.3101, "mean_token_accuracy": 0.9065420985221863, "num_tokens": 65559892.0, "step": 29910 }, { "entropy": 0.2951405904255807, "epoch": 0.4605742939663151, "grad_norm": 0.5935929417610168, "learning_rate": 1.9826013402487825e-05, "loss": 0.2953, "mean_token_accuracy": 0.9090284019708633, "num_tokens": 65628445.0, "step": 29920 }, { "entropy": 0.3026703329756856, "epoch": 0.4607282292249937, "grad_norm": 0.6233932375907898, "learning_rate": 1.9825680588481426e-05, "loss": 0.3158, "mean_token_accuracy": 0.9076934732496739, "num_tokens": 65693428.0, "step": 29930 }, { "entropy": 0.2958178989589214, "epoch": 0.46088216448367225, "grad_norm": 0.7505480647087097, "learning_rate": 1.982534745926269e-05, "loss": 0.2869, "mean_token_accuracy": 0.9103815771639348, "num_tokens": 65758534.0, "step": 29940 }, { "entropy": 0.2950725669041276, "epoch": 0.4610360997423509, "grad_norm": 0.6315085887908936, "learning_rate": 1.9825014014842303e-05, "loss": 0.3024, "mean_token_accuracy": 0.9065883740782738, "num_tokens": 65826365.0, "step": 29950 }, { "entropy": 0.2975346979685128, "epoch": 0.46119003500102945, "grad_norm": 0.6505082845687866, "learning_rate": 1.982468025523096e-05, "loss": 0.3045, "mean_token_accuracy": 0.9088601671159268, "num_tokens": 65888659.0, "step": 29960 }, { "entropy": 0.29655126109719276, "epoch": 0.461343970259708, "grad_norm": 0.5935437679290771, "learning_rate": 1.9824346180439375e-05, "loss": 0.3, "mean_token_accuracy": 0.9086975947022438, "num_tokens": 65960572.0, "step": 29970 }, { "entropy": 0.2958206618204713, "epoch": 0.4614979055183866, "grad_norm": 0.7048397660255432, "learning_rate": 1.9824011790478253e-05, "loss": 0.3027, "mean_token_accuracy": 0.9102879844605922, "num_tokens": 66023082.0, "step": 29980 }, { "entropy": 0.3023723097518086, "epoch": 0.46165184077706517, "grad_norm": 0.5803673267364502, "learning_rate": 1.9823677085358334e-05, "loss": 0.3068, "mean_token_accuracy": 0.9073477126657963, "num_tokens": 66096767.0, "step": 29990 }, { "entropy": 0.3139727140776813, "epoch": 0.46180577603574374, "grad_norm": 0.5721963047981262, "learning_rate": 1.982334206509035e-05, "loss": 0.3196, "mean_token_accuracy": 0.9049571484327317, "num_tokens": 66159835.0, "step": 30000 }, { "epoch": 0.46180577603574374, "eval_entropy": 0.3057652737870312, "eval_loss": 0.30058613419532776, "eval_mean_token_accuracy": 0.9075549942751573, "eval_num_tokens": 66159835.0, "eval_runtime": 7811.0864, "eval_samples_per_second": 4.158, "eval_steps_per_second": 4.158, "step": 30000 }, { "entropy": 0.30673301182687285, "epoch": 0.46195971129442237, "grad_norm": 0.6071364879608154, "learning_rate": 1.982300672968505e-05, "loss": 0.2956, "mean_token_accuracy": 0.9075291194021702, "num_tokens": 66222553.0, "step": 30010 }, { "entropy": 0.2995689935050905, "epoch": 0.46211364655310094, "grad_norm": 0.5297085046768188, "learning_rate": 1.982267107915319e-05, "loss": 0.3044, "mean_token_accuracy": 0.9066752791404724, "num_tokens": 66287732.0, "step": 30020 }, { "entropy": 0.295447439327836, "epoch": 0.4622675818117795, "grad_norm": 0.5436554551124573, "learning_rate": 1.9822335113505536e-05, "loss": 0.2935, "mean_token_accuracy": 0.9093317851424217, "num_tokens": 66354969.0, "step": 30030 }, { "entropy": 0.2958836914971471, "epoch": 0.4624215170704581, "grad_norm": 0.9278629422187805, "learning_rate": 1.982199883275287e-05, "loss": 0.3032, "mean_token_accuracy": 0.9090538255870342, "num_tokens": 66416943.0, "step": 30040 }, { "entropy": 0.2975347269326448, "epoch": 0.46257545232913666, "grad_norm": 0.715113639831543, "learning_rate": 1.9821662236905977e-05, "loss": 0.3081, "mean_token_accuracy": 0.9096461817622185, "num_tokens": 66478616.0, "step": 30050 }, { "entropy": 0.32718521263450384, "epoch": 0.46272938758781523, "grad_norm": 0.754324734210968, "learning_rate": 1.9821325325975657e-05, "loss": 0.3122, "mean_token_accuracy": 0.9011578515172005, "num_tokens": 66543174.0, "step": 30060 }, { "entropy": 0.28607221636921165, "epoch": 0.46288332284649386, "grad_norm": 0.8830543160438538, "learning_rate": 1.9820988099972716e-05, "loss": 0.2886, "mean_token_accuracy": 0.9115344598889351, "num_tokens": 66611588.0, "step": 30070 }, { "entropy": 0.28600289607420565, "epoch": 0.46303725810517243, "grad_norm": 0.8703383803367615, "learning_rate": 1.982065055890798e-05, "loss": 0.2997, "mean_token_accuracy": 0.910519178956747, "num_tokens": 66683963.0, "step": 30080 }, { "entropy": 0.2932286445051432, "epoch": 0.463191193363851, "grad_norm": 0.5818918347358704, "learning_rate": 1.982031270279227e-05, "loss": 0.2997, "mean_token_accuracy": 0.9096867449581623, "num_tokens": 66760152.0, "step": 30090 }, { "entropy": 0.30388313038274645, "epoch": 0.4633451286225296, "grad_norm": 0.6479551792144775, "learning_rate": 1.981997453163642e-05, "loss": 0.3004, "mean_token_accuracy": 0.9086802072823048, "num_tokens": 66823338.0, "step": 30100 }, { "entropy": 0.3181132422760129, "epoch": 0.46349906388120815, "grad_norm": 0.6190195679664612, "learning_rate": 1.981963604545129e-05, "loss": 0.3048, "mean_token_accuracy": 0.901131883263588, "num_tokens": 66897598.0, "step": 30110 }, { "entropy": 0.2993926964700222, "epoch": 0.4636529991398867, "grad_norm": 0.6647181510925293, "learning_rate": 1.9819297244247734e-05, "loss": 0.3034, "mean_token_accuracy": 0.9053478203713894, "num_tokens": 66964900.0, "step": 30120 }, { "entropy": 0.3046038533560932, "epoch": 0.4638069343985653, "grad_norm": 0.5355188250541687, "learning_rate": 1.981895812803662e-05, "loss": 0.291, "mean_token_accuracy": 0.9080597810447216, "num_tokens": 67033182.0, "step": 30130 }, { "entropy": 0.30405068062245844, "epoch": 0.4639608696572439, "grad_norm": 0.5373644828796387, "learning_rate": 1.9818618696828828e-05, "loss": 0.3009, "mean_token_accuracy": 0.9067034229636193, "num_tokens": 67096916.0, "step": 30140 }, { "entropy": 0.2914829980581999, "epoch": 0.4641148049159225, "grad_norm": 0.6738107204437256, "learning_rate": 1.9818278950635247e-05, "loss": 0.2911, "mean_token_accuracy": 0.9123899295926094, "num_tokens": 67163172.0, "step": 30150 }, { "entropy": 0.2923617331311107, "epoch": 0.46426874017460107, "grad_norm": 0.937311589717865, "learning_rate": 1.9817938889466773e-05, "loss": 0.3089, "mean_token_accuracy": 0.9075548678636551, "num_tokens": 67234068.0, "step": 30160 }, { "entropy": 0.29511226024478676, "epoch": 0.46442267543327964, "grad_norm": 0.5066137313842773, "learning_rate": 1.9817598513334322e-05, "loss": 0.2941, "mean_token_accuracy": 0.910562677681446, "num_tokens": 67303505.0, "step": 30170 }, { "entropy": 0.2794829733669758, "epoch": 0.4645766106919582, "grad_norm": 0.6584742665290833, "learning_rate": 1.981725782224881e-05, "loss": 0.2837, "mean_token_accuracy": 0.9127273693680763, "num_tokens": 67367937.0, "step": 30180 }, { "entropy": 0.2826959489844739, "epoch": 0.4647305459506368, "grad_norm": 0.6764706373214722, "learning_rate": 1.9816916816221162e-05, "loss": 0.2773, "mean_token_accuracy": 0.9132754035294056, "num_tokens": 67429496.0, "step": 30190 }, { "entropy": 0.2797005877830088, "epoch": 0.4648844812093154, "grad_norm": 0.750261664390564, "learning_rate": 1.9816575495262323e-05, "loss": 0.2875, "mean_token_accuracy": 0.9131226070225239, "num_tokens": 67499269.0, "step": 30200 }, { "entropy": 0.3022101423703134, "epoch": 0.465038416467994, "grad_norm": 0.6757910251617432, "learning_rate": 1.9816233859383245e-05, "loss": 0.3062, "mean_token_accuracy": 0.907611008733511, "num_tokens": 67565443.0, "step": 30210 }, { "entropy": 0.29288495061919095, "epoch": 0.46519235172667256, "grad_norm": 0.5888442397117615, "learning_rate": 1.981589190859488e-05, "loss": 0.2882, "mean_token_accuracy": 0.9117404475808144, "num_tokens": 67633794.0, "step": 30220 }, { "entropy": 0.29473483497276903, "epoch": 0.46534628698535113, "grad_norm": 0.6080930829048157, "learning_rate": 1.9815549642908206e-05, "loss": 0.2998, "mean_token_accuracy": 0.9092975579202175, "num_tokens": 67697938.0, "step": 30230 }, { "entropy": 0.30256170574575664, "epoch": 0.4655002222440297, "grad_norm": 0.66880863904953, "learning_rate": 1.9815207062334197e-05, "loss": 0.3068, "mean_token_accuracy": 0.9084508188068867, "num_tokens": 67757591.0, "step": 30240 }, { "entropy": 0.31001311196014286, "epoch": 0.4656541575027083, "grad_norm": 0.6426119804382324, "learning_rate": 1.9814864166883848e-05, "loss": 0.3183, "mean_token_accuracy": 0.905578576028347, "num_tokens": 67816356.0, "step": 30250 }, { "entropy": 0.31507091242820023, "epoch": 0.4658080927613869, "grad_norm": 0.5616639256477356, "learning_rate": 1.9814520956568154e-05, "loss": 0.3154, "mean_token_accuracy": 0.9040072552859784, "num_tokens": 67893967.0, "step": 30260 }, { "entropy": 0.32732650795951485, "epoch": 0.4659620280200655, "grad_norm": 0.5811809301376343, "learning_rate": 1.981417743139813e-05, "loss": 0.3358, "mean_token_accuracy": 0.8979097813367843, "num_tokens": 67962192.0, "step": 30270 }, { "entropy": 0.3258471988141537, "epoch": 0.46611596327874405, "grad_norm": 0.5871037840843201, "learning_rate": 1.9813833591384794e-05, "loss": 0.3289, "mean_token_accuracy": 0.9029286801815033, "num_tokens": 68029393.0, "step": 30280 }, { "entropy": 0.2854473818093538, "epoch": 0.4662698985374226, "grad_norm": 0.6317195892333984, "learning_rate": 1.981348943653918e-05, "loss": 0.2963, "mean_token_accuracy": 0.9116249814629555, "num_tokens": 68097082.0, "step": 30290 }, { "entropy": 0.29743360839784144, "epoch": 0.4664238337961012, "grad_norm": 0.5487779378890991, "learning_rate": 1.9813144966872323e-05, "loss": 0.3048, "mean_token_accuracy": 0.9062067829072475, "num_tokens": 68174858.0, "step": 30300 }, { "entropy": 0.3018641355447471, "epoch": 0.46657776905477977, "grad_norm": 0.7482454776763916, "learning_rate": 1.981280018239527e-05, "loss": 0.31, "mean_token_accuracy": 0.906780569255352, "num_tokens": 68239698.0, "step": 30310 }, { "entropy": 0.2949280921369791, "epoch": 0.46673170431345834, "grad_norm": 0.7229771614074707, "learning_rate": 1.98124550831191e-05, "loss": 0.3044, "mean_token_accuracy": 0.9093301832675934, "num_tokens": 68305677.0, "step": 30320 }, { "entropy": 0.28669158536940814, "epoch": 0.46688563957213697, "grad_norm": 0.6822247505187988, "learning_rate": 1.9812109669054866e-05, "loss": 0.2814, "mean_token_accuracy": 0.9131094850599766, "num_tokens": 68371923.0, "step": 30330 }, { "entropy": 0.3087806172668934, "epoch": 0.46703957483081554, "grad_norm": 0.5279852747917175, "learning_rate": 1.9811763940213654e-05, "loss": 0.3043, "mean_token_accuracy": 0.9068300947546959, "num_tokens": 68441589.0, "step": 30340 }, { "entropy": 0.3105328609235585, "epoch": 0.4671935100894941, "grad_norm": 0.5749509334564209, "learning_rate": 1.981141789660656e-05, "loss": 0.3028, "mean_token_accuracy": 0.9060409449040889, "num_tokens": 68511044.0, "step": 30350 }, { "entropy": 0.2792367313988507, "epoch": 0.4673474453481727, "grad_norm": 0.6782712936401367, "learning_rate": 1.9811071538244682e-05, "loss": 0.2875, "mean_token_accuracy": 0.9107496663928032, "num_tokens": 68566794.0, "step": 30360 }, { "entropy": 0.30072609707713127, "epoch": 0.46750138060685126, "grad_norm": 0.6274827718734741, "learning_rate": 1.981072486513913e-05, "loss": 0.3179, "mean_token_accuracy": 0.9077781893312931, "num_tokens": 68636785.0, "step": 30370 }, { "entropy": 0.27670339969918134, "epoch": 0.46765531586552983, "grad_norm": 0.7390549778938293, "learning_rate": 1.9810377877301026e-05, "loss": 0.2875, "mean_token_accuracy": 0.9141889818012714, "num_tokens": 68697096.0, "step": 30380 }, { "entropy": 0.29266060376539826, "epoch": 0.46780925112420846, "grad_norm": 0.903878390789032, "learning_rate": 1.98100305747415e-05, "loss": 0.3054, "mean_token_accuracy": 0.9079248979687691, "num_tokens": 68759587.0, "step": 30390 }, { "entropy": 0.2928618622943759, "epoch": 0.46796318638288703, "grad_norm": 0.727123498916626, "learning_rate": 1.98096829574717e-05, "loss": 0.3011, "mean_token_accuracy": 0.9104389548301697, "num_tokens": 68836585.0, "step": 30400 }, { "entropy": 0.29995827935636044, "epoch": 0.4681171216415656, "grad_norm": 0.6874995231628418, "learning_rate": 1.980933502550277e-05, "loss": 0.3074, "mean_token_accuracy": 0.9089259400963783, "num_tokens": 68898074.0, "step": 30410 }, { "entropy": 0.3019661381840706, "epoch": 0.4682710569002442, "grad_norm": 0.6349387168884277, "learning_rate": 1.9808986778845875e-05, "loss": 0.306, "mean_token_accuracy": 0.9063345618546009, "num_tokens": 68975370.0, "step": 30420 }, { "entropy": 0.2980919098481536, "epoch": 0.46842499215892275, "grad_norm": 0.6690191626548767, "learning_rate": 1.9808638217512192e-05, "loss": 0.2906, "mean_token_accuracy": 0.9078684002161026, "num_tokens": 69039782.0, "step": 30430 }, { "entropy": 0.310481970384717, "epoch": 0.4685789274176013, "grad_norm": 0.5657799243927002, "learning_rate": 1.980828934151289e-05, "loss": 0.3076, "mean_token_accuracy": 0.9063140816986561, "num_tokens": 69107427.0, "step": 30440 }, { "entropy": 0.3065538133494556, "epoch": 0.46873286267627995, "grad_norm": 0.627435028553009, "learning_rate": 1.9807940150859177e-05, "loss": 0.3165, "mean_token_accuracy": 0.9042723141610622, "num_tokens": 69175273.0, "step": 30450 }, { "entropy": 0.2992474506609142, "epoch": 0.4688867979349585, "grad_norm": 0.5452767014503479, "learning_rate": 1.9807590645562243e-05, "loss": 0.3134, "mean_token_accuracy": 0.9088145293295383, "num_tokens": 69240426.0, "step": 30460 }, { "entropy": 0.30597799671813847, "epoch": 0.4690407331936371, "grad_norm": 0.5700526237487793, "learning_rate": 1.9807240825633305e-05, "loss": 0.3057, "mean_token_accuracy": 0.9072452515363694, "num_tokens": 69305012.0, "step": 30470 }, { "entropy": 0.29185603791847825, "epoch": 0.46919466845231567, "grad_norm": 0.8591843247413635, "learning_rate": 1.9806890691083588e-05, "loss": 0.3062, "mean_token_accuracy": 0.9093875601887703, "num_tokens": 69362208.0, "step": 30480 }, { "entropy": 0.32589598577469586, "epoch": 0.46934860371099424, "grad_norm": 0.5826118588447571, "learning_rate": 1.9806540241924316e-05, "loss": 0.3258, "mean_token_accuracy": 0.9017529182136059, "num_tokens": 69425917.0, "step": 30490 }, { "entropy": 0.31828123051673174, "epoch": 0.4695025389696728, "grad_norm": 0.8417780995368958, "learning_rate": 1.980618947816674e-05, "loss": 0.3268, "mean_token_accuracy": 0.9027716659009457, "num_tokens": 69488725.0, "step": 30500 }, { "entropy": 0.308566405531019, "epoch": 0.4696564742283514, "grad_norm": 0.6686740517616272, "learning_rate": 1.9805838399822113e-05, "loss": 0.305, "mean_token_accuracy": 0.9056864537298679, "num_tokens": 69552298.0, "step": 30510 }, { "entropy": 0.28661489225924014, "epoch": 0.46981040948703, "grad_norm": 0.8471134901046753, "learning_rate": 1.980548700690169e-05, "loss": 0.2919, "mean_token_accuracy": 0.9102503135800362, "num_tokens": 69622663.0, "step": 30520 }, { "entropy": 0.3164044372737408, "epoch": 0.4699643447457086, "grad_norm": 0.6037560105323792, "learning_rate": 1.980513529941675e-05, "loss": 0.3128, "mean_token_accuracy": 0.9053574807941913, "num_tokens": 69692652.0, "step": 30530 }, { "entropy": 0.2918076108209789, "epoch": 0.47011828000438716, "grad_norm": 0.7143975496292114, "learning_rate": 1.9804783277378574e-05, "loss": 0.2925, "mean_token_accuracy": 0.9098819203674793, "num_tokens": 69756948.0, "step": 30540 }, { "entropy": 0.31286478424444797, "epoch": 0.47027221526306573, "grad_norm": 0.6344603300094604, "learning_rate": 1.9804430940798456e-05, "loss": 0.3263, "mean_token_accuracy": 0.9017833672463894, "num_tokens": 69822634.0, "step": 30550 }, { "entropy": 0.3111942755058408, "epoch": 0.4704261505217443, "grad_norm": 0.5286200642585754, "learning_rate": 1.98040782896877e-05, "loss": 0.3074, "mean_token_accuracy": 0.9067686341702939, "num_tokens": 69889352.0, "step": 30560 }, { "entropy": 0.3117507860995829, "epoch": 0.4705800857804229, "grad_norm": 0.7722741961479187, "learning_rate": 1.9803725324057614e-05, "loss": 0.3078, "mean_token_accuracy": 0.9055950075387955, "num_tokens": 69952015.0, "step": 30570 }, { "entropy": 0.29838962452486156, "epoch": 0.4707340210391015, "grad_norm": 0.6518648862838745, "learning_rate": 1.9803372043919527e-05, "loss": 0.3118, "mean_token_accuracy": 0.9073400422930717, "num_tokens": 70012879.0, "step": 30580 }, { "entropy": 0.3000617567449808, "epoch": 0.4708879562977801, "grad_norm": 0.6482046842575073, "learning_rate": 1.980301844928477e-05, "loss": 0.3107, "mean_token_accuracy": 0.9076182462275029, "num_tokens": 70079556.0, "step": 30590 }, { "entropy": 0.2995977423153818, "epoch": 0.47104189155645865, "grad_norm": 0.6033259630203247, "learning_rate": 1.9802664540164685e-05, "loss": 0.2843, "mean_token_accuracy": 0.9098499171435833, "num_tokens": 70142932.0, "step": 30600 }, { "entropy": 0.3131449741311371, "epoch": 0.4711958268151372, "grad_norm": 0.586412787437439, "learning_rate": 1.980231031657063e-05, "loss": 0.313, "mean_token_accuracy": 0.9044383078813553, "num_tokens": 70205324.0, "step": 30610 }, { "entropy": 0.30418477123603227, "epoch": 0.4713497620738158, "grad_norm": 0.9157301783561707, "learning_rate": 1.9801955778513967e-05, "loss": 0.3097, "mean_token_accuracy": 0.9054685793817043, "num_tokens": 70260695.0, "step": 30620 }, { "entropy": 0.29141455218195916, "epoch": 0.47150369733249436, "grad_norm": 0.6839006543159485, "learning_rate": 1.9801600926006064e-05, "loss": 0.2988, "mean_token_accuracy": 0.9091611109673977, "num_tokens": 70326319.0, "step": 30630 }, { "entropy": 0.30082362573593857, "epoch": 0.471657632591173, "grad_norm": 0.7242483496665955, "learning_rate": 1.9801245759058316e-05, "loss": 0.31, "mean_token_accuracy": 0.9081172414124012, "num_tokens": 70400213.0, "step": 30640 }, { "entropy": 0.27035031374543905, "epoch": 0.47181156784985157, "grad_norm": 0.5651316046714783, "learning_rate": 1.9800890277682104e-05, "loss": 0.273, "mean_token_accuracy": 0.9167723499238492, "num_tokens": 70471303.0, "step": 30650 }, { "entropy": 0.29470478314906356, "epoch": 0.47196550310853014, "grad_norm": 0.9210216403007507, "learning_rate": 1.9800534481888845e-05, "loss": 0.295, "mean_token_accuracy": 0.9082841448485851, "num_tokens": 70535845.0, "step": 30660 }, { "entropy": 0.30652647987008097, "epoch": 0.4721194383672087, "grad_norm": 0.8084213733673096, "learning_rate": 1.9800178371689943e-05, "loss": 0.3079, "mean_token_accuracy": 0.9054871611297131, "num_tokens": 70604490.0, "step": 30670 }, { "entropy": 0.30030419789254664, "epoch": 0.4722733736258873, "grad_norm": 0.7378129959106445, "learning_rate": 1.9799821947096826e-05, "loss": 0.3091, "mean_token_accuracy": 0.9073244802653789, "num_tokens": 70666168.0, "step": 30680 }, { "entropy": 0.2907160449773073, "epoch": 0.47242730888456586, "grad_norm": 0.7387752532958984, "learning_rate": 1.9799465208120928e-05, "loss": 0.2961, "mean_token_accuracy": 0.9108680576086045, "num_tokens": 70732241.0, "step": 30690 }, { "entropy": 0.30662463335320356, "epoch": 0.4725812441432444, "grad_norm": 0.6416597366333008, "learning_rate": 1.9799108154773696e-05, "loss": 0.3084, "mean_token_accuracy": 0.9054799631237984, "num_tokens": 70797603.0, "step": 30700 }, { "entropy": 0.3109873847104609, "epoch": 0.47273517940192306, "grad_norm": 0.6166195869445801, "learning_rate": 1.979875078706658e-05, "loss": 0.3146, "mean_token_accuracy": 0.9047998435795307, "num_tokens": 70865900.0, "step": 30710 }, { "entropy": 0.3085407609120011, "epoch": 0.47288911466060163, "grad_norm": 0.5491024851799011, "learning_rate": 1.9798393105011046e-05, "loss": 0.3133, "mean_token_accuracy": 0.9063191115856171, "num_tokens": 70933650.0, "step": 30720 }, { "entropy": 0.3027939097955823, "epoch": 0.4730430499192802, "grad_norm": 0.5776382684707642, "learning_rate": 1.979803510861857e-05, "loss": 0.3111, "mean_token_accuracy": 0.9056638866662979, "num_tokens": 70998305.0, "step": 30730 }, { "entropy": 0.3314788807183504, "epoch": 0.4731969851779588, "grad_norm": 0.5173878073692322, "learning_rate": 1.9797676797900635e-05, "loss": 0.3184, "mean_token_accuracy": 0.8979891888797283, "num_tokens": 71063934.0, "step": 30740 }, { "entropy": 0.3048451183363795, "epoch": 0.47335092043663735, "grad_norm": 0.6925045251846313, "learning_rate": 1.9797318172868735e-05, "loss": 0.3012, "mean_token_accuracy": 0.9058886140584945, "num_tokens": 71130651.0, "step": 30750 }, { "entropy": 0.31793117001652715, "epoch": 0.4735048556953159, "grad_norm": 0.5019372701644897, "learning_rate": 1.9796959233534378e-05, "loss": 0.3017, "mean_token_accuracy": 0.9046200804412365, "num_tokens": 71198487.0, "step": 30760 }, { "entropy": 0.30604109950363634, "epoch": 0.47365879095399455, "grad_norm": 0.6379595398902893, "learning_rate": 1.9796599979909076e-05, "loss": 0.3137, "mean_token_accuracy": 0.9068449974060059, "num_tokens": 71260890.0, "step": 30770 }, { "entropy": 0.2790228377096355, "epoch": 0.4738127262126731, "grad_norm": 0.7239155173301697, "learning_rate": 1.9796240412004355e-05, "loss": 0.295, "mean_token_accuracy": 0.9150326244533062, "num_tokens": 71321361.0, "step": 30780 }, { "entropy": 0.29097872208803893, "epoch": 0.4739666614713517, "grad_norm": 0.8076112866401672, "learning_rate": 1.9795880529831753e-05, "loss": 0.2977, "mean_token_accuracy": 0.9107023656368256, "num_tokens": 71389515.0, "step": 30790 }, { "entropy": 0.27856299439445137, "epoch": 0.47412059673003026, "grad_norm": 0.8234375715255737, "learning_rate": 1.9795520333402812e-05, "loss": 0.2879, "mean_token_accuracy": 0.9122962467372417, "num_tokens": 71447060.0, "step": 30800 }, { "entropy": 0.29925196394324305, "epoch": 0.47427453198870884, "grad_norm": 0.6264486908912659, "learning_rate": 1.9795159822729086e-05, "loss": 0.3199, "mean_token_accuracy": 0.9066758006811142, "num_tokens": 71508469.0, "step": 30810 }, { "entropy": 0.2900668389163911, "epoch": 0.4744284672473874, "grad_norm": 0.6563543081283569, "learning_rate": 1.9794798997822142e-05, "loss": 0.2881, "mean_token_accuracy": 0.9117318011820317, "num_tokens": 71566964.0, "step": 30820 }, { "entropy": 0.28294653771445155, "epoch": 0.47458240250606604, "grad_norm": 0.7474122643470764, "learning_rate": 1.9794437858693558e-05, "loss": 0.2914, "mean_token_accuracy": 0.9126536525785923, "num_tokens": 71631731.0, "step": 30830 }, { "entropy": 0.3225956389680505, "epoch": 0.4747363377647446, "grad_norm": 0.5384257435798645, "learning_rate": 1.9794076405354917e-05, "loss": 0.3306, "mean_token_accuracy": 0.9008545890450478, "num_tokens": 71701702.0, "step": 30840 }, { "entropy": 0.2992765777744353, "epoch": 0.4748902730234232, "grad_norm": 0.679397463798523, "learning_rate": 1.979371463781781e-05, "loss": 0.3047, "mean_token_accuracy": 0.9096060208976269, "num_tokens": 71758222.0, "step": 30850 }, { "entropy": 0.27710575684905053, "epoch": 0.47504420828210175, "grad_norm": 0.569018542766571, "learning_rate": 1.979335255609385e-05, "loss": 0.2887, "mean_token_accuracy": 0.9145162604749203, "num_tokens": 71822371.0, "step": 30860 }, { "entropy": 0.3036770662292838, "epoch": 0.4751981435407803, "grad_norm": 0.578413188457489, "learning_rate": 1.9792990160194653e-05, "loss": 0.2978, "mean_token_accuracy": 0.9075410798192024, "num_tokens": 71892459.0, "step": 30870 }, { "entropy": 0.28639746382832526, "epoch": 0.4753520787994589, "grad_norm": 0.5823050141334534, "learning_rate": 1.979262745013184e-05, "loss": 0.2948, "mean_token_accuracy": 0.9085205972194672, "num_tokens": 71954646.0, "step": 30880 }, { "entropy": 0.2925685898400843, "epoch": 0.47550601405813747, "grad_norm": 0.628511905670166, "learning_rate": 1.9792264425917048e-05, "loss": 0.2957, "mean_token_accuracy": 0.9108254246413707, "num_tokens": 72021718.0, "step": 30890 }, { "entropy": 0.27747428566217425, "epoch": 0.4756599493168161, "grad_norm": 0.8067585229873657, "learning_rate": 1.9791901087561928e-05, "loss": 0.2809, "mean_token_accuracy": 0.9166324511170387, "num_tokens": 72089141.0, "step": 30900 }, { "entropy": 0.2905678759329021, "epoch": 0.47581388457549467, "grad_norm": 0.7599378228187561, "learning_rate": 1.9791537435078124e-05, "loss": 0.2965, "mean_token_accuracy": 0.9095364265143872, "num_tokens": 72155634.0, "step": 30910 }, { "entropy": 0.29209047444164754, "epoch": 0.47596781983417324, "grad_norm": 0.8333150148391724, "learning_rate": 1.9791173468477315e-05, "loss": 0.2929, "mean_token_accuracy": 0.9094309844076633, "num_tokens": 72219502.0, "step": 30920 }, { "entropy": 0.2935419620014727, "epoch": 0.4761217550928518, "grad_norm": 0.5462749600410461, "learning_rate": 1.9790809187771173e-05, "loss": 0.2932, "mean_token_accuracy": 0.9133860491216182, "num_tokens": 72287419.0, "step": 30930 }, { "entropy": 0.2836478278040886, "epoch": 0.4762756903515304, "grad_norm": 0.6975204348564148, "learning_rate": 1.9790444592971384e-05, "loss": 0.2985, "mean_token_accuracy": 0.9101765647530555, "num_tokens": 72353117.0, "step": 30940 }, { "entropy": 0.28945309640839695, "epoch": 0.47642962561020896, "grad_norm": 0.872575044631958, "learning_rate": 1.979007968408964e-05, "loss": 0.2961, "mean_token_accuracy": 0.9107264794409275, "num_tokens": 72420917.0, "step": 30950 }, { "entropy": 0.3108946179971099, "epoch": 0.4765835608688876, "grad_norm": 0.762204647064209, "learning_rate": 1.9789714461137655e-05, "loss": 0.3152, "mean_token_accuracy": 0.9047957755625248, "num_tokens": 72495990.0, "step": 30960 }, { "entropy": 0.31213482180610297, "epoch": 0.47673749612756616, "grad_norm": 1.1693711280822754, "learning_rate": 1.978934892412714e-05, "loss": 0.3087, "mean_token_accuracy": 0.9042526096105575, "num_tokens": 72563295.0, "step": 30970 }, { "entropy": 0.30144600104540586, "epoch": 0.47689143138624474, "grad_norm": 0.47756651043891907, "learning_rate": 1.9788983073069822e-05, "loss": 0.3035, "mean_token_accuracy": 0.9081070095300674, "num_tokens": 72636125.0, "step": 30980 }, { "entropy": 0.28684409726411103, "epoch": 0.4770453666449233, "grad_norm": 0.7758151888847351, "learning_rate": 1.978861690797744e-05, "loss": 0.2902, "mean_token_accuracy": 0.9115083195269108, "num_tokens": 72700210.0, "step": 30990 }, { "entropy": 0.29905897779390217, "epoch": 0.4771993019036019, "grad_norm": 0.7013385891914368, "learning_rate": 1.9788250428861744e-05, "loss": 0.2976, "mean_token_accuracy": 0.9085804857313633, "num_tokens": 72760475.0, "step": 31000 }, { "entropy": 0.2933857596479356, "epoch": 0.47735323716228045, "grad_norm": 0.5299777984619141, "learning_rate": 1.9787883635734484e-05, "loss": 0.2936, "mean_token_accuracy": 0.9137336201965809, "num_tokens": 72822797.0, "step": 31010 }, { "entropy": 0.2821795754134655, "epoch": 0.4775071724209591, "grad_norm": 0.7712678909301758, "learning_rate": 1.9787516528607428e-05, "loss": 0.2949, "mean_token_accuracy": 0.9132110111415386, "num_tokens": 72885879.0, "step": 31020 }, { "entropy": 0.29753669165074825, "epoch": 0.47766110767963765, "grad_norm": 0.593917727470398, "learning_rate": 1.9787149107492357e-05, "loss": 0.3027, "mean_token_accuracy": 0.9075130589306355, "num_tokens": 72958747.0, "step": 31030 }, { "entropy": 0.29875782383605837, "epoch": 0.4778150429383162, "grad_norm": 0.8747012615203857, "learning_rate": 1.9786781372401054e-05, "loss": 0.2966, "mean_token_accuracy": 0.907764382660389, "num_tokens": 73022824.0, "step": 31040 }, { "entropy": 0.30866564251482487, "epoch": 0.4779689781969948, "grad_norm": 0.756309986114502, "learning_rate": 1.9786413323345317e-05, "loss": 0.3011, "mean_token_accuracy": 0.9065751917660236, "num_tokens": 73085307.0, "step": 31050 }, { "entropy": 0.27463350519537927, "epoch": 0.47812291345567337, "grad_norm": 0.7216115593910217, "learning_rate": 1.9786044960336954e-05, "loss": 0.289, "mean_token_accuracy": 0.914846221357584, "num_tokens": 73149438.0, "step": 31060 }, { "entropy": 0.3072024563327432, "epoch": 0.47827684871435194, "grad_norm": 0.5880587697029114, "learning_rate": 1.9785676283387785e-05, "loss": 0.3033, "mean_token_accuracy": 0.9040549196302891, "num_tokens": 73218440.0, "step": 31070 }, { "entropy": 0.31549198515713217, "epoch": 0.4784307839730305, "grad_norm": 0.6135621666908264, "learning_rate": 1.9785307292509633e-05, "loss": 0.3097, "mean_token_accuracy": 0.9025891579687595, "num_tokens": 73289705.0, "step": 31080 }, { "entropy": 0.286838885396719, "epoch": 0.47858471923170914, "grad_norm": 0.5335896611213684, "learning_rate": 1.9784937987714334e-05, "loss": 0.2842, "mean_token_accuracy": 0.9116741172969342, "num_tokens": 73357559.0, "step": 31090 }, { "entropy": 0.3103651125915349, "epoch": 0.4787386544903877, "grad_norm": 0.5935391187667847, "learning_rate": 1.978456836901374e-05, "loss": 0.3108, "mean_token_accuracy": 0.9032561734318734, "num_tokens": 73433921.0, "step": 31100 }, { "entropy": 0.30134245231747625, "epoch": 0.4788925897490663, "grad_norm": 0.6589078903198242, "learning_rate": 1.9784198436419707e-05, "loss": 0.3094, "mean_token_accuracy": 0.9064878486096859, "num_tokens": 73503204.0, "step": 31110 }, { "entropy": 0.28452117647975683, "epoch": 0.47904652500774486, "grad_norm": 0.7168863415718079, "learning_rate": 1.9783828189944104e-05, "loss": 0.2828, "mean_token_accuracy": 0.9151526913046837, "num_tokens": 73565332.0, "step": 31120 }, { "entropy": 0.302536216750741, "epoch": 0.47920046026642343, "grad_norm": 0.9523488283157349, "learning_rate": 1.9783457629598807e-05, "loss": 0.3165, "mean_token_accuracy": 0.9059945367276668, "num_tokens": 73627348.0, "step": 31130 }, { "entropy": 0.31703553833067416, "epoch": 0.479354395525102, "grad_norm": 0.8222002387046814, "learning_rate": 1.9783086755395704e-05, "loss": 0.3033, "mean_token_accuracy": 0.9034945636987686, "num_tokens": 73694039.0, "step": 31140 }, { "entropy": 0.3149337649345398, "epoch": 0.47950833078378063, "grad_norm": 0.7373892664909363, "learning_rate": 1.978271556734669e-05, "loss": 0.3156, "mean_token_accuracy": 0.901565209031105, "num_tokens": 73767198.0, "step": 31150 }, { "entropy": 0.3034733789972961, "epoch": 0.4796622660424592, "grad_norm": 0.679521918296814, "learning_rate": 1.978234406546368e-05, "loss": 0.3125, "mean_token_accuracy": 0.9069591335952282, "num_tokens": 73825515.0, "step": 31160 }, { "entropy": 0.2993191949091852, "epoch": 0.4798162013011378, "grad_norm": 0.714363157749176, "learning_rate": 1.9781972249758582e-05, "loss": 0.2982, "mean_token_accuracy": 0.9108196996152401, "num_tokens": 73889088.0, "step": 31170 }, { "entropy": 0.31465929206460713, "epoch": 0.47997013655981635, "grad_norm": 0.5986987352371216, "learning_rate": 1.9781600120243337e-05, "loss": 0.3085, "mean_token_accuracy": 0.9019304141402245, "num_tokens": 73963361.0, "step": 31180 }, { "entropy": 0.2927928983233869, "epoch": 0.4801240718184949, "grad_norm": 0.5220542550086975, "learning_rate": 1.978122767692987e-05, "loss": 0.3036, "mean_token_accuracy": 0.910160755366087, "num_tokens": 74029555.0, "step": 31190 }, { "entropy": 0.3022027863189578, "epoch": 0.4802780070771735, "grad_norm": 0.6462368369102478, "learning_rate": 1.9780854919830133e-05, "loss": 0.2998, "mean_token_accuracy": 0.9087187379598618, "num_tokens": 74100378.0, "step": 31200 }, { "entropy": 0.3113927741535008, "epoch": 0.4804319423358521, "grad_norm": 0.7547155022621155, "learning_rate": 1.978048184895609e-05, "loss": 0.3127, "mean_token_accuracy": 0.903516848385334, "num_tokens": 74162372.0, "step": 31210 }, { "entropy": 0.30902351886034013, "epoch": 0.4805858775945307, "grad_norm": 0.5908698439598083, "learning_rate": 1.9780108464319707e-05, "loss": 0.3098, "mean_token_accuracy": 0.9041011162102223, "num_tokens": 74224818.0, "step": 31220 }, { "entropy": 0.3152106243185699, "epoch": 0.48073981285320927, "grad_norm": 0.6501317024230957, "learning_rate": 1.9779734765932957e-05, "loss": 0.3198, "mean_token_accuracy": 0.9003715358674527, "num_tokens": 74296394.0, "step": 31230 }, { "entropy": 0.31038403827697036, "epoch": 0.48089374811188784, "grad_norm": 0.6779321432113647, "learning_rate": 1.9779360753807833e-05, "loss": 0.3094, "mean_token_accuracy": 0.9066741093993187, "num_tokens": 74356594.0, "step": 31240 }, { "entropy": 0.3064192644320428, "epoch": 0.4810476833705664, "grad_norm": 0.5675197243690491, "learning_rate": 1.977898642795633e-05, "loss": 0.2982, "mean_token_accuracy": 0.9059866949915886, "num_tokens": 74417061.0, "step": 31250 }, { "entropy": 0.2804867638275027, "epoch": 0.481201618629245, "grad_norm": 0.7048936486244202, "learning_rate": 1.977861178839046e-05, "loss": 0.2853, "mean_token_accuracy": 0.914213628321886, "num_tokens": 74485937.0, "step": 31260 }, { "entropy": 0.26848951475694777, "epoch": 0.48135555388792356, "grad_norm": 0.7012819051742554, "learning_rate": 1.9778236835122246e-05, "loss": 0.2729, "mean_token_accuracy": 0.9185870066285133, "num_tokens": 74545785.0, "step": 31270 }, { "entropy": 0.27778489151969554, "epoch": 0.4815094891466022, "grad_norm": 0.7920451164245605, "learning_rate": 1.9777861568163706e-05, "loss": 0.2887, "mean_token_accuracy": 0.9130070313811303, "num_tokens": 74603677.0, "step": 31280 }, { "entropy": 0.30946353506296875, "epoch": 0.48166342440528076, "grad_norm": 0.7955124378204346, "learning_rate": 1.9777485987526884e-05, "loss": 0.306, "mean_token_accuracy": 0.9056861065328121, "num_tokens": 74667605.0, "step": 31290 }, { "entropy": 0.28566393507644533, "epoch": 0.48181735966395933, "grad_norm": 0.5052657723426819, "learning_rate": 1.977711009322383e-05, "loss": 0.2936, "mean_token_accuracy": 0.9114922240376473, "num_tokens": 74737478.0, "step": 31300 }, { "entropy": 0.29678639192134143, "epoch": 0.4819712949226379, "grad_norm": 0.6614365577697754, "learning_rate": 1.9776733885266605e-05, "loss": 0.2845, "mean_token_accuracy": 0.9091103196144104, "num_tokens": 74806294.0, "step": 31310 }, { "entropy": 0.30356158008798956, "epoch": 0.4821252301813165, "grad_norm": 0.7024741172790527, "learning_rate": 1.9776357363667272e-05, "loss": 0.2945, "mean_token_accuracy": 0.9057525873184205, "num_tokens": 74875898.0, "step": 31320 }, { "entropy": 0.31084894770756366, "epoch": 0.48227916543999505, "grad_norm": 0.5957913994789124, "learning_rate": 1.9775980528437913e-05, "loss": 0.3134, "mean_token_accuracy": 0.9049110256135464, "num_tokens": 74950050.0, "step": 31330 }, { "entropy": 0.3117949371226132, "epoch": 0.4824331006986737, "grad_norm": 0.5792920589447021, "learning_rate": 1.9775603379590618e-05, "loss": 0.3276, "mean_token_accuracy": 0.9045839712023735, "num_tokens": 75022532.0, "step": 31340 }, { "entropy": 0.290427675191313, "epoch": 0.48258703595735225, "grad_norm": 0.6160818934440613, "learning_rate": 1.9775225917137485e-05, "loss": 0.2922, "mean_token_accuracy": 0.9119254536926746, "num_tokens": 75086942.0, "step": 31350 }, { "entropy": 0.29420192530378697, "epoch": 0.4827409712160308, "grad_norm": 0.6368720531463623, "learning_rate": 1.977484814109062e-05, "loss": 0.2908, "mean_token_accuracy": 0.9098599903285504, "num_tokens": 75146832.0, "step": 31360 }, { "entropy": 0.31337706139311194, "epoch": 0.4828949064747094, "grad_norm": 0.5731521248817444, "learning_rate": 1.9774470051462144e-05, "loss": 0.3217, "mean_token_accuracy": 0.9033120386302471, "num_tokens": 75219816.0, "step": 31370 }, { "entropy": 0.2915662318468094, "epoch": 0.48304884173338797, "grad_norm": 0.5457898378372192, "learning_rate": 1.977409164826419e-05, "loss": 0.3047, "mean_token_accuracy": 0.9115547806024551, "num_tokens": 75285468.0, "step": 31380 }, { "entropy": 0.31589262187480927, "epoch": 0.48320277699206654, "grad_norm": 0.6765292882919312, "learning_rate": 1.9773712931508898e-05, "loss": 0.3321, "mean_token_accuracy": 0.9015226632356643, "num_tokens": 75351083.0, "step": 31390 }, { "entropy": 0.3160297895781696, "epoch": 0.48335671225074517, "grad_norm": 0.5155459046363831, "learning_rate": 1.977333390120841e-05, "loss": 0.3112, "mean_token_accuracy": 0.9049923725426197, "num_tokens": 75411076.0, "step": 31400 }, { "entropy": 0.2805600566789508, "epoch": 0.48351064750942374, "grad_norm": 0.7486717104911804, "learning_rate": 1.977295455737489e-05, "loss": 0.2968, "mean_token_accuracy": 0.9159695863723755, "num_tokens": 75471629.0, "step": 31410 }, { "entropy": 0.3001065094023943, "epoch": 0.4836645827681023, "grad_norm": 0.6691919565200806, "learning_rate": 1.977257490002051e-05, "loss": 0.3088, "mean_token_accuracy": 0.9070785149931908, "num_tokens": 75540737.0, "step": 31420 }, { "entropy": 0.3079907831735909, "epoch": 0.4838185180267809, "grad_norm": 0.6996206641197205, "learning_rate": 1.9772194929157447e-05, "loss": 0.3101, "mean_token_accuracy": 0.9049521751701832, "num_tokens": 75604778.0, "step": 31430 }, { "entropy": 0.30678361086174843, "epoch": 0.48397245328545946, "grad_norm": 0.6745657324790955, "learning_rate": 1.9771814644797886e-05, "loss": 0.3048, "mean_token_accuracy": 0.906885951757431, "num_tokens": 75671223.0, "step": 31440 }, { "entropy": 0.26306892270222304, "epoch": 0.48412638854413803, "grad_norm": 0.6495862007141113, "learning_rate": 1.9771434046954033e-05, "loss": 0.2729, "mean_token_accuracy": 0.9178179942071438, "num_tokens": 75737071.0, "step": 31450 }, { "entropy": 0.32812840212136507, "epoch": 0.4842803238028166, "grad_norm": 0.5792267918586731, "learning_rate": 1.97710531356381e-05, "loss": 0.3237, "mean_token_accuracy": 0.8987478651106358, "num_tokens": 75804550.0, "step": 31460 }, { "entropy": 0.30589659633114935, "epoch": 0.48443425906149523, "grad_norm": 0.7915158867835999, "learning_rate": 1.97706719108623e-05, "loss": 0.3161, "mean_token_accuracy": 0.904033263027668, "num_tokens": 75859578.0, "step": 31470 }, { "entropy": 0.29398836391046645, "epoch": 0.4845881943201738, "grad_norm": 0.6778084635734558, "learning_rate": 1.9770290372638862e-05, "loss": 0.2969, "mean_token_accuracy": 0.9091858342289925, "num_tokens": 75923076.0, "step": 31480 }, { "entropy": 0.2794883159920573, "epoch": 0.4847421295788524, "grad_norm": 0.7386062145233154, "learning_rate": 1.9769908520980036e-05, "loss": 0.2803, "mean_token_accuracy": 0.9154994763433933, "num_tokens": 75986463.0, "step": 31490 }, { "entropy": 0.2923842029646039, "epoch": 0.48489606483753095, "grad_norm": 0.6299817562103271, "learning_rate": 1.9769526355898063e-05, "loss": 0.2842, "mean_token_accuracy": 0.908072218298912, "num_tokens": 76058010.0, "step": 31500 }, { "entropy": 0.2949165577068925, "epoch": 0.4850500000962095, "grad_norm": 0.7885644435882568, "learning_rate": 1.9769143877405202e-05, "loss": 0.2919, "mean_token_accuracy": 0.9084190025925636, "num_tokens": 76127452.0, "step": 31510 }, { "entropy": 0.2864888543263078, "epoch": 0.4852039353548881, "grad_norm": 0.6088125109672546, "learning_rate": 1.976876108551373e-05, "loss": 0.2889, "mean_token_accuracy": 0.9116926990449429, "num_tokens": 76194204.0, "step": 31520 }, { "entropy": 0.3064729399047792, "epoch": 0.4853578706135667, "grad_norm": 0.6527391076087952, "learning_rate": 1.976837798023592e-05, "loss": 0.3153, "mean_token_accuracy": 0.9047854892909527, "num_tokens": 76262706.0, "step": 31530 }, { "entropy": 0.28522355742752553, "epoch": 0.4855118058722453, "grad_norm": 0.5465194582939148, "learning_rate": 1.9767994561584072e-05, "loss": 0.2993, "mean_token_accuracy": 0.9133022956550121, "num_tokens": 76325389.0, "step": 31540 }, { "entropy": 0.3111666182056069, "epoch": 0.48566574113092387, "grad_norm": 0.5966360569000244, "learning_rate": 1.976761082957048e-05, "loss": 0.3137, "mean_token_accuracy": 0.9029340513050557, "num_tokens": 76393841.0, "step": 31550 }, { "entropy": 0.30195199344307183, "epoch": 0.48581967638960244, "grad_norm": 0.6398252844810486, "learning_rate": 1.9767226784207447e-05, "loss": 0.3087, "mean_token_accuracy": 0.9060247898101806, "num_tokens": 76458361.0, "step": 31560 }, { "entropy": 0.28258998366072774, "epoch": 0.485973611648281, "grad_norm": 0.593867838382721, "learning_rate": 1.976684242550731e-05, "loss": 0.2826, "mean_token_accuracy": 0.9126312002539635, "num_tokens": 76529575.0, "step": 31570 }, { "entropy": 0.30532670859247446, "epoch": 0.4861275469069596, "grad_norm": 0.8375864028930664, "learning_rate": 1.9766457753482384e-05, "loss": 0.3161, "mean_token_accuracy": 0.9063399590551853, "num_tokens": 76590832.0, "step": 31580 }, { "entropy": 0.30112972157076, "epoch": 0.4862814821656382, "grad_norm": 0.593332052230835, "learning_rate": 1.976607276814502e-05, "loss": 0.3021, "mean_token_accuracy": 0.9076108433306217, "num_tokens": 76653589.0, "step": 31590 }, { "entropy": 0.31081880778074267, "epoch": 0.4864354174243168, "grad_norm": 0.6198974847793579, "learning_rate": 1.9765687469507558e-05, "loss": 0.3073, "mean_token_accuracy": 0.9045248813927174, "num_tokens": 76729412.0, "step": 31600 }, { "entropy": 0.2973948875442147, "epoch": 0.48658935268299536, "grad_norm": 0.6294840574264526, "learning_rate": 1.9765301857582368e-05, "loss": 0.2985, "mean_token_accuracy": 0.9080571793019772, "num_tokens": 76796168.0, "step": 31610 }, { "entropy": 0.307569341827184, "epoch": 0.48674328794167393, "grad_norm": 0.6398555636405945, "learning_rate": 1.9764915932381822e-05, "loss": 0.3162, "mean_token_accuracy": 0.9048799037933349, "num_tokens": 76854916.0, "step": 31620 }, { "entropy": 0.2711097731254995, "epoch": 0.4868972232003525, "grad_norm": 0.7442623972892761, "learning_rate": 1.9764529693918287e-05, "loss": 0.2707, "mean_token_accuracy": 0.9164354890584946, "num_tokens": 76918245.0, "step": 31630 }, { "entropy": 0.2979500228539109, "epoch": 0.4870511584590311, "grad_norm": 0.7426527738571167, "learning_rate": 1.976414314220417e-05, "loss": 0.303, "mean_token_accuracy": 0.9077129974961281, "num_tokens": 76984553.0, "step": 31640 }, { "entropy": 0.2994405314326286, "epoch": 0.48720509371770965, "grad_norm": 0.5191835165023804, "learning_rate": 1.9763756277251863e-05, "loss": 0.3085, "mean_token_accuracy": 0.9073222659528255, "num_tokens": 77047986.0, "step": 31650 }, { "entropy": 0.3098065788857639, "epoch": 0.4873590289763883, "grad_norm": 0.6442819237709045, "learning_rate": 1.9763369099073778e-05, "loss": 0.3073, "mean_token_accuracy": 0.9053798876702785, "num_tokens": 77116436.0, "step": 31660 }, { "entropy": 0.29513267492875456, "epoch": 0.48751296423506685, "grad_norm": 0.6538479328155518, "learning_rate": 1.9762981607682336e-05, "loss": 0.2907, "mean_token_accuracy": 0.9090252205729484, "num_tokens": 77182099.0, "step": 31670 }, { "entropy": 0.2976803855970502, "epoch": 0.4876668994937454, "grad_norm": 0.6013593673706055, "learning_rate": 1.9762593803089966e-05, "loss": 0.3025, "mean_token_accuracy": 0.9086287565529346, "num_tokens": 77246334.0, "step": 31680 }, { "entropy": 0.3188600867055357, "epoch": 0.487820834752424, "grad_norm": 0.9631410241127014, "learning_rate": 1.9762205685309115e-05, "loss": 0.3242, "mean_token_accuracy": 0.9023495949804783, "num_tokens": 77309498.0, "step": 31690 }, { "entropy": 0.309239418618381, "epoch": 0.48797477001110257, "grad_norm": 0.633745551109314, "learning_rate": 1.9761817254352228e-05, "loss": 0.3058, "mean_token_accuracy": 0.9063395678997039, "num_tokens": 77368435.0, "step": 31700 }, { "entropy": 0.3163295148871839, "epoch": 0.48812870526978114, "grad_norm": 0.5675190091133118, "learning_rate": 1.976142851023177e-05, "loss": 0.3145, "mean_token_accuracy": 0.9027910679578781, "num_tokens": 77437659.0, "step": 31710 }, { "entropy": 0.2961264862678945, "epoch": 0.48828264052845977, "grad_norm": 0.7411370277404785, "learning_rate": 1.9761039452960206e-05, "loss": 0.3015, "mean_token_accuracy": 0.9081579834222794, "num_tokens": 77510379.0, "step": 31720 }, { "entropy": 0.30152882896363736, "epoch": 0.48843657578713834, "grad_norm": 0.6583564877510071, "learning_rate": 1.9760650082550024e-05, "loss": 0.2997, "mean_token_accuracy": 0.9065994277596474, "num_tokens": 77572611.0, "step": 31730 }, { "entropy": 0.282260422129184, "epoch": 0.4885905110458169, "grad_norm": 0.6635822057723999, "learning_rate": 1.976026039901371e-05, "loss": 0.2894, "mean_token_accuracy": 0.9119985274970531, "num_tokens": 77639055.0, "step": 31740 }, { "entropy": 0.326417641621083, "epoch": 0.4887444463044955, "grad_norm": 0.7179363369941711, "learning_rate": 1.975987040236377e-05, "loss": 0.3201, "mean_token_accuracy": 0.9000595793128013, "num_tokens": 77705886.0, "step": 31750 }, { "entropy": 0.2963307950645685, "epoch": 0.48889838156317406, "grad_norm": 0.6868104338645935, "learning_rate": 1.9759480092612714e-05, "loss": 0.2964, "mean_token_accuracy": 0.9093134894967079, "num_tokens": 77764795.0, "step": 31760 }, { "entropy": 0.28333398746326566, "epoch": 0.48905231682185263, "grad_norm": 0.6383088231086731, "learning_rate": 1.975908946977306e-05, "loss": 0.2924, "mean_token_accuracy": 0.9135441236197949, "num_tokens": 77830449.0, "step": 31770 }, { "entropy": 0.3021201233379543, "epoch": 0.48920625208053126, "grad_norm": 0.6765360832214355, "learning_rate": 1.975869853385734e-05, "loss": 0.3116, "mean_token_accuracy": 0.9058905109763146, "num_tokens": 77898074.0, "step": 31780 }, { "entropy": 0.2879598317667842, "epoch": 0.48936018733920983, "grad_norm": 0.6487332582473755, "learning_rate": 1.97583072848781e-05, "loss": 0.2922, "mean_token_accuracy": 0.9113676227629185, "num_tokens": 77967485.0, "step": 31790 }, { "entropy": 0.29768965309485795, "epoch": 0.4895141225978884, "grad_norm": 0.6384400129318237, "learning_rate": 1.9757915722847885e-05, "loss": 0.3193, "mean_token_accuracy": 0.9086013093590737, "num_tokens": 78038710.0, "step": 31800 }, { "entropy": 0.2861637741327286, "epoch": 0.489668057856567, "grad_norm": 0.5734187960624695, "learning_rate": 1.9757523847779266e-05, "loss": 0.2827, "mean_token_accuracy": 0.9133756667375564, "num_tokens": 78110635.0, "step": 31810 }, { "entropy": 0.2951931992545724, "epoch": 0.48982199311524555, "grad_norm": 0.6792467832565308, "learning_rate": 1.9757131659684807e-05, "loss": 0.3073, "mean_token_accuracy": 0.9077052272856235, "num_tokens": 78174078.0, "step": 31820 }, { "entropy": 0.30504931351169945, "epoch": 0.4899759283739241, "grad_norm": 0.6399514079093933, "learning_rate": 1.975673915857709e-05, "loss": 0.3233, "mean_token_accuracy": 0.9056575857102871, "num_tokens": 78229320.0, "step": 31830 }, { "entropy": 0.275663932505995, "epoch": 0.4901298636326027, "grad_norm": 0.5713058114051819, "learning_rate": 1.9756346344468708e-05, "loss": 0.2838, "mean_token_accuracy": 0.9165105380117893, "num_tokens": 78300566.0, "step": 31840 }, { "entropy": 0.3190461846999824, "epoch": 0.4902837988912813, "grad_norm": 0.7214494943618774, "learning_rate": 1.975595321737226e-05, "loss": 0.3219, "mean_token_accuracy": 0.9009857952594758, "num_tokens": 78371326.0, "step": 31850 }, { "entropy": 0.29389236783608796, "epoch": 0.4904377341499599, "grad_norm": 0.524046778678894, "learning_rate": 1.9755559777300362e-05, "loss": 0.2983, "mean_token_accuracy": 0.9074928253889084, "num_tokens": 78443970.0, "step": 31860 }, { "entropy": 0.2975514124147594, "epoch": 0.49059166940863846, "grad_norm": 0.766657292842865, "learning_rate": 1.9755166024265634e-05, "loss": 0.2943, "mean_token_accuracy": 0.9090407103300094, "num_tokens": 78512825.0, "step": 31870 }, { "entropy": 0.2979589859955013, "epoch": 0.49074560466731704, "grad_norm": 0.6891859173774719, "learning_rate": 1.9754771958280706e-05, "loss": 0.3156, "mean_token_accuracy": 0.9079725846648217, "num_tokens": 78573119.0, "step": 31880 }, { "entropy": 0.3048254162073135, "epoch": 0.4908995399259956, "grad_norm": 0.5684322118759155, "learning_rate": 1.9754377579358222e-05, "loss": 0.3038, "mean_token_accuracy": 0.9069663919508457, "num_tokens": 78639520.0, "step": 31890 }, { "entropy": 0.3052556709386408, "epoch": 0.4910534751846742, "grad_norm": 0.8395780920982361, "learning_rate": 1.975398288751083e-05, "loss": 0.3101, "mean_token_accuracy": 0.9069618411362171, "num_tokens": 78702967.0, "step": 31900 }, { "entropy": 0.3316076582297683, "epoch": 0.4912074104433528, "grad_norm": 0.7204639315605164, "learning_rate": 1.97535878827512e-05, "loss": 0.3331, "mean_token_accuracy": 0.8969456307590008, "num_tokens": 78773612.0, "step": 31910 }, { "entropy": 0.30804799906909464, "epoch": 0.4913613457020314, "grad_norm": 0.7173897624015808, "learning_rate": 1.9753192565091997e-05, "loss": 0.322, "mean_token_accuracy": 0.9051939517259597, "num_tokens": 78836247.0, "step": 31920 }, { "entropy": 0.299314736481756, "epoch": 0.49151528096070995, "grad_norm": 0.787987232208252, "learning_rate": 1.975279693454591e-05, "loss": 0.303, "mean_token_accuracy": 0.9081769473850727, "num_tokens": 78901830.0, "step": 31930 }, { "entropy": 0.301290921214968, "epoch": 0.4916692162193885, "grad_norm": 0.6368324160575867, "learning_rate": 1.975240099112562e-05, "loss": 0.3059, "mean_token_accuracy": 0.9057503409683705, "num_tokens": 78968662.0, "step": 31940 }, { "entropy": 0.31321445405483245, "epoch": 0.4918231514780671, "grad_norm": 0.5580065846443176, "learning_rate": 1.9752004734843834e-05, "loss": 0.3015, "mean_token_accuracy": 0.9041055336594581, "num_tokens": 79042127.0, "step": 31950 }, { "entropy": 0.30389781575649977, "epoch": 0.4919770867367457, "grad_norm": 0.8651940822601318, "learning_rate": 1.975160816571327e-05, "loss": 0.3139, "mean_token_accuracy": 0.9073901355266571, "num_tokens": 79106921.0, "step": 31960 }, { "entropy": 0.30662075290456414, "epoch": 0.4921310219954243, "grad_norm": 0.5868505835533142, "learning_rate": 1.9751211283746644e-05, "loss": 0.3215, "mean_token_accuracy": 0.906045363843441, "num_tokens": 79178172.0, "step": 31970 }, { "entropy": 0.2828893790952861, "epoch": 0.4922849572541029, "grad_norm": 0.630025327205658, "learning_rate": 1.975081408895669e-05, "loss": 0.2705, "mean_token_accuracy": 0.9144006855785847, "num_tokens": 79240813.0, "step": 31980 }, { "entropy": 0.28526132917031644, "epoch": 0.49243889251278145, "grad_norm": 0.6046526432037354, "learning_rate": 1.9750416581356147e-05, "loss": 0.294, "mean_token_accuracy": 0.9132112234830856, "num_tokens": 79311319.0, "step": 31990 }, { "entropy": 0.29333533346652985, "epoch": 0.49259282777146, "grad_norm": 0.6542878150939941, "learning_rate": 1.9750018760957772e-05, "loss": 0.2972, "mean_token_accuracy": 0.9099179074168205, "num_tokens": 79372884.0, "step": 32000 }, { "entropy": 0.31179156750440595, "epoch": 0.4927467630301386, "grad_norm": 0.6449347734451294, "learning_rate": 1.9749620627774325e-05, "loss": 0.307, "mean_token_accuracy": 0.9031130239367485, "num_tokens": 79448056.0, "step": 32010 }, { "entropy": 0.29649228854104875, "epoch": 0.49290069828881716, "grad_norm": 0.5203647017478943, "learning_rate": 1.9749222181818577e-05, "loss": 0.2999, "mean_token_accuracy": 0.9088661253452301, "num_tokens": 79516654.0, "step": 32020 }, { "entropy": 0.2901449644938111, "epoch": 0.49305463354749574, "grad_norm": 0.7976870536804199, "learning_rate": 1.9748823423103314e-05, "loss": 0.2907, "mean_token_accuracy": 0.9088212855160236, "num_tokens": 79587798.0, "step": 32030 }, { "entropy": 0.3044712472707033, "epoch": 0.49320856880617436, "grad_norm": 0.7264875173568726, "learning_rate": 1.974842435164132e-05, "loss": 0.3222, "mean_token_accuracy": 0.9033329799771309, "num_tokens": 79649995.0, "step": 32040 }, { "entropy": 0.2970388900488615, "epoch": 0.49336250406485294, "grad_norm": 0.5787564516067505, "learning_rate": 1.9748024967445413e-05, "loss": 0.2871, "mean_token_accuracy": 0.9090837053954601, "num_tokens": 79726455.0, "step": 32050 }, { "entropy": 0.319395958725363, "epoch": 0.4935164393235315, "grad_norm": 0.5807921290397644, "learning_rate": 1.974762527052839e-05, "loss": 0.3149, "mean_token_accuracy": 0.9031398616731167, "num_tokens": 79793817.0, "step": 32060 }, { "entropy": 0.31247418755665424, "epoch": 0.4936703745822101, "grad_norm": 0.5987370610237122, "learning_rate": 1.974722526090308e-05, "loss": 0.3107, "mean_token_accuracy": 0.9023449450731278, "num_tokens": 79863246.0, "step": 32070 }, { "entropy": 0.3074499225243926, "epoch": 0.49382430984088865, "grad_norm": 0.8461235761642456, "learning_rate": 1.9746824938582316e-05, "loss": 0.2988, "mean_token_accuracy": 0.9079349108040333, "num_tokens": 79932469.0, "step": 32080 }, { "entropy": 0.2997991695068777, "epoch": 0.4939782450995672, "grad_norm": 0.6593592166900635, "learning_rate": 1.9746424303578936e-05, "loss": 0.2978, "mean_token_accuracy": 0.9082019567489624, "num_tokens": 79997700.0, "step": 32090 }, { "entropy": 0.29304337631911037, "epoch": 0.49413218035824585, "grad_norm": 0.7177358865737915, "learning_rate": 1.9746023355905802e-05, "loss": 0.3088, "mean_token_accuracy": 0.9094692438840866, "num_tokens": 80068828.0, "step": 32100 }, { "entropy": 0.3002401743084192, "epoch": 0.4942861156169244, "grad_norm": 1.0549544095993042, "learning_rate": 1.974562209557577e-05, "loss": 0.3078, "mean_token_accuracy": 0.9070707179605961, "num_tokens": 80142042.0, "step": 32110 }, { "entropy": 0.2900657569989562, "epoch": 0.494440050875603, "grad_norm": 0.6003379225730896, "learning_rate": 1.974522052260171e-05, "loss": 0.2917, "mean_token_accuracy": 0.909510949254036, "num_tokens": 80207775.0, "step": 32120 }, { "entropy": 0.32048858478665354, "epoch": 0.49459398613428157, "grad_norm": 0.7265807390213013, "learning_rate": 1.9744818636996505e-05, "loss": 0.3284, "mean_token_accuracy": 0.9055598303675652, "num_tokens": 80266136.0, "step": 32130 }, { "entropy": 0.30496578216552733, "epoch": 0.49474792139296014, "grad_norm": 0.5191331505775452, "learning_rate": 1.9744416438773056e-05, "loss": 0.2925, "mean_token_accuracy": 0.906709399819374, "num_tokens": 80332301.0, "step": 32140 }, { "entropy": 0.30070726461708547, "epoch": 0.4949018566516387, "grad_norm": 0.6259462237358093, "learning_rate": 1.9744013927944257e-05, "loss": 0.3067, "mean_token_accuracy": 0.9038940370082855, "num_tokens": 80396598.0, "step": 32150 }, { "entropy": 0.2848918122239411, "epoch": 0.49505579191031734, "grad_norm": 0.6631000638008118, "learning_rate": 1.9743611104523027e-05, "loss": 0.2948, "mean_token_accuracy": 0.9125963844358921, "num_tokens": 80466069.0, "step": 32160 }, { "entropy": 0.30779402498155833, "epoch": 0.4952097271689959, "grad_norm": 0.6623424291610718, "learning_rate": 1.9743207968522283e-05, "loss": 0.2994, "mean_token_accuracy": 0.9074318453669548, "num_tokens": 80532602.0, "step": 32170 }, { "entropy": 0.2987078906968236, "epoch": 0.4953636624276745, "grad_norm": 0.5602055191993713, "learning_rate": 1.974280451995496e-05, "loss": 0.2961, "mean_token_accuracy": 0.9096553221344947, "num_tokens": 80603264.0, "step": 32180 }, { "entropy": 0.30829601576551796, "epoch": 0.49551759768635306, "grad_norm": 0.6493914723396301, "learning_rate": 1.9742400758834e-05, "loss": 0.3006, "mean_token_accuracy": 0.9042993195354938, "num_tokens": 80666347.0, "step": 32190 }, { "entropy": 0.29450415838509797, "epoch": 0.49567153294503163, "grad_norm": 0.7841638922691345, "learning_rate": 1.974199668517236e-05, "loss": 0.3061, "mean_token_accuracy": 0.9102671861648559, "num_tokens": 80733886.0, "step": 32200 }, { "entropy": 0.28778498275205494, "epoch": 0.4958254682037102, "grad_norm": 0.5864385962486267, "learning_rate": 1.9741592298983002e-05, "loss": 0.294, "mean_token_accuracy": 0.9131804794073105, "num_tokens": 80804213.0, "step": 32210 }, { "entropy": 0.28306473698467016, "epoch": 0.4959794034623888, "grad_norm": 0.593404233455658, "learning_rate": 1.9741187600278895e-05, "loss": 0.2902, "mean_token_accuracy": 0.9110937289893627, "num_tokens": 80876604.0, "step": 32220 }, { "entropy": 0.3004055256955326, "epoch": 0.4961333387210674, "grad_norm": 0.7166145443916321, "learning_rate": 1.9740782589073024e-05, "loss": 0.3021, "mean_token_accuracy": 0.9082581296563148, "num_tokens": 80947308.0, "step": 32230 }, { "entropy": 0.31694960631430147, "epoch": 0.496287273979746, "grad_norm": 0.5848685503005981, "learning_rate": 1.9740377265378382e-05, "loss": 0.3182, "mean_token_accuracy": 0.9041415743529797, "num_tokens": 81013566.0, "step": 32240 }, { "entropy": 0.27586426567286254, "epoch": 0.49644120923842455, "grad_norm": 0.8806293606758118, "learning_rate": 1.9739971629207967e-05, "loss": 0.2942, "mean_token_accuracy": 0.9147315546870232, "num_tokens": 81072989.0, "step": 32250 }, { "entropy": 0.3108387101441622, "epoch": 0.4965951444971031, "grad_norm": 0.5125815868377686, "learning_rate": 1.9739565680574802e-05, "loss": 0.3157, "mean_token_accuracy": 0.9037122033536434, "num_tokens": 81140162.0, "step": 32260 }, { "entropy": 0.31597851319238546, "epoch": 0.4967490797557817, "grad_norm": 0.5384411811828613, "learning_rate": 1.9739159419491902e-05, "loss": 0.3089, "mean_token_accuracy": 0.9020816080272198, "num_tokens": 81215781.0, "step": 32270 }, { "entropy": 0.29718932379037144, "epoch": 0.49690301501446027, "grad_norm": 0.6042025685310364, "learning_rate": 1.9738752845972306e-05, "loss": 0.2949, "mean_token_accuracy": 0.9072249203920364, "num_tokens": 81282117.0, "step": 32280 }, { "entropy": 0.3078822050243616, "epoch": 0.4970569502731389, "grad_norm": 0.7069520354270935, "learning_rate": 1.973834596002905e-05, "loss": 0.3091, "mean_token_accuracy": 0.9057795979082585, "num_tokens": 81354063.0, "step": 32290 }, { "entropy": 0.27786972280591726, "epoch": 0.49721088553181747, "grad_norm": 0.8380268216133118, "learning_rate": 1.9737938761675194e-05, "loss": 0.2922, "mean_token_accuracy": 0.9132818557322026, "num_tokens": 81408644.0, "step": 32300 }, { "entropy": 0.3162259546108544, "epoch": 0.49736482079049604, "grad_norm": 0.6834720373153687, "learning_rate": 1.9737531250923797e-05, "loss": 0.3261, "mean_token_accuracy": 0.901980784535408, "num_tokens": 81477307.0, "step": 32310 }, { "entropy": 0.29596725264564155, "epoch": 0.4975187560491746, "grad_norm": 0.7146609425544739, "learning_rate": 1.9737123427787933e-05, "loss": 0.2987, "mean_token_accuracy": 0.9091281838715076, "num_tokens": 81540171.0, "step": 32320 }, { "entropy": 0.3177063062787056, "epoch": 0.4976726913078532, "grad_norm": 0.5528653264045715, "learning_rate": 1.9736715292280687e-05, "loss": 0.3146, "mean_token_accuracy": 0.902767962962389, "num_tokens": 81600982.0, "step": 32330 }, { "entropy": 0.3099085623398423, "epoch": 0.49782662656653176, "grad_norm": 0.5708902478218079, "learning_rate": 1.9736306844415147e-05, "loss": 0.3102, "mean_token_accuracy": 0.903819115459919, "num_tokens": 81667925.0, "step": 32340 }, { "entropy": 0.3135126055218279, "epoch": 0.4979805618252104, "grad_norm": 0.5548070073127747, "learning_rate": 1.9735898084204423e-05, "loss": 0.3135, "mean_token_accuracy": 0.9069768629968167, "num_tokens": 81727662.0, "step": 32350 }, { "entropy": 0.2913286787457764, "epoch": 0.49813449708388896, "grad_norm": 0.663645327091217, "learning_rate": 1.9735489011661625e-05, "loss": 0.305, "mean_token_accuracy": 0.9105754904448986, "num_tokens": 81785147.0, "step": 32360 }, { "entropy": 0.29880726858973505, "epoch": 0.49828843234256753, "grad_norm": 0.614680290222168, "learning_rate": 1.9735079626799874e-05, "loss": 0.2984, "mean_token_accuracy": 0.9083014704287052, "num_tokens": 81858593.0, "step": 32370 }, { "entropy": 0.2978056722320616, "epoch": 0.4984423676012461, "grad_norm": 0.7387822866439819, "learning_rate": 1.9734669929632305e-05, "loss": 0.3041, "mean_token_accuracy": 0.9061821170151234, "num_tokens": 81924479.0, "step": 32380 }, { "entropy": 0.28760726293548944, "epoch": 0.4985963028599247, "grad_norm": 0.5898032188415527, "learning_rate": 1.9734259920172063e-05, "loss": 0.2867, "mean_token_accuracy": 0.9134098917245865, "num_tokens": 81991124.0, "step": 32390 }, { "entropy": 0.2967043754644692, "epoch": 0.49875023811860325, "grad_norm": 0.5780078172683716, "learning_rate": 1.97338495984323e-05, "loss": 0.3024, "mean_token_accuracy": 0.908564256131649, "num_tokens": 82057021.0, "step": 32400 }, { "entropy": 0.3039437662810087, "epoch": 0.4989041733772818, "grad_norm": 0.7755940556526184, "learning_rate": 1.9733438964426178e-05, "loss": 0.308, "mean_token_accuracy": 0.9079201303422451, "num_tokens": 82118565.0, "step": 32410 }, { "entropy": 0.28816003631800413, "epoch": 0.49905810863596045, "grad_norm": 0.5067946314811707, "learning_rate": 1.9733028018166874e-05, "loss": 0.2954, "mean_token_accuracy": 0.9096809379756451, "num_tokens": 82190536.0, "step": 32420 }, { "entropy": 0.3131323577836156, "epoch": 0.499212043894639, "grad_norm": 0.8032708168029785, "learning_rate": 1.9732616759667565e-05, "loss": 0.3137, "mean_token_accuracy": 0.902522237598896, "num_tokens": 82260877.0, "step": 32430 }, { "entropy": 0.3074033910408616, "epoch": 0.4993659791533176, "grad_norm": 0.5475780367851257, "learning_rate": 1.9732205188941454e-05, "loss": 0.3105, "mean_token_accuracy": 0.9046415619552135, "num_tokens": 82334193.0, "step": 32440 }, { "entropy": 0.3164211521856487, "epoch": 0.49951991441199617, "grad_norm": 0.4884684979915619, "learning_rate": 1.9731793306001735e-05, "loss": 0.3132, "mean_token_accuracy": 0.9040664836764336, "num_tokens": 82406588.0, "step": 32450 }, { "entropy": 0.2963389358483255, "epoch": 0.49967384967067474, "grad_norm": 0.8145978450775146, "learning_rate": 1.9731381110861624e-05, "loss": 0.3038, "mean_token_accuracy": 0.9086631909012794, "num_tokens": 82473446.0, "step": 32460 }, { "entropy": 0.2963446039706469, "epoch": 0.4998277849293533, "grad_norm": 0.6925342679023743, "learning_rate": 1.973096860353435e-05, "loss": 0.2942, "mean_token_accuracy": 0.9110294267535209, "num_tokens": 82535745.0, "step": 32470 }, { "entropy": 0.2871373754926026, "epoch": 0.49998172018803194, "grad_norm": 0.6807143092155457, "learning_rate": 1.9730555784033138e-05, "loss": 0.3018, "mean_token_accuracy": 0.9105884522199631, "num_tokens": 82602637.0, "step": 32480 }, { "entropy": 0.30232819663360716, "epoch": 0.5001356554467105, "grad_norm": 0.6397662162780762, "learning_rate": 1.9730142652371235e-05, "loss": 0.3016, "mean_token_accuracy": 0.905908016115427, "num_tokens": 82673412.0, "step": 32490 }, { "entropy": 0.2918717105872929, "epoch": 0.500289590705389, "grad_norm": 0.5617779493331909, "learning_rate": 1.9729729208561895e-05, "loss": 0.2932, "mean_token_accuracy": 0.9111883968114853, "num_tokens": 82743423.0, "step": 32500 }, { "entropy": 0.3140604126267135, "epoch": 0.5004435259640677, "grad_norm": 0.8217074871063232, "learning_rate": 1.9729315452618386e-05, "loss": 0.3149, "mean_token_accuracy": 0.8995378777384758, "num_tokens": 82814185.0, "step": 32510 }, { "entropy": 0.3188564452342689, "epoch": 0.5005974612227463, "grad_norm": 0.7149631381034851, "learning_rate": 1.972890138455397e-05, "loss": 0.3111, "mean_token_accuracy": 0.9025591216981411, "num_tokens": 82881614.0, "step": 32520 }, { "entropy": 0.28795984396710994, "epoch": 0.5007513964814249, "grad_norm": 0.7921105027198792, "learning_rate": 1.9728487004381942e-05, "loss": 0.3004, "mean_token_accuracy": 0.9116830810904503, "num_tokens": 82951998.0, "step": 32530 }, { "entropy": 0.2821583139710128, "epoch": 0.5009053317401034, "grad_norm": 0.5959587097167969, "learning_rate": 1.972807231211559e-05, "loss": 0.2878, "mean_token_accuracy": 0.9146379821002484, "num_tokens": 83017011.0, "step": 32540 }, { "entropy": 0.2930244873277843, "epoch": 0.501059266998782, "grad_norm": 0.6771332025527954, "learning_rate": 1.972765730776822e-05, "loss": 0.3069, "mean_token_accuracy": 0.9085350856184959, "num_tokens": 83083837.0, "step": 32550 }, { "entropy": 0.2862073304131627, "epoch": 0.5012132022574606, "grad_norm": 0.7097522020339966, "learning_rate": 1.972724199135314e-05, "loss": 0.2949, "mean_token_accuracy": 0.9103779770433903, "num_tokens": 83145465.0, "step": 32560 }, { "entropy": 0.3171032796613872, "epoch": 0.5013671375161391, "grad_norm": 0.5741182565689087, "learning_rate": 1.9726826362883677e-05, "loss": 0.3116, "mean_token_accuracy": 0.9022349514067173, "num_tokens": 83206820.0, "step": 32570 }, { "entropy": 0.3004722342826426, "epoch": 0.5015210727748177, "grad_norm": 0.698379635810852, "learning_rate": 1.9726410422373168e-05, "loss": 0.3104, "mean_token_accuracy": 0.906218296289444, "num_tokens": 83274589.0, "step": 32580 }, { "entropy": 0.3018739590421319, "epoch": 0.5016750080334963, "grad_norm": 0.5927415490150452, "learning_rate": 1.9725994169834952e-05, "loss": 0.3023, "mean_token_accuracy": 0.9087419055402279, "num_tokens": 83343105.0, "step": 32590 }, { "entropy": 0.28224583761766553, "epoch": 0.5018289432921749, "grad_norm": 0.5919114947319031, "learning_rate": 1.9725577605282385e-05, "loss": 0.2868, "mean_token_accuracy": 0.9134611666202546, "num_tokens": 83415539.0, "step": 32600 }, { "entropy": 0.29755243435502055, "epoch": 0.5019828785508534, "grad_norm": 0.6496954560279846, "learning_rate": 1.9725160728728827e-05, "loss": 0.2973, "mean_token_accuracy": 0.9063939981162548, "num_tokens": 83487730.0, "step": 32610 }, { "entropy": 0.29786983206868173, "epoch": 0.502136813809532, "grad_norm": 0.8050012588500977, "learning_rate": 1.9724743540187658e-05, "loss": 0.3018, "mean_token_accuracy": 0.9075717762112617, "num_tokens": 83549942.0, "step": 32620 }, { "entropy": 0.308338781632483, "epoch": 0.5022907490682106, "grad_norm": 0.8049755692481995, "learning_rate": 1.9724326039672253e-05, "loss": 0.2952, "mean_token_accuracy": 0.9050166480243206, "num_tokens": 83616551.0, "step": 32630 }, { "entropy": 0.2970153110101819, "epoch": 0.5024446843268893, "grad_norm": 0.7213860750198364, "learning_rate": 1.9723908227196013e-05, "loss": 0.292, "mean_token_accuracy": 0.9100506201386451, "num_tokens": 83682528.0, "step": 32640 }, { "entropy": 0.3051547144539654, "epoch": 0.5025986195855678, "grad_norm": 0.5920039415359497, "learning_rate": 1.9723490102772338e-05, "loss": 0.3097, "mean_token_accuracy": 0.9053672574460506, "num_tokens": 83751420.0, "step": 32650 }, { "entropy": 0.30026386231184005, "epoch": 0.5027525548442464, "grad_norm": 0.8295782208442688, "learning_rate": 1.9723071666414645e-05, "loss": 0.289, "mean_token_accuracy": 0.9090550608932972, "num_tokens": 83812815.0, "step": 32660 }, { "entropy": 0.31585442116484047, "epoch": 0.502906490102925, "grad_norm": 0.6912811994552612, "learning_rate": 1.9722652918136354e-05, "loss": 0.3182, "mean_token_accuracy": 0.9041824050247669, "num_tokens": 83878730.0, "step": 32670 }, { "entropy": 0.3088791807182133, "epoch": 0.5030604253616036, "grad_norm": 0.567297637462616, "learning_rate": 1.97222338579509e-05, "loss": 0.309, "mean_token_accuracy": 0.9057932823896409, "num_tokens": 83944854.0, "step": 32680 }, { "entropy": 0.2906337640248239, "epoch": 0.5032143606202821, "grad_norm": 0.6485405564308167, "learning_rate": 1.9721814485871725e-05, "loss": 0.2969, "mean_token_accuracy": 0.9103106833994389, "num_tokens": 84006756.0, "step": 32690 }, { "entropy": 0.3015539297834039, "epoch": 0.5033682958789607, "grad_norm": 0.8752917051315308, "learning_rate": 1.972139480191229e-05, "loss": 0.3074, "mean_token_accuracy": 0.9063509978353977, "num_tokens": 84070437.0, "step": 32700 }, { "entropy": 0.3155734837986529, "epoch": 0.5035222311376393, "grad_norm": 0.6964156627655029, "learning_rate": 1.9720974806086045e-05, "loss": 0.3162, "mean_token_accuracy": 0.9043980285525322, "num_tokens": 84137844.0, "step": 32710 }, { "entropy": 0.3016783490777016, "epoch": 0.5036761663963178, "grad_norm": 0.5706114768981934, "learning_rate": 1.9720554498406478e-05, "loss": 0.2977, "mean_token_accuracy": 0.9109622970223427, "num_tokens": 84206631.0, "step": 32720 }, { "entropy": 0.29820771208032965, "epoch": 0.5038301016549964, "grad_norm": 0.5765863060951233, "learning_rate": 1.9720133878887063e-05, "loss": 0.3086, "mean_token_accuracy": 0.9083248905837535, "num_tokens": 84270381.0, "step": 32730 }, { "entropy": 0.29166211616247895, "epoch": 0.503984036913675, "grad_norm": 0.5254558324813843, "learning_rate": 1.9719712947541297e-05, "loss": 0.2927, "mean_token_accuracy": 0.9107589304447175, "num_tokens": 84344592.0, "step": 32740 }, { "entropy": 0.29491387503221633, "epoch": 0.5041379721723536, "grad_norm": 0.5056374669075012, "learning_rate": 1.9719291704382683e-05, "loss": 0.2933, "mean_token_accuracy": 0.9074094913899898, "num_tokens": 84408514.0, "step": 32750 }, { "entropy": 0.312148102093488, "epoch": 0.5042919074310322, "grad_norm": 0.7378531694412231, "learning_rate": 1.9718870149424733e-05, "loss": 0.314, "mean_token_accuracy": 0.9059046387672425, "num_tokens": 84460882.0, "step": 32760 }, { "entropy": 0.293112703692168, "epoch": 0.5044458426897108, "grad_norm": 0.7338842749595642, "learning_rate": 1.971844828268098e-05, "loss": 0.2925, "mean_token_accuracy": 0.9115520231425762, "num_tokens": 84518482.0, "step": 32770 }, { "entropy": 0.3031423197127879, "epoch": 0.5045997779483894, "grad_norm": 0.554044783115387, "learning_rate": 1.9718026104164945e-05, "loss": 0.3131, "mean_token_accuracy": 0.9041615910828114, "num_tokens": 84588883.0, "step": 32780 }, { "entropy": 0.30747972112149, "epoch": 0.504753713207068, "grad_norm": 0.6356152892112732, "learning_rate": 1.971760361389018e-05, "loss": 0.3018, "mean_token_accuracy": 0.9052698709070682, "num_tokens": 84658336.0, "step": 32790 }, { "entropy": 0.2919203539378941, "epoch": 0.5049076484657465, "grad_norm": 0.5975723266601562, "learning_rate": 1.971718081187023e-05, "loss": 0.3009, "mean_token_accuracy": 0.9096649542450905, "num_tokens": 84725392.0, "step": 32800 }, { "entropy": 0.28738866522908213, "epoch": 0.5050615837244251, "grad_norm": 0.6605074405670166, "learning_rate": 1.9716757698118673e-05, "loss": 0.2939, "mean_token_accuracy": 0.9120452314615249, "num_tokens": 84787410.0, "step": 32810 }, { "entropy": 0.29516154173761605, "epoch": 0.5052155189831037, "grad_norm": 0.5758609771728516, "learning_rate": 1.971633427264907e-05, "loss": 0.301, "mean_token_accuracy": 0.9074242144823075, "num_tokens": 84850143.0, "step": 32820 }, { "entropy": 0.2872396599501371, "epoch": 0.5053694542417823, "grad_norm": 0.6692937016487122, "learning_rate": 1.9715910535475008e-05, "loss": 0.2851, "mean_token_accuracy": 0.9118603184819222, "num_tokens": 84914379.0, "step": 32830 }, { "entropy": 0.2874593698419631, "epoch": 0.5055233895004608, "grad_norm": 0.6622369885444641, "learning_rate": 1.9715486486610084e-05, "loss": 0.297, "mean_token_accuracy": 0.9090963006019592, "num_tokens": 84982339.0, "step": 32840 }, { "entropy": 0.2863856891170144, "epoch": 0.5056773247591394, "grad_norm": 0.65252685546875, "learning_rate": 1.9715062126067898e-05, "loss": 0.287, "mean_token_accuracy": 0.9124788954854012, "num_tokens": 85054332.0, "step": 32850 }, { "entropy": 0.2889897650107741, "epoch": 0.505831260017818, "grad_norm": 0.6705595254898071, "learning_rate": 1.9714637453862063e-05, "loss": 0.2933, "mean_token_accuracy": 0.9108188420534133, "num_tokens": 85119567.0, "step": 32860 }, { "entropy": 0.29545629406347873, "epoch": 0.5059851952764965, "grad_norm": 0.749913215637207, "learning_rate": 1.971421247000621e-05, "loss": 0.3014, "mean_token_accuracy": 0.9073196746408939, "num_tokens": 85183088.0, "step": 32870 }, { "entropy": 0.29949123207479716, "epoch": 0.5061391305351751, "grad_norm": 0.615135669708252, "learning_rate": 1.9713787174513963e-05, "loss": 0.2981, "mean_token_accuracy": 0.9091683611273765, "num_tokens": 85240870.0, "step": 32880 }, { "entropy": 0.28948830217123034, "epoch": 0.5062930657938538, "grad_norm": 0.7620739936828613, "learning_rate": 1.971336156739897e-05, "loss": 0.2887, "mean_token_accuracy": 0.9116381131112575, "num_tokens": 85300740.0, "step": 32890 }, { "entropy": 0.28730483455583455, "epoch": 0.5064470010525324, "grad_norm": 0.6431682705879211, "learning_rate": 1.9712935648674888e-05, "loss": 0.3133, "mean_token_accuracy": 0.9100279085338115, "num_tokens": 85372098.0, "step": 32900 }, { "entropy": 0.2946350908838212, "epoch": 0.506600936311211, "grad_norm": 0.5856776833534241, "learning_rate": 1.9712509418355372e-05, "loss": 0.3107, "mean_token_accuracy": 0.9085524410009385, "num_tokens": 85454559.0, "step": 32910 }, { "entropy": 0.2869563166052103, "epoch": 0.5067548715698895, "grad_norm": 0.4926590919494629, "learning_rate": 1.9712082876454106e-05, "loss": 0.279, "mean_token_accuracy": 0.9119383230805397, "num_tokens": 85522041.0, "step": 32920 }, { "entropy": 0.2978821470402181, "epoch": 0.5069088068285681, "grad_norm": 0.48102253675460815, "learning_rate": 1.9711656022984767e-05, "loss": 0.2953, "mean_token_accuracy": 0.9071626149117946, "num_tokens": 85596708.0, "step": 32930 }, { "entropy": 0.29918333161622285, "epoch": 0.5070627420872467, "grad_norm": 0.6443530321121216, "learning_rate": 1.971122885796105e-05, "loss": 0.2993, "mean_token_accuracy": 0.9087916716933251, "num_tokens": 85671059.0, "step": 32940 }, { "entropy": 0.30332447150722147, "epoch": 0.5072166773459252, "grad_norm": 0.7072397470474243, "learning_rate": 1.971080138139666e-05, "loss": 0.3133, "mean_token_accuracy": 0.9047786325216294, "num_tokens": 85736467.0, "step": 32950 }, { "entropy": 0.291539357509464, "epoch": 0.5073706126046038, "grad_norm": 0.801414966583252, "learning_rate": 1.971037359330531e-05, "loss": 0.2816, "mean_token_accuracy": 0.9136413216590882, "num_tokens": 85802088.0, "step": 32960 }, { "entropy": 0.2990983411669731, "epoch": 0.5075245478632824, "grad_norm": 0.47815290093421936, "learning_rate": 1.970994549370072e-05, "loss": 0.3037, "mean_token_accuracy": 0.9079967990517617, "num_tokens": 85873339.0, "step": 32970 }, { "entropy": 0.29850703328847883, "epoch": 0.507678483121961, "grad_norm": 0.8128398060798645, "learning_rate": 1.970951708259663e-05, "loss": 0.3024, "mean_token_accuracy": 0.9092284999787807, "num_tokens": 85930204.0, "step": 32980 }, { "entropy": 0.29802806545048954, "epoch": 0.5078324183806395, "grad_norm": 0.6995387673377991, "learning_rate": 1.970908836000678e-05, "loss": 0.3036, "mean_token_accuracy": 0.9077672474086285, "num_tokens": 85998339.0, "step": 32990 }, { "entropy": 0.2947309762239456, "epoch": 0.5079863536393181, "grad_norm": 0.6924102306365967, "learning_rate": 1.9708659325944923e-05, "loss": 0.3079, "mean_token_accuracy": 0.9069246396422386, "num_tokens": 86064231.0, "step": 33000 }, { "entropy": 0.29748018067330123, "epoch": 0.5081402888979967, "grad_norm": 0.5502949357032776, "learning_rate": 1.9708229980424826e-05, "loss": 0.2944, "mean_token_accuracy": 0.9082814738154411, "num_tokens": 86137847.0, "step": 33010 }, { "entropy": 0.27144667943939566, "epoch": 0.5082942241566754, "grad_norm": 0.5563339591026306, "learning_rate": 1.970780032346026e-05, "loss": 0.2754, "mean_token_accuracy": 0.9167795851826668, "num_tokens": 86210679.0, "step": 33020 }, { "entropy": 0.31267990786582234, "epoch": 0.5084481594153539, "grad_norm": 0.6975290179252625, "learning_rate": 1.970737035506501e-05, "loss": 0.3221, "mean_token_accuracy": 0.9011103294789791, "num_tokens": 86268741.0, "step": 33030 }, { "entropy": 0.30562844602391126, "epoch": 0.5086020946740325, "grad_norm": 0.6735768914222717, "learning_rate": 1.9706940075252862e-05, "loss": 0.3089, "mean_token_accuracy": 0.90893145352602, "num_tokens": 86329886.0, "step": 33040 }, { "entropy": 0.2941249033436179, "epoch": 0.5087560299327111, "grad_norm": 0.7471871972084045, "learning_rate": 1.970650948403763e-05, "loss": 0.304, "mean_token_accuracy": 0.9082578741014004, "num_tokens": 86398691.0, "step": 33050 }, { "entropy": 0.3065161587670445, "epoch": 0.5089099651913896, "grad_norm": 0.6476220488548279, "learning_rate": 1.9706078581433126e-05, "loss": 0.3093, "mean_token_accuracy": 0.9060387261211872, "num_tokens": 86470009.0, "step": 33060 }, { "entropy": 0.2832556628622115, "epoch": 0.5090639004500682, "grad_norm": 0.5802522301673889, "learning_rate": 1.970564736745317e-05, "loss": 0.2807, "mean_token_accuracy": 0.9135899536311627, "num_tokens": 86533642.0, "step": 33070 }, { "entropy": 0.2878000861033797, "epoch": 0.5092178357087468, "grad_norm": 0.5990781188011169, "learning_rate": 1.9705215842111598e-05, "loss": 0.2935, "mean_token_accuracy": 0.9115748666226864, "num_tokens": 86595869.0, "step": 33080 }, { "entropy": 0.3096742697991431, "epoch": 0.5093717709674254, "grad_norm": 0.6958196759223938, "learning_rate": 1.9704784005422252e-05, "loss": 0.3218, "mean_token_accuracy": 0.9049523994326591, "num_tokens": 86665248.0, "step": 33090 }, { "entropy": 0.29127783915027977, "epoch": 0.5095257062261039, "grad_norm": 0.6986090540885925, "learning_rate": 1.9704351857398984e-05, "loss": 0.2916, "mean_token_accuracy": 0.9108592607080936, "num_tokens": 86728147.0, "step": 33100 }, { "entropy": 0.2787388294003904, "epoch": 0.5096796414847825, "grad_norm": 0.5635427832603455, "learning_rate": 1.9703919398055657e-05, "loss": 0.2828, "mean_token_accuracy": 0.9130657643079758, "num_tokens": 86792736.0, "step": 33110 }, { "entropy": 0.28152471547946334, "epoch": 0.5098335767434611, "grad_norm": 0.5926389098167419, "learning_rate": 1.970348662740615e-05, "loss": 0.2917, "mean_token_accuracy": 0.9131708070635796, "num_tokens": 86863131.0, "step": 33120 }, { "entropy": 0.3101689722388983, "epoch": 0.5099875120021397, "grad_norm": 0.665530800819397, "learning_rate": 1.9703053545464345e-05, "loss": 0.3013, "mean_token_accuracy": 0.9055334761738777, "num_tokens": 86937624.0, "step": 33130 }, { "entropy": 0.27777328658849, "epoch": 0.5101414472608182, "grad_norm": 0.6941455602645874, "learning_rate": 1.970262015224413e-05, "loss": 0.2809, "mean_token_accuracy": 0.9164224714040756, "num_tokens": 87009743.0, "step": 33140 }, { "entropy": 0.3024665812030435, "epoch": 0.5102953825194969, "grad_norm": 0.7671465277671814, "learning_rate": 1.9702186447759415e-05, "loss": 0.3126, "mean_token_accuracy": 0.9047562144696712, "num_tokens": 87080023.0, "step": 33150 }, { "entropy": 0.2898311184719205, "epoch": 0.5104493177781755, "grad_norm": 0.7620391845703125, "learning_rate": 1.970175243202411e-05, "loss": 0.2965, "mean_token_accuracy": 0.9118792653083801, "num_tokens": 87153901.0, "step": 33160 }, { "entropy": 0.28403970170766113, "epoch": 0.510603253036854, "grad_norm": 0.586436927318573, "learning_rate": 1.970131810505214e-05, "loss": 0.3078, "mean_token_accuracy": 0.9090007349848748, "num_tokens": 87230080.0, "step": 33170 }, { "entropy": 0.28403482753783466, "epoch": 0.5107571882955326, "grad_norm": 0.6028360724449158, "learning_rate": 1.9700883466857435e-05, "loss": 0.2929, "mean_token_accuracy": 0.9112773209810257, "num_tokens": 87293143.0, "step": 33180 }, { "entropy": 0.2885580902919173, "epoch": 0.5109111235542112, "grad_norm": 0.5053156614303589, "learning_rate": 1.9700448517453942e-05, "loss": 0.2929, "mean_token_accuracy": 0.9120991721749305, "num_tokens": 87362241.0, "step": 33190 }, { "entropy": 0.2953931702300906, "epoch": 0.5110650588128898, "grad_norm": 0.6473748087882996, "learning_rate": 1.9700013256855613e-05, "loss": 0.296, "mean_token_accuracy": 0.9103543907403946, "num_tokens": 87427981.0, "step": 33200 }, { "entropy": 0.2941796542145312, "epoch": 0.5112189940715683, "grad_norm": 0.6180082559585571, "learning_rate": 1.9699577685076413e-05, "loss": 0.3075, "mean_token_accuracy": 0.9071114607155323, "num_tokens": 87497604.0, "step": 33210 }, { "entropy": 0.2780396495014429, "epoch": 0.5113729293302469, "grad_norm": 0.6278762221336365, "learning_rate": 1.9699141802130312e-05, "loss": 0.2794, "mean_token_accuracy": 0.9157344564795494, "num_tokens": 87562661.0, "step": 33220 }, { "entropy": 0.29417483089491725, "epoch": 0.5115268645889255, "grad_norm": 0.6103235483169556, "learning_rate": 1.9698705608031297e-05, "loss": 0.291, "mean_token_accuracy": 0.9101440720260143, "num_tokens": 87624495.0, "step": 33230 }, { "entropy": 0.29937418634071944, "epoch": 0.5116807998476041, "grad_norm": 0.5361796021461487, "learning_rate": 1.969826910279336e-05, "loss": 0.308, "mean_token_accuracy": 0.9045122921466827, "num_tokens": 87688666.0, "step": 33240 }, { "entropy": 0.31263066874817014, "epoch": 0.5118347351062826, "grad_norm": 0.6641252636909485, "learning_rate": 1.9697832286430504e-05, "loss": 0.314, "mean_token_accuracy": 0.9034513637423516, "num_tokens": 87744692.0, "step": 33250 }, { "entropy": 0.29682299587875605, "epoch": 0.5119886703649612, "grad_norm": 0.6110656261444092, "learning_rate": 1.969739515895674e-05, "loss": 0.2909, "mean_token_accuracy": 0.9087763898074627, "num_tokens": 87815620.0, "step": 33260 }, { "entropy": 0.29050803147256377, "epoch": 0.5121426056236399, "grad_norm": 0.6827271580696106, "learning_rate": 1.9696957720386092e-05, "loss": 0.3032, "mean_token_accuracy": 0.9112838208675385, "num_tokens": 87881416.0, "step": 33270 }, { "entropy": 0.30534005695953964, "epoch": 0.5122965408823185, "grad_norm": 0.7443462610244751, "learning_rate": 1.96965199707326e-05, "loss": 0.3161, "mean_token_accuracy": 0.9054387502372265, "num_tokens": 87949238.0, "step": 33280 }, { "entropy": 0.2835673626512289, "epoch": 0.512450476140997, "grad_norm": 0.715048611164093, "learning_rate": 1.9696081910010298e-05, "loss": 0.2928, "mean_token_accuracy": 0.9117878898978233, "num_tokens": 88017005.0, "step": 33290 }, { "entropy": 0.2845455494709313, "epoch": 0.5126044113996756, "grad_norm": 0.6863804459571838, "learning_rate": 1.969564353823324e-05, "loss": 0.293, "mean_token_accuracy": 0.9140180967748165, "num_tokens": 88088009.0, "step": 33300 }, { "entropy": 0.29738859133794904, "epoch": 0.5127583466583542, "grad_norm": 0.7242123484611511, "learning_rate": 1.96952048554155e-05, "loss": 0.3007, "mean_token_accuracy": 0.9095641531050205, "num_tokens": 88144788.0, "step": 33310 }, { "entropy": 0.29107603384181857, "epoch": 0.5129122819170328, "grad_norm": 0.6378305554389954, "learning_rate": 1.969476586157114e-05, "loss": 0.2963, "mean_token_accuracy": 0.9098415210843086, "num_tokens": 88208797.0, "step": 33320 }, { "entropy": 0.28816219959408046, "epoch": 0.5130662171757113, "grad_norm": 0.6265912652015686, "learning_rate": 1.9694326556714244e-05, "loss": 0.2837, "mean_token_accuracy": 0.9121165767312049, "num_tokens": 88272704.0, "step": 33330 }, { "entropy": 0.30148734897375107, "epoch": 0.5132201524343899, "grad_norm": 0.7849416136741638, "learning_rate": 1.9693886940858912e-05, "loss": 0.307, "mean_token_accuracy": 0.9061935350298882, "num_tokens": 88339667.0, "step": 33340 }, { "entropy": 0.29289111914113164, "epoch": 0.5133740876930685, "grad_norm": 0.6315934658050537, "learning_rate": 1.9693447014019238e-05, "loss": 0.2986, "mean_token_accuracy": 0.9096973054111004, "num_tokens": 88404923.0, "step": 33350 }, { "entropy": 0.3009996720589697, "epoch": 0.513528022951747, "grad_norm": 0.6762884259223938, "learning_rate": 1.9693006776209344e-05, "loss": 0.2895, "mean_token_accuracy": 0.9095973692834377, "num_tokens": 88471256.0, "step": 33360 }, { "entropy": 0.2784295048564672, "epoch": 0.5136819582104256, "grad_norm": 0.5702849626541138, "learning_rate": 1.9692566227443344e-05, "loss": 0.2733, "mean_token_accuracy": 0.9134665280580521, "num_tokens": 88533400.0, "step": 33370 }, { "entropy": 0.26646328354254367, "epoch": 0.5138358934691042, "grad_norm": 0.5642054677009583, "learning_rate": 1.969212536773538e-05, "loss": 0.2795, "mean_token_accuracy": 0.9181524984538555, "num_tokens": 88600586.0, "step": 33380 }, { "entropy": 0.3068424778059125, "epoch": 0.5139898287277828, "grad_norm": 0.6208695769309998, "learning_rate": 1.969168419709959e-05, "loss": 0.3135, "mean_token_accuracy": 0.9067970521748066, "num_tokens": 88670142.0, "step": 33390 }, { "entropy": 0.29828449422493575, "epoch": 0.5141437639864614, "grad_norm": 0.7832105159759521, "learning_rate": 1.9691242715550127e-05, "loss": 0.3003, "mean_token_accuracy": 0.9075608342885971, "num_tokens": 88735668.0, "step": 33400 }, { "entropy": 0.2994774772785604, "epoch": 0.51429769924514, "grad_norm": 0.6668322682380676, "learning_rate": 1.9690800923101156e-05, "loss": 0.2996, "mean_token_accuracy": 0.9090031690895557, "num_tokens": 88800879.0, "step": 33410 }, { "entropy": 0.28978946395218375, "epoch": 0.5144516345038186, "grad_norm": 0.5787093043327332, "learning_rate": 1.9690358819766847e-05, "loss": 0.2889, "mean_token_accuracy": 0.9108759932219982, "num_tokens": 88859085.0, "step": 33420 }, { "entropy": 0.29614471541717646, "epoch": 0.5146055697624972, "grad_norm": 0.6922380328178406, "learning_rate": 1.968991640556138e-05, "loss": 0.292, "mean_token_accuracy": 0.9086831346154213, "num_tokens": 88918693.0, "step": 33430 }, { "entropy": 0.2751495331525803, "epoch": 0.5147595050211757, "grad_norm": 0.6608083844184875, "learning_rate": 1.968947368049896e-05, "loss": 0.289, "mean_token_accuracy": 0.9134875699877739, "num_tokens": 88988157.0, "step": 33440 }, { "entropy": 0.2960879964753985, "epoch": 0.5149134402798543, "grad_norm": 0.537852942943573, "learning_rate": 1.9689030644593777e-05, "loss": 0.3062, "mean_token_accuracy": 0.9062345169484616, "num_tokens": 89060034.0, "step": 33450 }, { "entropy": 0.2861039934679866, "epoch": 0.5150673755385329, "grad_norm": 0.6744629144668579, "learning_rate": 1.968858729786005e-05, "loss": 0.2846, "mean_token_accuracy": 0.914184931665659, "num_tokens": 89125377.0, "step": 33460 }, { "entropy": 0.3030968838371336, "epoch": 0.5152213107972115, "grad_norm": 0.5773418545722961, "learning_rate": 1.9688143640312e-05, "loss": 0.3022, "mean_token_accuracy": 0.9059939175844193, "num_tokens": 89193547.0, "step": 33470 }, { "entropy": 0.28177894102409484, "epoch": 0.51537524605589, "grad_norm": 0.8599927425384521, "learning_rate": 1.9687699671963863e-05, "loss": 0.2864, "mean_token_accuracy": 0.9127481110394001, "num_tokens": 89252030.0, "step": 33480 }, { "entropy": 0.2956911077722907, "epoch": 0.5155291813145686, "grad_norm": 0.7165298461914062, "learning_rate": 1.9687255392829877e-05, "loss": 0.3108, "mean_token_accuracy": 0.910194045305252, "num_tokens": 89320640.0, "step": 33490 }, { "entropy": 0.28707967484369873, "epoch": 0.5156831165732472, "grad_norm": 0.7553251385688782, "learning_rate": 1.96868108029243e-05, "loss": 0.298, "mean_token_accuracy": 0.9130868032574654, "num_tokens": 89381451.0, "step": 33500 }, { "entropy": 0.2983064591884613, "epoch": 0.5158370518319257, "grad_norm": 0.7828065156936646, "learning_rate": 1.968636590226139e-05, "loss": 0.3008, "mean_token_accuracy": 0.9083540193736553, "num_tokens": 89446936.0, "step": 33510 }, { "entropy": 0.29125298308208586, "epoch": 0.5159909870906043, "grad_norm": 0.6606245040893555, "learning_rate": 1.9685920690855415e-05, "loss": 0.3042, "mean_token_accuracy": 0.9096967533230782, "num_tokens": 89513904.0, "step": 33520 }, { "entropy": 0.29453496597707274, "epoch": 0.516144922349283, "grad_norm": 0.6075644493103027, "learning_rate": 1.968547516872067e-05, "loss": 0.3095, "mean_token_accuracy": 0.9102658689022064, "num_tokens": 89581234.0, "step": 33530 }, { "entropy": 0.2805327915586531, "epoch": 0.5162988576079616, "grad_norm": 0.7708632946014404, "learning_rate": 1.968502933587144e-05, "loss": 0.2851, "mean_token_accuracy": 0.9129125729203225, "num_tokens": 89643118.0, "step": 33540 }, { "entropy": 0.30228863693773744, "epoch": 0.5164527928666401, "grad_norm": 0.806365966796875, "learning_rate": 1.9684583192322027e-05, "loss": 0.3119, "mean_token_accuracy": 0.9070049606263637, "num_tokens": 89712285.0, "step": 33550 }, { "entropy": 0.299924963247031, "epoch": 0.5166067281253187, "grad_norm": 0.7963147759437561, "learning_rate": 1.9684136738086748e-05, "loss": 0.3044, "mean_token_accuracy": 0.9064434953033924, "num_tokens": 89778719.0, "step": 33560 }, { "entropy": 0.29844977175816895, "epoch": 0.5167606633839973, "grad_norm": 0.6771014928817749, "learning_rate": 1.9683689973179923e-05, "loss": 0.3002, "mean_token_accuracy": 0.9088641382753849, "num_tokens": 89846417.0, "step": 33570 }, { "entropy": 0.28732329327613115, "epoch": 0.5169145986426759, "grad_norm": 0.8529155254364014, "learning_rate": 1.968324289761588e-05, "loss": 0.2982, "mean_token_accuracy": 0.9111948877573013, "num_tokens": 89906199.0, "step": 33580 }, { "entropy": 0.29019391965121033, "epoch": 0.5170685339013544, "grad_norm": 0.6685404181480408, "learning_rate": 1.9682795511408966e-05, "loss": 0.298, "mean_token_accuracy": 0.9100745864212513, "num_tokens": 89969510.0, "step": 33590 }, { "entropy": 0.2976806082762778, "epoch": 0.517222469160033, "grad_norm": 0.5123539566993713, "learning_rate": 1.9682347814573535e-05, "loss": 0.2996, "mean_token_accuracy": 0.9078996866941452, "num_tokens": 90032628.0, "step": 33600 }, { "entropy": 0.2916402473114431, "epoch": 0.5173764044187116, "grad_norm": 0.534785807132721, "learning_rate": 1.968189980712395e-05, "loss": 0.2961, "mean_token_accuracy": 0.9124387450516224, "num_tokens": 90099346.0, "step": 33610 }, { "entropy": 0.29741398664191365, "epoch": 0.5175303396773901, "grad_norm": 0.606475293636322, "learning_rate": 1.9681451489074575e-05, "loss": 0.3046, "mean_token_accuracy": 0.9075750216841698, "num_tokens": 90175233.0, "step": 33620 }, { "entropy": 0.2996823811903596, "epoch": 0.5176842749360687, "grad_norm": 0.525810182094574, "learning_rate": 1.96810028604398e-05, "loss": 0.2945, "mean_token_accuracy": 0.9059168711304665, "num_tokens": 90239434.0, "step": 33630 }, { "entropy": 0.2876533719711006, "epoch": 0.5178382101947473, "grad_norm": 0.5465270280838013, "learning_rate": 1.9680553921234013e-05, "loss": 0.2856, "mean_token_accuracy": 0.9109185710549355, "num_tokens": 90305480.0, "step": 33640 }, { "entropy": 0.3051456896588206, "epoch": 0.517992145453426, "grad_norm": 0.6368846297264099, "learning_rate": 1.968010467147162e-05, "loss": 0.3034, "mean_token_accuracy": 0.9028539732098579, "num_tokens": 90371871.0, "step": 33650 }, { "entropy": 0.28469547871500256, "epoch": 0.5181460807121046, "grad_norm": 0.5870382189750671, "learning_rate": 1.9679655111167027e-05, "loss": 0.2885, "mean_token_accuracy": 0.9119637832045555, "num_tokens": 90444123.0, "step": 33660 }, { "entropy": 0.28460962157696484, "epoch": 0.5183000159707831, "grad_norm": 0.5692122578620911, "learning_rate": 1.9679205240334665e-05, "loss": 0.2986, "mean_token_accuracy": 0.9131367236375809, "num_tokens": 90508470.0, "step": 33670 }, { "entropy": 0.30567752281203864, "epoch": 0.5184539512294617, "grad_norm": 0.6174583435058594, "learning_rate": 1.9678755058988962e-05, "loss": 0.3088, "mean_token_accuracy": 0.9056475721299648, "num_tokens": 90568102.0, "step": 33680 }, { "entropy": 0.28171012587845323, "epoch": 0.5186078864881403, "grad_norm": 0.48217374086380005, "learning_rate": 1.9678304567144356e-05, "loss": 0.279, "mean_token_accuracy": 0.9125325664877891, "num_tokens": 90630398.0, "step": 33690 }, { "entropy": 0.2999285101890564, "epoch": 0.5187618217468188, "grad_norm": 0.639385461807251, "learning_rate": 1.9677853764815306e-05, "loss": 0.2988, "mean_token_accuracy": 0.9063641510903835, "num_tokens": 90698513.0, "step": 33700 }, { "entropy": 0.3124455322511494, "epoch": 0.5189157570054974, "grad_norm": 0.7690107226371765, "learning_rate": 1.9677402652016267e-05, "loss": 0.3206, "mean_token_accuracy": 0.9029177904129029, "num_tokens": 90755696.0, "step": 33710 }, { "entropy": 0.3014462011866271, "epoch": 0.519069692264176, "grad_norm": 0.5043014883995056, "learning_rate": 1.9676951228761716e-05, "loss": 0.2954, "mean_token_accuracy": 0.9097486712038517, "num_tokens": 90820792.0, "step": 33720 }, { "entropy": 0.29857654720544813, "epoch": 0.5192236275228546, "grad_norm": 0.7189611196517944, "learning_rate": 1.9676499495066134e-05, "loss": 0.3092, "mean_token_accuracy": 0.90634640827775, "num_tokens": 90887114.0, "step": 33730 }, { "entropy": 0.3001519342884421, "epoch": 0.5193775627815331, "grad_norm": 0.6503850817680359, "learning_rate": 1.9676047450944008e-05, "loss": 0.3005, "mean_token_accuracy": 0.9074426114559173, "num_tokens": 90950032.0, "step": 33740 }, { "entropy": 0.2865695645101368, "epoch": 0.5195314980402117, "grad_norm": 0.6826472878456116, "learning_rate": 1.9675595096409847e-05, "loss": 0.2896, "mean_token_accuracy": 0.911699378490448, "num_tokens": 91021866.0, "step": 33750 }, { "entropy": 0.27124735368415714, "epoch": 0.5196854332988903, "grad_norm": 0.6580582857131958, "learning_rate": 1.967514243147816e-05, "loss": 0.2755, "mean_token_accuracy": 0.9184569776058197, "num_tokens": 91088158.0, "step": 33760 }, { "entropy": 0.2940813432447612, "epoch": 0.5198393685575688, "grad_norm": 0.6047825217247009, "learning_rate": 1.9674689456163464e-05, "loss": 0.3183, "mean_token_accuracy": 0.9095403149724006, "num_tokens": 91157853.0, "step": 33770 }, { "entropy": 0.2756730977445841, "epoch": 0.5199933038162475, "grad_norm": 1.0140879154205322, "learning_rate": 1.96742361704803e-05, "loss": 0.2825, "mean_token_accuracy": 0.9141954071819782, "num_tokens": 91224065.0, "step": 33780 }, { "entropy": 0.3035905722528696, "epoch": 0.5201472390749261, "grad_norm": 0.804072380065918, "learning_rate": 1.9673782574443198e-05, "loss": 0.3014, "mean_token_accuracy": 0.9069585368037224, "num_tokens": 91291959.0, "step": 33790 }, { "entropy": 0.29308580728247763, "epoch": 0.5203011743336047, "grad_norm": 0.553142249584198, "learning_rate": 1.9673328668066724e-05, "loss": 0.2924, "mean_token_accuracy": 0.9113749861717224, "num_tokens": 91372200.0, "step": 33800 }, { "entropy": 0.2841148723848164, "epoch": 0.5204551095922832, "grad_norm": 0.731942892074585, "learning_rate": 1.9672874451365424e-05, "loss": 0.2888, "mean_token_accuracy": 0.9104142218828202, "num_tokens": 91430463.0, "step": 33810 }, { "entropy": 0.2924607008695602, "epoch": 0.5206090448509618, "grad_norm": 0.6426456570625305, "learning_rate": 1.967241992435388e-05, "loss": 0.2997, "mean_token_accuracy": 0.9078279331326484, "num_tokens": 91504766.0, "step": 33820 }, { "entropy": 0.27088245619088414, "epoch": 0.5207629801096404, "grad_norm": 0.7884749174118042, "learning_rate": 1.967196508704667e-05, "loss": 0.2755, "mean_token_accuracy": 0.9179328083992004, "num_tokens": 91569083.0, "step": 33830 }, { "entropy": 0.28556596040725707, "epoch": 0.520916915368319, "grad_norm": 0.5155747532844543, "learning_rate": 1.9671509939458388e-05, "loss": 0.2988, "mean_token_accuracy": 0.9088393881917, "num_tokens": 91639671.0, "step": 33840 }, { "entropy": 0.2774844212457538, "epoch": 0.5210708506269975, "grad_norm": 0.7039178013801575, "learning_rate": 1.967105448160363e-05, "loss": 0.2872, "mean_token_accuracy": 0.913224146515131, "num_tokens": 91712510.0, "step": 33850 }, { "entropy": 0.3011332067660987, "epoch": 0.5212247858856761, "grad_norm": 0.6003684997558594, "learning_rate": 1.9670598713497013e-05, "loss": 0.3042, "mean_token_accuracy": 0.9090269953012466, "num_tokens": 91779374.0, "step": 33860 }, { "entropy": 0.3022316242568195, "epoch": 0.5213787211443547, "grad_norm": 0.6232831478118896, "learning_rate": 1.9670142635153153e-05, "loss": 0.3048, "mean_token_accuracy": 0.9053365729749203, "num_tokens": 91850003.0, "step": 33870 }, { "entropy": 0.3058416111394763, "epoch": 0.5215326564030333, "grad_norm": 0.6400650143623352, "learning_rate": 1.9669686246586684e-05, "loss": 0.3036, "mean_token_accuracy": 0.9067971475422383, "num_tokens": 91916921.0, "step": 33880 }, { "entropy": 0.2956146943382919, "epoch": 0.5216865916617118, "grad_norm": 0.5364951491355896, "learning_rate": 1.966922954781225e-05, "loss": 0.2885, "mean_token_accuracy": 0.9087028712034225, "num_tokens": 91990214.0, "step": 33890 }, { "entropy": 0.278635904379189, "epoch": 0.5218405269203904, "grad_norm": 0.5394244194030762, "learning_rate": 1.96687725388445e-05, "loss": 0.2847, "mean_token_accuracy": 0.914625308662653, "num_tokens": 92064084.0, "step": 33900 }, { "entropy": 0.30951080629602074, "epoch": 0.5219944621790691, "grad_norm": 0.5747436881065369, "learning_rate": 1.966831521969809e-05, "loss": 0.3115, "mean_token_accuracy": 0.9052174098789691, "num_tokens": 92132200.0, "step": 33910 }, { "entropy": 0.2867465021088719, "epoch": 0.5221483974377477, "grad_norm": 0.6756865382194519, "learning_rate": 1.9667857590387694e-05, "loss": 0.2891, "mean_token_accuracy": 0.9095613911747933, "num_tokens": 92193711.0, "step": 33920 }, { "entropy": 0.27768154088407754, "epoch": 0.5223023326964262, "grad_norm": 0.49703025817871094, "learning_rate": 1.9667399650928e-05, "loss": 0.2761, "mean_token_accuracy": 0.9154910892248154, "num_tokens": 92254944.0, "step": 33930 }, { "entropy": 0.29194907993078234, "epoch": 0.5224562679551048, "grad_norm": 0.6500468850135803, "learning_rate": 1.966694140133369e-05, "loss": 0.2884, "mean_token_accuracy": 0.910361135005951, "num_tokens": 92324603.0, "step": 33940 }, { "entropy": 0.27408957220613955, "epoch": 0.5226102032137834, "grad_norm": 0.599069356918335, "learning_rate": 1.9666482841619468e-05, "loss": 0.2843, "mean_token_accuracy": 0.9153124257922173, "num_tokens": 92390368.0, "step": 33950 }, { "entropy": 0.30248220190405845, "epoch": 0.522764138472462, "grad_norm": 0.5591315627098083, "learning_rate": 1.9666023971800044e-05, "loss": 0.3001, "mean_token_accuracy": 0.9093102030456066, "num_tokens": 92458319.0, "step": 33960 }, { "entropy": 0.27439945144578815, "epoch": 0.5229180737311405, "grad_norm": 0.903423547744751, "learning_rate": 1.966556479189014e-05, "loss": 0.2846, "mean_token_accuracy": 0.9142396189272404, "num_tokens": 92520954.0, "step": 33970 }, { "entropy": 0.28541688453406094, "epoch": 0.5230720089898191, "grad_norm": 0.7127604484558105, "learning_rate": 1.9665105301904487e-05, "loss": 0.2833, "mean_token_accuracy": 0.9117923870682716, "num_tokens": 92585849.0, "step": 33980 }, { "entropy": 0.3122310611885041, "epoch": 0.5232259442484977, "grad_norm": 0.7837318181991577, "learning_rate": 1.966464550185782e-05, "loss": 0.3066, "mean_token_accuracy": 0.902982097864151, "num_tokens": 92645309.0, "step": 33990 }, { "entropy": 0.29685370437800884, "epoch": 0.5233798795071762, "grad_norm": 0.4682348370552063, "learning_rate": 1.9664185391764904e-05, "loss": 0.3089, "mean_token_accuracy": 0.9091867037117481, "num_tokens": 92719377.0, "step": 34000 }, { "entropy": 0.2901741058565676, "epoch": 0.5235338147658548, "grad_norm": 0.5963210463523865, "learning_rate": 1.966372497164048e-05, "loss": 0.2844, "mean_token_accuracy": 0.9104810684919358, "num_tokens": 92784475.0, "step": 34010 }, { "entropy": 0.30959066851064565, "epoch": 0.5236877500245334, "grad_norm": 0.683404803276062, "learning_rate": 1.9663264241499336e-05, "loss": 0.3103, "mean_token_accuracy": 0.9052508994936943, "num_tokens": 92850222.0, "step": 34020 }, { "entropy": 0.3014103353954852, "epoch": 0.5238416852832121, "grad_norm": 0.8133298754692078, "learning_rate": 1.966280320135624e-05, "loss": 0.3056, "mean_token_accuracy": 0.9063719742000103, "num_tokens": 92917961.0, "step": 34030 }, { "entropy": 0.2921520763076842, "epoch": 0.5239956205418906, "grad_norm": 0.6461925506591797, "learning_rate": 1.966234185122599e-05, "loss": 0.2919, "mean_token_accuracy": 0.9107955887913703, "num_tokens": 92982831.0, "step": 34040 }, { "entropy": 0.2859082884155214, "epoch": 0.5241495558005692, "grad_norm": 0.8923617005348206, "learning_rate": 1.9661880191123383e-05, "loss": 0.2916, "mean_token_accuracy": 0.9121386878192425, "num_tokens": 93046861.0, "step": 34050 }, { "entropy": 0.30099404798820617, "epoch": 0.5243034910592478, "grad_norm": 0.8909616470336914, "learning_rate": 1.966141822106323e-05, "loss": 0.3123, "mean_token_accuracy": 0.9072880715131759, "num_tokens": 93107774.0, "step": 34060 }, { "entropy": 0.30339549444615843, "epoch": 0.5244574263179264, "grad_norm": 0.6678545475006104, "learning_rate": 1.9660955941060347e-05, "loss": 0.2976, "mean_token_accuracy": 0.9060826733708381, "num_tokens": 93169535.0, "step": 34070 }, { "entropy": 0.27561022257432344, "epoch": 0.5246113615766049, "grad_norm": 0.7735971212387085, "learning_rate": 1.9660493351129573e-05, "loss": 0.2874, "mean_token_accuracy": 0.9147443406283855, "num_tokens": 93225909.0, "step": 34080 }, { "entropy": 0.3027921081520617, "epoch": 0.5247652968352835, "grad_norm": 0.5892338752746582, "learning_rate": 1.9660030451285744e-05, "loss": 0.2905, "mean_token_accuracy": 0.9075765229761601, "num_tokens": 93289133.0, "step": 34090 }, { "entropy": 0.3021124929189682, "epoch": 0.5249192320939621, "grad_norm": 0.6343271732330322, "learning_rate": 1.9659567241543707e-05, "loss": 0.3078, "mean_token_accuracy": 0.905148558318615, "num_tokens": 93353089.0, "step": 34100 }, { "entropy": 0.2983418951742351, "epoch": 0.5250731673526406, "grad_norm": 0.6377505660057068, "learning_rate": 1.965910372191833e-05, "loss": 0.3025, "mean_token_accuracy": 0.9064981617033482, "num_tokens": 93421466.0, "step": 34110 }, { "entropy": 0.2925023876130581, "epoch": 0.5252271026113192, "grad_norm": 0.7396999001502991, "learning_rate": 1.9658639892424466e-05, "loss": 0.2992, "mean_token_accuracy": 0.9105267204344273, "num_tokens": 93488662.0, "step": 34120 }, { "entropy": 0.29259963277727363, "epoch": 0.5253810378699978, "grad_norm": 0.5072888135910034, "learning_rate": 1.9658175753077013e-05, "loss": 0.2868, "mean_token_accuracy": 0.9110489502549172, "num_tokens": 93560080.0, "step": 34130 }, { "entropy": 0.2897521645762026, "epoch": 0.5255349731286764, "grad_norm": 0.5110901594161987, "learning_rate": 1.9657711303890854e-05, "loss": 0.2991, "mean_token_accuracy": 0.90896285623312, "num_tokens": 93626548.0, "step": 34140 }, { "entropy": 0.2703201790340245, "epoch": 0.5256889083873549, "grad_norm": 0.48317527770996094, "learning_rate": 1.9657246544880887e-05, "loss": 0.2736, "mean_token_accuracy": 0.9160087957978249, "num_tokens": 93692422.0, "step": 34150 }, { "entropy": 0.32302039954811335, "epoch": 0.5258428436460336, "grad_norm": 0.5728574991226196, "learning_rate": 1.9656781476062026e-05, "loss": 0.3206, "mean_token_accuracy": 0.8993430718779564, "num_tokens": 93762964.0, "step": 34160 }, { "entropy": 0.3031371371820569, "epoch": 0.5259967789047122, "grad_norm": 0.5537545680999756, "learning_rate": 1.9656316097449183e-05, "loss": 0.2954, "mean_token_accuracy": 0.9076611921191216, "num_tokens": 93829279.0, "step": 34170 }, { "entropy": 0.2984643251635134, "epoch": 0.5261507141633908, "grad_norm": 0.6325329542160034, "learning_rate": 1.96558504090573e-05, "loss": 0.2995, "mean_token_accuracy": 0.9065529994666577, "num_tokens": 93901175.0, "step": 34180 }, { "entropy": 0.29048589188605545, "epoch": 0.5263046494220693, "grad_norm": 0.6068358421325684, "learning_rate": 1.9655384410901305e-05, "loss": 0.2875, "mean_token_accuracy": 0.9090205430984497, "num_tokens": 93969082.0, "step": 34190 }, { "entropy": 0.28896715193986894, "epoch": 0.5264585846807479, "grad_norm": 0.5242764949798584, "learning_rate": 1.9654918102996147e-05, "loss": 0.2845, "mean_token_accuracy": 0.9115757197141647, "num_tokens": 94039652.0, "step": 34200 }, { "entropy": 0.29751774212345483, "epoch": 0.5266125199394265, "grad_norm": 0.7136447429656982, "learning_rate": 1.9654451485356796e-05, "loss": 0.3199, "mean_token_accuracy": 0.9076425291597843, "num_tokens": 94105492.0, "step": 34210 }, { "entropy": 0.2989205080550164, "epoch": 0.526766455198105, "grad_norm": 0.6582279801368713, "learning_rate": 1.9653984557998212e-05, "loss": 0.3144, "mean_token_accuracy": 0.908204910159111, "num_tokens": 94167353.0, "step": 34220 }, { "entropy": 0.26403871197253465, "epoch": 0.5269203904567836, "grad_norm": 0.6988164782524109, "learning_rate": 1.965351732093538e-05, "loss": 0.2713, "mean_token_accuracy": 0.9179055772721767, "num_tokens": 94225601.0, "step": 34230 }, { "entropy": 0.2873790076933801, "epoch": 0.5270743257154622, "grad_norm": 0.5530125498771667, "learning_rate": 1.9653049774183283e-05, "loss": 0.2853, "mean_token_accuracy": 0.9105676352977753, "num_tokens": 94292548.0, "step": 34240 }, { "entropy": 0.2984679350629449, "epoch": 0.5272282609741408, "grad_norm": 0.7631990313529968, "learning_rate": 1.965258191775693e-05, "loss": 0.2974, "mean_token_accuracy": 0.9047696344554425, "num_tokens": 94351376.0, "step": 34250 }, { "entropy": 0.2867036104202271, "epoch": 0.5273821962328193, "grad_norm": 0.5321787595748901, "learning_rate": 1.965211375167132e-05, "loss": 0.2935, "mean_token_accuracy": 0.9117669098079204, "num_tokens": 94420455.0, "step": 34260 }, { "entropy": 0.293227955698967, "epoch": 0.5275361314914979, "grad_norm": 0.5407249927520752, "learning_rate": 1.9651645275941474e-05, "loss": 0.2839, "mean_token_accuracy": 0.9112817265093327, "num_tokens": 94495303.0, "step": 34270 }, { "entropy": 0.3001565098762512, "epoch": 0.5276900667501765, "grad_norm": 0.48298242688179016, "learning_rate": 1.965117649058242e-05, "loss": 0.2993, "mean_token_accuracy": 0.9075087115168572, "num_tokens": 94567797.0, "step": 34280 }, { "entropy": 0.2993870127014816, "epoch": 0.5278440020088552, "grad_norm": 0.5517644286155701, "learning_rate": 1.9650707395609205e-05, "loss": 0.3073, "mean_token_accuracy": 0.9069628298282624, "num_tokens": 94638258.0, "step": 34290 }, { "entropy": 0.29640673007816076, "epoch": 0.5279979372675337, "grad_norm": 0.895453155040741, "learning_rate": 1.9650237991036875e-05, "loss": 0.3052, "mean_token_accuracy": 0.9075728319585323, "num_tokens": 94704037.0, "step": 34300 }, { "entropy": 0.3065510708838701, "epoch": 0.5281518725262123, "grad_norm": 0.7398391962051392, "learning_rate": 1.964976827688048e-05, "loss": 0.3192, "mean_token_accuracy": 0.906526581197977, "num_tokens": 94774123.0, "step": 34310 }, { "entropy": 0.2973533251322806, "epoch": 0.5283058077848909, "grad_norm": 0.5951509475708008, "learning_rate": 1.9649298253155095e-05, "loss": 0.3062, "mean_token_accuracy": 0.9088899753987789, "num_tokens": 94839059.0, "step": 34320 }, { "entropy": 0.2788669762201607, "epoch": 0.5284597430435695, "grad_norm": 0.6506094336509705, "learning_rate": 1.96488279198758e-05, "loss": 0.2953, "mean_token_accuracy": 0.9146290808916092, "num_tokens": 94899861.0, "step": 34330 }, { "entropy": 0.2698430922813714, "epoch": 0.528613678302248, "grad_norm": 0.6662431955337524, "learning_rate": 1.9648357277057683e-05, "loss": 0.2735, "mean_token_accuracy": 0.9166142053902149, "num_tokens": 94965024.0, "step": 34340 }, { "entropy": 0.31095573576167224, "epoch": 0.5287676135609266, "grad_norm": 0.7558030486106873, "learning_rate": 1.9647886324715837e-05, "loss": 0.3087, "mean_token_accuracy": 0.9053192004561424, "num_tokens": 95021893.0, "step": 34350 }, { "entropy": 0.28499276768416165, "epoch": 0.5289215488196052, "grad_norm": 0.5880144834518433, "learning_rate": 1.9647415062865378e-05, "loss": 0.2949, "mean_token_accuracy": 0.9134049974381924, "num_tokens": 95083993.0, "step": 34360 }, { "entropy": 0.3101393095217645, "epoch": 0.5290754840782838, "grad_norm": 0.690140962600708, "learning_rate": 1.9646943491521416e-05, "loss": 0.3101, "mean_token_accuracy": 0.9053356848657131, "num_tokens": 95148696.0, "step": 34370 }, { "entropy": 0.2955204799771309, "epoch": 0.5292294193369623, "grad_norm": 0.6090550422668457, "learning_rate": 1.964647161069909e-05, "loss": 0.299, "mean_token_accuracy": 0.9097919031977654, "num_tokens": 95213437.0, "step": 34380 }, { "entropy": 0.27083368180319667, "epoch": 0.5293833545956409, "grad_norm": 0.45414772629737854, "learning_rate": 1.964599942041353e-05, "loss": 0.2854, "mean_token_accuracy": 0.9172089554369449, "num_tokens": 95281239.0, "step": 34390 }, { "entropy": 0.2772831214591861, "epoch": 0.5295372898543195, "grad_norm": 0.5492344498634338, "learning_rate": 1.964552692067989e-05, "loss": 0.2875, "mean_token_accuracy": 0.9137888438999653, "num_tokens": 95343154.0, "step": 34400 }, { "entropy": 0.3001628487370908, "epoch": 0.5296912251129982, "grad_norm": 0.6638262271881104, "learning_rate": 1.9645054111513317e-05, "loss": 0.3007, "mean_token_accuracy": 0.9059304751455783, "num_tokens": 95410925.0, "step": 34410 }, { "entropy": 0.2935884103178978, "epoch": 0.5298451603716767, "grad_norm": 0.6004320979118347, "learning_rate": 1.964458099292899e-05, "loss": 0.2968, "mean_token_accuracy": 0.911239442974329, "num_tokens": 95488859.0, "step": 34420 }, { "entropy": 0.29442435279488566, "epoch": 0.5299990956303553, "grad_norm": 0.6100964546203613, "learning_rate": 1.964410756494208e-05, "loss": 0.3007, "mean_token_accuracy": 0.9097100704908371, "num_tokens": 95565287.0, "step": 34430 }, { "entropy": 0.30084970127791166, "epoch": 0.5301530308890339, "grad_norm": 0.7009586691856384, "learning_rate": 1.9643633827567784e-05, "loss": 0.296, "mean_token_accuracy": 0.9058727152645588, "num_tokens": 95619408.0, "step": 34440 }, { "entropy": 0.2857831752859056, "epoch": 0.5303069661477124, "grad_norm": 0.5866560339927673, "learning_rate": 1.964315978082129e-05, "loss": 0.2998, "mean_token_accuracy": 0.9115692287683487, "num_tokens": 95688362.0, "step": 34450 }, { "entropy": 0.2753293951041996, "epoch": 0.530460901406391, "grad_norm": 0.8032995462417603, "learning_rate": 1.9642685424717807e-05, "loss": 0.2844, "mean_token_accuracy": 0.9146569088101387, "num_tokens": 95741242.0, "step": 34460 }, { "entropy": 0.31630489686504004, "epoch": 0.5306148366650696, "grad_norm": 0.6153771281242371, "learning_rate": 1.964221075927256e-05, "loss": 0.3017, "mean_token_accuracy": 0.902856171131134, "num_tokens": 95806974.0, "step": 34470 }, { "entropy": 0.2897560589015484, "epoch": 0.5307687719237482, "grad_norm": 0.5262749791145325, "learning_rate": 1.9641735784500765e-05, "loss": 0.3117, "mean_token_accuracy": 0.9096544310450554, "num_tokens": 95876166.0, "step": 34480 }, { "entropy": 0.31911794999614357, "epoch": 0.5309227071824267, "grad_norm": 0.591693103313446, "learning_rate": 1.9641260500417672e-05, "loss": 0.3168, "mean_token_accuracy": 0.9001423805952072, "num_tokens": 95950901.0, "step": 34490 }, { "entropy": 0.3011593378148973, "epoch": 0.5310766424411053, "grad_norm": 0.5557816028594971, "learning_rate": 1.9640784907038516e-05, "loss": 0.292, "mean_token_accuracy": 0.9075036309659481, "num_tokens": 96012030.0, "step": 34500 }, { "entropy": 0.28037310782819985, "epoch": 0.5312305776997839, "grad_norm": 0.5140681266784668, "learning_rate": 1.9640309004378568e-05, "loss": 0.2876, "mean_token_accuracy": 0.9137207731604576, "num_tokens": 96083186.0, "step": 34510 }, { "entropy": 0.2732238441705704, "epoch": 0.5313845129584625, "grad_norm": 0.620695948600769, "learning_rate": 1.963983279245308e-05, "loss": 0.2766, "mean_token_accuracy": 0.9159085370600224, "num_tokens": 96156841.0, "step": 34520 }, { "entropy": 0.2977639142423868, "epoch": 0.531538448217141, "grad_norm": 0.681490421295166, "learning_rate": 1.9639356271277338e-05, "loss": 0.2967, "mean_token_accuracy": 0.9084554836153984, "num_tokens": 96220125.0, "step": 34530 }, { "entropy": 0.3059673852287233, "epoch": 0.5316923834758197, "grad_norm": 0.7297816872596741, "learning_rate": 1.963887944086663e-05, "loss": 0.3057, "mean_token_accuracy": 0.9080189980566502, "num_tokens": 96292824.0, "step": 34540 }, { "entropy": 0.294598304387182, "epoch": 0.5318463187344983, "grad_norm": 0.7471526265144348, "learning_rate": 1.9638402301236246e-05, "loss": 0.3041, "mean_token_accuracy": 0.9098600529134273, "num_tokens": 96366246.0, "step": 34550 }, { "entropy": 0.2874886502511799, "epoch": 0.5320002539931769, "grad_norm": 0.7338495850563049, "learning_rate": 1.9637924852401503e-05, "loss": 0.2961, "mean_token_accuracy": 0.9118246704339981, "num_tokens": 96427379.0, "step": 34560 }, { "entropy": 0.28715494405478237, "epoch": 0.5321541892518554, "grad_norm": 0.6170453429222107, "learning_rate": 1.963744709437771e-05, "loss": 0.3039, "mean_token_accuracy": 0.9105248838663101, "num_tokens": 96490641.0, "step": 34570 }, { "entropy": 0.310654569696635, "epoch": 0.532308124510534, "grad_norm": 0.7044759392738342, "learning_rate": 1.9636969027180193e-05, "loss": 0.3184, "mean_token_accuracy": 0.9011164054274559, "num_tokens": 96567188.0, "step": 34580 }, { "entropy": 0.31677942620590327, "epoch": 0.5324620597692126, "grad_norm": 0.4878321886062622, "learning_rate": 1.9636490650824297e-05, "loss": 0.3054, "mean_token_accuracy": 0.9013214826583862, "num_tokens": 96638672.0, "step": 34590 }, { "entropy": 0.28646567948162555, "epoch": 0.5326159950278911, "grad_norm": 0.7357041239738464, "learning_rate": 1.963601196532536e-05, "loss": 0.291, "mean_token_accuracy": 0.9112574547529221, "num_tokens": 96704520.0, "step": 34600 }, { "entropy": 0.28747115237638354, "epoch": 0.5327699302865697, "grad_norm": 0.5957870483398438, "learning_rate": 1.9635532970698737e-05, "loss": 0.2937, "mean_token_accuracy": 0.9086569055914879, "num_tokens": 96771860.0, "step": 34610 }, { "entropy": 0.3138426776975393, "epoch": 0.5329238655452483, "grad_norm": 0.8329657316207886, "learning_rate": 1.9635053666959803e-05, "loss": 0.3032, "mean_token_accuracy": 0.9044525034725666, "num_tokens": 96834219.0, "step": 34620 }, { "entropy": 0.289807032328099, "epoch": 0.5330778008039269, "grad_norm": 0.7674151062965393, "learning_rate": 1.963457405412393e-05, "loss": 0.2876, "mean_token_accuracy": 0.9100162111222744, "num_tokens": 96898698.0, "step": 34630 }, { "entropy": 0.2587489550933242, "epoch": 0.5332317360626054, "grad_norm": 0.6857740879058838, "learning_rate": 1.9634094132206508e-05, "loss": 0.2743, "mean_token_accuracy": 0.9181892544031143, "num_tokens": 96967001.0, "step": 34640 }, { "entropy": 0.29879840202629565, "epoch": 0.533385671321284, "grad_norm": 0.8616179823875427, "learning_rate": 1.9633613901222924e-05, "loss": 0.3074, "mean_token_accuracy": 0.9078557692468167, "num_tokens": 97027040.0, "step": 34650 }, { "entropy": 0.2979915709234774, "epoch": 0.5335396065799626, "grad_norm": 0.5982269048690796, "learning_rate": 1.963313336118859e-05, "loss": 0.3005, "mean_token_accuracy": 0.9085839711129665, "num_tokens": 97089175.0, "step": 34660 }, { "entropy": 0.2941703487187624, "epoch": 0.5336935418386413, "grad_norm": 0.5090850591659546, "learning_rate": 1.9632652512118923e-05, "loss": 0.2938, "mean_token_accuracy": 0.9096934571862221, "num_tokens": 97155135.0, "step": 34670 }, { "entropy": 0.2896920698694885, "epoch": 0.5338474770973198, "grad_norm": 0.630466103553772, "learning_rate": 1.963217135402935e-05, "loss": 0.3037, "mean_token_accuracy": 0.910168357938528, "num_tokens": 97225491.0, "step": 34680 }, { "entropy": 0.297336929384619, "epoch": 0.5340014123559984, "grad_norm": 0.714320719242096, "learning_rate": 1.9631689886935298e-05, "loss": 0.292, "mean_token_accuracy": 0.9109470963478088, "num_tokens": 97293972.0, "step": 34690 }, { "entropy": 0.3078794308938086, "epoch": 0.534155347614677, "grad_norm": 0.6140061020851135, "learning_rate": 1.9631208110852223e-05, "loss": 0.3107, "mean_token_accuracy": 0.9056806996464729, "num_tokens": 97365918.0, "step": 34700 }, { "entropy": 0.3049267092719674, "epoch": 0.5343092828733556, "grad_norm": 0.61519455909729, "learning_rate": 1.963072602579557e-05, "loss": 0.2972, "mean_token_accuracy": 0.9067368142306804, "num_tokens": 97434631.0, "step": 34710 }, { "entropy": 0.2864338587038219, "epoch": 0.5344632181320341, "grad_norm": 0.7230631113052368, "learning_rate": 1.963024363178082e-05, "loss": 0.2855, "mean_token_accuracy": 0.909969649463892, "num_tokens": 97504331.0, "step": 34720 }, { "entropy": 0.2843684478662908, "epoch": 0.5346171533907127, "grad_norm": 0.6863176226615906, "learning_rate": 1.962976092882343e-05, "loss": 0.2911, "mean_token_accuracy": 0.9136567942798137, "num_tokens": 97566133.0, "step": 34730 }, { "entropy": 0.29021417275071143, "epoch": 0.5347710886493913, "grad_norm": 0.5313068628311157, "learning_rate": 1.9629277916938904e-05, "loss": 0.2938, "mean_token_accuracy": 0.9105573736131192, "num_tokens": 97631295.0, "step": 34740 }, { "entropy": 0.28199363546445966, "epoch": 0.5349250239080698, "grad_norm": 0.5715615153312683, "learning_rate": 1.962879459614272e-05, "loss": 0.2903, "mean_token_accuracy": 0.9117464832961559, "num_tokens": 97698313.0, "step": 34750 }, { "entropy": 0.2851268616504967, "epoch": 0.5350789591667484, "grad_norm": 0.5662885904312134, "learning_rate": 1.9628310966450394e-05, "loss": 0.2853, "mean_token_accuracy": 0.9124861478805542, "num_tokens": 97760336.0, "step": 34760 }, { "entropy": 0.28435173397883773, "epoch": 0.535232894425427, "grad_norm": 0.5566797256469727, "learning_rate": 1.9627827027877436e-05, "loss": 0.2827, "mean_token_accuracy": 0.9118518382310867, "num_tokens": 97828224.0, "step": 34770 }, { "entropy": 0.2799868139438331, "epoch": 0.5353868296841056, "grad_norm": 0.7288780808448792, "learning_rate": 1.9627342780439375e-05, "loss": 0.2854, "mean_token_accuracy": 0.9124121747910976, "num_tokens": 97886705.0, "step": 34780 }, { "entropy": 0.2980767936445773, "epoch": 0.5355407649427842, "grad_norm": 0.639087438583374, "learning_rate": 1.9626858224151744e-05, "loss": 0.3034, "mean_token_accuracy": 0.9079510852694511, "num_tokens": 97957161.0, "step": 34790 }, { "entropy": 0.3069039098918438, "epoch": 0.5356947002014628, "grad_norm": 0.6258012652397156, "learning_rate": 1.9626373359030085e-05, "loss": 0.3279, "mean_token_accuracy": 0.9058007307350635, "num_tokens": 98024934.0, "step": 34800 }, { "entropy": 0.3101541683077812, "epoch": 0.5358486354601414, "grad_norm": 0.7090270519256592, "learning_rate": 1.9625888185089962e-05, "loss": 0.2966, "mean_token_accuracy": 0.906077367067337, "num_tokens": 98088939.0, "step": 34810 }, { "entropy": 0.2929620981216431, "epoch": 0.53600257071882, "grad_norm": 0.6014565229415894, "learning_rate": 1.9625402702346925e-05, "loss": 0.2836, "mean_token_accuracy": 0.9107839696109294, "num_tokens": 98156912.0, "step": 34820 }, { "entropy": 0.28566007325425746, "epoch": 0.5361565059774985, "grad_norm": 0.6548582315444946, "learning_rate": 1.962491691081656e-05, "loss": 0.2998, "mean_token_accuracy": 0.9097890481352806, "num_tokens": 98220734.0, "step": 34830 }, { "entropy": 0.27027447801083326, "epoch": 0.5363104412361771, "grad_norm": 0.7643510103225708, "learning_rate": 1.9624430810514447e-05, "loss": 0.2813, "mean_token_accuracy": 0.9161987513303756, "num_tokens": 98285340.0, "step": 34840 }, { "entropy": 0.3084267540834844, "epoch": 0.5364643764948557, "grad_norm": 0.7477254867553711, "learning_rate": 1.9623944401456182e-05, "loss": 0.3155, "mean_token_accuracy": 0.9055305019021034, "num_tokens": 98348996.0, "step": 34850 }, { "entropy": 0.3157481672242284, "epoch": 0.5366183117535342, "grad_norm": 0.7509663105010986, "learning_rate": 1.962345768365737e-05, "loss": 0.3039, "mean_token_accuracy": 0.9060395203530789, "num_tokens": 98414191.0, "step": 34860 }, { "entropy": 0.27758893007412555, "epoch": 0.5367722470122128, "grad_norm": 0.5257643461227417, "learning_rate": 1.962297065713362e-05, "loss": 0.2826, "mean_token_accuracy": 0.9142881527543067, "num_tokens": 98489368.0, "step": 34870 }, { "entropy": 0.2929554605856538, "epoch": 0.5369261822708914, "grad_norm": 0.6944231390953064, "learning_rate": 1.962248332190056e-05, "loss": 0.2906, "mean_token_accuracy": 0.9089701138436794, "num_tokens": 98551538.0, "step": 34880 }, { "entropy": 0.29060957496985795, "epoch": 0.53708011752957, "grad_norm": 0.6961785554885864, "learning_rate": 1.9621995677973827e-05, "loss": 0.2964, "mean_token_accuracy": 0.9093332678079605, "num_tokens": 98617748.0, "step": 34890 }, { "entropy": 0.294522218964994, "epoch": 0.5372340527882485, "grad_norm": 0.8165434002876282, "learning_rate": 1.962150772536906e-05, "loss": 0.2946, "mean_token_accuracy": 0.9093096233904362, "num_tokens": 98678511.0, "step": 34900 }, { "entropy": 0.2953226465731859, "epoch": 0.5373879880469271, "grad_norm": 0.6704986095428467, "learning_rate": 1.9621019464101908e-05, "loss": 0.2996, "mean_token_accuracy": 0.9085000284016133, "num_tokens": 98745958.0, "step": 34910 }, { "entropy": 0.2858789819292724, "epoch": 0.5375419233056058, "grad_norm": 0.6083013415336609, "learning_rate": 1.962053089418805e-05, "loss": 0.2902, "mean_token_accuracy": 0.9125435188412666, "num_tokens": 98809465.0, "step": 34920 }, { "entropy": 0.2837666883133352, "epoch": 0.5376958585642844, "grad_norm": 0.5470143556594849, "learning_rate": 1.9620042015643143e-05, "loss": 0.2888, "mean_token_accuracy": 0.9130016699433326, "num_tokens": 98887460.0, "step": 34930 }, { "entropy": 0.2999925293028355, "epoch": 0.5378497938229629, "grad_norm": 0.6074014902114868, "learning_rate": 1.961955282848288e-05, "loss": 0.2999, "mean_token_accuracy": 0.9044359423220157, "num_tokens": 98952815.0, "step": 34940 }, { "entropy": 0.29341230932623147, "epoch": 0.5380037290816415, "grad_norm": 0.5329830646514893, "learning_rate": 1.9619063332722953e-05, "loss": 0.2904, "mean_token_accuracy": 0.9099331356585025, "num_tokens": 99024104.0, "step": 34950 }, { "entropy": 0.30619917148724196, "epoch": 0.5381576643403201, "grad_norm": 0.7496148347854614, "learning_rate": 1.9618573528379066e-05, "loss": 0.3285, "mean_token_accuracy": 0.904637398570776, "num_tokens": 99090477.0, "step": 34960 }, { "entropy": 0.29359291326254605, "epoch": 0.5383115995989987, "grad_norm": 0.6374704837799072, "learning_rate": 1.9618083415466928e-05, "loss": 0.2952, "mean_token_accuracy": 0.9110894933342933, "num_tokens": 99155756.0, "step": 34970 }, { "entropy": 0.29433381631970407, "epoch": 0.5384655348576772, "grad_norm": 0.5981258749961853, "learning_rate": 1.9617592994002262e-05, "loss": 0.2999, "mean_token_accuracy": 0.9080653913319111, "num_tokens": 99220952.0, "step": 34980 }, { "entropy": 0.2947474463842809, "epoch": 0.5386194701163558, "grad_norm": 0.6693444848060608, "learning_rate": 1.9617102264000808e-05, "loss": 0.2967, "mean_token_accuracy": 0.908451497554779, "num_tokens": 99290290.0, "step": 34990 }, { "entropy": 0.28767710784450173, "epoch": 0.5387734053750344, "grad_norm": 0.7826620936393738, "learning_rate": 1.9616611225478305e-05, "loss": 0.2974, "mean_token_accuracy": 0.912030003219843, "num_tokens": 99352083.0, "step": 35000 }, { "entropy": 0.2800241463817656, "epoch": 0.538927340633713, "grad_norm": 0.6450188159942627, "learning_rate": 1.9616119878450503e-05, "loss": 0.2942, "mean_token_accuracy": 0.9123711332678794, "num_tokens": 99420429.0, "step": 35010 }, { "entropy": 0.295021790266037, "epoch": 0.5390812758923915, "grad_norm": 0.5482524633407593, "learning_rate": 1.9615628222933165e-05, "loss": 0.3011, "mean_token_accuracy": 0.9089790165424347, "num_tokens": 99492167.0, "step": 35020 }, { "entropy": 0.29043366564437745, "epoch": 0.5392352111510701, "grad_norm": 0.6085675954818726, "learning_rate": 1.9615136258942068e-05, "loss": 0.2817, "mean_token_accuracy": 0.9119406491518021, "num_tokens": 99567481.0, "step": 35030 }, { "entropy": 0.2864604831673205, "epoch": 0.5393891464097487, "grad_norm": 0.8055798411369324, "learning_rate": 1.961464398649299e-05, "loss": 0.2954, "mean_token_accuracy": 0.9103079363703728, "num_tokens": 99627184.0, "step": 35040 }, { "entropy": 0.2855006348341703, "epoch": 0.5395430816684273, "grad_norm": 0.8388733863830566, "learning_rate": 1.9614151405601725e-05, "loss": 0.2954, "mean_token_accuracy": 0.9121615596115589, "num_tokens": 99693609.0, "step": 35050 }, { "entropy": 0.2956028853543103, "epoch": 0.5396970169271059, "grad_norm": 1.0914736986160278, "learning_rate": 1.9613658516284074e-05, "loss": 0.2933, "mean_token_accuracy": 0.9075728602707386, "num_tokens": 99757881.0, "step": 35060 }, { "entropy": 0.2748233683407307, "epoch": 0.5398509521857845, "grad_norm": 0.5449804663658142, "learning_rate": 1.9613165318555852e-05, "loss": 0.2829, "mean_token_accuracy": 0.9138264052569867, "num_tokens": 99832680.0, "step": 35070 }, { "entropy": 0.28439770843833684, "epoch": 0.5400048874444631, "grad_norm": 0.5662392973899841, "learning_rate": 1.9612671812432878e-05, "loss": 0.2923, "mean_token_accuracy": 0.9121702440083027, "num_tokens": 99900557.0, "step": 35080 }, { "entropy": 0.28945426382124423, "epoch": 0.5401588227031416, "grad_norm": 0.5480408072471619, "learning_rate": 1.9612177997930988e-05, "loss": 0.2961, "mean_token_accuracy": 0.9118339881300926, "num_tokens": 99967968.0, "step": 35090 }, { "entropy": 0.2946443661116064, "epoch": 0.5403127579618202, "grad_norm": 0.6608086824417114, "learning_rate": 1.961168387506602e-05, "loss": 0.2933, "mean_token_accuracy": 0.9088820412755012, "num_tokens": 100035692.0, "step": 35100 }, { "entropy": 0.28446776028722526, "epoch": 0.5404666932204988, "grad_norm": 0.566707193851471, "learning_rate": 1.9611189443853826e-05, "loss": 0.2948, "mean_token_accuracy": 0.9096706323325634, "num_tokens": 100100946.0, "step": 35110 }, { "entropy": 0.28429010035470126, "epoch": 0.5406206284791774, "grad_norm": 0.6355764269828796, "learning_rate": 1.9610694704310266e-05, "loss": 0.2801, "mean_token_accuracy": 0.9146771863102913, "num_tokens": 100165616.0, "step": 35120 }, { "entropy": 0.28794082179665564, "epoch": 0.5407745637378559, "grad_norm": 0.7047946453094482, "learning_rate": 1.9610199656451216e-05, "loss": 0.2953, "mean_token_accuracy": 0.9097133487462997, "num_tokens": 100229110.0, "step": 35130 }, { "entropy": 0.2974625333212316, "epoch": 0.5409284989965345, "grad_norm": 0.662771463394165, "learning_rate": 1.9609704300292557e-05, "loss": 0.292, "mean_token_accuracy": 0.907311424612999, "num_tokens": 100284150.0, "step": 35140 }, { "entropy": 0.30827772663906217, "epoch": 0.5410824342552131, "grad_norm": 0.6176130771636963, "learning_rate": 1.9609208635850175e-05, "loss": 0.3016, "mean_token_accuracy": 0.9060839019715786, "num_tokens": 100365528.0, "step": 35150 }, { "entropy": 0.3009809358045459, "epoch": 0.5412363695138916, "grad_norm": 0.6385419964790344, "learning_rate": 1.9608712663139977e-05, "loss": 0.3079, "mean_token_accuracy": 0.9073697932064533, "num_tokens": 100428507.0, "step": 35160 }, { "entropy": 0.2954742692410946, "epoch": 0.5413903047725703, "grad_norm": 0.6633898615837097, "learning_rate": 1.9608216382177868e-05, "loss": 0.3023, "mean_token_accuracy": 0.9087821438908577, "num_tokens": 100492330.0, "step": 35170 }, { "entropy": 0.3031747380271554, "epoch": 0.5415442400312489, "grad_norm": 0.676865816116333, "learning_rate": 1.9607719792979774e-05, "loss": 0.2967, "mean_token_accuracy": 0.9100617550313472, "num_tokens": 100553977.0, "step": 35180 }, { "entropy": 0.28787080487236383, "epoch": 0.5416981752899275, "grad_norm": 0.5737813711166382, "learning_rate": 1.9607222895561628e-05, "loss": 0.2925, "mean_token_accuracy": 0.91147475913167, "num_tokens": 100626140.0, "step": 35190 }, { "entropy": 0.2932240589521825, "epoch": 0.541852110548606, "grad_norm": 0.6541439294815063, "learning_rate": 1.9606725689939362e-05, "loss": 0.2983, "mean_token_accuracy": 0.9068645216524601, "num_tokens": 100693587.0, "step": 35200 }, { "entropy": 0.298886816482991, "epoch": 0.5420060458072846, "grad_norm": 0.5200814008712769, "learning_rate": 1.9606228176128933e-05, "loss": 0.3072, "mean_token_accuracy": 0.908596520870924, "num_tokens": 100760642.0, "step": 35210 }, { "entropy": 0.29575373269617555, "epoch": 0.5421599810659632, "grad_norm": 0.6382042765617371, "learning_rate": 1.96057303541463e-05, "loss": 0.2919, "mean_token_accuracy": 0.9110948830842972, "num_tokens": 100826385.0, "step": 35220 }, { "entropy": 0.2597132162190974, "epoch": 0.5423139163246418, "grad_norm": 0.6820361018180847, "learning_rate": 1.9605232224007435e-05, "loss": 0.2658, "mean_token_accuracy": 0.9202033273875714, "num_tokens": 100891790.0, "step": 35230 }, { "entropy": 0.2836816248483956, "epoch": 0.5424678515833203, "grad_norm": 0.6098989248275757, "learning_rate": 1.9604733785728317e-05, "loss": 0.286, "mean_token_accuracy": 0.9109104685485363, "num_tokens": 100951944.0, "step": 35240 }, { "entropy": 0.2951169817708433, "epoch": 0.5426217868419989, "grad_norm": 0.5726596117019653, "learning_rate": 1.9604235039324935e-05, "loss": 0.3021, "mean_token_accuracy": 0.9102476976811886, "num_tokens": 101016544.0, "step": 35250 }, { "entropy": 0.29040515432134273, "epoch": 0.5427757221006775, "grad_norm": 0.5589771866798401, "learning_rate": 1.9603735984813287e-05, "loss": 0.2923, "mean_token_accuracy": 0.9120114669203758, "num_tokens": 101087958.0, "step": 35260 }, { "entropy": 0.2840397091582417, "epoch": 0.542929657359356, "grad_norm": 0.5216221213340759, "learning_rate": 1.960323662220939e-05, "loss": 0.2883, "mean_token_accuracy": 0.913145562261343, "num_tokens": 101163876.0, "step": 35270 }, { "entropy": 0.29398500081151724, "epoch": 0.5430835926180346, "grad_norm": 0.616969108581543, "learning_rate": 1.9602736951529257e-05, "loss": 0.3, "mean_token_accuracy": 0.9069524258375168, "num_tokens": 101228441.0, "step": 35280 }, { "entropy": 0.3040966482833028, "epoch": 0.5432375278767132, "grad_norm": 0.6850166916847229, "learning_rate": 1.9602236972788922e-05, "loss": 0.3008, "mean_token_accuracy": 0.9090179145336151, "num_tokens": 101295461.0, "step": 35290 }, { "entropy": 0.28272564243525267, "epoch": 0.5433914631353919, "grad_norm": 0.5384455919265747, "learning_rate": 1.960173668600442e-05, "loss": 0.2714, "mean_token_accuracy": 0.9142475508153438, "num_tokens": 101362609.0, "step": 35300 }, { "entropy": 0.29263490391895175, "epoch": 0.5435453983940705, "grad_norm": 0.586248517036438, "learning_rate": 1.9601236091191804e-05, "loss": 0.2932, "mean_token_accuracy": 0.9104364983737468, "num_tokens": 101428798.0, "step": 35310 }, { "entropy": 0.2960131700150669, "epoch": 0.543699333652749, "grad_norm": 0.6773260235786438, "learning_rate": 1.9600735188367133e-05, "loss": 0.3138, "mean_token_accuracy": 0.9077392801642418, "num_tokens": 101491244.0, "step": 35320 }, { "entropy": 0.31150474939495326, "epoch": 0.5438532689114276, "grad_norm": 0.777612030506134, "learning_rate": 1.9600233977546473e-05, "loss": 0.3015, "mean_token_accuracy": 0.9071386180818081, "num_tokens": 101555608.0, "step": 35330 }, { "entropy": 0.28053127629682423, "epoch": 0.5440072041701062, "grad_norm": 0.6003282070159912, "learning_rate": 1.9599732458745908e-05, "loss": 0.2943, "mean_token_accuracy": 0.9138140238821506, "num_tokens": 101619213.0, "step": 35340 }, { "entropy": 0.2754382467828691, "epoch": 0.5441611394287847, "grad_norm": 0.7420060038566589, "learning_rate": 1.9599230631981527e-05, "loss": 0.29, "mean_token_accuracy": 0.9147229634225369, "num_tokens": 101683016.0, "step": 35350 }, { "entropy": 0.30819274773821237, "epoch": 0.5443150746874633, "grad_norm": 0.7527722120285034, "learning_rate": 1.959872849726942e-05, "loss": 0.3125, "mean_token_accuracy": 0.9047104634344578, "num_tokens": 101752894.0, "step": 35360 }, { "entropy": 0.3027434332296252, "epoch": 0.5444690099461419, "grad_norm": 0.609845757484436, "learning_rate": 1.959822605462571e-05, "loss": 0.3068, "mean_token_accuracy": 0.9083687670528888, "num_tokens": 101818909.0, "step": 35370 }, { "entropy": 0.2905788941308856, "epoch": 0.5446229452048205, "grad_norm": 0.4688611924648285, "learning_rate": 1.95977233040665e-05, "loss": 0.2962, "mean_token_accuracy": 0.9099339596927166, "num_tokens": 101889580.0, "step": 35380 }, { "entropy": 0.2773247599601746, "epoch": 0.544776880463499, "grad_norm": 0.48634642362594604, "learning_rate": 1.959722024560793e-05, "loss": 0.291, "mean_token_accuracy": 0.9142061978578567, "num_tokens": 101960240.0, "step": 35390 }, { "entropy": 0.3025761551223695, "epoch": 0.5449308157221776, "grad_norm": 0.7049952149391174, "learning_rate": 1.9596716879266134e-05, "loss": 0.3065, "mean_token_accuracy": 0.9064342111349106, "num_tokens": 102024583.0, "step": 35400 }, { "entropy": 0.2898585067130625, "epoch": 0.5450847509808562, "grad_norm": 0.7866857647895813, "learning_rate": 1.959621320505726e-05, "loss": 0.3103, "mean_token_accuracy": 0.9113070987164974, "num_tokens": 102087794.0, "step": 35410 }, { "entropy": 0.28495890982449057, "epoch": 0.5452386862395348, "grad_norm": 0.5336520075798035, "learning_rate": 1.959570922299747e-05, "loss": 0.2826, "mean_token_accuracy": 0.9122020848095417, "num_tokens": 102152604.0, "step": 35420 }, { "entropy": 0.279819958563894, "epoch": 0.5453926214982134, "grad_norm": 0.607160747051239, "learning_rate": 1.9595204933102926e-05, "loss": 0.2838, "mean_token_accuracy": 0.9142019256949425, "num_tokens": 102217095.0, "step": 35430 }, { "entropy": 0.292634972371161, "epoch": 0.545546556756892, "grad_norm": 0.5876541137695312, "learning_rate": 1.959470033538981e-05, "loss": 0.3013, "mean_token_accuracy": 0.9112312294542789, "num_tokens": 102293032.0, "step": 35440 }, { "entropy": 0.3016113396733999, "epoch": 0.5457004920155706, "grad_norm": 0.537751317024231, "learning_rate": 1.959419542987431e-05, "loss": 0.3021, "mean_token_accuracy": 0.90912069901824, "num_tokens": 102362992.0, "step": 35450 }, { "entropy": 0.2833183747716248, "epoch": 0.5458544272742492, "grad_norm": 0.6044763326644897, "learning_rate": 1.9593690216572617e-05, "loss": 0.3038, "mean_token_accuracy": 0.9106795713305473, "num_tokens": 102424914.0, "step": 35460 }, { "entropy": 0.28630760703235864, "epoch": 0.5460083625329277, "grad_norm": 0.5856043100357056, "learning_rate": 1.959318469550095e-05, "loss": 0.3029, "mean_token_accuracy": 0.9115915454924106, "num_tokens": 102489585.0, "step": 35470 }, { "entropy": 0.3267705438658595, "epoch": 0.5461622977916063, "grad_norm": 0.6538934707641602, "learning_rate": 1.9592678866675515e-05, "loss": 0.321, "mean_token_accuracy": 0.9010336801409722, "num_tokens": 102555862.0, "step": 35480 }, { "entropy": 0.2890149686485529, "epoch": 0.5463162330502849, "grad_norm": 0.6703861951828003, "learning_rate": 1.9592172730112543e-05, "loss": 0.2894, "mean_token_accuracy": 0.9104680053889751, "num_tokens": 102626068.0, "step": 35490 }, { "entropy": 0.28389401491731403, "epoch": 0.5464701683089634, "grad_norm": 0.6455228328704834, "learning_rate": 1.9591666285828277e-05, "loss": 0.29, "mean_token_accuracy": 0.9101331770420075, "num_tokens": 102698056.0, "step": 35500 }, { "entropy": 0.3006344364956021, "epoch": 0.546624103567642, "grad_norm": 0.5496712923049927, "learning_rate": 1.959115953383896e-05, "loss": 0.2979, "mean_token_accuracy": 0.9058312736451626, "num_tokens": 102772227.0, "step": 35510 }, { "entropy": 0.29830635953694584, "epoch": 0.5467780388263206, "grad_norm": 0.7277551293373108, "learning_rate": 1.9590652474160842e-05, "loss": 0.2903, "mean_token_accuracy": 0.9081680715084076, "num_tokens": 102836533.0, "step": 35520 }, { "entropy": 0.29550360087305305, "epoch": 0.5469319740849992, "grad_norm": 0.681649386882782, "learning_rate": 1.9590145106810198e-05, "loss": 0.2961, "mean_token_accuracy": 0.909964307397604, "num_tokens": 102900125.0, "step": 35530 }, { "entropy": 0.28065694132819774, "epoch": 0.5470859093436777, "grad_norm": 0.5538848638534546, "learning_rate": 1.9589637431803303e-05, "loss": 0.2902, "mean_token_accuracy": 0.9135785937309265, "num_tokens": 102964416.0, "step": 35540 }, { "entropy": 0.30045952908694745, "epoch": 0.5472398446023564, "grad_norm": 0.5561969876289368, "learning_rate": 1.9589129449156443e-05, "loss": 0.2984, "mean_token_accuracy": 0.9076206959784031, "num_tokens": 103029469.0, "step": 35550 }, { "entropy": 0.2708427687175572, "epoch": 0.547393779861035, "grad_norm": 0.6061958074569702, "learning_rate": 1.9588621158885913e-05, "loss": 0.2815, "mean_token_accuracy": 0.9165163114666939, "num_tokens": 103094516.0, "step": 35560 }, { "entropy": 0.2873483876697719, "epoch": 0.5475477151197136, "grad_norm": 0.5879287719726562, "learning_rate": 1.958811256100802e-05, "loss": 0.2827, "mean_token_accuracy": 0.9089079827070237, "num_tokens": 103156915.0, "step": 35570 }, { "entropy": 0.314765580650419, "epoch": 0.5477016503783921, "grad_norm": 0.9927390813827515, "learning_rate": 1.958760365553908e-05, "loss": 0.311, "mean_token_accuracy": 0.9034561201930046, "num_tokens": 103221010.0, "step": 35580 }, { "entropy": 0.28748520761728286, "epoch": 0.5478555856370707, "grad_norm": 0.8776090741157532, "learning_rate": 1.958709444249542e-05, "loss": 0.2953, "mean_token_accuracy": 0.9085667669773102, "num_tokens": 103285643.0, "step": 35590 }, { "entropy": 0.2820776626467705, "epoch": 0.5480095208957493, "grad_norm": 0.6850734353065491, "learning_rate": 1.9586584921893376e-05, "loss": 0.3019, "mean_token_accuracy": 0.9133209109306335, "num_tokens": 103352001.0, "step": 35600 }, { "entropy": 0.3105759798549116, "epoch": 0.5481634561544279, "grad_norm": 0.6558756828308105, "learning_rate": 1.9586075093749292e-05, "loss": 0.3111, "mean_token_accuracy": 0.9038511194288731, "num_tokens": 103420361.0, "step": 35610 }, { "entropy": 0.2855307414196432, "epoch": 0.5483173914131064, "grad_norm": 0.5400463342666626, "learning_rate": 1.9585564958079525e-05, "loss": 0.2856, "mean_token_accuracy": 0.9116763912141324, "num_tokens": 103482508.0, "step": 35620 }, { "entropy": 0.2985068096779287, "epoch": 0.548471326671785, "grad_norm": 0.6228495836257935, "learning_rate": 1.9585054514900437e-05, "loss": 0.2915, "mean_token_accuracy": 0.9098201304674148, "num_tokens": 103551829.0, "step": 35630 }, { "entropy": 0.2868843277916312, "epoch": 0.5486252619304636, "grad_norm": 0.8229027390480042, "learning_rate": 1.958454376422841e-05, "loss": 0.3017, "mean_token_accuracy": 0.9101906701922416, "num_tokens": 103615996.0, "step": 35640 }, { "entropy": 0.30014402121305467, "epoch": 0.5487791971891421, "grad_norm": 0.5792244672775269, "learning_rate": 1.958403270607982e-05, "loss": 0.3063, "mean_token_accuracy": 0.9069485060870648, "num_tokens": 103680909.0, "step": 35650 }, { "entropy": 0.27774482751265167, "epoch": 0.5489331324478207, "grad_norm": 0.6689913868904114, "learning_rate": 1.9583521340471067e-05, "loss": 0.2905, "mean_token_accuracy": 0.9138735018670558, "num_tokens": 103751982.0, "step": 35660 }, { "entropy": 0.30353743163868785, "epoch": 0.5490870677064993, "grad_norm": 0.6281141042709351, "learning_rate": 1.9583009667418557e-05, "loss": 0.2963, "mean_token_accuracy": 0.90654711201787, "num_tokens": 103815115.0, "step": 35670 }, { "entropy": 0.26849410105496646, "epoch": 0.549241002965178, "grad_norm": 1.1897172927856445, "learning_rate": 1.9582497686938705e-05, "loss": 0.2765, "mean_token_accuracy": 0.9163573078811169, "num_tokens": 103876741.0, "step": 35680 }, { "entropy": 0.2798436532728374, "epoch": 0.5493949382238565, "grad_norm": 0.5380126237869263, "learning_rate": 1.9581985399047934e-05, "loss": 0.2803, "mean_token_accuracy": 0.9134359873831273, "num_tokens": 103945533.0, "step": 35690 }, { "entropy": 0.27417830722406505, "epoch": 0.5495488734825351, "grad_norm": 0.6271054148674011, "learning_rate": 1.9581472803762675e-05, "loss": 0.2743, "mean_token_accuracy": 0.9159369185566902, "num_tokens": 104011226.0, "step": 35700 }, { "entropy": 0.3101641663350165, "epoch": 0.5497028087412137, "grad_norm": 0.5712934732437134, "learning_rate": 1.9580959901099378e-05, "loss": 0.312, "mean_token_accuracy": 0.9014848820865154, "num_tokens": 104081338.0, "step": 35710 }, { "entropy": 0.27870033849030734, "epoch": 0.5498567439998923, "grad_norm": 0.5743352174758911, "learning_rate": 1.95804466910745e-05, "loss": 0.2823, "mean_token_accuracy": 0.9121940203011036, "num_tokens": 104138771.0, "step": 35720 }, { "entropy": 0.28157549872994425, "epoch": 0.5500106792585708, "grad_norm": 0.6139850616455078, "learning_rate": 1.957993317370449e-05, "loss": 0.2865, "mean_token_accuracy": 0.9134449310600757, "num_tokens": 104209136.0, "step": 35730 }, { "entropy": 0.2831763378344476, "epoch": 0.5501646145172494, "grad_norm": 0.651115894317627, "learning_rate": 1.9579419349005838e-05, "loss": 0.2823, "mean_token_accuracy": 0.912734616547823, "num_tokens": 104273825.0, "step": 35740 }, { "entropy": 0.3063353061676025, "epoch": 0.550318549775928, "grad_norm": 0.6135551929473877, "learning_rate": 1.9578905216995018e-05, "loss": 0.3056, "mean_token_accuracy": 0.9062212437391282, "num_tokens": 104341689.0, "step": 35750 }, { "entropy": 0.2797217502258718, "epoch": 0.5504724850346066, "grad_norm": 0.5461230278015137, "learning_rate": 1.9578390777688533e-05, "loss": 0.2983, "mean_token_accuracy": 0.9127732627093792, "num_tokens": 104409885.0, "step": 35760 }, { "entropy": 0.28740727435797453, "epoch": 0.5506264202932851, "grad_norm": 0.6101832985877991, "learning_rate": 1.9577876031102872e-05, "loss": 0.2937, "mean_token_accuracy": 0.913452410697937, "num_tokens": 104477564.0, "step": 35770 }, { "entropy": 0.27776170521974564, "epoch": 0.5507803555519637, "grad_norm": 0.5858318209648132, "learning_rate": 1.9577360977254563e-05, "loss": 0.282, "mean_token_accuracy": 0.9145398572087288, "num_tokens": 104544520.0, "step": 35780 }, { "entropy": 0.2995044873096049, "epoch": 0.5509342908106423, "grad_norm": 0.7226455807685852, "learning_rate": 1.957684561616012e-05, "loss": 0.2852, "mean_token_accuracy": 0.9062506973743438, "num_tokens": 104626707.0, "step": 35790 }, { "entropy": 0.3105140863917768, "epoch": 0.5510882260693208, "grad_norm": 0.691646933555603, "learning_rate": 1.9576329947836082e-05, "loss": 0.3112, "mean_token_accuracy": 0.9040626257658004, "num_tokens": 104693923.0, "step": 35800 }, { "entropy": 0.280409318767488, "epoch": 0.5512421613279995, "grad_norm": 0.7105718851089478, "learning_rate": 1.957581397229899e-05, "loss": 0.2818, "mean_token_accuracy": 0.9134919494390488, "num_tokens": 104762868.0, "step": 35810 }, { "entropy": 0.29140623109415176, "epoch": 0.5513960965866781, "grad_norm": 0.6236815452575684, "learning_rate": 1.9575297689565393e-05, "loss": 0.293, "mean_token_accuracy": 0.910142133384943, "num_tokens": 104828206.0, "step": 35820 }, { "entropy": 0.3060458546504378, "epoch": 0.5515500318453567, "grad_norm": 0.5797079205513, "learning_rate": 1.9574781099651853e-05, "loss": 0.3061, "mean_token_accuracy": 0.9041307099163532, "num_tokens": 104897544.0, "step": 35830 }, { "entropy": 0.2970426191575825, "epoch": 0.5517039671040352, "grad_norm": 0.9058693647384644, "learning_rate": 1.957426420257495e-05, "loss": 0.3037, "mean_token_accuracy": 0.908926347643137, "num_tokens": 104952289.0, "step": 35840 }, { "entropy": 0.3050597165711224, "epoch": 0.5518579023627138, "grad_norm": 0.6601883172988892, "learning_rate": 1.957374699835126e-05, "loss": 0.3099, "mean_token_accuracy": 0.9068135850131511, "num_tokens": 105015292.0, "step": 35850 }, { "entropy": 0.28587752431631086, "epoch": 0.5520118376213924, "grad_norm": 0.532274603843689, "learning_rate": 1.957322948699738e-05, "loss": 0.2767, "mean_token_accuracy": 0.9121852435171605, "num_tokens": 105083124.0, "step": 35860 }, { "entropy": 0.27798564322292807, "epoch": 0.552165772880071, "grad_norm": 0.6161877512931824, "learning_rate": 1.9572711668529905e-05, "loss": 0.2935, "mean_token_accuracy": 0.9120308570563793, "num_tokens": 105141896.0, "step": 35870 }, { "entropy": 0.2918962951749563, "epoch": 0.5523197081387495, "grad_norm": 0.7705641984939575, "learning_rate": 1.9572193542965452e-05, "loss": 0.2961, "mean_token_accuracy": 0.9116690002381802, "num_tokens": 105205230.0, "step": 35880 }, { "entropy": 0.31529578268527986, "epoch": 0.5524736433974281, "grad_norm": 0.7728886008262634, "learning_rate": 1.9571675110320643e-05, "loss": 0.3273, "mean_token_accuracy": 0.9033935017883777, "num_tokens": 105269978.0, "step": 35890 }, { "entropy": 0.29005241496488454, "epoch": 0.5526275786561067, "grad_norm": 0.6019885540008545, "learning_rate": 1.9571156370612106e-05, "loss": 0.2888, "mean_token_accuracy": 0.9116592168807983, "num_tokens": 105335258.0, "step": 35900 }, { "entropy": 0.29062107503414153, "epoch": 0.5527815139147852, "grad_norm": 0.9119720458984375, "learning_rate": 1.957063732385649e-05, "loss": 0.2996, "mean_token_accuracy": 0.9093862541019917, "num_tokens": 105395103.0, "step": 35910 }, { "entropy": 0.29656040463596584, "epoch": 0.5529354491734638, "grad_norm": 0.5563232898712158, "learning_rate": 1.9570117970070437e-05, "loss": 0.3113, "mean_token_accuracy": 0.908315546810627, "num_tokens": 105462739.0, "step": 35920 }, { "entropy": 0.2842743509449065, "epoch": 0.5530893844321425, "grad_norm": 0.6996023654937744, "learning_rate": 1.956959830927061e-05, "loss": 0.3009, "mean_token_accuracy": 0.9132235981523991, "num_tokens": 105518555.0, "step": 35930 }, { "entropy": 0.27832107730209826, "epoch": 0.5532433196908211, "grad_norm": 0.932039201259613, "learning_rate": 1.9569078341473684e-05, "loss": 0.282, "mean_token_accuracy": 0.9156088992953301, "num_tokens": 105582643.0, "step": 35940 }, { "entropy": 0.2803800476714969, "epoch": 0.5533972549494997, "grad_norm": 0.5972434282302856, "learning_rate": 1.956855806669634e-05, "loss": 0.2993, "mean_token_accuracy": 0.9103773660957813, "num_tokens": 105646767.0, "step": 35950 }, { "entropy": 0.28566415132954714, "epoch": 0.5535511902081782, "grad_norm": 0.5583222508430481, "learning_rate": 1.9568037484955264e-05, "loss": 0.2903, "mean_token_accuracy": 0.911376103758812, "num_tokens": 105718304.0, "step": 35960 }, { "entropy": 0.30165809392929077, "epoch": 0.5537051254668568, "grad_norm": 0.7719779014587402, "learning_rate": 1.956751659626716e-05, "loss": 0.298, "mean_token_accuracy": 0.90456418171525, "num_tokens": 105787331.0, "step": 35970 }, { "entropy": 0.2929904765449464, "epoch": 0.5538590607255354, "grad_norm": 0.6320728063583374, "learning_rate": 1.9566995400648733e-05, "loss": 0.2938, "mean_token_accuracy": 0.9108279332518577, "num_tokens": 105847531.0, "step": 35980 }, { "entropy": 0.2800653981976211, "epoch": 0.5540129959842139, "grad_norm": 0.493561714887619, "learning_rate": 1.9566473898116714e-05, "loss": 0.2828, "mean_token_accuracy": 0.9117098033428193, "num_tokens": 105921078.0, "step": 35990 }, { "entropy": 0.3044974334537983, "epoch": 0.5541669312428925, "grad_norm": 0.8501036763191223, "learning_rate": 1.956595208868782e-05, "loss": 0.3084, "mean_token_accuracy": 0.9039261557161808, "num_tokens": 105981797.0, "step": 36000 }, { "entropy": 0.275227889418602, "epoch": 0.5543208665015711, "grad_norm": 0.6367060542106628, "learning_rate": 1.95654299723788e-05, "loss": 0.2834, "mean_token_accuracy": 0.9163248717784882, "num_tokens": 106051243.0, "step": 36010 }, { "entropy": 0.27761797681450845, "epoch": 0.5544748017602497, "grad_norm": 0.5813433527946472, "learning_rate": 1.9564907549206404e-05, "loss": 0.2882, "mean_token_accuracy": 0.9114011354744435, "num_tokens": 106115171.0, "step": 36020 }, { "entropy": 0.2906686871312559, "epoch": 0.5546287370189282, "grad_norm": 0.579474687576294, "learning_rate": 1.9564384819187388e-05, "loss": 0.2899, "mean_token_accuracy": 0.909580347687006, "num_tokens": 106183478.0, "step": 36030 }, { "entropy": 0.30722976410761477, "epoch": 0.5547826722776068, "grad_norm": 0.7314605712890625, "learning_rate": 1.956386178233852e-05, "loss": 0.3183, "mean_token_accuracy": 0.9046873487532139, "num_tokens": 106250188.0, "step": 36040 }, { "entropy": 0.29211108256131413, "epoch": 0.5549366075362854, "grad_norm": 0.5387395620346069, "learning_rate": 1.956333843867658e-05, "loss": 0.2918, "mean_token_accuracy": 0.9114996433258057, "num_tokens": 106313668.0, "step": 36050 }, { "entropy": 0.28948224438354375, "epoch": 0.5550905427949641, "grad_norm": 0.7365321516990662, "learning_rate": 1.956281478821836e-05, "loss": 0.2997, "mean_token_accuracy": 0.9106793656945229, "num_tokens": 106375563.0, "step": 36060 }, { "entropy": 0.3001919365487993, "epoch": 0.5552444780536426, "grad_norm": 0.6113272905349731, "learning_rate": 1.956229083098066e-05, "loss": 0.3065, "mean_token_accuracy": 0.9073448918759823, "num_tokens": 106438311.0, "step": 36070 }, { "entropy": 0.3032921524718404, "epoch": 0.5553984133123212, "grad_norm": 0.8487753868103027, "learning_rate": 1.9561766566980284e-05, "loss": 0.3109, "mean_token_accuracy": 0.9071657374501229, "num_tokens": 106502882.0, "step": 36080 }, { "entropy": 0.29941190741956236, "epoch": 0.5555523485709998, "grad_norm": 0.5237177014350891, "learning_rate": 1.956124199623405e-05, "loss": 0.2992, "mean_token_accuracy": 0.9090417623519897, "num_tokens": 106567757.0, "step": 36090 }, { "entropy": 0.2925551953725517, "epoch": 0.5557062838296783, "grad_norm": 0.6408127546310425, "learning_rate": 1.9560717118758794e-05, "loss": 0.2927, "mean_token_accuracy": 0.9102263234555721, "num_tokens": 106637169.0, "step": 36100 }, { "entropy": 0.2761906853877008, "epoch": 0.5558602190883569, "grad_norm": 0.5350357294082642, "learning_rate": 1.9560191934571348e-05, "loss": 0.2857, "mean_token_accuracy": 0.9118722833693027, "num_tokens": 106708960.0, "step": 36110 }, { "entropy": 0.3002636290155351, "epoch": 0.5560141543470355, "grad_norm": 0.5839075446128845, "learning_rate": 1.9559666443688562e-05, "loss": 0.3055, "mean_token_accuracy": 0.9108301207423211, "num_tokens": 106771025.0, "step": 36120 }, { "entropy": 0.2801809534430504, "epoch": 0.5561680896057141, "grad_norm": 0.576659083366394, "learning_rate": 1.955914064612729e-05, "loss": 0.2842, "mean_token_accuracy": 0.9138025112450123, "num_tokens": 106849344.0, "step": 36130 }, { "entropy": 0.2862000958994031, "epoch": 0.5563220248643926, "grad_norm": 0.7718620896339417, "learning_rate": 1.955861454190441e-05, "loss": 0.2903, "mean_token_accuracy": 0.9098510220646858, "num_tokens": 106911150.0, "step": 36140 }, { "entropy": 0.28842987390235064, "epoch": 0.5564759601230712, "grad_norm": 0.7943069934844971, "learning_rate": 1.955808813103679e-05, "loss": 0.2829, "mean_token_accuracy": 0.9109243154525757, "num_tokens": 106972489.0, "step": 36150 }, { "entropy": 0.291287290584296, "epoch": 0.5566298953817498, "grad_norm": 0.5394967198371887, "learning_rate": 1.9557561413541316e-05, "loss": 0.3024, "mean_token_accuracy": 0.9106141857802867, "num_tokens": 107036291.0, "step": 36160 }, { "entropy": 0.3145131738856435, "epoch": 0.5567838306404284, "grad_norm": 0.6149186491966248, "learning_rate": 1.95570343894349e-05, "loss": 0.314, "mean_token_accuracy": 0.9037752211093902, "num_tokens": 107114007.0, "step": 36170 }, { "entropy": 0.2982744690030813, "epoch": 0.5569377658991069, "grad_norm": 0.7042515277862549, "learning_rate": 1.9556507058734434e-05, "loss": 0.3004, "mean_token_accuracy": 0.9096989341080188, "num_tokens": 107179708.0, "step": 36180 }, { "entropy": 0.27655803998932244, "epoch": 0.5570917011577856, "grad_norm": 0.7394461631774902, "learning_rate": 1.9555979421456838e-05, "loss": 0.2941, "mean_token_accuracy": 0.9130698435008526, "num_tokens": 107234946.0, "step": 36190 }, { "entropy": 0.2898248896934092, "epoch": 0.5572456364164642, "grad_norm": 0.5887390375137329, "learning_rate": 1.955545147761904e-05, "loss": 0.2908, "mean_token_accuracy": 0.910197801887989, "num_tokens": 107299991.0, "step": 36200 }, { "entropy": 0.306549262907356, "epoch": 0.5573995716751428, "grad_norm": 0.6359957456588745, "learning_rate": 1.9554923227237985e-05, "loss": 0.3059, "mean_token_accuracy": 0.9057110235095024, "num_tokens": 107375055.0, "step": 36210 }, { "entropy": 0.3002840818837285, "epoch": 0.5575535069338213, "grad_norm": 0.6398302316665649, "learning_rate": 1.9554394670330606e-05, "loss": 0.3047, "mean_token_accuracy": 0.9068161740899086, "num_tokens": 107442831.0, "step": 36220 }, { "entropy": 0.29113969709724186, "epoch": 0.5577074421924999, "grad_norm": 0.6163185834884644, "learning_rate": 1.955386580691387e-05, "loss": 0.2814, "mean_token_accuracy": 0.9110508397221565, "num_tokens": 107514176.0, "step": 36230 }, { "entropy": 0.29925884753465654, "epoch": 0.5578613774511785, "grad_norm": 0.5993199348449707, "learning_rate": 1.9553336637004738e-05, "loss": 0.2932, "mean_token_accuracy": 0.9092118747532367, "num_tokens": 107581201.0, "step": 36240 }, { "entropy": 0.28635796485468745, "epoch": 0.558015312709857, "grad_norm": 0.7634015679359436, "learning_rate": 1.9552807160620184e-05, "loss": 0.282, "mean_token_accuracy": 0.9110889695584774, "num_tokens": 107652106.0, "step": 36250 }, { "entropy": 0.27910331171005964, "epoch": 0.5581692479685356, "grad_norm": 0.6111934781074524, "learning_rate": 1.95522773777772e-05, "loss": 0.2874, "mean_token_accuracy": 0.914662217348814, "num_tokens": 107714958.0, "step": 36260 }, { "entropy": 0.29437997760251167, "epoch": 0.5583231832272142, "grad_norm": 0.6053867340087891, "learning_rate": 1.9551747288492774e-05, "loss": 0.2964, "mean_token_accuracy": 0.9090319126844406, "num_tokens": 107783250.0, "step": 36270 }, { "entropy": 0.29698745273053645, "epoch": 0.5584771184858928, "grad_norm": 0.6571268439292908, "learning_rate": 1.955121689278392e-05, "loss": 0.3099, "mean_token_accuracy": 0.9065494507551193, "num_tokens": 107848956.0, "step": 36280 }, { "entropy": 0.28862088788300755, "epoch": 0.5586310537445713, "grad_norm": 0.6670976281166077, "learning_rate": 1.955068619066765e-05, "loss": 0.2871, "mean_token_accuracy": 0.9103763192892075, "num_tokens": 107914919.0, "step": 36290 }, { "entropy": 0.2975349240936339, "epoch": 0.5587849890032499, "grad_norm": 0.6720621585845947, "learning_rate": 1.9550155182160983e-05, "loss": 0.2905, "mean_token_accuracy": 0.9094587609171867, "num_tokens": 107979786.0, "step": 36300 }, { "entropy": 0.2892061237245798, "epoch": 0.5589389242619286, "grad_norm": 0.5608441829681396, "learning_rate": 1.9549623867280964e-05, "loss": 0.2886, "mean_token_accuracy": 0.9113853670656681, "num_tokens": 108045468.0, "step": 36310 }, { "entropy": 0.3027749292552471, "epoch": 0.5590928595206072, "grad_norm": 0.7036764025688171, "learning_rate": 1.954909224604463e-05, "loss": 0.2959, "mean_token_accuracy": 0.9066360048949719, "num_tokens": 108112714.0, "step": 36320 }, { "entropy": 0.30453932797536254, "epoch": 0.5592467947792857, "grad_norm": 0.7427670359611511, "learning_rate": 1.9548560318469042e-05, "loss": 0.312, "mean_token_accuracy": 0.9075164034962654, "num_tokens": 108183744.0, "step": 36330 }, { "entropy": 0.26925253169611096, "epoch": 0.5594007300379643, "grad_norm": 0.5971004962921143, "learning_rate": 1.954802808457126e-05, "loss": 0.2739, "mean_token_accuracy": 0.9176491402089596, "num_tokens": 108248849.0, "step": 36340 }, { "entropy": 0.2841953566297889, "epoch": 0.5595546652966429, "grad_norm": 0.9223271012306213, "learning_rate": 1.954749554436836e-05, "loss": 0.2952, "mean_token_accuracy": 0.9113171584904194, "num_tokens": 108312576.0, "step": 36350 }, { "entropy": 0.289356211014092, "epoch": 0.5597086005553215, "grad_norm": 0.6696433424949646, "learning_rate": 1.9546962697877425e-05, "loss": 0.2874, "mean_token_accuracy": 0.9125031985342502, "num_tokens": 108373585.0, "step": 36360 }, { "entropy": 0.29213843336328865, "epoch": 0.559862535814, "grad_norm": 0.5328192710876465, "learning_rate": 1.954642954511555e-05, "loss": 0.2966, "mean_token_accuracy": 0.9095866680145264, "num_tokens": 108439104.0, "step": 36370 }, { "entropy": 0.276940299756825, "epoch": 0.5600164710726786, "grad_norm": 0.4960581660270691, "learning_rate": 1.954589608609984e-05, "loss": 0.281, "mean_token_accuracy": 0.9136816844344139, "num_tokens": 108502524.0, "step": 36380 }, { "entropy": 0.2850562876090407, "epoch": 0.5601704063313572, "grad_norm": 0.6868286728858948, "learning_rate": 1.9545362320847404e-05, "loss": 0.28, "mean_token_accuracy": 0.9126864962279797, "num_tokens": 108562444.0, "step": 36390 }, { "entropy": 0.293749225884676, "epoch": 0.5603243415900357, "grad_norm": 0.9692115187644958, "learning_rate": 1.9544828249375373e-05, "loss": 0.3101, "mean_token_accuracy": 0.908904080837965, "num_tokens": 108633843.0, "step": 36400 }, { "entropy": 0.2791544203646481, "epoch": 0.5604782768487143, "grad_norm": 0.84964919090271, "learning_rate": 1.954429387170087e-05, "loss": 0.2864, "mean_token_accuracy": 0.9156945541501045, "num_tokens": 108693921.0, "step": 36410 }, { "entropy": 0.27798299100250007, "epoch": 0.5606322121073929, "grad_norm": 0.4892924427986145, "learning_rate": 1.9543759187841047e-05, "loss": 0.2862, "mean_token_accuracy": 0.914737218618393, "num_tokens": 108762615.0, "step": 36420 }, { "entropy": 0.29806943209841846, "epoch": 0.5607861473660715, "grad_norm": 0.4817202091217041, "learning_rate": 1.9543224197813054e-05, "loss": 0.3032, "mean_token_accuracy": 0.9075657606124878, "num_tokens": 108831291.0, "step": 36430 }, { "entropy": 0.2827323509380221, "epoch": 0.5609400826247501, "grad_norm": 0.5535596013069153, "learning_rate": 1.9542688901634052e-05, "loss": 0.2883, "mean_token_accuracy": 0.9143310599029064, "num_tokens": 108892193.0, "step": 36440 }, { "entropy": 0.2736834155395627, "epoch": 0.5610940178834287, "grad_norm": 0.6820367574691772, "learning_rate": 1.954215329932122e-05, "loss": 0.278, "mean_token_accuracy": 0.9148912876844406, "num_tokens": 108962273.0, "step": 36450 }, { "entropy": 0.2852375714108348, "epoch": 0.5612479531421073, "grad_norm": 0.5306121706962585, "learning_rate": 1.954161739089173e-05, "loss": 0.2955, "mean_token_accuracy": 0.9114483281970024, "num_tokens": 109024115.0, "step": 36460 }, { "entropy": 0.25839940505102277, "epoch": 0.5614018884007859, "grad_norm": 0.6133040189743042, "learning_rate": 1.9541081176362783e-05, "loss": 0.2687, "mean_token_accuracy": 0.9183049879968166, "num_tokens": 109079620.0, "step": 36470 }, { "entropy": 0.30884548388421534, "epoch": 0.5615558236594644, "grad_norm": 0.5403277277946472, "learning_rate": 1.9540544655751573e-05, "loss": 0.2982, "mean_token_accuracy": 0.9051696971058846, "num_tokens": 109156041.0, "step": 36480 }, { "entropy": 0.29521121010184287, "epoch": 0.561709758918143, "grad_norm": 0.5474687814712524, "learning_rate": 1.9540007829075322e-05, "loss": 0.3108, "mean_token_accuracy": 0.9076200135052204, "num_tokens": 109224944.0, "step": 36490 }, { "entropy": 0.2956853903830051, "epoch": 0.5618636941768216, "grad_norm": 0.6068357825279236, "learning_rate": 1.9539470696351244e-05, "loss": 0.286, "mean_token_accuracy": 0.9095495149493218, "num_tokens": 109287106.0, "step": 36500 }, { "entropy": 0.29802684802561996, "epoch": 0.5620176294355002, "grad_norm": 0.6329134702682495, "learning_rate": 1.953893325759657e-05, "loss": 0.3027, "mean_token_accuracy": 0.9076251447200775, "num_tokens": 109346693.0, "step": 36510 }, { "entropy": 0.2955785904079676, "epoch": 0.5621715646941787, "grad_norm": 1.0742098093032837, "learning_rate": 1.9538395512828548e-05, "loss": 0.2943, "mean_token_accuracy": 0.9060393616557121, "num_tokens": 109399459.0, "step": 36520 }, { "entropy": 0.31309030912816527, "epoch": 0.5623254999528573, "grad_norm": 0.7339901924133301, "learning_rate": 1.9537857462064422e-05, "loss": 0.3016, "mean_token_accuracy": 0.9054211936891079, "num_tokens": 109467637.0, "step": 36530 }, { "entropy": 0.2982602007687092, "epoch": 0.5624794352115359, "grad_norm": 0.6354732513427734, "learning_rate": 1.9537319105321458e-05, "loss": 0.2906, "mean_token_accuracy": 0.9082027778029442, "num_tokens": 109535362.0, "step": 36540 }, { "entropy": 0.2924905691295862, "epoch": 0.5626333704702144, "grad_norm": 0.5571328997612, "learning_rate": 1.9536780442616922e-05, "loss": 0.2966, "mean_token_accuracy": 0.9117012299597264, "num_tokens": 109594528.0, "step": 36550 }, { "entropy": 0.28998121712356806, "epoch": 0.562787305728893, "grad_norm": 0.6077293753623962, "learning_rate": 1.9536241473968097e-05, "loss": 0.2916, "mean_token_accuracy": 0.9083847090601921, "num_tokens": 109662795.0, "step": 36560 }, { "entropy": 0.2842296415939927, "epoch": 0.5629412409875717, "grad_norm": 0.5771117210388184, "learning_rate": 1.9535702199392276e-05, "loss": 0.2832, "mean_token_accuracy": 0.9122479975223541, "num_tokens": 109728197.0, "step": 36570 }, { "entropy": 0.2793910690583289, "epoch": 0.5630951762462503, "grad_norm": 0.5848639011383057, "learning_rate": 1.9535162618906756e-05, "loss": 0.283, "mean_token_accuracy": 0.9143567122519016, "num_tokens": 109798638.0, "step": 36580 }, { "entropy": 0.28241594610735776, "epoch": 0.5632491115049288, "grad_norm": 0.6888241767883301, "learning_rate": 1.953462273252885e-05, "loss": 0.2794, "mean_token_accuracy": 0.9112820632755756, "num_tokens": 109867460.0, "step": 36590 }, { "entropy": 0.28716467302292586, "epoch": 0.5634030467636074, "grad_norm": 0.7355326414108276, "learning_rate": 1.9534082540275873e-05, "loss": 0.291, "mean_token_accuracy": 0.9113353677093983, "num_tokens": 109932652.0, "step": 36600 }, { "entropy": 0.2845652062445879, "epoch": 0.563556982022286, "grad_norm": 0.5972771644592285, "learning_rate": 1.9533542042165155e-05, "loss": 0.2799, "mean_token_accuracy": 0.9114854946732521, "num_tokens": 109999762.0, "step": 36610 }, { "entropy": 0.27955517200753094, "epoch": 0.5637109172809646, "grad_norm": 0.8676490187644958, "learning_rate": 1.953300123821404e-05, "loss": 0.2938, "mean_token_accuracy": 0.911307618767023, "num_tokens": 110066548.0, "step": 36620 }, { "entropy": 0.2887376746162772, "epoch": 0.5638648525396431, "grad_norm": 0.6157773733139038, "learning_rate": 1.9532460128439875e-05, "loss": 0.3063, "mean_token_accuracy": 0.9116196103394032, "num_tokens": 110133527.0, "step": 36630 }, { "entropy": 0.2952212809585035, "epoch": 0.5640187877983217, "grad_norm": 0.8619616031646729, "learning_rate": 1.9531918712860016e-05, "loss": 0.2874, "mean_token_accuracy": 0.9089381583034992, "num_tokens": 110197408.0, "step": 36640 }, { "entropy": 0.2909796852618456, "epoch": 0.5641727230570003, "grad_norm": 0.685164749622345, "learning_rate": 1.953137699149184e-05, "loss": 0.2953, "mean_token_accuracy": 0.9111698269844055, "num_tokens": 110258917.0, "step": 36650 }, { "entropy": 0.2818409857340157, "epoch": 0.5643266583156789, "grad_norm": 0.5387259125709534, "learning_rate": 1.9530834964352715e-05, "loss": 0.2919, "mean_token_accuracy": 0.9113919325172901, "num_tokens": 110332323.0, "step": 36660 }, { "entropy": 0.2955139484256506, "epoch": 0.5644805935743574, "grad_norm": 0.5385759472846985, "learning_rate": 1.9530292631460036e-05, "loss": 0.2937, "mean_token_accuracy": 0.9085373736917972, "num_tokens": 110396160.0, "step": 36670 }, { "entropy": 0.28489135364070534, "epoch": 0.564634528833036, "grad_norm": 0.7210793495178223, "learning_rate": 1.95297499928312e-05, "loss": 0.2921, "mean_token_accuracy": 0.9128208577632904, "num_tokens": 110454659.0, "step": 36680 }, { "entropy": 0.28876354834064843, "epoch": 0.5647884640917147, "grad_norm": 0.8143342733383179, "learning_rate": 1.952920704848362e-05, "loss": 0.2953, "mean_token_accuracy": 0.9098890118300915, "num_tokens": 110512985.0, "step": 36690 }, { "entropy": 0.28980019241571425, "epoch": 0.5649423993503933, "grad_norm": 0.7670053839683533, "learning_rate": 1.9528663798434703e-05, "loss": 0.2897, "mean_token_accuracy": 0.9112382680177689, "num_tokens": 110583605.0, "step": 36700 }, { "entropy": 0.28402344500645993, "epoch": 0.5650963346090718, "grad_norm": 0.6093108057975769, "learning_rate": 1.9528120242701883e-05, "loss": 0.2806, "mean_token_accuracy": 0.9144080065190792, "num_tokens": 110650286.0, "step": 36710 }, { "entropy": 0.2843106960877776, "epoch": 0.5652502698677504, "grad_norm": 0.8618741035461426, "learning_rate": 1.9527576381302605e-05, "loss": 0.2961, "mean_token_accuracy": 0.9125444039702415, "num_tokens": 110709572.0, "step": 36720 }, { "entropy": 0.28895770413801075, "epoch": 0.565404205126429, "grad_norm": 0.610744297504425, "learning_rate": 1.9527032214254303e-05, "loss": 0.3033, "mean_token_accuracy": 0.9116525806486606, "num_tokens": 110771249.0, "step": 36730 }, { "entropy": 0.30443105231970546, "epoch": 0.5655581403851075, "grad_norm": 0.5748558640480042, "learning_rate": 1.952648774157444e-05, "loss": 0.2983, "mean_token_accuracy": 0.9061632096767426, "num_tokens": 110843535.0, "step": 36740 }, { "entropy": 0.28930548266507683, "epoch": 0.5657120756437861, "grad_norm": 0.5078638792037964, "learning_rate": 1.952594296328048e-05, "loss": 0.3031, "mean_token_accuracy": 0.9118295751512051, "num_tokens": 110907603.0, "step": 36750 }, { "entropy": 0.3025862482376397, "epoch": 0.5658660109024647, "grad_norm": 0.5892511010169983, "learning_rate": 1.9525397879389904e-05, "loss": 0.3089, "mean_token_accuracy": 0.9046698942780494, "num_tokens": 110971061.0, "step": 36760 }, { "entropy": 0.278075338806957, "epoch": 0.5660199461611433, "grad_norm": 0.6217501759529114, "learning_rate": 1.95248524899202e-05, "loss": 0.2855, "mean_token_accuracy": 0.9132081210613251, "num_tokens": 111040948.0, "step": 36770 }, { "entropy": 0.29672068525105716, "epoch": 0.5661738814198218, "grad_norm": 0.561393678188324, "learning_rate": 1.952430679488886e-05, "loss": 0.2944, "mean_token_accuracy": 0.9085307657718659, "num_tokens": 111105784.0, "step": 36780 }, { "entropy": 0.28322089137509465, "epoch": 0.5663278166785004, "grad_norm": 0.6685360074043274, "learning_rate": 1.952376079431339e-05, "loss": 0.2921, "mean_token_accuracy": 0.9117575533688068, "num_tokens": 111169358.0, "step": 36790 }, { "entropy": 0.3071655359119177, "epoch": 0.566481751937179, "grad_norm": 0.7701115608215332, "learning_rate": 1.952321448821131e-05, "loss": 0.3073, "mean_token_accuracy": 0.9056446932256221, "num_tokens": 111236317.0, "step": 36800 }, { "entropy": 0.26852561701089145, "epoch": 0.5666356871958576, "grad_norm": 0.6617115139961243, "learning_rate": 1.9522667876600142e-05, "loss": 0.2844, "mean_token_accuracy": 0.9157200247049332, "num_tokens": 111302679.0, "step": 36810 }, { "entropy": 0.29010720839723947, "epoch": 0.5667896224545362, "grad_norm": 0.6435924768447876, "learning_rate": 1.952212095949742e-05, "loss": 0.2992, "mean_token_accuracy": 0.9104262232780457, "num_tokens": 111363940.0, "step": 36820 }, { "entropy": 0.27808599835261705, "epoch": 0.5669435577132148, "grad_norm": 0.6079960465431213, "learning_rate": 1.9521573736920696e-05, "loss": 0.284, "mean_token_accuracy": 0.9150475218892098, "num_tokens": 111432793.0, "step": 36830 }, { "entropy": 0.29330156836658716, "epoch": 0.5670974929718934, "grad_norm": 0.5336328744888306, "learning_rate": 1.9521026208887516e-05, "loss": 0.2917, "mean_token_accuracy": 0.9088428594172001, "num_tokens": 111504293.0, "step": 36840 }, { "entropy": 0.28166524348780514, "epoch": 0.567251428230572, "grad_norm": 0.7984206080436707, "learning_rate": 1.9520478375415455e-05, "loss": 0.2935, "mean_token_accuracy": 0.9121942736208439, "num_tokens": 111570519.0, "step": 36850 }, { "entropy": 0.28470653723925354, "epoch": 0.5674053634892505, "grad_norm": 0.8325445055961609, "learning_rate": 1.9519930236522083e-05, "loss": 0.289, "mean_token_accuracy": 0.9119689017534256, "num_tokens": 111636146.0, "step": 36860 }, { "entropy": 0.28027195343747735, "epoch": 0.5675592987479291, "grad_norm": 0.5395911931991577, "learning_rate": 1.9519381792224982e-05, "loss": 0.296, "mean_token_accuracy": 0.913027023524046, "num_tokens": 111706365.0, "step": 36870 }, { "entropy": 0.27767023546621206, "epoch": 0.5677132340066077, "grad_norm": 0.4993591904640198, "learning_rate": 1.951883304254175e-05, "loss": 0.288, "mean_token_accuracy": 0.913290137797594, "num_tokens": 111784092.0, "step": 36880 }, { "entropy": 0.2925655216909945, "epoch": 0.5678671692652862, "grad_norm": 0.6380429863929749, "learning_rate": 1.951828398748999e-05, "loss": 0.2933, "mean_token_accuracy": 0.9122245244681835, "num_tokens": 111846397.0, "step": 36890 }, { "entropy": 0.2907050454989076, "epoch": 0.5680211045239648, "grad_norm": 0.637654721736908, "learning_rate": 1.9517734627087318e-05, "loss": 0.3036, "mean_token_accuracy": 0.9083958975970745, "num_tokens": 111915917.0, "step": 36900 }, { "entropy": 0.3051985607482493, "epoch": 0.5681750397826434, "grad_norm": 0.6193662285804749, "learning_rate": 1.9517184961351353e-05, "loss": 0.3012, "mean_token_accuracy": 0.9052084438502789, "num_tokens": 111983564.0, "step": 36910 }, { "entropy": 0.2891208314336836, "epoch": 0.568328975041322, "grad_norm": 0.6182016730308533, "learning_rate": 1.951663499029973e-05, "loss": 0.2953, "mean_token_accuracy": 0.9127513639628887, "num_tokens": 112059227.0, "step": 36920 }, { "entropy": 0.2718375545926392, "epoch": 0.5684829103000005, "grad_norm": 0.5501700639724731, "learning_rate": 1.9516084713950096e-05, "loss": 0.2822, "mean_token_accuracy": 0.9156343169510365, "num_tokens": 112127993.0, "step": 36930 }, { "entropy": 0.2928925732150674, "epoch": 0.5686368455586791, "grad_norm": 0.6638200879096985, "learning_rate": 1.95155341323201e-05, "loss": 0.2942, "mean_token_accuracy": 0.9089538492262363, "num_tokens": 112192599.0, "step": 36940 }, { "entropy": 0.29099737014621496, "epoch": 0.5687907808173578, "grad_norm": 0.5957184433937073, "learning_rate": 1.951498324542741e-05, "loss": 0.2848, "mean_token_accuracy": 0.9113623410463333, "num_tokens": 112253608.0, "step": 36950 }, { "entropy": 0.29812668152153493, "epoch": 0.5689447160760364, "grad_norm": 0.5155591368675232, "learning_rate": 1.951443205328969e-05, "loss": 0.2961, "mean_token_accuracy": 0.9069598183035851, "num_tokens": 112334838.0, "step": 36960 }, { "entropy": 0.270058820117265, "epoch": 0.5690986513347149, "grad_norm": 0.611397922039032, "learning_rate": 1.9513880555924628e-05, "loss": 0.2785, "mean_token_accuracy": 0.9152816653251648, "num_tokens": 112397072.0, "step": 36970 }, { "entropy": 0.28740600049495696, "epoch": 0.5692525865933935, "grad_norm": 0.5952185392379761, "learning_rate": 1.951332875334992e-05, "loss": 0.2942, "mean_token_accuracy": 0.9113309107720852, "num_tokens": 112458173.0, "step": 36980 }, { "entropy": 0.2750930259935558, "epoch": 0.5694065218520721, "grad_norm": 0.567246675491333, "learning_rate": 1.9512776645583266e-05, "loss": 0.2784, "mean_token_accuracy": 0.9157406397163868, "num_tokens": 112524646.0, "step": 36990 }, { "entropy": 0.29360212115570905, "epoch": 0.5695604571107507, "grad_norm": 0.5308836698532104, "learning_rate": 1.951222423264237e-05, "loss": 0.3011, "mean_token_accuracy": 0.9092645518481731, "num_tokens": 112595156.0, "step": 37000 }, { "entropy": 0.3137119787745178, "epoch": 0.5697143923694292, "grad_norm": 0.6121848225593567, "learning_rate": 1.9511671514544962e-05, "loss": 0.3147, "mean_token_accuracy": 0.9036274626851082, "num_tokens": 112667532.0, "step": 37010 }, { "entropy": 0.2861791592091322, "epoch": 0.5698683276281078, "grad_norm": 0.7486041784286499, "learning_rate": 1.951111849130877e-05, "loss": 0.2873, "mean_token_accuracy": 0.9129540152847767, "num_tokens": 112731084.0, "step": 37020 }, { "entropy": 0.2748860871419311, "epoch": 0.5700222628867864, "grad_norm": 0.7500281929969788, "learning_rate": 1.9510565162951538e-05, "loss": 0.282, "mean_token_accuracy": 0.9125697761774063, "num_tokens": 112793424.0, "step": 37030 }, { "entropy": 0.3200145438313484, "epoch": 0.5701761981454649, "grad_norm": 0.5467698574066162, "learning_rate": 1.9510011529491013e-05, "loss": 0.3134, "mean_token_accuracy": 0.9027153931558132, "num_tokens": 112861523.0, "step": 37040 }, { "entropy": 0.291383424308151, "epoch": 0.5703301334041435, "grad_norm": 0.5785459280014038, "learning_rate": 1.9509457590944962e-05, "loss": 0.2986, "mean_token_accuracy": 0.910470712184906, "num_tokens": 112925750.0, "step": 37050 }, { "entropy": 0.3013216136023402, "epoch": 0.5704840686628221, "grad_norm": 0.5989560484886169, "learning_rate": 1.950890334733115e-05, "loss": 0.2997, "mean_token_accuracy": 0.9067268334329128, "num_tokens": 112990567.0, "step": 37060 }, { "entropy": 0.30896682683378457, "epoch": 0.5706380039215008, "grad_norm": 0.6413235664367676, "learning_rate": 1.9508348798667355e-05, "loss": 0.2998, "mean_token_accuracy": 0.9047558002173901, "num_tokens": 113053873.0, "step": 37070 }, { "entropy": 0.29501158772036434, "epoch": 0.5707919391801793, "grad_norm": 0.6390897631645203, "learning_rate": 1.9507793944971376e-05, "loss": 0.296, "mean_token_accuracy": 0.9095269799232483, "num_tokens": 113119397.0, "step": 37080 }, { "entropy": 0.30546115469187496, "epoch": 0.5709458744388579, "grad_norm": 0.6606222987174988, "learning_rate": 1.950723878626101e-05, "loss": 0.2977, "mean_token_accuracy": 0.904750120639801, "num_tokens": 113200720.0, "step": 37090 }, { "entropy": 0.30093725863844156, "epoch": 0.5710998096975365, "grad_norm": 0.5772372484207153, "learning_rate": 1.950668332255406e-05, "loss": 0.3067, "mean_token_accuracy": 0.9082447424530983, "num_tokens": 113268119.0, "step": 37100 }, { "entropy": 0.30533154485747216, "epoch": 0.5712537449562151, "grad_norm": 0.6526159048080444, "learning_rate": 1.9506127553868356e-05, "loss": 0.3019, "mean_token_accuracy": 0.9054891884326934, "num_tokens": 113330924.0, "step": 37110 }, { "entropy": 0.2806809534318745, "epoch": 0.5714076802148936, "grad_norm": 0.6615791320800781, "learning_rate": 1.9505571480221713e-05, "loss": 0.2748, "mean_token_accuracy": 0.913319093734026, "num_tokens": 113401546.0, "step": 37120 }, { "entropy": 0.2813155702315271, "epoch": 0.5715616154735722, "grad_norm": 0.7523384094238281, "learning_rate": 1.9505015101631988e-05, "loss": 0.2771, "mean_token_accuracy": 0.9117581255733966, "num_tokens": 113462738.0, "step": 37130 }, { "entropy": 0.2870593781583011, "epoch": 0.5717155507322508, "grad_norm": 0.693594753742218, "learning_rate": 1.9504458418117016e-05, "loss": 0.2896, "mean_token_accuracy": 0.9091579891741276, "num_tokens": 113526629.0, "step": 37140 }, { "entropy": 0.28293189182877543, "epoch": 0.5718694859909293, "grad_norm": 0.6697568297386169, "learning_rate": 1.950390142969466e-05, "loss": 0.2848, "mean_token_accuracy": 0.9122418835759163, "num_tokens": 113596179.0, "step": 37150 }, { "entropy": 0.277377547044307, "epoch": 0.5720234212496079, "grad_norm": 0.4652498662471771, "learning_rate": 1.9503344136382795e-05, "loss": 0.2807, "mean_token_accuracy": 0.9139702886343002, "num_tokens": 113670563.0, "step": 37160 }, { "entropy": 0.28805197700858115, "epoch": 0.5721773565082865, "grad_norm": 0.6515666842460632, "learning_rate": 1.950278653819929e-05, "loss": 0.2851, "mean_token_accuracy": 0.9116563074290752, "num_tokens": 113739493.0, "step": 37170 }, { "entropy": 0.3036094038747251, "epoch": 0.5723312917669651, "grad_norm": 0.721257746219635, "learning_rate": 1.9502228635162032e-05, "loss": 0.3109, "mean_token_accuracy": 0.9069693781435489, "num_tokens": 113812567.0, "step": 37180 }, { "entropy": 0.2815349789336324, "epoch": 0.5724852270256436, "grad_norm": 0.6967387199401855, "learning_rate": 1.950167042728893e-05, "loss": 0.2926, "mean_token_accuracy": 0.9131901033222676, "num_tokens": 113876347.0, "step": 37190 }, { "entropy": 0.27290188213810324, "epoch": 0.5726391622843223, "grad_norm": 0.5349797010421753, "learning_rate": 1.9501111914597878e-05, "loss": 0.2766, "mean_token_accuracy": 0.9163598351180553, "num_tokens": 113943469.0, "step": 37200 }, { "entropy": 0.2793090142775327, "epoch": 0.5727930975430009, "grad_norm": 0.6682000756263733, "learning_rate": 1.9500553097106804e-05, "loss": 0.2777, "mean_token_accuracy": 0.914003549516201, "num_tokens": 114006560.0, "step": 37210 }, { "entropy": 0.29759387504309415, "epoch": 0.5729470328016795, "grad_norm": 0.5989261865615845, "learning_rate": 1.9499993974833628e-05, "loss": 0.3128, "mean_token_accuracy": 0.9093449831008911, "num_tokens": 114075845.0, "step": 37220 }, { "entropy": 0.2995797802694142, "epoch": 0.573100968060358, "grad_norm": 0.5956131219863892, "learning_rate": 1.9499434547796293e-05, "loss": 0.294, "mean_token_accuracy": 0.9083019219338894, "num_tokens": 114148871.0, "step": 37230 }, { "entropy": 0.2854191939346492, "epoch": 0.5732549033190366, "grad_norm": 0.5264977812767029, "learning_rate": 1.949887481601274e-05, "loss": 0.288, "mean_token_accuracy": 0.910671093314886, "num_tokens": 114219504.0, "step": 37240 }, { "entropy": 0.28879986619576814, "epoch": 0.5734088385777152, "grad_norm": 0.6553689241409302, "learning_rate": 1.9498314779500932e-05, "loss": 0.2859, "mean_token_accuracy": 0.9107181757688523, "num_tokens": 114279688.0, "step": 37250 }, { "entropy": 0.2894633638672531, "epoch": 0.5735627738363938, "grad_norm": 0.5594285726547241, "learning_rate": 1.9497754438278827e-05, "loss": 0.3021, "mean_token_accuracy": 0.9108983501791954, "num_tokens": 114342804.0, "step": 37260 }, { "entropy": 0.29570840299129486, "epoch": 0.5737167090950723, "grad_norm": 0.6979062557220459, "learning_rate": 1.9497193792364405e-05, "loss": 0.302, "mean_token_accuracy": 0.9092586271464824, "num_tokens": 114411682.0, "step": 37270 }, { "entropy": 0.27750099040567877, "epoch": 0.5738706443537509, "grad_norm": 0.6460676789283752, "learning_rate": 1.9496632841775654e-05, "loss": 0.286, "mean_token_accuracy": 0.9137863084673882, "num_tokens": 114472016.0, "step": 37280 }, { "entropy": 0.2862785396166146, "epoch": 0.5740245796124295, "grad_norm": 0.8560227155685425, "learning_rate": 1.949607158653057e-05, "loss": 0.2875, "mean_token_accuracy": 0.9113840237259865, "num_tokens": 114539420.0, "step": 37290 }, { "entropy": 0.3032057502306998, "epoch": 0.574178514871108, "grad_norm": 0.7055937647819519, "learning_rate": 1.9495510026647152e-05, "loss": 0.3046, "mean_token_accuracy": 0.9062136344611644, "num_tokens": 114607551.0, "step": 37300 }, { "entropy": 0.3012864312157035, "epoch": 0.5743324501297866, "grad_norm": 0.6277341842651367, "learning_rate": 1.949494816214342e-05, "loss": 0.2925, "mean_token_accuracy": 0.9097641758620739, "num_tokens": 114664462.0, "step": 37310 }, { "entropy": 0.30520568899810313, "epoch": 0.5744863853884652, "grad_norm": 0.6836447715759277, "learning_rate": 1.94943859930374e-05, "loss": 0.3124, "mean_token_accuracy": 0.9074530132114887, "num_tokens": 114730725.0, "step": 37320 }, { "entropy": 0.2966282984241843, "epoch": 0.5746403206471439, "grad_norm": 0.7445089817047119, "learning_rate": 1.949382351934712e-05, "loss": 0.2952, "mean_token_accuracy": 0.9078993976116181, "num_tokens": 114795812.0, "step": 37330 }, { "entropy": 0.27937975628301503, "epoch": 0.5747942559058224, "grad_norm": 0.6437022089958191, "learning_rate": 1.9493260741090633e-05, "loss": 0.2869, "mean_token_accuracy": 0.9131102122366428, "num_tokens": 114860678.0, "step": 37340 }, { "entropy": 0.28411452313885094, "epoch": 0.574948191164501, "grad_norm": 0.6376534104347229, "learning_rate": 1.9492697658285986e-05, "loss": 0.2865, "mean_token_accuracy": 0.9121372237801552, "num_tokens": 114928589.0, "step": 37350 }, { "entropy": 0.2855631113052368, "epoch": 0.5751021264231796, "grad_norm": 0.574272096157074, "learning_rate": 1.949213427095125e-05, "loss": 0.2919, "mean_token_accuracy": 0.9145797491073608, "num_tokens": 115001505.0, "step": 37360 }, { "entropy": 0.29628875507041813, "epoch": 0.5752560616818582, "grad_norm": 0.6764696836471558, "learning_rate": 1.949157057910449e-05, "loss": 0.2983, "mean_token_accuracy": 0.9087736546993256, "num_tokens": 115067466.0, "step": 37370 }, { "entropy": 0.29706004103645683, "epoch": 0.5754099969405367, "grad_norm": 0.6028088331222534, "learning_rate": 1.9491006582763797e-05, "loss": 0.307, "mean_token_accuracy": 0.9089762911200523, "num_tokens": 115137480.0, "step": 37380 }, { "entropy": 0.2877296349965036, "epoch": 0.5755639321992153, "grad_norm": 0.564656138420105, "learning_rate": 1.9490442281947263e-05, "loss": 0.2995, "mean_token_accuracy": 0.9114254012703895, "num_tokens": 115203730.0, "step": 37390 }, { "entropy": 0.2929312936961651, "epoch": 0.5757178674578939, "grad_norm": 0.5370362997055054, "learning_rate": 1.948987767667299e-05, "loss": 0.2875, "mean_token_accuracy": 0.9111482255160809, "num_tokens": 115275864.0, "step": 37400 }, { "entropy": 0.2875267186202109, "epoch": 0.5758718027165725, "grad_norm": 0.6494754552841187, "learning_rate": 1.9489312766959087e-05, "loss": 0.2921, "mean_token_accuracy": 0.9074379809200763, "num_tokens": 115337356.0, "step": 37410 }, { "entropy": 0.29486233880743384, "epoch": 0.576025737975251, "grad_norm": 0.8794815540313721, "learning_rate": 1.948874755282368e-05, "loss": 0.2928, "mean_token_accuracy": 0.9078986436128617, "num_tokens": 115399221.0, "step": 37420 }, { "entropy": 0.28249186407774685, "epoch": 0.5761796732339296, "grad_norm": 0.619581937789917, "learning_rate": 1.9488182034284904e-05, "loss": 0.2885, "mean_token_accuracy": 0.9103047668933868, "num_tokens": 115473107.0, "step": 37430 }, { "entropy": 0.28787054102867843, "epoch": 0.5763336084926082, "grad_norm": 0.6462114453315735, "learning_rate": 1.9487616211360897e-05, "loss": 0.2795, "mean_token_accuracy": 0.9102754101157189, "num_tokens": 115533877.0, "step": 37440 }, { "entropy": 0.28888137713074685, "epoch": 0.5764875437512869, "grad_norm": 0.6420550346374512, "learning_rate": 1.9487050084069814e-05, "loss": 0.2847, "mean_token_accuracy": 0.9114627577364445, "num_tokens": 115587256.0, "step": 37450 }, { "entropy": 0.27823731023818254, "epoch": 0.5766414790099654, "grad_norm": 0.7436168193817139, "learning_rate": 1.9486483652429812e-05, "loss": 0.2725, "mean_token_accuracy": 0.9127557344734669, "num_tokens": 115649028.0, "step": 37460 }, { "entropy": 0.29039759151637556, "epoch": 0.576795414268644, "grad_norm": 0.673064112663269, "learning_rate": 1.9485916916459064e-05, "loss": 0.2944, "mean_token_accuracy": 0.9103914275765419, "num_tokens": 115710659.0, "step": 37470 }, { "entropy": 0.2838738231919706, "epoch": 0.5769493495273226, "grad_norm": 0.5586153864860535, "learning_rate": 1.9485349876175755e-05, "loss": 0.2952, "mean_token_accuracy": 0.9130788214504719, "num_tokens": 115785061.0, "step": 37480 }, { "entropy": 0.30268556885421277, "epoch": 0.5771032847860011, "grad_norm": 0.578583300113678, "learning_rate": 1.9484782531598075e-05, "loss": 0.3111, "mean_token_accuracy": 0.9061085000634194, "num_tokens": 115849420.0, "step": 37490 }, { "entropy": 0.2947486197575927, "epoch": 0.5772572200446797, "grad_norm": 0.5279681086540222, "learning_rate": 1.9484214882744223e-05, "loss": 0.3049, "mean_token_accuracy": 0.9087418720126152, "num_tokens": 115915434.0, "step": 37500 }, { "entropy": 0.28068300066515806, "epoch": 0.5774111553033583, "grad_norm": 0.7981143593788147, "learning_rate": 1.9483646929632405e-05, "loss": 0.2778, "mean_token_accuracy": 0.9130816102027893, "num_tokens": 115980720.0, "step": 37510 }, { "entropy": 0.2961070341989398, "epoch": 0.5775650905620369, "grad_norm": 0.6279017925262451, "learning_rate": 1.9483078672280848e-05, "loss": 0.3076, "mean_token_accuracy": 0.909561176598072, "num_tokens": 116047086.0, "step": 37520 }, { "entropy": 0.29322096053510904, "epoch": 0.5777190258207154, "grad_norm": 0.6808649301528931, "learning_rate": 1.9482510110707778e-05, "loss": 0.292, "mean_token_accuracy": 0.9109172210097313, "num_tokens": 116106213.0, "step": 37530 }, { "entropy": 0.2783852989785373, "epoch": 0.577872961079394, "grad_norm": 0.818078875541687, "learning_rate": 1.9481941244931438e-05, "loss": 0.2834, "mean_token_accuracy": 0.90985262170434, "num_tokens": 116172060.0, "step": 37540 }, { "entropy": 0.31154611203819516, "epoch": 0.5780268963380726, "grad_norm": 0.7264761328697205, "learning_rate": 1.9481372074970076e-05, "loss": 0.3116, "mean_token_accuracy": 0.905917576700449, "num_tokens": 116243066.0, "step": 37550 }, { "entropy": 0.29841599743813274, "epoch": 0.5781808315967512, "grad_norm": 0.5668660402297974, "learning_rate": 1.9480802600841954e-05, "loss": 0.3032, "mean_token_accuracy": 0.9104308724403382, "num_tokens": 116315502.0, "step": 37560 }, { "entropy": 0.29868229953572156, "epoch": 0.5783347668554297, "grad_norm": 0.6770445108413696, "learning_rate": 1.948023282256533e-05, "loss": 0.3096, "mean_token_accuracy": 0.9065957345068455, "num_tokens": 116380162.0, "step": 37570 }, { "entropy": 0.3056383829563856, "epoch": 0.5784887021141084, "grad_norm": 0.5481095314025879, "learning_rate": 1.94796627401585e-05, "loss": 0.3057, "mean_token_accuracy": 0.9076859027147293, "num_tokens": 116450117.0, "step": 37580 }, { "entropy": 0.28614425361156465, "epoch": 0.578642637372787, "grad_norm": 0.5899983048439026, "learning_rate": 1.9479092353639734e-05, "loss": 0.2927, "mean_token_accuracy": 0.9138260826468467, "num_tokens": 116507721.0, "step": 37590 }, { "entropy": 0.28045619940385225, "epoch": 0.5787965726314656, "grad_norm": 0.5631558895111084, "learning_rate": 1.9478521663027346e-05, "loss": 0.291, "mean_token_accuracy": 0.9129706591367721, "num_tokens": 116573877.0, "step": 37600 }, { "entropy": 0.29762435890734196, "epoch": 0.5789505078901441, "grad_norm": 0.5886217951774597, "learning_rate": 1.9477950668339636e-05, "loss": 0.3157, "mean_token_accuracy": 0.9082422561943531, "num_tokens": 116635916.0, "step": 37610 }, { "entropy": 0.30001126434653996, "epoch": 0.5791044431488227, "grad_norm": 0.680921733379364, "learning_rate": 1.9477379369594922e-05, "loss": 0.2944, "mean_token_accuracy": 0.9082329519093036, "num_tokens": 116703684.0, "step": 37620 }, { "entropy": 0.285330007225275, "epoch": 0.5792583784075013, "grad_norm": 0.8868697285652161, "learning_rate": 1.9476807766811533e-05, "loss": 0.2884, "mean_token_accuracy": 0.9114242501556873, "num_tokens": 116769111.0, "step": 37630 }, { "entropy": 0.2851103304885328, "epoch": 0.5794123136661798, "grad_norm": 0.9189745783805847, "learning_rate": 1.9476235860007805e-05, "loss": 0.2952, "mean_token_accuracy": 0.9123147934675216, "num_tokens": 116835675.0, "step": 37640 }, { "entropy": 0.2855176498182118, "epoch": 0.5795662489248584, "grad_norm": 0.6667695641517639, "learning_rate": 1.947566364920209e-05, "loss": 0.2847, "mean_token_accuracy": 0.9129228204488754, "num_tokens": 116901439.0, "step": 37650 }, { "entropy": 0.29687861120328307, "epoch": 0.579720184183537, "grad_norm": 0.5037972927093506, "learning_rate": 1.947509113441274e-05, "loss": 0.3109, "mean_token_accuracy": 0.9074893243610859, "num_tokens": 116973289.0, "step": 37660 }, { "entropy": 0.30969967572018503, "epoch": 0.5798741194422156, "grad_norm": 0.7719970345497131, "learning_rate": 1.947451831565812e-05, "loss": 0.311, "mean_token_accuracy": 0.9019806019961834, "num_tokens": 117038943.0, "step": 37670 }, { "entropy": 0.29760569613426924, "epoch": 0.5800280547008941, "grad_norm": 0.8411520719528198, "learning_rate": 1.947394519295661e-05, "loss": 0.3048, "mean_token_accuracy": 0.9072484858334064, "num_tokens": 117103899.0, "step": 37680 }, { "entropy": 0.29248906699940563, "epoch": 0.5801819899595727, "grad_norm": 0.5953316688537598, "learning_rate": 1.9473371766326594e-05, "loss": 0.2949, "mean_token_accuracy": 0.9132071770727634, "num_tokens": 117173718.0, "step": 37690 }, { "entropy": 0.2767122224904597, "epoch": 0.5803359252182513, "grad_norm": 0.49370235204696655, "learning_rate": 1.947279803578647e-05, "loss": 0.2822, "mean_token_accuracy": 0.9155486971139908, "num_tokens": 117248448.0, "step": 37700 }, { "entropy": 0.2830576327629387, "epoch": 0.58048986047693, "grad_norm": 0.5443357229232788, "learning_rate": 1.947222400135464e-05, "loss": 0.2875, "mean_token_accuracy": 0.913065405189991, "num_tokens": 117310278.0, "step": 37710 }, { "entropy": 0.2874010564759374, "epoch": 0.5806437957356085, "grad_norm": 0.6341981291770935, "learning_rate": 1.947164966304952e-05, "loss": 0.3004, "mean_token_accuracy": 0.9097245156764984, "num_tokens": 117381216.0, "step": 37720 }, { "entropy": 0.3113473686389625, "epoch": 0.5807977309942871, "grad_norm": 0.7791240215301514, "learning_rate": 1.947107502088954e-05, "loss": 0.3046, "mean_token_accuracy": 0.9062362484633922, "num_tokens": 117447902.0, "step": 37730 }, { "entropy": 0.28528160648420453, "epoch": 0.5809516662529657, "grad_norm": 0.7394787073135376, "learning_rate": 1.947050007489313e-05, "loss": 0.2904, "mean_token_accuracy": 0.911184198409319, "num_tokens": 117507535.0, "step": 37740 }, { "entropy": 0.2675049496814609, "epoch": 0.5811056015116443, "grad_norm": 0.6682214140892029, "learning_rate": 1.9469924825078738e-05, "loss": 0.2752, "mean_token_accuracy": 0.9171147130429744, "num_tokens": 117577178.0, "step": 37750 }, { "entropy": 0.3046274116262794, "epoch": 0.5812595367703228, "grad_norm": 0.7872171998023987, "learning_rate": 1.946934927146481e-05, "loss": 0.3025, "mean_token_accuracy": 0.9055767983198166, "num_tokens": 117640055.0, "step": 37760 }, { "entropy": 0.2838647922500968, "epoch": 0.5814134720290014, "grad_norm": 0.6457483172416687, "learning_rate": 1.9468773414069822e-05, "loss": 0.286, "mean_token_accuracy": 0.9148573257029057, "num_tokens": 117699358.0, "step": 37770 }, { "entropy": 0.28251188741996885, "epoch": 0.58156740728768, "grad_norm": 0.7421318888664246, "learning_rate": 1.9468197252912243e-05, "loss": 0.2858, "mean_token_accuracy": 0.9134503565728664, "num_tokens": 117763205.0, "step": 37780 }, { "entropy": 0.28636887595057486, "epoch": 0.5817213425463585, "grad_norm": 0.6790886521339417, "learning_rate": 1.9467620788010547e-05, "loss": 0.2967, "mean_token_accuracy": 0.9101063050329685, "num_tokens": 117821734.0, "step": 37790 }, { "entropy": 0.2890741332434118, "epoch": 0.5818752778050371, "grad_norm": 0.8673421144485474, "learning_rate": 1.9467044019383243e-05, "loss": 0.3009, "mean_token_accuracy": 0.9096365071833133, "num_tokens": 117885844.0, "step": 37800 }, { "entropy": 0.29591266391798854, "epoch": 0.5820292130637157, "grad_norm": 0.691364586353302, "learning_rate": 1.9466466947048827e-05, "loss": 0.2878, "mean_token_accuracy": 0.9105870179831982, "num_tokens": 117951367.0, "step": 37810 }, { "entropy": 0.29815801735967395, "epoch": 0.5821831483223943, "grad_norm": 0.6200420260429382, "learning_rate": 1.9465889571025807e-05, "loss": 0.2979, "mean_token_accuracy": 0.9099051780998707, "num_tokens": 118013533.0, "step": 37820 }, { "entropy": 0.3027414009906352, "epoch": 0.582337083581073, "grad_norm": 0.5501805543899536, "learning_rate": 1.946531189133271e-05, "loss": 0.3154, "mean_token_accuracy": 0.9061034798622132, "num_tokens": 118076814.0, "step": 37830 }, { "entropy": 0.3168647399172187, "epoch": 0.5824910188397515, "grad_norm": 0.6498293280601501, "learning_rate": 1.9464733907988068e-05, "loss": 0.3263, "mean_token_accuracy": 0.9017957381904125, "num_tokens": 118139133.0, "step": 37840 }, { "entropy": 0.29781890828162433, "epoch": 0.5826449540984301, "grad_norm": 0.7118014693260193, "learning_rate": 1.9464155621010428e-05, "loss": 0.284, "mean_token_accuracy": 0.9084117539227009, "num_tokens": 118199727.0, "step": 37850 }, { "entropy": 0.2947593081742525, "epoch": 0.5827988893571087, "grad_norm": 0.623193085193634, "learning_rate": 1.9463577030418334e-05, "loss": 0.3005, "mean_token_accuracy": 0.9107650056481361, "num_tokens": 118271113.0, "step": 37860 }, { "entropy": 0.2687114577740431, "epoch": 0.5829528246157872, "grad_norm": 0.9947728514671326, "learning_rate": 1.9462998136230346e-05, "loss": 0.2581, "mean_token_accuracy": 0.9169127821922303, "num_tokens": 118332478.0, "step": 37870 }, { "entropy": 0.28317591873928905, "epoch": 0.5831067598744658, "grad_norm": 0.7361688017845154, "learning_rate": 1.9462418938465042e-05, "loss": 0.2917, "mean_token_accuracy": 0.9116386584937572, "num_tokens": 118401301.0, "step": 37880 }, { "entropy": 0.2837916719727218, "epoch": 0.5832606951331444, "grad_norm": 0.6051737666130066, "learning_rate": 1.9461839437141003e-05, "loss": 0.2865, "mean_token_accuracy": 0.91297577470541, "num_tokens": 118471595.0, "step": 37890 }, { "entropy": 0.27732846392318605, "epoch": 0.583414630391823, "grad_norm": 0.6048546433448792, "learning_rate": 1.9461259632276814e-05, "loss": 0.283, "mean_token_accuracy": 0.913791460543871, "num_tokens": 118538178.0, "step": 37900 }, { "entropy": 0.27707173600792884, "epoch": 0.5835685656505015, "grad_norm": 0.6844649910926819, "learning_rate": 1.9460679523891078e-05, "loss": 0.291, "mean_token_accuracy": 0.9156978122889996, "num_tokens": 118601404.0, "step": 37910 }, { "entropy": 0.29359627785161135, "epoch": 0.5837225009091801, "grad_norm": 0.6209313869476318, "learning_rate": 1.946009911200241e-05, "loss": 0.2949, "mean_token_accuracy": 0.9084200873970986, "num_tokens": 118666946.0, "step": 37920 }, { "entropy": 0.3211615465581417, "epoch": 0.5838764361678587, "grad_norm": 0.723471999168396, "learning_rate": 1.945951839662942e-05, "loss": 0.3144, "mean_token_accuracy": 0.9022588022053242, "num_tokens": 118733100.0, "step": 37930 }, { "entropy": 0.29160611899569633, "epoch": 0.5840303714265372, "grad_norm": 0.6839172840118408, "learning_rate": 1.9458937377790744e-05, "loss": 0.2762, "mean_token_accuracy": 0.9106926828622818, "num_tokens": 118800320.0, "step": 37940 }, { "entropy": 0.2984732104465365, "epoch": 0.5841843066852158, "grad_norm": 0.5201835036277771, "learning_rate": 1.945835605550502e-05, "loss": 0.2974, "mean_token_accuracy": 0.9057530537247658, "num_tokens": 118867050.0, "step": 37950 }, { "entropy": 0.2686125909443945, "epoch": 0.5843382419438945, "grad_norm": 0.7388797998428345, "learning_rate": 1.94577744297909e-05, "loss": 0.2663, "mean_token_accuracy": 0.9186613582074642, "num_tokens": 118932040.0, "step": 37960 }, { "entropy": 0.2830048973672092, "epoch": 0.5844921772025731, "grad_norm": 0.6465365290641785, "learning_rate": 1.945719250066704e-05, "loss": 0.2863, "mean_token_accuracy": 0.9116877965629101, "num_tokens": 118994139.0, "step": 37970 }, { "entropy": 0.29811706906184554, "epoch": 0.5846461124612516, "grad_norm": 0.6093432307243347, "learning_rate": 1.9456610268152108e-05, "loss": 0.3018, "mean_token_accuracy": 0.9080816321074963, "num_tokens": 119068382.0, "step": 37980 }, { "entropy": 0.28428283454850317, "epoch": 0.5848000477199302, "grad_norm": 0.8288683295249939, "learning_rate": 1.9456027732264782e-05, "loss": 0.2866, "mean_token_accuracy": 0.9099437475204468, "num_tokens": 119134266.0, "step": 37990 }, { "entropy": 0.2859497210942209, "epoch": 0.5849539829786088, "grad_norm": 0.5652445554733276, "learning_rate": 1.9455444893023754e-05, "loss": 0.2867, "mean_token_accuracy": 0.9117250621318818, "num_tokens": 119205499.0, "step": 38000 }, { "entropy": 0.27801266256719825, "epoch": 0.5851079182372874, "grad_norm": 0.5044861435890198, "learning_rate": 1.9454861750447714e-05, "loss": 0.2766, "mean_token_accuracy": 0.9155188210308551, "num_tokens": 119274582.0, "step": 38010 }, { "entropy": 0.3035778651945293, "epoch": 0.5852618534959659, "grad_norm": 0.5734131932258606, "learning_rate": 1.945427830455538e-05, "loss": 0.3047, "mean_token_accuracy": 0.906736122071743, "num_tokens": 119342412.0, "step": 38020 }, { "entropy": 0.30298642739653586, "epoch": 0.5854157887546445, "grad_norm": 0.6254779696464539, "learning_rate": 1.945369455536546e-05, "loss": 0.309, "mean_token_accuracy": 0.9077628374099731, "num_tokens": 119404489.0, "step": 38030 }, { "entropy": 0.30263306288979946, "epoch": 0.5855697240133231, "grad_norm": 0.5811045169830322, "learning_rate": 1.9453110502896687e-05, "loss": 0.3025, "mean_token_accuracy": 0.9063902728259563, "num_tokens": 119473551.0, "step": 38040 }, { "entropy": 0.28575572837144136, "epoch": 0.5857236592720017, "grad_norm": 0.6470403671264648, "learning_rate": 1.9452526147167795e-05, "loss": 0.2767, "mean_token_accuracy": 0.9127218931913376, "num_tokens": 119535736.0, "step": 38050 }, { "entropy": 0.2692881668917835, "epoch": 0.5858775945306802, "grad_norm": 0.5927481055259705, "learning_rate": 1.9451941488197527e-05, "loss": 0.2805, "mean_token_accuracy": 0.9158396534621716, "num_tokens": 119607659.0, "step": 38060 }, { "entropy": 0.27668205900117754, "epoch": 0.5860315297893588, "grad_norm": 0.7149688601493835, "learning_rate": 1.9451356526004646e-05, "loss": 0.2823, "mean_token_accuracy": 0.9122813440859318, "num_tokens": 119676817.0, "step": 38070 }, { "entropy": 0.3064639816991985, "epoch": 0.5861854650480374, "grad_norm": 0.5445702075958252, "learning_rate": 1.9450771260607912e-05, "loss": 0.31, "mean_token_accuracy": 0.9054105415940285, "num_tokens": 119730611.0, "step": 38080 }, { "entropy": 0.29548922944813966, "epoch": 0.586339400306716, "grad_norm": 0.7846742272377014, "learning_rate": 1.9450185692026105e-05, "loss": 0.2929, "mean_token_accuracy": 0.9099145494401455, "num_tokens": 119795670.0, "step": 38090 }, { "entropy": 0.2914104393683374, "epoch": 0.5864933355653946, "grad_norm": 0.790316104888916, "learning_rate": 1.944959982027801e-05, "loss": 0.2919, "mean_token_accuracy": 0.9115936242043972, "num_tokens": 119861577.0, "step": 38100 }, { "entropy": 0.26613625036552546, "epoch": 0.5866472708240732, "grad_norm": 0.7292237281799316, "learning_rate": 1.9449013645382417e-05, "loss": 0.2772, "mean_token_accuracy": 0.9168544210493564, "num_tokens": 119922427.0, "step": 38110 }, { "entropy": 0.30850235614925625, "epoch": 0.5868012060827518, "grad_norm": 0.6937001943588257, "learning_rate": 1.9448427167358134e-05, "loss": 0.3114, "mean_token_accuracy": 0.9055410079658032, "num_tokens": 119995162.0, "step": 38120 }, { "entropy": 0.30233182962983846, "epoch": 0.5869551413414303, "grad_norm": 0.5812699198722839, "learning_rate": 1.944784038622398e-05, "loss": 0.2999, "mean_token_accuracy": 0.9062766611576081, "num_tokens": 120072659.0, "step": 38130 }, { "entropy": 0.2522984479088336, "epoch": 0.5871090766001089, "grad_norm": 0.5522423386573792, "learning_rate": 1.9447253301998773e-05, "loss": 0.2591, "mean_token_accuracy": 0.919579491019249, "num_tokens": 120143511.0, "step": 38140 }, { "entropy": 0.2903908696956933, "epoch": 0.5872630118587875, "grad_norm": 0.5521197319030762, "learning_rate": 1.9446665914701344e-05, "loss": 0.2961, "mean_token_accuracy": 0.9102292686700821, "num_tokens": 120210935.0, "step": 38150 }, { "entropy": 0.27823789659887554, "epoch": 0.5874169471174661, "grad_norm": 0.6049338579177856, "learning_rate": 1.9446078224350547e-05, "loss": 0.2735, "mean_token_accuracy": 0.9142727315425873, "num_tokens": 120280127.0, "step": 38160 }, { "entropy": 0.27999556316062807, "epoch": 0.5875708823761446, "grad_norm": 0.6881234049797058, "learning_rate": 1.944549023096523e-05, "loss": 0.2818, "mean_token_accuracy": 0.9115422077476978, "num_tokens": 120347126.0, "step": 38170 }, { "entropy": 0.294406412076205, "epoch": 0.5877248176348232, "grad_norm": 0.5835028290748596, "learning_rate": 1.9444901934564255e-05, "loss": 0.2928, "mean_token_accuracy": 0.9101785391569137, "num_tokens": 120414932.0, "step": 38180 }, { "entropy": 0.30815640445798637, "epoch": 0.5878787528935018, "grad_norm": 0.6414505243301392, "learning_rate": 1.9444313335166497e-05, "loss": 0.3131, "mean_token_accuracy": 0.905425027757883, "num_tokens": 120487652.0, "step": 38190 }, { "entropy": 0.28630732456222174, "epoch": 0.5880326881521803, "grad_norm": 0.5572633743286133, "learning_rate": 1.9443724432790834e-05, "loss": 0.2907, "mean_token_accuracy": 0.9114728562533856, "num_tokens": 120552776.0, "step": 38200 }, { "entropy": 0.297471883893013, "epoch": 0.588186623410859, "grad_norm": 0.5131746530532837, "learning_rate": 1.9443135227456166e-05, "loss": 0.3018, "mean_token_accuracy": 0.9060763701796531, "num_tokens": 120624389.0, "step": 38210 }, { "entropy": 0.293511800095439, "epoch": 0.5883405586695376, "grad_norm": 0.6678810119628906, "learning_rate": 1.9442545719181386e-05, "loss": 0.2969, "mean_token_accuracy": 0.910335361212492, "num_tokens": 120682038.0, "step": 38220 }, { "entropy": 0.2678772771731019, "epoch": 0.5884944939282162, "grad_norm": 0.6935123205184937, "learning_rate": 1.944195590798541e-05, "loss": 0.2566, "mean_token_accuracy": 0.9169098056852818, "num_tokens": 120757686.0, "step": 38230 }, { "entropy": 0.28494768114760516, "epoch": 0.5886484291868948, "grad_norm": 0.6007286906242371, "learning_rate": 1.9441365793887163e-05, "loss": 0.3068, "mean_token_accuracy": 0.9107564836740494, "num_tokens": 120825990.0, "step": 38240 }, { "entropy": 0.27564480947330594, "epoch": 0.5888023644455733, "grad_norm": 0.6909257769584656, "learning_rate": 1.9440775376905572e-05, "loss": 0.28, "mean_token_accuracy": 0.9151189066469669, "num_tokens": 120890952.0, "step": 38250 }, { "entropy": 0.2722304807975888, "epoch": 0.5889562997042519, "grad_norm": 0.6395936608314514, "learning_rate": 1.9440184657059578e-05, "loss": 0.2877, "mean_token_accuracy": 0.9163083367049694, "num_tokens": 120946740.0, "step": 38260 }, { "entropy": 0.2876410967670381, "epoch": 0.5891102349629305, "grad_norm": 0.5605814456939697, "learning_rate": 1.943959363436813e-05, "loss": 0.2913, "mean_token_accuracy": 0.9097180470824242, "num_tokens": 121016951.0, "step": 38270 }, { "entropy": 0.26389468191191556, "epoch": 0.589264170221609, "grad_norm": 0.5659781694412231, "learning_rate": 1.94390023088502e-05, "loss": 0.2823, "mean_token_accuracy": 0.9173361591994762, "num_tokens": 121090136.0, "step": 38280 }, { "entropy": 0.28377116983756423, "epoch": 0.5894181054802876, "grad_norm": 0.7243486642837524, "learning_rate": 1.943841068052474e-05, "loss": 0.2786, "mean_token_accuracy": 0.9134757310152054, "num_tokens": 121161623.0, "step": 38290 }, { "entropy": 0.28661793572828176, "epoch": 0.5895720407389662, "grad_norm": 0.5200279355049133, "learning_rate": 1.9437818749410738e-05, "loss": 0.2891, "mean_token_accuracy": 0.9104512862861156, "num_tokens": 121236392.0, "step": 38300 }, { "entropy": 0.28449363354593515, "epoch": 0.5897259759976448, "grad_norm": 0.577932596206665, "learning_rate": 1.9437226515527188e-05, "loss": 0.2895, "mean_token_accuracy": 0.9105257287621498, "num_tokens": 121314208.0, "step": 38310 }, { "entropy": 0.304955524019897, "epoch": 0.5898799112563233, "grad_norm": 0.7047997713088989, "learning_rate": 1.943663397889308e-05, "loss": 0.3068, "mean_token_accuracy": 0.9064931444823742, "num_tokens": 121379877.0, "step": 38320 }, { "entropy": 0.3320522184483707, "epoch": 0.5900338465150019, "grad_norm": 0.7038030624389648, "learning_rate": 1.943604113952743e-05, "loss": 0.3244, "mean_token_accuracy": 0.9004431910812855, "num_tokens": 121440911.0, "step": 38330 }, { "entropy": 0.27843058267608284, "epoch": 0.5901877817736806, "grad_norm": 0.6828165054321289, "learning_rate": 1.9435447997449253e-05, "loss": 0.2871, "mean_token_accuracy": 0.9140984907746315, "num_tokens": 121508113.0, "step": 38340 }, { "entropy": 0.2886190082877874, "epoch": 0.5903417170323592, "grad_norm": 0.5717393159866333, "learning_rate": 1.9434854552677584e-05, "loss": 0.2938, "mean_token_accuracy": 0.9110691636800766, "num_tokens": 121570059.0, "step": 38350 }, { "entropy": 0.29623543759807947, "epoch": 0.5904956522910377, "grad_norm": 0.5876076817512512, "learning_rate": 1.943426080523145e-05, "loss": 0.3003, "mean_token_accuracy": 0.9091927610337734, "num_tokens": 121631873.0, "step": 38360 }, { "entropy": 0.28861056743189695, "epoch": 0.5906495875497163, "grad_norm": 0.6206479072570801, "learning_rate": 1.9433666755129903e-05, "loss": 0.2771, "mean_token_accuracy": 0.9108193255960941, "num_tokens": 121700686.0, "step": 38370 }, { "entropy": 0.2952474934048951, "epoch": 0.5908035228083949, "grad_norm": 0.7560595870018005, "learning_rate": 1.9433072402392003e-05, "loss": 0.2876, "mean_token_accuracy": 0.9079841531813144, "num_tokens": 121767429.0, "step": 38380 }, { "entropy": 0.26775067383423445, "epoch": 0.5909574580670734, "grad_norm": 0.5359091758728027, "learning_rate": 1.9432477747036816e-05, "loss": 0.2822, "mean_token_accuracy": 0.9144719809293747, "num_tokens": 121846405.0, "step": 38390 }, { "entropy": 0.2876062604598701, "epoch": 0.591111393325752, "grad_norm": 0.6889899969100952, "learning_rate": 1.9431882789083422e-05, "loss": 0.3058, "mean_token_accuracy": 0.9111338250339032, "num_tokens": 121911388.0, "step": 38400 }, { "entropy": 0.2966715857386589, "epoch": 0.5912653285844306, "grad_norm": 0.5403153896331787, "learning_rate": 1.9431287528550896e-05, "loss": 0.2855, "mean_token_accuracy": 0.9107509531080723, "num_tokens": 121985579.0, "step": 38410 }, { "entropy": 0.28408765103667977, "epoch": 0.5914192638431092, "grad_norm": 0.6882910132408142, "learning_rate": 1.9430691965458347e-05, "loss": 0.298, "mean_token_accuracy": 0.9108279131352901, "num_tokens": 122059222.0, "step": 38420 }, { "entropy": 0.30829087952151896, "epoch": 0.5915731991017877, "grad_norm": 0.580696702003479, "learning_rate": 1.9430096099824877e-05, "loss": 0.3087, "mean_token_accuracy": 0.9049819961190224, "num_tokens": 122132495.0, "step": 38430 }, { "entropy": 0.27994651068001986, "epoch": 0.5917271343604663, "grad_norm": 0.6962777376174927, "learning_rate": 1.94294999316696e-05, "loss": 0.298, "mean_token_accuracy": 0.9131641775369644, "num_tokens": 122190455.0, "step": 38440 }, { "entropy": 0.2724752487614751, "epoch": 0.5918810696191449, "grad_norm": 0.7079670429229736, "learning_rate": 1.9428903461011642e-05, "loss": 0.2813, "mean_token_accuracy": 0.9146144449710846, "num_tokens": 122254449.0, "step": 38450 }, { "entropy": 0.28211307618767023, "epoch": 0.5920350048778235, "grad_norm": 0.5706033706665039, "learning_rate": 1.942830668787014e-05, "loss": 0.2842, "mean_token_accuracy": 0.9113453842699528, "num_tokens": 122327150.0, "step": 38460 }, { "entropy": 0.2840691052377224, "epoch": 0.5921889401365021, "grad_norm": 0.595199704170227, "learning_rate": 1.9427709612264235e-05, "loss": 0.2846, "mean_token_accuracy": 0.9132887288928032, "num_tokens": 122392029.0, "step": 38470 }, { "entropy": 0.2927996365353465, "epoch": 0.5923428753951807, "grad_norm": 0.6410272121429443, "learning_rate": 1.9427112234213082e-05, "loss": 0.29, "mean_token_accuracy": 0.9079915694892406, "num_tokens": 122455783.0, "step": 38480 }, { "entropy": 0.2849935840815306, "epoch": 0.5924968106538593, "grad_norm": 0.6700944304466248, "learning_rate": 1.942651455373585e-05, "loss": 0.2865, "mean_token_accuracy": 0.9121615342795849, "num_tokens": 122515387.0, "step": 38490 }, { "entropy": 0.2766516711562872, "epoch": 0.5926507459125379, "grad_norm": 0.6256654858589172, "learning_rate": 1.9425916570851706e-05, "loss": 0.2757, "mean_token_accuracy": 0.9132049947977066, "num_tokens": 122582284.0, "step": 38500 }, { "entropy": 0.26621872326359153, "epoch": 0.5928046811712164, "grad_norm": 0.5696317553520203, "learning_rate": 1.9425318285579836e-05, "loss": 0.2792, "mean_token_accuracy": 0.9172633364796638, "num_tokens": 122649072.0, "step": 38510 }, { "entropy": 0.28502482604235413, "epoch": 0.592958616429895, "grad_norm": 0.9536358118057251, "learning_rate": 1.9424719697939437e-05, "loss": 0.2903, "mean_token_accuracy": 0.911523899435997, "num_tokens": 122711854.0, "step": 38520 }, { "entropy": 0.2857400613836944, "epoch": 0.5931125516885736, "grad_norm": 0.6871974468231201, "learning_rate": 1.942412080794971e-05, "loss": 0.2803, "mean_token_accuracy": 0.9120467238128185, "num_tokens": 122777882.0, "step": 38530 }, { "entropy": 0.2837906998582184, "epoch": 0.5932664869472521, "grad_norm": 0.8472070693969727, "learning_rate": 1.9423521615629864e-05, "loss": 0.2896, "mean_token_accuracy": 0.9105157546699048, "num_tokens": 122848254.0, "step": 38540 }, { "entropy": 0.2922023140825331, "epoch": 0.5934204222059307, "grad_norm": 0.5600532293319702, "learning_rate": 1.9422922120999124e-05, "loss": 0.3029, "mean_token_accuracy": 0.9095430009067058, "num_tokens": 122923954.0, "step": 38550 }, { "entropy": 0.27224849900230763, "epoch": 0.5935743574646093, "grad_norm": 0.6332675218582153, "learning_rate": 1.9422322324076726e-05, "loss": 0.2763, "mean_token_accuracy": 0.916772436350584, "num_tokens": 122995707.0, "step": 38560 }, { "entropy": 0.2871088248677552, "epoch": 0.5937282927232879, "grad_norm": 0.6882993578910828, "learning_rate": 1.942172222488191e-05, "loss": 0.2836, "mean_token_accuracy": 0.9115588568150997, "num_tokens": 123054096.0, "step": 38570 }, { "entropy": 0.28388828514143827, "epoch": 0.5938822279819664, "grad_norm": 0.7024440169334412, "learning_rate": 1.942112182343392e-05, "loss": 0.2879, "mean_token_accuracy": 0.9106521107256412, "num_tokens": 123117991.0, "step": 38580 }, { "entropy": 0.279187220800668, "epoch": 0.5940361632406451, "grad_norm": 0.4970123767852783, "learning_rate": 1.9420521119752023e-05, "loss": 0.2915, "mean_token_accuracy": 0.9123311080038548, "num_tokens": 123189246.0, "step": 38590 }, { "entropy": 0.30042759869247676, "epoch": 0.5941900984993237, "grad_norm": 0.6737905740737915, "learning_rate": 1.941992011385549e-05, "loss": 0.2981, "mean_token_accuracy": 0.9082325220108032, "num_tokens": 123256955.0, "step": 38600 }, { "entropy": 0.27204283624887465, "epoch": 0.5943440337580023, "grad_norm": 0.5800544023513794, "learning_rate": 1.9419318805763603e-05, "loss": 0.2701, "mean_token_accuracy": 0.91655662804842, "num_tokens": 123320536.0, "step": 38610 }, { "entropy": 0.29350590957328676, "epoch": 0.5944979690166808, "grad_norm": 0.5189716219902039, "learning_rate": 1.941871719549565e-05, "loss": 0.3076, "mean_token_accuracy": 0.908731198310852, "num_tokens": 123388264.0, "step": 38620 }, { "entropy": 0.27583510959520935, "epoch": 0.5946519042753594, "grad_norm": 0.7122189402580261, "learning_rate": 1.9418115283070927e-05, "loss": 0.2855, "mean_token_accuracy": 0.9141564227640628, "num_tokens": 123450024.0, "step": 38630 }, { "entropy": 0.2987645329907537, "epoch": 0.594805839534038, "grad_norm": 0.6505634784698486, "learning_rate": 1.9417513068508754e-05, "loss": 0.2986, "mean_token_accuracy": 0.9074797093868255, "num_tokens": 123512971.0, "step": 38640 }, { "entropy": 0.2898190434090793, "epoch": 0.5949597747927166, "grad_norm": 0.6347748637199402, "learning_rate": 1.9416910551828444e-05, "loss": 0.2892, "mean_token_accuracy": 0.9119581572711468, "num_tokens": 123576095.0, "step": 38650 }, { "entropy": 0.2887229911051691, "epoch": 0.5951137100513951, "grad_norm": 0.6476402878761292, "learning_rate": 1.941630773304932e-05, "loss": 0.297, "mean_token_accuracy": 0.9135364532470703, "num_tokens": 123641933.0, "step": 38660 }, { "entropy": 0.306198475509882, "epoch": 0.5952676453100737, "grad_norm": 0.685903012752533, "learning_rate": 1.9415704612190734e-05, "loss": 0.3093, "mean_token_accuracy": 0.9048223420977592, "num_tokens": 123709616.0, "step": 38670 }, { "entropy": 0.28891458036378026, "epoch": 0.5954215805687523, "grad_norm": 0.6712910532951355, "learning_rate": 1.9415101189272022e-05, "loss": 0.2966, "mean_token_accuracy": 0.9102346614003182, "num_tokens": 123761402.0, "step": 38680 }, { "entropy": 0.2892449714243412, "epoch": 0.5955755158274308, "grad_norm": 0.5798468589782715, "learning_rate": 1.941449746431255e-05, "loss": 0.2845, "mean_token_accuracy": 0.9137683011591434, "num_tokens": 123832194.0, "step": 38690 }, { "entropy": 0.2820197786204517, "epoch": 0.5957294510861094, "grad_norm": 0.7747431993484497, "learning_rate": 1.9413893437331684e-05, "loss": 0.3009, "mean_token_accuracy": 0.9136417441070079, "num_tokens": 123896700.0, "step": 38700 }, { "entropy": 0.2949078665114939, "epoch": 0.595883386344788, "grad_norm": 0.4666909873485565, "learning_rate": 1.9413289108348795e-05, "loss": 0.2941, "mean_token_accuracy": 0.907558698952198, "num_tokens": 123967989.0, "step": 38710 }, { "entropy": 0.30674334689974786, "epoch": 0.5960373216034667, "grad_norm": 0.5924844741821289, "learning_rate": 1.9412684477383284e-05, "loss": 0.3131, "mean_token_accuracy": 0.9065271750092506, "num_tokens": 124033619.0, "step": 38720 }, { "entropy": 0.2927540016360581, "epoch": 0.5961912568621452, "grad_norm": 0.5081960558891296, "learning_rate": 1.9412079544454533e-05, "loss": 0.2909, "mean_token_accuracy": 0.9108942478895188, "num_tokens": 124097235.0, "step": 38730 }, { "entropy": 0.284145332314074, "epoch": 0.5963451921208238, "grad_norm": 0.5699634552001953, "learning_rate": 1.9411474309581958e-05, "loss": 0.3008, "mean_token_accuracy": 0.9096285477280617, "num_tokens": 124159271.0, "step": 38740 }, { "entropy": 0.27987341657280923, "epoch": 0.5964991273795024, "grad_norm": 0.6382817625999451, "learning_rate": 1.9410868772784973e-05, "loss": 0.2848, "mean_token_accuracy": 0.9144647791981697, "num_tokens": 124225832.0, "step": 38750 }, { "entropy": 0.28886804506182673, "epoch": 0.596653062638181, "grad_norm": 0.6453297138214111, "learning_rate": 1.9410262934083003e-05, "loss": 0.2956, "mean_token_accuracy": 0.9117851540446281, "num_tokens": 124288017.0, "step": 38760 }, { "entropy": 0.29290464939549565, "epoch": 0.5968069978968595, "grad_norm": 0.5520327687263489, "learning_rate": 1.9409656793495482e-05, "loss": 0.2878, "mean_token_accuracy": 0.9089155927300453, "num_tokens": 124358777.0, "step": 38770 }, { "entropy": 0.3108167524449527, "epoch": 0.5969609331555381, "grad_norm": 0.605430006980896, "learning_rate": 1.9409050351041858e-05, "loss": 0.3146, "mean_token_accuracy": 0.9058993861079216, "num_tokens": 124422943.0, "step": 38780 }, { "entropy": 0.2702074883505702, "epoch": 0.5971148684142167, "grad_norm": 0.7182388305664062, "learning_rate": 1.9408443606741585e-05, "loss": 0.2715, "mean_token_accuracy": 0.9164241150021553, "num_tokens": 124482994.0, "step": 38790 }, { "entropy": 0.28482846776023507, "epoch": 0.5972688036728953, "grad_norm": 0.7276037931442261, "learning_rate": 1.9407836560614126e-05, "loss": 0.2867, "mean_token_accuracy": 0.912012492865324, "num_tokens": 124542382.0, "step": 38800 }, { "entropy": 0.2943683641962707, "epoch": 0.5974227389315738, "grad_norm": 0.5470726490020752, "learning_rate": 1.940722921267896e-05, "loss": 0.3154, "mean_token_accuracy": 0.9093693234026432, "num_tokens": 124609762.0, "step": 38810 }, { "entropy": 0.29461370250210167, "epoch": 0.5975766741902524, "grad_norm": 0.6418203711509705, "learning_rate": 1.9406621562955568e-05, "loss": 0.3117, "mean_token_accuracy": 0.9113640151917934, "num_tokens": 124672049.0, "step": 38820 }, { "entropy": 0.2706081504933536, "epoch": 0.597730609448931, "grad_norm": 0.6054796576499939, "learning_rate": 1.940601361146344e-05, "loss": 0.2853, "mean_token_accuracy": 0.9164394691586495, "num_tokens": 124729324.0, "step": 38830 }, { "entropy": 0.2761873395182192, "epoch": 0.5978845447076095, "grad_norm": 0.7396766543388367, "learning_rate": 1.9405405358222088e-05, "loss": 0.2784, "mean_token_accuracy": 0.9136770620942116, "num_tokens": 124784288.0, "step": 38840 }, { "entropy": 0.28571867011487484, "epoch": 0.5980384799662882, "grad_norm": 0.5531111359596252, "learning_rate": 1.9404796803251017e-05, "loss": 0.2952, "mean_token_accuracy": 0.9122462071478367, "num_tokens": 124858327.0, "step": 38850 }, { "entropy": 0.29376718113198874, "epoch": 0.5981924152249668, "grad_norm": 0.5118190050125122, "learning_rate": 1.940418794656975e-05, "loss": 0.2924, "mean_token_accuracy": 0.9121388860046864, "num_tokens": 124924484.0, "step": 38860 }, { "entropy": 0.2696869336999953, "epoch": 0.5983463504836454, "grad_norm": 0.637765109539032, "learning_rate": 1.940357878819783e-05, "loss": 0.2748, "mean_token_accuracy": 0.9172845646739006, "num_tokens": 124992238.0, "step": 38870 }, { "entropy": 0.27005093479529024, "epoch": 0.598500285742324, "grad_norm": 0.6180473566055298, "learning_rate": 1.9402969328154786e-05, "loss": 0.2844, "mean_token_accuracy": 0.9151889115571976, "num_tokens": 125047750.0, "step": 38880 }, { "entropy": 0.2789674921892583, "epoch": 0.5986542210010025, "grad_norm": 0.7591472268104553, "learning_rate": 1.9402359566460175e-05, "loss": 0.2861, "mean_token_accuracy": 0.9129276633262634, "num_tokens": 125107959.0, "step": 38890 }, { "entropy": 0.29893692592158916, "epoch": 0.5988081562596811, "grad_norm": 0.8093998432159424, "learning_rate": 1.940174950313356e-05, "loss": 0.2975, "mean_token_accuracy": 0.9070475667715072, "num_tokens": 125169482.0, "step": 38900 }, { "entropy": 0.2850018413737416, "epoch": 0.5989620915183597, "grad_norm": 0.8213830590248108, "learning_rate": 1.9401139138194513e-05, "loss": 0.2815, "mean_token_accuracy": 0.9122270837426185, "num_tokens": 125224641.0, "step": 38910 }, { "entropy": 0.2892366706393659, "epoch": 0.5991160267770382, "grad_norm": 0.48894360661506653, "learning_rate": 1.9400528471662604e-05, "loss": 0.3005, "mean_token_accuracy": 0.9104686990380287, "num_tokens": 125294031.0, "step": 38920 }, { "entropy": 0.26668570870533587, "epoch": 0.5992699620357168, "grad_norm": 0.5871398448944092, "learning_rate": 1.939991750355744e-05, "loss": 0.2679, "mean_token_accuracy": 0.9178432986140251, "num_tokens": 125362821.0, "step": 38930 }, { "entropy": 0.2870370536111295, "epoch": 0.5994238972943954, "grad_norm": 0.508257269859314, "learning_rate": 1.9399306233898613e-05, "loss": 0.3014, "mean_token_accuracy": 0.9106844983994961, "num_tokens": 125422304.0, "step": 38940 }, { "entropy": 0.28268210906535385, "epoch": 0.599577832553074, "grad_norm": 0.5466177463531494, "learning_rate": 1.9398694662705728e-05, "loss": 0.2788, "mean_token_accuracy": 0.9137662880122661, "num_tokens": 125491659.0, "step": 38950 }, { "entropy": 0.2876384721137583, "epoch": 0.5997317678117525, "grad_norm": 0.8809535503387451, "learning_rate": 1.9398082789998408e-05, "loss": 0.2929, "mean_token_accuracy": 0.9093930684030056, "num_tokens": 125555805.0, "step": 38960 }, { "entropy": 0.2835082496516407, "epoch": 0.5998857030704312, "grad_norm": 0.5488160252571106, "learning_rate": 1.9397470615796286e-05, "loss": 0.2879, "mean_token_accuracy": 0.9134703643620015, "num_tokens": 125622085.0, "step": 38970 }, { "entropy": 0.2708982914686203, "epoch": 0.6000396383291098, "grad_norm": 0.7258061766624451, "learning_rate": 1.9396858140119e-05, "loss": 0.2673, "mean_token_accuracy": 0.9162754513323307, "num_tokens": 125681239.0, "step": 38980 }, { "entropy": 0.2997819953598082, "epoch": 0.6001935735877884, "grad_norm": 0.5644104480743408, "learning_rate": 1.9396245362986197e-05, "loss": 0.2867, "mean_token_accuracy": 0.9056954823434353, "num_tokens": 125757296.0, "step": 38990 }, { "entropy": 0.2891509497538209, "epoch": 0.6003475088464669, "grad_norm": 0.594848096370697, "learning_rate": 1.9395632284417535e-05, "loss": 0.2944, "mean_token_accuracy": 0.9089191667735577, "num_tokens": 125820425.0, "step": 39000 }, { "entropy": 0.2876337864436209, "epoch": 0.6005014441051455, "grad_norm": 0.7020294666290283, "learning_rate": 1.939501890443268e-05, "loss": 0.2905, "mean_token_accuracy": 0.9113807499408721, "num_tokens": 125877315.0, "step": 39010 }, { "entropy": 0.27712064553052185, "epoch": 0.6006553793638241, "grad_norm": 0.6024906635284424, "learning_rate": 1.9394405223051313e-05, "loss": 0.2777, "mean_token_accuracy": 0.9145071551203727, "num_tokens": 125943764.0, "step": 39020 }, { "entropy": 0.300984738022089, "epoch": 0.6008093146225026, "grad_norm": 0.5808442831039429, "learning_rate": 1.9393791240293116e-05, "loss": 0.297, "mean_token_accuracy": 0.9070745378732681, "num_tokens": 126010731.0, "step": 39030 }, { "entropy": 0.2835416253656149, "epoch": 0.6009632498811812, "grad_norm": 0.5429584980010986, "learning_rate": 1.939317695617779e-05, "loss": 0.2885, "mean_token_accuracy": 0.9106586664915085, "num_tokens": 126072137.0, "step": 39040 }, { "entropy": 0.31342499386519196, "epoch": 0.6011171851398598, "grad_norm": 0.59637451171875, "learning_rate": 1.9392562370725044e-05, "loss": 0.3163, "mean_token_accuracy": 0.9038565658032894, "num_tokens": 126147679.0, "step": 39050 }, { "entropy": 0.2933554662391543, "epoch": 0.6012711203985384, "grad_norm": 0.6178944110870361, "learning_rate": 1.939194748395459e-05, "loss": 0.2953, "mean_token_accuracy": 0.9096938587725163, "num_tokens": 126218191.0, "step": 39060 }, { "entropy": 0.289877806045115, "epoch": 0.6014250556572169, "grad_norm": 0.5930234789848328, "learning_rate": 1.9391332295886155e-05, "loss": 0.2882, "mean_token_accuracy": 0.9104660913348198, "num_tokens": 126276226.0, "step": 39070 }, { "entropy": 0.28127780500799415, "epoch": 0.6015789909158955, "grad_norm": 0.5111605525016785, "learning_rate": 1.9390716806539474e-05, "loss": 0.294, "mean_token_accuracy": 0.9124434903264046, "num_tokens": 126340415.0, "step": 39080 }, { "entropy": 0.28022841196507214, "epoch": 0.6017329261745741, "grad_norm": 0.6004422903060913, "learning_rate": 1.939010101593429e-05, "loss": 0.2871, "mean_token_accuracy": 0.9136137664318085, "num_tokens": 126399097.0, "step": 39090 }, { "entropy": 0.28324994519352914, "epoch": 0.6018868614332528, "grad_norm": 0.5662945508956909, "learning_rate": 1.938948492409036e-05, "loss": 0.2965, "mean_token_accuracy": 0.912203711271286, "num_tokens": 126464097.0, "step": 39100 }, { "entropy": 0.29024170422926543, "epoch": 0.6020407966919313, "grad_norm": 0.7991623878479004, "learning_rate": 1.9388868531027458e-05, "loss": 0.2976, "mean_token_accuracy": 0.9087364464998245, "num_tokens": 126526256.0, "step": 39110 }, { "entropy": 0.2759373095817864, "epoch": 0.6021947319506099, "grad_norm": 0.5695711970329285, "learning_rate": 1.938825183676534e-05, "loss": 0.2805, "mean_token_accuracy": 0.9176666468381882, "num_tokens": 126592510.0, "step": 39120 }, { "entropy": 0.2732806605286896, "epoch": 0.6023486672092885, "grad_norm": 0.6234866380691528, "learning_rate": 1.93876348413238e-05, "loss": 0.2772, "mean_token_accuracy": 0.9141364939510822, "num_tokens": 126656677.0, "step": 39130 }, { "entropy": 0.2891701995395124, "epoch": 0.602502602467967, "grad_norm": 0.6440828442573547, "learning_rate": 1.9387017544722632e-05, "loss": 0.293, "mean_token_accuracy": 0.9084146596491337, "num_tokens": 126726156.0, "step": 39140 }, { "entropy": 0.2754191057756543, "epoch": 0.6026565377266456, "grad_norm": 0.5132788419723511, "learning_rate": 1.9386399946981638e-05, "loss": 0.2751, "mean_token_accuracy": 0.9159563668072224, "num_tokens": 126790406.0, "step": 39150 }, { "entropy": 0.2989913820289075, "epoch": 0.6028104729853242, "grad_norm": 0.6046749353408813, "learning_rate": 1.938578204812063e-05, "loss": 0.3003, "mean_token_accuracy": 0.9083828948438167, "num_tokens": 126851276.0, "step": 39160 }, { "entropy": 0.29372527459636333, "epoch": 0.6029644082440028, "grad_norm": 0.8101642727851868, "learning_rate": 1.938516384815943e-05, "loss": 0.3001, "mean_token_accuracy": 0.9103326328098774, "num_tokens": 126915704.0, "step": 39170 }, { "entropy": 0.2902566549368203, "epoch": 0.6031183435026813, "grad_norm": 0.643403172492981, "learning_rate": 1.938454534711787e-05, "loss": 0.301, "mean_token_accuracy": 0.9101841047406196, "num_tokens": 126979305.0, "step": 39180 }, { "entropy": 0.3002389275468886, "epoch": 0.6032722787613599, "grad_norm": 0.5405078530311584, "learning_rate": 1.9383926545015798e-05, "loss": 0.2945, "mean_token_accuracy": 0.909904246032238, "num_tokens": 127040498.0, "step": 39190 }, { "entropy": 0.2898299758322537, "epoch": 0.6034262140200385, "grad_norm": 0.5981183052062988, "learning_rate": 1.9383307441873054e-05, "loss": 0.2945, "mean_token_accuracy": 0.9080812521278858, "num_tokens": 127107334.0, "step": 39200 }, { "entropy": 0.28488463955000043, "epoch": 0.6035801492787171, "grad_norm": 0.6748288869857788, "learning_rate": 1.938268803770951e-05, "loss": 0.2833, "mean_token_accuracy": 0.912563169002533, "num_tokens": 127172692.0, "step": 39210 }, { "entropy": 0.2739122981205583, "epoch": 0.6037340845373956, "grad_norm": 0.6224254965782166, "learning_rate": 1.938206833254503e-05, "loss": 0.2875, "mean_token_accuracy": 0.9148685052990914, "num_tokens": 127237126.0, "step": 39220 }, { "entropy": 0.2897231478244066, "epoch": 0.6038880197960743, "grad_norm": 0.5825873017311096, "learning_rate": 1.9381448326399496e-05, "loss": 0.2829, "mean_token_accuracy": 0.9095207326114177, "num_tokens": 127311808.0, "step": 39230 }, { "entropy": 0.28835327811539174, "epoch": 0.6040419550547529, "grad_norm": 0.773512601852417, "learning_rate": 1.9380828019292797e-05, "loss": 0.2919, "mean_token_accuracy": 0.9096506595611572, "num_tokens": 127375680.0, "step": 39240 }, { "entropy": 0.2786993649788201, "epoch": 0.6041958903134315, "grad_norm": 0.650534987449646, "learning_rate": 1.938020741124484e-05, "loss": 0.2824, "mean_token_accuracy": 0.9125532537698746, "num_tokens": 127455417.0, "step": 39250 }, { "entropy": 0.28742216909304263, "epoch": 0.60434982557211, "grad_norm": 0.49506527185440063, "learning_rate": 1.9379586502275523e-05, "loss": 0.2828, "mean_token_accuracy": 0.9112144850194455, "num_tokens": 127522585.0, "step": 39260 }, { "entropy": 0.2774578968062997, "epoch": 0.6045037608307886, "grad_norm": 0.5554907321929932, "learning_rate": 1.9378965292404775e-05, "loss": 0.276, "mean_token_accuracy": 0.912755124270916, "num_tokens": 127591097.0, "step": 39270 }, { "entropy": 0.2868816224858165, "epoch": 0.6046576960894672, "grad_norm": 0.5893166065216064, "learning_rate": 1.937834378165252e-05, "loss": 0.305, "mean_token_accuracy": 0.9122161991894245, "num_tokens": 127649347.0, "step": 39280 }, { "entropy": 0.3021311822347343, "epoch": 0.6048116313481458, "grad_norm": 0.593578577041626, "learning_rate": 1.9377721970038696e-05, "loss": 0.308, "mean_token_accuracy": 0.9065051347017288, "num_tokens": 127712611.0, "step": 39290 }, { "entropy": 0.2809598666615784, "epoch": 0.6049655666068243, "grad_norm": 0.6698401570320129, "learning_rate": 1.937709985758325e-05, "loss": 0.2823, "mean_token_accuracy": 0.9142944552004337, "num_tokens": 127775564.0, "step": 39300 }, { "entropy": 0.28922465797513724, "epoch": 0.6051195018655029, "grad_norm": 0.5507311224937439, "learning_rate": 1.9376477444306142e-05, "loss": 0.292, "mean_token_accuracy": 0.9122150912880898, "num_tokens": 127843610.0, "step": 39310 }, { "entropy": 0.269012560416013, "epoch": 0.6052734371241815, "grad_norm": 0.5834651589393616, "learning_rate": 1.937585473022734e-05, "loss": 0.274, "mean_token_accuracy": 0.9149912878870964, "num_tokens": 127909354.0, "step": 39320 }, { "entropy": 0.2787372024729848, "epoch": 0.60542737238286, "grad_norm": 0.5986599922180176, "learning_rate": 1.937523171536682e-05, "loss": 0.2948, "mean_token_accuracy": 0.9111977733671666, "num_tokens": 127972953.0, "step": 39330 }, { "entropy": 0.2858866319060326, "epoch": 0.6055813076415386, "grad_norm": 0.604000449180603, "learning_rate": 1.937460839974457e-05, "loss": 0.2984, "mean_token_accuracy": 0.9139298975467682, "num_tokens": 128041808.0, "step": 39340 }, { "entropy": 0.2873558425344527, "epoch": 0.6057352429002172, "grad_norm": 0.6069461703300476, "learning_rate": 1.937398478338058e-05, "loss": 0.2937, "mean_token_accuracy": 0.9130343422293663, "num_tokens": 128105210.0, "step": 39350 }, { "entropy": 0.27220661751925945, "epoch": 0.6058891781588959, "grad_norm": 0.736070990562439, "learning_rate": 1.9373360866294867e-05, "loss": 0.2953, "mean_token_accuracy": 0.9141553066670894, "num_tokens": 128171267.0, "step": 39360 }, { "entropy": 0.293258582893759, "epoch": 0.6060431134175744, "grad_norm": 0.5470393896102905, "learning_rate": 1.9372736648507435e-05, "loss": 0.3012, "mean_token_accuracy": 0.9069432206451893, "num_tokens": 128239212.0, "step": 39370 }, { "entropy": 0.28810622477903963, "epoch": 0.606197048676253, "grad_norm": 0.5824304223060608, "learning_rate": 1.9372112130038317e-05, "loss": 0.296, "mean_token_accuracy": 0.9115229696035385, "num_tokens": 128301009.0, "step": 39380 }, { "entropy": 0.29865648606792095, "epoch": 0.6063509839349316, "grad_norm": 0.6638112664222717, "learning_rate": 1.9371487310907543e-05, "loss": 0.307, "mean_token_accuracy": 0.9080639556050301, "num_tokens": 128359555.0, "step": 39390 }, { "entropy": 0.27149093700572846, "epoch": 0.6065049191936102, "grad_norm": 0.5598527789115906, "learning_rate": 1.937086219113516e-05, "loss": 0.2853, "mean_token_accuracy": 0.9165777139365673, "num_tokens": 128422333.0, "step": 39400 }, { "entropy": 0.28772289464250206, "epoch": 0.6066588544522887, "grad_norm": 0.5384035110473633, "learning_rate": 1.9370236770741223e-05, "loss": 0.2823, "mean_token_accuracy": 0.9110025092959404, "num_tokens": 128489605.0, "step": 39410 }, { "entropy": 0.295170880574733, "epoch": 0.6068127897109673, "grad_norm": 0.6609432101249695, "learning_rate": 1.9369611049745793e-05, "loss": 0.3004, "mean_token_accuracy": 0.9093361124396324, "num_tokens": 128551603.0, "step": 39420 }, { "entropy": 0.27815560046583415, "epoch": 0.6069667249696459, "grad_norm": 0.7460977435112, "learning_rate": 1.9368985028168944e-05, "loss": 0.2855, "mean_token_accuracy": 0.9141050353646278, "num_tokens": 128613840.0, "step": 39430 }, { "entropy": 0.26824197694659235, "epoch": 0.6071206602283244, "grad_norm": 0.694247841835022, "learning_rate": 1.936835870603076e-05, "loss": 0.272, "mean_token_accuracy": 0.9163487687706947, "num_tokens": 128670778.0, "step": 39440 }, { "entropy": 0.2740464506670833, "epoch": 0.607274595487003, "grad_norm": 0.7340484857559204, "learning_rate": 1.9367732083351337e-05, "loss": 0.2876, "mean_token_accuracy": 0.9129238776862622, "num_tokens": 128732773.0, "step": 39450 }, { "entropy": 0.2842222181148827, "epoch": 0.6074285307456816, "grad_norm": 0.5565941333770752, "learning_rate": 1.9367105160150772e-05, "loss": 0.2862, "mean_token_accuracy": 0.9129315994679927, "num_tokens": 128798132.0, "step": 39460 }, { "entropy": 0.27677080063149334, "epoch": 0.6075824660043602, "grad_norm": 0.529208779335022, "learning_rate": 1.9366477936449183e-05, "loss": 0.2811, "mean_token_accuracy": 0.9131040260195732, "num_tokens": 128868695.0, "step": 39470 }, { "entropy": 0.2726364478468895, "epoch": 0.6077364012630389, "grad_norm": 0.6262661218643188, "learning_rate": 1.936585041226668e-05, "loss": 0.2763, "mean_token_accuracy": 0.9147849433124066, "num_tokens": 128933805.0, "step": 39480 }, { "entropy": 0.27497453754767776, "epoch": 0.6078903365217174, "grad_norm": 0.6541654467582703, "learning_rate": 1.9365222587623407e-05, "loss": 0.2785, "mean_token_accuracy": 0.9155890472233296, "num_tokens": 129004092.0, "step": 39490 }, { "entropy": 0.30148314824327826, "epoch": 0.608044271780396, "grad_norm": 0.5801261067390442, "learning_rate": 1.93645944625395e-05, "loss": 0.3014, "mean_token_accuracy": 0.9073308750987052, "num_tokens": 129069312.0, "step": 39500 }, { "entropy": 0.2910453211516142, "epoch": 0.6081982070390746, "grad_norm": 0.5508652925491333, "learning_rate": 1.9363966037035106e-05, "loss": 0.2891, "mean_token_accuracy": 0.9099022686481476, "num_tokens": 129137121.0, "step": 39510 }, { "entropy": 0.2768260852433741, "epoch": 0.6083521422977531, "grad_norm": 0.5863245129585266, "learning_rate": 1.936333731113039e-05, "loss": 0.2946, "mean_token_accuracy": 0.9140108235180378, "num_tokens": 129205277.0, "step": 39520 }, { "entropy": 0.3019986803643405, "epoch": 0.6085060775564317, "grad_norm": 0.6035866737365723, "learning_rate": 1.936270828484552e-05, "loss": 0.3079, "mean_token_accuracy": 0.9072033531963826, "num_tokens": 129271106.0, "step": 39530 }, { "entropy": 0.2829225524328649, "epoch": 0.6086600128151103, "grad_norm": 0.6022166609764099, "learning_rate": 1.9362078958200677e-05, "loss": 0.2694, "mean_token_accuracy": 0.9127974525094033, "num_tokens": 129335791.0, "step": 39540 }, { "entropy": 0.2859885597601533, "epoch": 0.6088139480737889, "grad_norm": 0.8332986235618591, "learning_rate": 1.936144933121605e-05, "loss": 0.2998, "mean_token_accuracy": 0.9097121901810169, "num_tokens": 129407154.0, "step": 39550 }, { "entropy": 0.2849348844960332, "epoch": 0.6089678833324674, "grad_norm": 0.6571502089500427, "learning_rate": 1.9360819403911833e-05, "loss": 0.2937, "mean_token_accuracy": 0.9135128296911716, "num_tokens": 129473009.0, "step": 39560 }, { "entropy": 0.27197138080373406, "epoch": 0.609121818591146, "grad_norm": 0.5444139242172241, "learning_rate": 1.9360189176308242e-05, "loss": 0.2791, "mean_token_accuracy": 0.9168578960001469, "num_tokens": 129539014.0, "step": 39570 }, { "entropy": 0.292810351587832, "epoch": 0.6092757538498246, "grad_norm": 0.5867481231689453, "learning_rate": 1.935955864842549e-05, "loss": 0.301, "mean_token_accuracy": 0.9099662363529205, "num_tokens": 129608312.0, "step": 39580 }, { "entropy": 0.2757065614685416, "epoch": 0.6094296891085031, "grad_norm": 0.524853527545929, "learning_rate": 1.9358927820283802e-05, "loss": 0.2903, "mean_token_accuracy": 0.9142273738980293, "num_tokens": 129673391.0, "step": 39590 }, { "entropy": 0.2868512145243585, "epoch": 0.6095836243671817, "grad_norm": 0.5398345589637756, "learning_rate": 1.935829669190342e-05, "loss": 0.2861, "mean_token_accuracy": 0.9112593226134778, "num_tokens": 129748120.0, "step": 39600 }, { "entropy": 0.28507137382403014, "epoch": 0.6097375596258604, "grad_norm": 0.6095168590545654, "learning_rate": 1.935766526330459e-05, "loss": 0.2839, "mean_token_accuracy": 0.9111604809761047, "num_tokens": 129823940.0, "step": 39610 }, { "entropy": 0.29473692141473296, "epoch": 0.609891494884539, "grad_norm": 0.6038127541542053, "learning_rate": 1.935703353450757e-05, "loss": 0.2957, "mean_token_accuracy": 0.9082280166447163, "num_tokens": 129894978.0, "step": 39620 }, { "entropy": 0.27725549871101973, "epoch": 0.6100454301432175, "grad_norm": 0.6091604828834534, "learning_rate": 1.9356401505532623e-05, "loss": 0.2871, "mean_token_accuracy": 0.9122544676065445, "num_tokens": 129954776.0, "step": 39630 }, { "entropy": 0.2730435681529343, "epoch": 0.6101993654018961, "grad_norm": 0.6963739395141602, "learning_rate": 1.9355769176400026e-05, "loss": 0.2729, "mean_token_accuracy": 0.9154108978807927, "num_tokens": 130016656.0, "step": 39640 }, { "entropy": 0.2718174153007567, "epoch": 0.6103533006605747, "grad_norm": 0.7391349077224731, "learning_rate": 1.9355136547130065e-05, "loss": 0.2736, "mean_token_accuracy": 0.91557776927948, "num_tokens": 130077343.0, "step": 39650 }, { "entropy": 0.2895274739712477, "epoch": 0.6105072359192533, "grad_norm": 0.6479403376579285, "learning_rate": 1.9354503617743033e-05, "loss": 0.2932, "mean_token_accuracy": 0.9118661791086197, "num_tokens": 130135192.0, "step": 39660 }, { "entropy": 0.29422583123669027, "epoch": 0.6106611711779318, "grad_norm": 0.5661783218383789, "learning_rate": 1.9353870388259238e-05, "loss": 0.2946, "mean_token_accuracy": 0.909159454703331, "num_tokens": 130208906.0, "step": 39670 }, { "entropy": 0.27542929369956254, "epoch": 0.6108151064366104, "grad_norm": 0.5790853500366211, "learning_rate": 1.9353236858698995e-05, "loss": 0.278, "mean_token_accuracy": 0.9142395660281182, "num_tokens": 130271990.0, "step": 39680 }, { "entropy": 0.26054770657792686, "epoch": 0.610969041695289, "grad_norm": 0.5899028778076172, "learning_rate": 1.9352603029082622e-05, "loss": 0.2682, "mean_token_accuracy": 0.9169323235750199, "num_tokens": 130337739.0, "step": 39690 }, { "entropy": 0.27794360658153894, "epoch": 0.6111229769539676, "grad_norm": 0.5964627265930176, "learning_rate": 1.9351968899430456e-05, "loss": 0.2842, "mean_token_accuracy": 0.9138721339404583, "num_tokens": 130402964.0, "step": 39700 }, { "entropy": 0.2878259291872382, "epoch": 0.6112769122126461, "grad_norm": 0.7078076601028442, "learning_rate": 1.935133446976284e-05, "loss": 0.2908, "mean_token_accuracy": 0.9129759080708026, "num_tokens": 130462350.0, "step": 39710 }, { "entropy": 0.2885641956701875, "epoch": 0.6114308474713247, "grad_norm": 0.6684649586677551, "learning_rate": 1.935069974010013e-05, "loss": 0.294, "mean_token_accuracy": 0.9104833520948887, "num_tokens": 130524832.0, "step": 39720 }, { "entropy": 0.26442456990480423, "epoch": 0.6115847827300033, "grad_norm": 0.7681618928909302, "learning_rate": 1.9350064710462682e-05, "loss": 0.2788, "mean_token_accuracy": 0.9181375719606877, "num_tokens": 130586443.0, "step": 39730 }, { "entropy": 0.29077385636046527, "epoch": 0.611738717988682, "grad_norm": 0.6589037179946899, "learning_rate": 1.9349429380870873e-05, "loss": 0.2918, "mean_token_accuracy": 0.9084858037531376, "num_tokens": 130649821.0, "step": 39740 }, { "entropy": 0.28827540306374433, "epoch": 0.6118926532473605, "grad_norm": 0.7377372980117798, "learning_rate": 1.9348793751345086e-05, "loss": 0.2992, "mean_token_accuracy": 0.9111886665225029, "num_tokens": 130714462.0, "step": 39750 }, { "entropy": 0.2855952437035739, "epoch": 0.6120465885060391, "grad_norm": 0.7015329003334045, "learning_rate": 1.9348157821905703e-05, "loss": 0.2937, "mean_token_accuracy": 0.9127640493214131, "num_tokens": 130779862.0, "step": 39760 }, { "entropy": 0.276936352904886, "epoch": 0.6122005237647177, "grad_norm": 0.7752102017402649, "learning_rate": 1.9347521592573138e-05, "loss": 0.289, "mean_token_accuracy": 0.9156614065170288, "num_tokens": 130840393.0, "step": 39770 }, { "entropy": 0.2676732323132455, "epoch": 0.6123544590233962, "grad_norm": 0.8012641072273254, "learning_rate": 1.934688506336779e-05, "loss": 0.278, "mean_token_accuracy": 0.9177679724991321, "num_tokens": 130898417.0, "step": 39780 }, { "entropy": 0.28543085400015117, "epoch": 0.6125083942820748, "grad_norm": 1.0316020250320435, "learning_rate": 1.9346248234310085e-05, "loss": 0.2983, "mean_token_accuracy": 0.9119775794446469, "num_tokens": 130973443.0, "step": 39790 }, { "entropy": 0.2876907678321004, "epoch": 0.6126623295407534, "grad_norm": 0.7338789105415344, "learning_rate": 1.9345611105420453e-05, "loss": 0.2934, "mean_token_accuracy": 0.9119055233895779, "num_tokens": 131035473.0, "step": 39800 }, { "entropy": 0.3057675588876009, "epoch": 0.612816264799432, "grad_norm": 0.8106244802474976, "learning_rate": 1.934497367671933e-05, "loss": 0.3088, "mean_token_accuracy": 0.904668889194727, "num_tokens": 131095710.0, "step": 39810 }, { "entropy": 0.2905081405304372, "epoch": 0.6129702000581105, "grad_norm": 0.50346839427948, "learning_rate": 1.934433594822717e-05, "loss": 0.2795, "mean_token_accuracy": 0.9108511812984943, "num_tokens": 131163024.0, "step": 39820 }, { "entropy": 0.31353768249973657, "epoch": 0.6131241353167891, "grad_norm": 0.6514941453933716, "learning_rate": 1.9343697919964426e-05, "loss": 0.3097, "mean_token_accuracy": 0.9016607500612736, "num_tokens": 131229244.0, "step": 39830 }, { "entropy": 0.286209846008569, "epoch": 0.6132780705754677, "grad_norm": 0.8093122243881226, "learning_rate": 1.934305959195157e-05, "loss": 0.2889, "mean_token_accuracy": 0.9120633259415627, "num_tokens": 131293566.0, "step": 39840 }, { "entropy": 0.2974687110632658, "epoch": 0.6134320058341463, "grad_norm": 0.546248197555542, "learning_rate": 1.934242096420908e-05, "loss": 0.2955, "mean_token_accuracy": 0.9082489684224129, "num_tokens": 131360176.0, "step": 39850 }, { "entropy": 0.29126889957115054, "epoch": 0.6135859410928249, "grad_norm": 0.6575856804847717, "learning_rate": 1.934178203675744e-05, "loss": 0.2998, "mean_token_accuracy": 0.908876559138298, "num_tokens": 131426141.0, "step": 39860 }, { "entropy": 0.2735102507285774, "epoch": 0.6137398763515035, "grad_norm": 0.8354252576828003, "learning_rate": 1.934114280961715e-05, "loss": 0.2788, "mean_token_accuracy": 0.9146740771830082, "num_tokens": 131493293.0, "step": 39870 }, { "entropy": 0.26734127178788186, "epoch": 0.6138938116101821, "grad_norm": 0.5714274644851685, "learning_rate": 1.934050328280872e-05, "loss": 0.2627, "mean_token_accuracy": 0.915371859818697, "num_tokens": 131557429.0, "step": 39880 }, { "entropy": 0.2709987549111247, "epoch": 0.6140477468688607, "grad_norm": 0.6898828148841858, "learning_rate": 1.9339863456352658e-05, "loss": 0.2793, "mean_token_accuracy": 0.915360376983881, "num_tokens": 131623116.0, "step": 39890 }, { "entropy": 0.29920547595247626, "epoch": 0.6142016821275392, "grad_norm": 0.6070495247840881, "learning_rate": 1.9339223330269495e-05, "loss": 0.2972, "mean_token_accuracy": 0.9068963848054409, "num_tokens": 131689623.0, "step": 39900 }, { "entropy": 0.2955111279152334, "epoch": 0.6143556173862178, "grad_norm": 0.786957859992981, "learning_rate": 1.9338582904579768e-05, "loss": 0.2919, "mean_token_accuracy": 0.9103858828544616, "num_tokens": 131754458.0, "step": 39910 }, { "entropy": 0.28466829312965275, "epoch": 0.6145095526448964, "grad_norm": 0.5534308552742004, "learning_rate": 1.9337942179304015e-05, "loss": 0.286, "mean_token_accuracy": 0.9104105897247792, "num_tokens": 131814455.0, "step": 39920 }, { "entropy": 0.2935789915733039, "epoch": 0.614663487903575, "grad_norm": 0.7240470051765442, "learning_rate": 1.9337301154462802e-05, "loss": 0.2794, "mean_token_accuracy": 0.9089046157896519, "num_tokens": 131876620.0, "step": 39930 }, { "entropy": 0.27097635697573424, "epoch": 0.6148174231622535, "grad_norm": 0.522350549697876, "learning_rate": 1.933665983007668e-05, "loss": 0.2854, "mean_token_accuracy": 0.9145860478281975, "num_tokens": 131935353.0, "step": 39940 }, { "entropy": 0.2958191353827715, "epoch": 0.6149713584209321, "grad_norm": 0.5832711458206177, "learning_rate": 1.933601820616624e-05, "loss": 0.3075, "mean_token_accuracy": 0.9107205025851727, "num_tokens": 132004949.0, "step": 39950 }, { "entropy": 0.2870202537626028, "epoch": 0.6151252936796107, "grad_norm": 0.5844823718070984, "learning_rate": 1.9335376282752048e-05, "loss": 0.2924, "mean_token_accuracy": 0.911402078717947, "num_tokens": 132066431.0, "step": 39960 }, { "entropy": 0.3024038043804467, "epoch": 0.6152792289382892, "grad_norm": 0.629599928855896, "learning_rate": 1.933473405985471e-05, "loss": 0.3093, "mean_token_accuracy": 0.9073541291058064, "num_tokens": 132138649.0, "step": 39970 }, { "entropy": 0.2871047121472657, "epoch": 0.6154331641969678, "grad_norm": 0.682495653629303, "learning_rate": 1.933409153749482e-05, "loss": 0.2875, "mean_token_accuracy": 0.9119155086576939, "num_tokens": 132206866.0, "step": 39980 }, { "entropy": 0.29020096827298403, "epoch": 0.6155870994556465, "grad_norm": 0.6252926588058472, "learning_rate": 1.9333448715692996e-05, "loss": 0.3018, "mean_token_accuracy": 0.9100996106863022, "num_tokens": 132272412.0, "step": 39990 }, { "entropy": 0.2863960159011185, "epoch": 0.6157410347143251, "grad_norm": 0.5504569411277771, "learning_rate": 1.9332805594469856e-05, "loss": 0.2829, "mean_token_accuracy": 0.9119042448699475, "num_tokens": 132332494.0, "step": 40000 }, { "epoch": 0.6157410347143251, "eval_entropy": 0.2911869712514791, "eval_loss": 0.2852184772491455, "eval_mean_token_accuracy": 0.9116166967082718, "eval_num_tokens": 132332494.0, "eval_runtime": 7809.858, "eval_samples_per_second": 4.159, "eval_steps_per_second": 4.159, "step": 40000 }, { "entropy": 0.30046789860352874, "epoch": 0.6158949699730036, "grad_norm": 0.5784236788749695, "learning_rate": 1.9332162173846035e-05, "loss": 0.2948, "mean_token_accuracy": 0.9088501736521721, "num_tokens": 132403594.0, "step": 40010 }, { "entropy": 0.2715789289213717, "epoch": 0.6160489052316822, "grad_norm": 0.5929123163223267, "learning_rate": 1.9331518453842173e-05, "loss": 0.2777, "mean_token_accuracy": 0.9166614644229412, "num_tokens": 132468617.0, "step": 40020 }, { "entropy": 0.2731598294340074, "epoch": 0.6162028404903608, "grad_norm": 0.7709828615188599, "learning_rate": 1.933087443447892e-05, "loss": 0.2836, "mean_token_accuracy": 0.9144693933427334, "num_tokens": 132529052.0, "step": 40030 }, { "entropy": 0.2620743768289685, "epoch": 0.6163567757490394, "grad_norm": 0.5774185061454773, "learning_rate": 1.933023011577694e-05, "loss": 0.2733, "mean_token_accuracy": 0.916525711864233, "num_tokens": 132590970.0, "step": 40040 }, { "entropy": 0.2807546317577362, "epoch": 0.6165107110077179, "grad_norm": 0.6779410243034363, "learning_rate": 1.9329585497756897e-05, "loss": 0.2788, "mean_token_accuracy": 0.9135002464056015, "num_tokens": 132648229.0, "step": 40050 }, { "entropy": 0.2647203686647117, "epoch": 0.6166646462663965, "grad_norm": 0.842797040939331, "learning_rate": 1.9328940580439478e-05, "loss": 0.2659, "mean_token_accuracy": 0.9186688765883446, "num_tokens": 132716725.0, "step": 40060 }, { "entropy": 0.25807273807004094, "epoch": 0.6168185815250751, "grad_norm": 0.6862563490867615, "learning_rate": 1.9328295363845363e-05, "loss": 0.2766, "mean_token_accuracy": 0.9174627155065537, "num_tokens": 132785348.0, "step": 40070 }, { "entropy": 0.28748180912807586, "epoch": 0.6169725167837536, "grad_norm": 0.6476529240608215, "learning_rate": 1.932764984799526e-05, "loss": 0.294, "mean_token_accuracy": 0.9108391188085079, "num_tokens": 132853800.0, "step": 40080 }, { "entropy": 0.30771015528589485, "epoch": 0.6171264520424322, "grad_norm": 0.5029317736625671, "learning_rate": 1.9327004032909873e-05, "loss": 0.2888, "mean_token_accuracy": 0.905886510014534, "num_tokens": 132919141.0, "step": 40090 }, { "entropy": 0.2781276860274374, "epoch": 0.6172803873011108, "grad_norm": 0.4980087876319885, "learning_rate": 1.932635791860992e-05, "loss": 0.2823, "mean_token_accuracy": 0.9122856214642525, "num_tokens": 132982819.0, "step": 40100 }, { "entropy": 0.2824342601001263, "epoch": 0.6174343225597894, "grad_norm": 0.6258975267410278, "learning_rate": 1.9325711505116128e-05, "loss": 0.2962, "mean_token_accuracy": 0.9123968861997127, "num_tokens": 133052126.0, "step": 40110 }, { "entropy": 0.2623897647485137, "epoch": 0.617588257818468, "grad_norm": 0.5230681300163269, "learning_rate": 1.9325064792449237e-05, "loss": 0.2776, "mean_token_accuracy": 0.9182923816144466, "num_tokens": 133122695.0, "step": 40120 }, { "entropy": 0.28693404952064155, "epoch": 0.6177421930771466, "grad_norm": 0.5468260645866394, "learning_rate": 1.9324417780629993e-05, "loss": 0.3032, "mean_token_accuracy": 0.9113880105316639, "num_tokens": 133184948.0, "step": 40130 }, { "entropy": 0.2792577173560858, "epoch": 0.6178961283358252, "grad_norm": 0.6056135296821594, "learning_rate": 1.932377046967915e-05, "loss": 0.2846, "mean_token_accuracy": 0.9151111416518688, "num_tokens": 133248825.0, "step": 40140 }, { "entropy": 0.2923952630721033, "epoch": 0.6180500635945038, "grad_norm": 0.5425509214401245, "learning_rate": 1.9323122859617473e-05, "loss": 0.3045, "mean_token_accuracy": 0.9108342103660106, "num_tokens": 133314139.0, "step": 40150 }, { "entropy": 0.26659211041405795, "epoch": 0.6182039988531823, "grad_norm": 0.615999698638916, "learning_rate": 1.9322474950465745e-05, "loss": 0.2753, "mean_token_accuracy": 0.9172979459166527, "num_tokens": 133373059.0, "step": 40160 }, { "entropy": 0.2588866732083261, "epoch": 0.6183579341118609, "grad_norm": 0.7300843000411987, "learning_rate": 1.9321826742244746e-05, "loss": 0.2602, "mean_token_accuracy": 0.9188394755125046, "num_tokens": 133433023.0, "step": 40170 }, { "entropy": 0.3076372142881155, "epoch": 0.6185118693705395, "grad_norm": 0.6430777311325073, "learning_rate": 1.932117823497527e-05, "loss": 0.3081, "mean_token_accuracy": 0.904069472849369, "num_tokens": 133500359.0, "step": 40180 }, { "entropy": 0.28192567108199, "epoch": 0.618665804629218, "grad_norm": 0.46186432242393494, "learning_rate": 1.9320529428678125e-05, "loss": 0.2741, "mean_token_accuracy": 0.9118500195443631, "num_tokens": 133568047.0, "step": 40190 }, { "entropy": 0.2759274092502892, "epoch": 0.6188197398878966, "grad_norm": 0.6300408840179443, "learning_rate": 1.931988032337412e-05, "loss": 0.2754, "mean_token_accuracy": 0.9186961412429809, "num_tokens": 133632493.0, "step": 40200 }, { "entropy": 0.3129672700539231, "epoch": 0.6189736751465752, "grad_norm": 0.7460319399833679, "learning_rate": 1.9319230919084082e-05, "loss": 0.309, "mean_token_accuracy": 0.9033969081938267, "num_tokens": 133695276.0, "step": 40210 }, { "entropy": 0.28378785997629163, "epoch": 0.6191276104052538, "grad_norm": 0.7594212889671326, "learning_rate": 1.9318581215828843e-05, "loss": 0.2979, "mean_token_accuracy": 0.9122375950217247, "num_tokens": 133760011.0, "step": 40220 }, { "entropy": 0.28092665784060955, "epoch": 0.6192815456639323, "grad_norm": 0.6379871368408203, "learning_rate": 1.931793121362925e-05, "loss": 0.2976, "mean_token_accuracy": 0.9149101063609123, "num_tokens": 133826343.0, "step": 40230 }, { "entropy": 0.29480644334107636, "epoch": 0.619435480922611, "grad_norm": 0.5900607109069824, "learning_rate": 1.931728091250615e-05, "loss": 0.3008, "mean_token_accuracy": 0.9081575736403465, "num_tokens": 133893452.0, "step": 40240 }, { "entropy": 0.3114850215613842, "epoch": 0.6195894161812896, "grad_norm": 0.548327624797821, "learning_rate": 1.9316630312480405e-05, "loss": 0.3044, "mean_token_accuracy": 0.9032936967909336, "num_tokens": 133963314.0, "step": 40250 }, { "entropy": 0.27476769210770724, "epoch": 0.6197433514399682, "grad_norm": 0.5937341451644897, "learning_rate": 1.9315979413572888e-05, "loss": 0.274, "mean_token_accuracy": 0.9160853646695614, "num_tokens": 134023345.0, "step": 40260 }, { "entropy": 0.2893003994598985, "epoch": 0.6198972866986467, "grad_norm": 0.6108443140983582, "learning_rate": 1.931532821580448e-05, "loss": 0.2863, "mean_token_accuracy": 0.9094359025359153, "num_tokens": 134095474.0, "step": 40270 }, { "entropy": 0.2803208563476801, "epoch": 0.6200512219573253, "grad_norm": 0.5432729125022888, "learning_rate": 1.9314676719196076e-05, "loss": 0.2758, "mean_token_accuracy": 0.9138463981449604, "num_tokens": 134166404.0, "step": 40280 }, { "entropy": 0.2794767655432224, "epoch": 0.6202051572160039, "grad_norm": 0.6432532072067261, "learning_rate": 1.931402492376857e-05, "loss": 0.2902, "mean_token_accuracy": 0.9121351785957813, "num_tokens": 134234045.0, "step": 40290 }, { "entropy": 0.28973386632278564, "epoch": 0.6203590924746825, "grad_norm": 0.5491120219230652, "learning_rate": 1.9313372829542873e-05, "loss": 0.2949, "mean_token_accuracy": 0.9071142889559269, "num_tokens": 134300396.0, "step": 40300 }, { "entropy": 0.2845471129752696, "epoch": 0.620513027733361, "grad_norm": 0.5294365286827087, "learning_rate": 1.931272043653991e-05, "loss": 0.2822, "mean_token_accuracy": 0.9136860243976116, "num_tokens": 134371828.0, "step": 40310 }, { "entropy": 0.25965975574217737, "epoch": 0.6206669629920396, "grad_norm": 0.551866352558136, "learning_rate": 1.93120677447806e-05, "loss": 0.2624, "mean_token_accuracy": 0.9202211849391461, "num_tokens": 134435997.0, "step": 40320 }, { "entropy": 0.26380246412009, "epoch": 0.6208208982507182, "grad_norm": 0.7342532277107239, "learning_rate": 1.9311414754285893e-05, "loss": 0.2643, "mean_token_accuracy": 0.9182764738798141, "num_tokens": 134496006.0, "step": 40330 }, { "entropy": 0.28165199449285866, "epoch": 0.6209748335093968, "grad_norm": 0.7617347836494446, "learning_rate": 1.931076146507673e-05, "loss": 0.2791, "mean_token_accuracy": 0.9136710703372956, "num_tokens": 134565312.0, "step": 40340 }, { "entropy": 0.29239860028028486, "epoch": 0.6211287687680753, "grad_norm": 0.6188693046569824, "learning_rate": 1.931010787717407e-05, "loss": 0.2881, "mean_token_accuracy": 0.9093491017818451, "num_tokens": 134626723.0, "step": 40350 }, { "entropy": 0.2758285226300359, "epoch": 0.6212827040267539, "grad_norm": 0.8372313380241394, "learning_rate": 1.930945399059888e-05, "loss": 0.2793, "mean_token_accuracy": 0.9140428066253662, "num_tokens": 134688420.0, "step": 40360 }, { "entropy": 0.3076136106625199, "epoch": 0.6214366392854326, "grad_norm": 0.6253185272216797, "learning_rate": 1.9308799805372138e-05, "loss": 0.3106, "mean_token_accuracy": 0.9043849267065525, "num_tokens": 134751983.0, "step": 40370 }, { "entropy": 0.2770120401866734, "epoch": 0.6215905745441112, "grad_norm": 0.6016418933868408, "learning_rate": 1.9308145321514834e-05, "loss": 0.2895, "mean_token_accuracy": 0.9135588280856609, "num_tokens": 134824111.0, "step": 40380 }, { "entropy": 0.31207400457933543, "epoch": 0.6217445098027897, "grad_norm": 0.6142170429229736, "learning_rate": 1.9307490539047956e-05, "loss": 0.3028, "mean_token_accuracy": 0.9050642572343349, "num_tokens": 134891168.0, "step": 40390 }, { "entropy": 0.2829204543493688, "epoch": 0.6218984450614683, "grad_norm": 0.7876229286193848, "learning_rate": 1.9306835457992516e-05, "loss": 0.2804, "mean_token_accuracy": 0.9121823586523533, "num_tokens": 134953730.0, "step": 40400 }, { "entropy": 0.2756702000275254, "epoch": 0.6220523803201469, "grad_norm": 0.571122944355011, "learning_rate": 1.9306180078369528e-05, "loss": 0.2863, "mean_token_accuracy": 0.9156790971755981, "num_tokens": 135017928.0, "step": 40410 }, { "entropy": 0.2840680481866002, "epoch": 0.6222063155788254, "grad_norm": 0.6876442432403564, "learning_rate": 1.9305524400200017e-05, "loss": 0.2815, "mean_token_accuracy": 0.9108219653367996, "num_tokens": 135083442.0, "step": 40420 }, { "entropy": 0.2639019095338881, "epoch": 0.622360250837504, "grad_norm": 0.515900731086731, "learning_rate": 1.9304868423505016e-05, "loss": 0.2621, "mean_token_accuracy": 0.9199608959257602, "num_tokens": 135150817.0, "step": 40430 }, { "entropy": 0.2987519778311253, "epoch": 0.6225141860961826, "grad_norm": 0.6663203835487366, "learning_rate": 1.9304212148305572e-05, "loss": 0.3054, "mean_token_accuracy": 0.9079473622143268, "num_tokens": 135209359.0, "step": 40440 }, { "entropy": 0.2775415325537324, "epoch": 0.6226681213548612, "grad_norm": 0.5676693320274353, "learning_rate": 1.9303555574622733e-05, "loss": 0.2856, "mean_token_accuracy": 0.9148469924926758, "num_tokens": 135282734.0, "step": 40450 }, { "entropy": 0.28362281490117314, "epoch": 0.6228220566135397, "grad_norm": 0.619151771068573, "learning_rate": 1.930289870247757e-05, "loss": 0.2857, "mean_token_accuracy": 0.914788980036974, "num_tokens": 135348956.0, "step": 40460 }, { "entropy": 0.27484915340319277, "epoch": 0.6229759918722183, "grad_norm": 0.6427295804023743, "learning_rate": 1.930224153189115e-05, "loss": 0.2806, "mean_token_accuracy": 0.9137948684394359, "num_tokens": 135411470.0, "step": 40470 }, { "entropy": 0.28297987999394536, "epoch": 0.6231299271308969, "grad_norm": 0.5342233777046204, "learning_rate": 1.9301584062884554e-05, "loss": 0.2778, "mean_token_accuracy": 0.9130141973495484, "num_tokens": 135476902.0, "step": 40480 }, { "entropy": 0.3120063048787415, "epoch": 0.6232838623895754, "grad_norm": 0.6318883895874023, "learning_rate": 1.9300926295478883e-05, "loss": 0.3186, "mean_token_accuracy": 0.9039338596165181, "num_tokens": 135540034.0, "step": 40490 }, { "entropy": 0.3005062703974545, "epoch": 0.6234377976482541, "grad_norm": 0.48818087577819824, "learning_rate": 1.9300268229695228e-05, "loss": 0.3038, "mean_token_accuracy": 0.9073501780629158, "num_tokens": 135619129.0, "step": 40500 }, { "entropy": 0.2937313444912434, "epoch": 0.6235917329069327, "grad_norm": 0.7424091696739197, "learning_rate": 1.9299609865554704e-05, "loss": 0.295, "mean_token_accuracy": 0.9093649201095104, "num_tokens": 135689257.0, "step": 40510 }, { "entropy": 0.29149589212611315, "epoch": 0.6237456681656113, "grad_norm": 0.6629044413566589, "learning_rate": 1.9298951203078435e-05, "loss": 0.2976, "mean_token_accuracy": 0.9072472512722015, "num_tokens": 135762989.0, "step": 40520 }, { "entropy": 0.287790465913713, "epoch": 0.6238996034242899, "grad_norm": 0.7512009739875793, "learning_rate": 1.9298292242287546e-05, "loss": 0.2931, "mean_token_accuracy": 0.9115586511790752, "num_tokens": 135826002.0, "step": 40530 }, { "entropy": 0.2793130049481988, "epoch": 0.6240535386829684, "grad_norm": 0.6569163203239441, "learning_rate": 1.9297632983203177e-05, "loss": 0.2805, "mean_token_accuracy": 0.9148730210959911, "num_tokens": 135890285.0, "step": 40540 }, { "entropy": 0.27865413641557096, "epoch": 0.624207473941647, "grad_norm": 0.763248860836029, "learning_rate": 1.9296973425846483e-05, "loss": 0.2901, "mean_token_accuracy": 0.9098876506090164, "num_tokens": 135953280.0, "step": 40550 }, { "entropy": 0.26341277938336133, "epoch": 0.6243614092003256, "grad_norm": 0.7043796181678772, "learning_rate": 1.929631357023862e-05, "loss": 0.2878, "mean_token_accuracy": 0.9177167005836964, "num_tokens": 136017595.0, "step": 40560 }, { "entropy": 0.3014309453777969, "epoch": 0.6245153444590041, "grad_norm": 0.5618266463279724, "learning_rate": 1.9295653416400753e-05, "loss": 0.2971, "mean_token_accuracy": 0.9095845766365528, "num_tokens": 136081375.0, "step": 40570 }, { "entropy": 0.27599600832909343, "epoch": 0.6246692797176827, "grad_norm": 0.712902307510376, "learning_rate": 1.9294992964354063e-05, "loss": 0.2778, "mean_token_accuracy": 0.9117116875946522, "num_tokens": 136145596.0, "step": 40580 }, { "entropy": 0.28135765278711916, "epoch": 0.6248232149763613, "grad_norm": 0.5238603353500366, "learning_rate": 1.9294332214119738e-05, "loss": 0.2877, "mean_token_accuracy": 0.9141554020345211, "num_tokens": 136211532.0, "step": 40590 }, { "entropy": 0.274145526625216, "epoch": 0.6249771502350399, "grad_norm": 0.6207101941108704, "learning_rate": 1.9293671165718973e-05, "loss": 0.2733, "mean_token_accuracy": 0.9151353754103184, "num_tokens": 136287115.0, "step": 40600 }, { "entropy": 0.28374044634401796, "epoch": 0.6251310854937184, "grad_norm": 0.6980747580528259, "learning_rate": 1.9293009819172978e-05, "loss": 0.2847, "mean_token_accuracy": 0.9134989604353905, "num_tokens": 136352176.0, "step": 40610 }, { "entropy": 0.27222417630255225, "epoch": 0.6252850207523971, "grad_norm": 0.6921433806419373, "learning_rate": 1.9292348174502966e-05, "loss": 0.2815, "mean_token_accuracy": 0.9140041269361973, "num_tokens": 136420486.0, "step": 40620 }, { "entropy": 0.29705912712961435, "epoch": 0.6254389560110757, "grad_norm": 0.5414215922355652, "learning_rate": 1.9291686231730164e-05, "loss": 0.3018, "mean_token_accuracy": 0.9077325679361821, "num_tokens": 136488865.0, "step": 40630 }, { "entropy": 0.28020985005423427, "epoch": 0.6255928912697543, "grad_norm": 0.5321645736694336, "learning_rate": 1.9291023990875806e-05, "loss": 0.2811, "mean_token_accuracy": 0.9147709690034389, "num_tokens": 136559224.0, "step": 40640 }, { "entropy": 0.3012808125466108, "epoch": 0.6257468265284328, "grad_norm": 0.8107784986495972, "learning_rate": 1.9290361451961145e-05, "loss": 0.3053, "mean_token_accuracy": 0.9066559135913849, "num_tokens": 136617826.0, "step": 40650 }, { "entropy": 0.2993659595027566, "epoch": 0.6259007617871114, "grad_norm": 0.7290443778038025, "learning_rate": 1.9289698615007425e-05, "loss": 0.2987, "mean_token_accuracy": 0.9093154057860374, "num_tokens": 136678937.0, "step": 40660 }, { "entropy": 0.28769472548738123, "epoch": 0.62605469704579, "grad_norm": 0.7141662836074829, "learning_rate": 1.9289035480035916e-05, "loss": 0.2892, "mean_token_accuracy": 0.909380491822958, "num_tokens": 136744474.0, "step": 40670 }, { "entropy": 0.2862282557412982, "epoch": 0.6262086323044685, "grad_norm": 0.6347970366477966, "learning_rate": 1.928837204706789e-05, "loss": 0.2955, "mean_token_accuracy": 0.9101095914840698, "num_tokens": 136809988.0, "step": 40680 }, { "entropy": 0.29776236405596135, "epoch": 0.6263625675631471, "grad_norm": 0.5387595295906067, "learning_rate": 1.928770831612463e-05, "loss": 0.2999, "mean_token_accuracy": 0.9086283907294274, "num_tokens": 136884135.0, "step": 40690 }, { "entropy": 0.2915337197482586, "epoch": 0.6265165028218257, "grad_norm": 0.544022262096405, "learning_rate": 1.9287044287227426e-05, "loss": 0.2936, "mean_token_accuracy": 0.9089840427041054, "num_tokens": 136952434.0, "step": 40700 }, { "entropy": 0.285848274640739, "epoch": 0.6266704380805043, "grad_norm": 0.6931750178337097, "learning_rate": 1.928637996039759e-05, "loss": 0.2915, "mean_token_accuracy": 0.9130677364766597, "num_tokens": 137006876.0, "step": 40710 }, { "entropy": 0.27849708665162326, "epoch": 0.6268243733391828, "grad_norm": 0.5593206286430359, "learning_rate": 1.9285715335656424e-05, "loss": 0.2833, "mean_token_accuracy": 0.914091981202364, "num_tokens": 137070071.0, "step": 40720 }, { "entropy": 0.2818996824324131, "epoch": 0.6269783085978614, "grad_norm": 0.49667754769325256, "learning_rate": 1.928505041302525e-05, "loss": 0.2816, "mean_token_accuracy": 0.9115407593548298, "num_tokens": 137142484.0, "step": 40730 }, { "entropy": 0.28614278407767413, "epoch": 0.62713224385654, "grad_norm": 0.6127164363861084, "learning_rate": 1.9284385192525405e-05, "loss": 0.2951, "mean_token_accuracy": 0.9104786485433578, "num_tokens": 137205869.0, "step": 40740 }, { "entropy": 0.27954102149233223, "epoch": 0.6272861791152187, "grad_norm": 0.6995141506195068, "learning_rate": 1.9283719674178227e-05, "loss": 0.2851, "mean_token_accuracy": 0.913648221641779, "num_tokens": 137272622.0, "step": 40750 }, { "entropy": 0.29507623594254256, "epoch": 0.6274401143738972, "grad_norm": 0.6275798678398132, "learning_rate": 1.9283053858005066e-05, "loss": 0.3027, "mean_token_accuracy": 0.9086962386965751, "num_tokens": 137344393.0, "step": 40760 }, { "entropy": 0.278873036429286, "epoch": 0.6275940496325758, "grad_norm": 0.6665669679641724, "learning_rate": 1.9282387744027278e-05, "loss": 0.2988, "mean_token_accuracy": 0.9145800389349461, "num_tokens": 137412113.0, "step": 40770 }, { "entropy": 0.2860538702458143, "epoch": 0.6277479848912544, "grad_norm": 0.5589609146118164, "learning_rate": 1.928172133226624e-05, "loss": 0.2999, "mean_token_accuracy": 0.9089405566453934, "num_tokens": 137480804.0, "step": 40780 }, { "entropy": 0.30322738327085974, "epoch": 0.627901920149933, "grad_norm": 0.7458589673042297, "learning_rate": 1.9281054622743324e-05, "loss": 0.3144, "mean_token_accuracy": 0.9069742262363434, "num_tokens": 137545062.0, "step": 40790 }, { "entropy": 0.2858154675923288, "epoch": 0.6280558554086115, "grad_norm": 0.5829207897186279, "learning_rate": 1.928038761547992e-05, "loss": 0.2882, "mean_token_accuracy": 0.9116419091820717, "num_tokens": 137612672.0, "step": 40800 }, { "entropy": 0.2768090701662004, "epoch": 0.6282097906672901, "grad_norm": 0.6600169539451599, "learning_rate": 1.9279720310497426e-05, "loss": 0.2813, "mean_token_accuracy": 0.9149488434195518, "num_tokens": 137683207.0, "step": 40810 }, { "entropy": 0.2735976932570338, "epoch": 0.6283637259259687, "grad_norm": 0.7543452382087708, "learning_rate": 1.9279052707817253e-05, "loss": 0.2818, "mean_token_accuracy": 0.9136270433664322, "num_tokens": 137745679.0, "step": 40820 }, { "entropy": 0.27541688941419123, "epoch": 0.6285176611846472, "grad_norm": 0.5629929900169373, "learning_rate": 1.9278384807460813e-05, "loss": 0.2862, "mean_token_accuracy": 0.9131689615547657, "num_tokens": 137824521.0, "step": 40830 }, { "entropy": 0.295237819198519, "epoch": 0.6286715964433258, "grad_norm": 0.6124274730682373, "learning_rate": 1.9277716609449534e-05, "loss": 0.287, "mean_token_accuracy": 0.9111767143011094, "num_tokens": 137882897.0, "step": 40840 }, { "entropy": 0.2898249187506735, "epoch": 0.6288255317020044, "grad_norm": 0.5153648257255554, "learning_rate": 1.9277048113804856e-05, "loss": 0.2863, "mean_token_accuracy": 0.9072437904775142, "num_tokens": 137953982.0, "step": 40850 }, { "entropy": 0.2981351401656866, "epoch": 0.628979466960683, "grad_norm": 0.6627551317214966, "learning_rate": 1.9276379320548218e-05, "loss": 0.296, "mean_token_accuracy": 0.9082454234361649, "num_tokens": 138021105.0, "step": 40860 }, { "entropy": 0.29496375024318694, "epoch": 0.6291334022193615, "grad_norm": 0.621447741985321, "learning_rate": 1.9275710229701078e-05, "loss": 0.2938, "mean_token_accuracy": 0.9102335222065449, "num_tokens": 138095011.0, "step": 40870 }, { "entropy": 0.29800152527168394, "epoch": 0.6292873374780402, "grad_norm": 0.7079433798789978, "learning_rate": 1.9275040841284904e-05, "loss": 0.3147, "mean_token_accuracy": 0.9079749122262001, "num_tokens": 138157711.0, "step": 40880 }, { "entropy": 0.2962869394570589, "epoch": 0.6294412727367188, "grad_norm": 0.561737596988678, "learning_rate": 1.9274371155321167e-05, "loss": 0.3058, "mean_token_accuracy": 0.9091135956346988, "num_tokens": 138225065.0, "step": 40890 }, { "entropy": 0.28349256981164217, "epoch": 0.6295952079953974, "grad_norm": 0.5481238961219788, "learning_rate": 1.927370117183135e-05, "loss": 0.2819, "mean_token_accuracy": 0.9096583239734173, "num_tokens": 138286081.0, "step": 40900 }, { "entropy": 0.27946242559701207, "epoch": 0.6297491432540759, "grad_norm": 0.6812970638275146, "learning_rate": 1.9273030890836946e-05, "loss": 0.2798, "mean_token_accuracy": 0.912185026705265, "num_tokens": 138350860.0, "step": 40910 }, { "entropy": 0.27642950797453525, "epoch": 0.6299030785127545, "grad_norm": 0.6474122405052185, "learning_rate": 1.9272360312359465e-05, "loss": 0.2797, "mean_token_accuracy": 0.9143131509423256, "num_tokens": 138422313.0, "step": 40920 }, { "entropy": 0.26813012091442945, "epoch": 0.6300570137714331, "grad_norm": 0.4515659511089325, "learning_rate": 1.927168943642041e-05, "loss": 0.2776, "mean_token_accuracy": 0.9153808325529098, "num_tokens": 138485107.0, "step": 40930 }, { "entropy": 0.296719737444073, "epoch": 0.6302109490301117, "grad_norm": 0.8489612340927124, "learning_rate": 1.927101826304131e-05, "loss": 0.3087, "mean_token_accuracy": 0.9073884941637516, "num_tokens": 138559657.0, "step": 40940 }, { "entropy": 0.2939278126694262, "epoch": 0.6303648842887902, "grad_norm": 0.47043144702911377, "learning_rate": 1.927034679224369e-05, "loss": 0.2928, "mean_token_accuracy": 0.9105780221521854, "num_tokens": 138631087.0, "step": 40950 }, { "entropy": 0.2704919165931642, "epoch": 0.6305188195474688, "grad_norm": 0.9047435522079468, "learning_rate": 1.9269675024049096e-05, "loss": 0.2759, "mean_token_accuracy": 0.9171354532241821, "num_tokens": 138688729.0, "step": 40960 }, { "entropy": 0.27631860645487905, "epoch": 0.6306727548061474, "grad_norm": 0.7164669036865234, "learning_rate": 1.9269002958479078e-05, "loss": 0.2928, "mean_token_accuracy": 0.9118654139339923, "num_tokens": 138758263.0, "step": 40970 }, { "entropy": 0.25870846761390565, "epoch": 0.630826690064826, "grad_norm": 0.5561733245849609, "learning_rate": 1.92683305955552e-05, "loss": 0.2653, "mean_token_accuracy": 0.9199291571974755, "num_tokens": 138824002.0, "step": 40980 }, { "entropy": 0.2988218491896987, "epoch": 0.6309806253235045, "grad_norm": 0.8101004362106323, "learning_rate": 1.926765793529902e-05, "loss": 0.3126, "mean_token_accuracy": 0.9077602989971638, "num_tokens": 138893540.0, "step": 40990 }, { "entropy": 0.28572304733097553, "epoch": 0.6311345605821832, "grad_norm": 0.6429336667060852, "learning_rate": 1.926698497773213e-05, "loss": 0.2851, "mean_token_accuracy": 0.9130571544170379, "num_tokens": 138967897.0, "step": 41000 }, { "entropy": 0.2696552780456841, "epoch": 0.6312884958408618, "grad_norm": 0.5374010801315308, "learning_rate": 1.926631172287611e-05, "loss": 0.2691, "mean_token_accuracy": 0.9165196016430854, "num_tokens": 139039997.0, "step": 41010 }, { "entropy": 0.28081821743398905, "epoch": 0.6314424310995403, "grad_norm": 0.5854422450065613, "learning_rate": 1.9265638170752563e-05, "loss": 0.2795, "mean_token_accuracy": 0.912059522420168, "num_tokens": 139107410.0, "step": 41020 }, { "entropy": 0.27263215128332374, "epoch": 0.6315963663582189, "grad_norm": 0.6655133962631226, "learning_rate": 1.9264964321383095e-05, "loss": 0.2779, "mean_token_accuracy": 0.9140349358320237, "num_tokens": 139171787.0, "step": 41030 }, { "entropy": 0.2641391203738749, "epoch": 0.6317503016168975, "grad_norm": 0.47252118587493896, "learning_rate": 1.9264290174789325e-05, "loss": 0.277, "mean_token_accuracy": 0.9185385517776012, "num_tokens": 139236690.0, "step": 41040 }, { "entropy": 0.29199625421315434, "epoch": 0.6319042368755761, "grad_norm": 0.7308881282806396, "learning_rate": 1.9263615730992877e-05, "loss": 0.3004, "mean_token_accuracy": 0.9108171842992305, "num_tokens": 139307464.0, "step": 41050 }, { "entropy": 0.30950444331392646, "epoch": 0.6320581721342546, "grad_norm": 0.6772857904434204, "learning_rate": 1.926294099001539e-05, "loss": 0.3102, "mean_token_accuracy": 0.9042258657515049, "num_tokens": 139380086.0, "step": 41060 }, { "entropy": 0.277648591902107, "epoch": 0.6322121073929332, "grad_norm": 0.556607186794281, "learning_rate": 1.926226595187851e-05, "loss": 0.284, "mean_token_accuracy": 0.9138243667781353, "num_tokens": 139444325.0, "step": 41070 }, { "entropy": 0.26944533148780464, "epoch": 0.6323660426516118, "grad_norm": 0.6092591881752014, "learning_rate": 1.9261590616603893e-05, "loss": 0.2849, "mean_token_accuracy": 0.9148857958614827, "num_tokens": 139504181.0, "step": 41080 }, { "entropy": 0.2931858662515879, "epoch": 0.6325199779102904, "grad_norm": 0.6701981425285339, "learning_rate": 1.9260914984213205e-05, "loss": 0.2772, "mean_token_accuracy": 0.9108386851847172, "num_tokens": 139566519.0, "step": 41090 }, { "entropy": 0.27906403662636875, "epoch": 0.6326739131689689, "grad_norm": 0.8222400546073914, "learning_rate": 1.9260239054728112e-05, "loss": 0.2833, "mean_token_accuracy": 0.9126859463751316, "num_tokens": 139626989.0, "step": 41100 }, { "entropy": 0.2950974987819791, "epoch": 0.6328278484276475, "grad_norm": 0.6747414469718933, "learning_rate": 1.925956282817031e-05, "loss": 0.2939, "mean_token_accuracy": 0.909475139528513, "num_tokens": 139692617.0, "step": 41110 }, { "entropy": 0.29046980338171124, "epoch": 0.6329817836863261, "grad_norm": 0.6146323084831238, "learning_rate": 1.9258886304561485e-05, "loss": 0.2909, "mean_token_accuracy": 0.9091234877705574, "num_tokens": 139756336.0, "step": 41120 }, { "entropy": 0.28731636293232443, "epoch": 0.6331357189450048, "grad_norm": 0.5548815727233887, "learning_rate": 1.925820948392334e-05, "loss": 0.3, "mean_token_accuracy": 0.912565091252327, "num_tokens": 139826952.0, "step": 41130 }, { "entropy": 0.28234331756830217, "epoch": 0.6332896542036833, "grad_norm": 0.6920479536056519, "learning_rate": 1.9257532366277593e-05, "loss": 0.2921, "mean_token_accuracy": 0.913581719994545, "num_tokens": 139896662.0, "step": 41140 }, { "entropy": 0.2774469285272062, "epoch": 0.6334435894623619, "grad_norm": 0.5371641516685486, "learning_rate": 1.9256854951645964e-05, "loss": 0.2815, "mean_token_accuracy": 0.9152339465916157, "num_tokens": 139966586.0, "step": 41150 }, { "entropy": 0.2910074068233371, "epoch": 0.6335975247210405, "grad_norm": 0.618833065032959, "learning_rate": 1.925617724005018e-05, "loss": 0.3002, "mean_token_accuracy": 0.9101914294064045, "num_tokens": 140044011.0, "step": 41160 }, { "entropy": 0.28683745888993145, "epoch": 0.633751459979719, "grad_norm": 0.8012226819992065, "learning_rate": 1.9255499231511993e-05, "loss": 0.2884, "mean_token_accuracy": 0.9105373024940491, "num_tokens": 140100389.0, "step": 41170 }, { "entropy": 0.2955420380458236, "epoch": 0.6339053952383976, "grad_norm": 0.6045354008674622, "learning_rate": 1.9254820926053144e-05, "loss": 0.2866, "mean_token_accuracy": 0.9077781200408935, "num_tokens": 140165217.0, "step": 41180 }, { "entropy": 0.2757881938479841, "epoch": 0.6340593304970762, "grad_norm": 0.7983649969100952, "learning_rate": 1.9254142323695397e-05, "loss": 0.2801, "mean_token_accuracy": 0.9150994002819062, "num_tokens": 140228099.0, "step": 41190 }, { "entropy": 0.28252982115373015, "epoch": 0.6342132657557548, "grad_norm": 0.6922416090965271, "learning_rate": 1.9253463424460522e-05, "loss": 0.2854, "mean_token_accuracy": 0.9140629172325134, "num_tokens": 140292021.0, "step": 41200 }, { "entropy": 0.28407245948910714, "epoch": 0.6343672010144333, "grad_norm": 0.6093047261238098, "learning_rate": 1.92527842283703e-05, "loss": 0.2928, "mean_token_accuracy": 0.9118080049753189, "num_tokens": 140347108.0, "step": 41210 }, { "entropy": 0.26822895454242823, "epoch": 0.6345211362731119, "grad_norm": 0.625362753868103, "learning_rate": 1.9252104735446517e-05, "loss": 0.2729, "mean_token_accuracy": 0.916461818665266, "num_tokens": 140406028.0, "step": 41220 }, { "entropy": 0.26810893397778274, "epoch": 0.6346750715317905, "grad_norm": 0.6041412949562073, "learning_rate": 1.9251424945710967e-05, "loss": 0.2773, "mean_token_accuracy": 0.9167957186698914, "num_tokens": 140474548.0, "step": 41230 }, { "entropy": 0.2772668644785881, "epoch": 0.634829006790469, "grad_norm": 0.685687780380249, "learning_rate": 1.925074485918547e-05, "loss": 0.2805, "mean_token_accuracy": 0.914788157492876, "num_tokens": 140541360.0, "step": 41240 }, { "entropy": 0.2870303335599601, "epoch": 0.6349829420491476, "grad_norm": 0.5767706632614136, "learning_rate": 1.9250064475891833e-05, "loss": 0.2841, "mean_token_accuracy": 0.9119962237775325, "num_tokens": 140598816.0, "step": 41250 }, { "entropy": 0.2732712178491056, "epoch": 0.6351368773078263, "grad_norm": 0.5513277053833008, "learning_rate": 1.924938379585189e-05, "loss": 0.2769, "mean_token_accuracy": 0.9159359112381935, "num_tokens": 140666908.0, "step": 41260 }, { "entropy": 0.2671667380258441, "epoch": 0.6352908125665049, "grad_norm": 0.6355390548706055, "learning_rate": 1.924870281908747e-05, "loss": 0.276, "mean_token_accuracy": 0.9169454164803028, "num_tokens": 140733875.0, "step": 41270 }, { "entropy": 0.2595922944135964, "epoch": 0.6354447478251835, "grad_norm": 0.7443574070930481, "learning_rate": 1.9248021545620427e-05, "loss": 0.2683, "mean_token_accuracy": 0.9180263951420784, "num_tokens": 140799500.0, "step": 41280 }, { "entropy": 0.2693312741816044, "epoch": 0.635598683083862, "grad_norm": 0.654975175857544, "learning_rate": 1.924733997547261e-05, "loss": 0.276, "mean_token_accuracy": 0.9173731960356235, "num_tokens": 140867230.0, "step": 41290 }, { "entropy": 0.29173525534570216, "epoch": 0.6357526183425406, "grad_norm": 0.5712435245513916, "learning_rate": 1.924665810866589e-05, "loss": 0.288, "mean_token_accuracy": 0.9073385253548623, "num_tokens": 140935494.0, "step": 41300 }, { "entropy": 0.2905060337856412, "epoch": 0.6359065536012192, "grad_norm": 0.5324926972389221, "learning_rate": 1.9245975945222135e-05, "loss": 0.3103, "mean_token_accuracy": 0.9093462035059929, "num_tokens": 141004313.0, "step": 41310 }, { "entropy": 0.2888725289143622, "epoch": 0.6360604888598977, "grad_norm": 0.6800596117973328, "learning_rate": 1.9245293485163238e-05, "loss": 0.2898, "mean_token_accuracy": 0.9121307902038097, "num_tokens": 141070751.0, "step": 41320 }, { "entropy": 0.2963935170322657, "epoch": 0.6362144241185763, "grad_norm": 0.6148959398269653, "learning_rate": 1.9244610728511082e-05, "loss": 0.2931, "mean_token_accuracy": 0.9083046756684781, "num_tokens": 141134288.0, "step": 41330 }, { "entropy": 0.2752077508717775, "epoch": 0.6363683593772549, "grad_norm": 0.6294364929199219, "learning_rate": 1.9243927675287576e-05, "loss": 0.2773, "mean_token_accuracy": 0.9131072282791137, "num_tokens": 141190458.0, "step": 41340 }, { "entropy": 0.27913438323885204, "epoch": 0.6365222946359335, "grad_norm": 0.6186771392822266, "learning_rate": 1.9243244325514636e-05, "loss": 0.288, "mean_token_accuracy": 0.9137358106672764, "num_tokens": 141256456.0, "step": 41350 }, { "entropy": 0.282477901596576, "epoch": 0.636676229894612, "grad_norm": 0.5530544519424438, "learning_rate": 1.9242560679214177e-05, "loss": 0.272, "mean_token_accuracy": 0.913517390191555, "num_tokens": 141326121.0, "step": 41360 }, { "entropy": 0.26037506125867366, "epoch": 0.6368301651532906, "grad_norm": 0.633636474609375, "learning_rate": 1.9241876736408138e-05, "loss": 0.2619, "mean_token_accuracy": 0.9181235201656819, "num_tokens": 141398279.0, "step": 41370 }, { "entropy": 0.27631447091698647, "epoch": 0.6369841004119693, "grad_norm": 0.6291542053222656, "learning_rate": 1.9241192497118452e-05, "loss": 0.2862, "mean_token_accuracy": 0.9114912152290344, "num_tokens": 141463026.0, "step": 41380 }, { "entropy": 0.2877786302939057, "epoch": 0.6371380356706479, "grad_norm": 0.7975897192955017, "learning_rate": 1.9240507961367076e-05, "loss": 0.2941, "mean_token_accuracy": 0.9112943686544895, "num_tokens": 141523323.0, "step": 41390 }, { "entropy": 0.2897195599973202, "epoch": 0.6372919709293264, "grad_norm": 0.6042283177375793, "learning_rate": 1.9239823129175968e-05, "loss": 0.3001, "mean_token_accuracy": 0.9107287883758545, "num_tokens": 141586621.0, "step": 41400 }, { "entropy": 0.29965619258582593, "epoch": 0.637445906188005, "grad_norm": 0.613291323184967, "learning_rate": 1.9239138000567096e-05, "loss": 0.3004, "mean_token_accuracy": 0.9071613743901252, "num_tokens": 141662316.0, "step": 41410 }, { "entropy": 0.29024718957953155, "epoch": 0.6375998414466836, "grad_norm": 0.6619861125946045, "learning_rate": 1.9238452575562446e-05, "loss": 0.299, "mean_token_accuracy": 0.9116519339382648, "num_tokens": 141731750.0, "step": 41420 }, { "entropy": 0.2717816931195557, "epoch": 0.6377537767053622, "grad_norm": 0.6463592648506165, "learning_rate": 1.9237766854183997e-05, "loss": 0.279, "mean_token_accuracy": 0.9129775702953339, "num_tokens": 141809497.0, "step": 41430 }, { "entropy": 0.31306717088446023, "epoch": 0.6379077119640407, "grad_norm": 0.7956434488296509, "learning_rate": 1.923708083645375e-05, "loss": 0.3066, "mean_token_accuracy": 0.9023684561252594, "num_tokens": 141865397.0, "step": 41440 }, { "entropy": 0.2788052189163864, "epoch": 0.6380616472227193, "grad_norm": 0.7570144534111023, "learning_rate": 1.923639452239372e-05, "loss": 0.2837, "mean_token_accuracy": 0.913200044631958, "num_tokens": 141924548.0, "step": 41450 }, { "entropy": 0.29697232535108925, "epoch": 0.6382155824813979, "grad_norm": 0.7918006181716919, "learning_rate": 1.923570791202592e-05, "loss": 0.3145, "mean_token_accuracy": 0.9077438995242119, "num_tokens": 141988759.0, "step": 41460 }, { "entropy": 0.28033556044101715, "epoch": 0.6383695177400764, "grad_norm": 0.5316072106361389, "learning_rate": 1.9235021005372373e-05, "loss": 0.2774, "mean_token_accuracy": 0.9125706031918526, "num_tokens": 142055409.0, "step": 41470 }, { "entropy": 0.28431461183354256, "epoch": 0.638523452998755, "grad_norm": 0.6291187405586243, "learning_rate": 1.923433380245512e-05, "loss": 0.2891, "mean_token_accuracy": 0.9117767550051212, "num_tokens": 142116475.0, "step": 41480 }, { "entropy": 0.2722673387266695, "epoch": 0.6386773882574336, "grad_norm": 0.8005866408348083, "learning_rate": 1.9233646303296204e-05, "loss": 0.2807, "mean_token_accuracy": 0.9140409886837005, "num_tokens": 142182015.0, "step": 41490 }, { "entropy": 0.28325820257887246, "epoch": 0.6388313235161122, "grad_norm": 0.6451913118362427, "learning_rate": 1.923295850791768e-05, "loss": 0.2855, "mean_token_accuracy": 0.9124609567224979, "num_tokens": 142251346.0, "step": 41500 }, { "entropy": 0.30550245214253663, "epoch": 0.6389852587747908, "grad_norm": 0.6722967624664307, "learning_rate": 1.923227041634162e-05, "loss": 0.2996, "mean_token_accuracy": 0.9069139778614044, "num_tokens": 142317362.0, "step": 41510 }, { "entropy": 0.285578840598464, "epoch": 0.6391391940334694, "grad_norm": 0.5310318470001221, "learning_rate": 1.9231582028590084e-05, "loss": 0.2919, "mean_token_accuracy": 0.9133469641208649, "num_tokens": 142383219.0, "step": 41520 }, { "entropy": 0.2860008435323834, "epoch": 0.639293129292148, "grad_norm": 0.6039324402809143, "learning_rate": 1.923089334468517e-05, "loss": 0.2985, "mean_token_accuracy": 0.9120110616087913, "num_tokens": 142453824.0, "step": 41530 }, { "entropy": 0.30949476258829234, "epoch": 0.6394470645508266, "grad_norm": 0.7662457227706909, "learning_rate": 1.9230204364648963e-05, "loss": 0.307, "mean_token_accuracy": 0.9032005935907363, "num_tokens": 142526438.0, "step": 41540 }, { "entropy": 0.28056102050468323, "epoch": 0.6396009998095051, "grad_norm": 0.6411665081977844, "learning_rate": 1.922951508850357e-05, "loss": 0.2857, "mean_token_accuracy": 0.9138198494911194, "num_tokens": 142598240.0, "step": 41550 }, { "entropy": 0.2886726719327271, "epoch": 0.6397549350681837, "grad_norm": 0.6887714862823486, "learning_rate": 1.9228825516271097e-05, "loss": 0.2807, "mean_token_accuracy": 0.9123256102204322, "num_tokens": 142666261.0, "step": 41560 }, { "entropy": 0.27767931912094357, "epoch": 0.6399088703268623, "grad_norm": 0.6277850270271301, "learning_rate": 1.9228135647973678e-05, "loss": 0.284, "mean_token_accuracy": 0.9122422859072685, "num_tokens": 142732712.0, "step": 41570 }, { "entropy": 0.27989910496398807, "epoch": 0.6400628055855409, "grad_norm": 0.5409116744995117, "learning_rate": 1.9227445483633432e-05, "loss": 0.2979, "mean_token_accuracy": 0.9106674574315547, "num_tokens": 142804897.0, "step": 41580 }, { "entropy": 0.3000862653367221, "epoch": 0.6402167408442194, "grad_norm": 0.6008951663970947, "learning_rate": 1.9226755023272505e-05, "loss": 0.2904, "mean_token_accuracy": 0.9094907194375992, "num_tokens": 142869153.0, "step": 41590 }, { "entropy": 0.27954227151349187, "epoch": 0.640370676102898, "grad_norm": 0.5360778570175171, "learning_rate": 1.922606426691305e-05, "loss": 0.2864, "mean_token_accuracy": 0.9114035464823246, "num_tokens": 142939261.0, "step": 41600 }, { "entropy": 0.2743183341808617, "epoch": 0.6405246113615766, "grad_norm": 0.6357720494270325, "learning_rate": 1.9225373214577217e-05, "loss": 0.2949, "mean_token_accuracy": 0.9131850220263005, "num_tokens": 143008715.0, "step": 41610 }, { "entropy": 0.28832163466140626, "epoch": 0.6406785466202551, "grad_norm": 0.7275717258453369, "learning_rate": 1.9224681866287182e-05, "loss": 0.2955, "mean_token_accuracy": 0.9131002888083458, "num_tokens": 143067295.0, "step": 41620 }, { "entropy": 0.2691517185419798, "epoch": 0.6408324818789337, "grad_norm": 0.7801373600959778, "learning_rate": 1.9223990222065127e-05, "loss": 0.2819, "mean_token_accuracy": 0.9141323827207088, "num_tokens": 143129605.0, "step": 41630 }, { "entropy": 0.2905734313651919, "epoch": 0.6409864171376124, "grad_norm": 0.5634872913360596, "learning_rate": 1.9223298281933237e-05, "loss": 0.3004, "mean_token_accuracy": 0.9114622235298157, "num_tokens": 143195134.0, "step": 41640 }, { "entropy": 0.2825833545066416, "epoch": 0.641140352396291, "grad_norm": 1.3157631158828735, "learning_rate": 1.922260604591371e-05, "loss": 0.2885, "mean_token_accuracy": 0.9125481225550175, "num_tokens": 143251431.0, "step": 41650 }, { "entropy": 0.3109057155437768, "epoch": 0.6412942876549695, "grad_norm": 0.7496021389961243, "learning_rate": 1.9221913514028753e-05, "loss": 0.3202, "mean_token_accuracy": 0.9024020984768868, "num_tokens": 143305388.0, "step": 41660 }, { "entropy": 0.2915331685915589, "epoch": 0.6414482229136481, "grad_norm": 0.7113876342773438, "learning_rate": 1.922122068630058e-05, "loss": 0.3039, "mean_token_accuracy": 0.9089736424386501, "num_tokens": 143369094.0, "step": 41670 }, { "entropy": 0.2852957827039063, "epoch": 0.6416021581723267, "grad_norm": 0.48009464144706726, "learning_rate": 1.922052756275142e-05, "loss": 0.2779, "mean_token_accuracy": 0.9132191762328148, "num_tokens": 143438702.0, "step": 41680 }, { "entropy": 0.2997613525018096, "epoch": 0.6417560934310053, "grad_norm": 0.8749350309371948, "learning_rate": 1.921983414340351e-05, "loss": 0.288, "mean_token_accuracy": 0.906439645588398, "num_tokens": 143506913.0, "step": 41690 }, { "entropy": 0.2798427616246045, "epoch": 0.6419100286896838, "grad_norm": 0.6601565480232239, "learning_rate": 1.9219140428279096e-05, "loss": 0.2713, "mean_token_accuracy": 0.9151912547647953, "num_tokens": 143563116.0, "step": 41700 }, { "entropy": 0.2727745701558888, "epoch": 0.6420639639483624, "grad_norm": 0.5784587860107422, "learning_rate": 1.9218446417400426e-05, "loss": 0.2851, "mean_token_accuracy": 0.9125590190291405, "num_tokens": 143627171.0, "step": 41710 }, { "entropy": 0.25518086617812513, "epoch": 0.642217899207041, "grad_norm": 0.5700269937515259, "learning_rate": 1.921775211078977e-05, "loss": 0.2756, "mean_token_accuracy": 0.9220411591231823, "num_tokens": 143693745.0, "step": 41720 }, { "entropy": 0.2631002421490848, "epoch": 0.6423718344657195, "grad_norm": 0.7557549476623535, "learning_rate": 1.92170575084694e-05, "loss": 0.2698, "mean_token_accuracy": 0.9197390452027321, "num_tokens": 143759505.0, "step": 41730 }, { "entropy": 0.2838632402010262, "epoch": 0.6425257697243981, "grad_norm": 0.6737226247787476, "learning_rate": 1.9216362610461603e-05, "loss": 0.2883, "mean_token_accuracy": 0.9095488846302032, "num_tokens": 143828215.0, "step": 41740 }, { "entropy": 0.28157495995983484, "epoch": 0.6426797049830767, "grad_norm": 0.5899105072021484, "learning_rate": 1.9215667416788663e-05, "loss": 0.2874, "mean_token_accuracy": 0.9114444211125374, "num_tokens": 143898014.0, "step": 41750 }, { "entropy": 0.2895121899433434, "epoch": 0.6428336402417554, "grad_norm": 0.7070789933204651, "learning_rate": 1.921497192747289e-05, "loss": 0.2955, "mean_token_accuracy": 0.9095191217958927, "num_tokens": 143970692.0, "step": 41760 }, { "entropy": 0.281880166195333, "epoch": 0.642987575500434, "grad_norm": 0.7323887348175049, "learning_rate": 1.921427614253659e-05, "loss": 0.2835, "mean_token_accuracy": 0.9130202822387219, "num_tokens": 144046512.0, "step": 41770 }, { "entropy": 0.27118222415447235, "epoch": 0.6431415107591125, "grad_norm": 0.6618849039077759, "learning_rate": 1.921358006200209e-05, "loss": 0.2796, "mean_token_accuracy": 0.9158447913825511, "num_tokens": 144111475.0, "step": 41780 }, { "entropy": 0.2879595853388309, "epoch": 0.6432954460177911, "grad_norm": 0.6504032611846924, "learning_rate": 1.921288368589172e-05, "loss": 0.3097, "mean_token_accuracy": 0.9114247716963291, "num_tokens": 144179129.0, "step": 41790 }, { "entropy": 0.2813371457159519, "epoch": 0.6434493812764697, "grad_norm": 0.5610096454620361, "learning_rate": 1.9212187014227816e-05, "loss": 0.2881, "mean_token_accuracy": 0.9142175823450088, "num_tokens": 144249359.0, "step": 41800 }, { "entropy": 0.30312735149636866, "epoch": 0.6436033165351482, "grad_norm": 0.54507976770401, "learning_rate": 1.9211490047032727e-05, "loss": 0.3091, "mean_token_accuracy": 0.9070638909935951, "num_tokens": 144308662.0, "step": 41810 }, { "entropy": 0.28239255100488664, "epoch": 0.6437572517938268, "grad_norm": 0.6363852620124817, "learning_rate": 1.9210792784328813e-05, "loss": 0.287, "mean_token_accuracy": 0.9125432915985584, "num_tokens": 144374605.0, "step": 41820 }, { "entropy": 0.2845182426273823, "epoch": 0.6439111870525054, "grad_norm": 0.5923203229904175, "learning_rate": 1.9210095226138445e-05, "loss": 0.2938, "mean_token_accuracy": 0.910140971839428, "num_tokens": 144440309.0, "step": 41830 }, { "entropy": 0.27335790228098633, "epoch": 0.644065122311184, "grad_norm": 0.5556699633598328, "learning_rate": 1.9209397372484e-05, "loss": 0.2755, "mean_token_accuracy": 0.9148866921663285, "num_tokens": 144506751.0, "step": 41840 }, { "entropy": 0.2794735672883689, "epoch": 0.6442190575698625, "grad_norm": 0.6253672242164612, "learning_rate": 1.9208699223387866e-05, "loss": 0.2863, "mean_token_accuracy": 0.9120206810534001, "num_tokens": 144576715.0, "step": 41850 }, { "entropy": 0.293465047609061, "epoch": 0.6443729928285411, "grad_norm": 0.6746427416801453, "learning_rate": 1.9208000778872436e-05, "loss": 0.2904, "mean_token_accuracy": 0.9084980838000775, "num_tokens": 144641010.0, "step": 41860 }, { "entropy": 0.2818126159720123, "epoch": 0.6445269280872197, "grad_norm": 0.4888513684272766, "learning_rate": 1.9207302038960124e-05, "loss": 0.293, "mean_token_accuracy": 0.9108847640454769, "num_tokens": 144707718.0, "step": 41870 }, { "entropy": 0.2758179232478142, "epoch": 0.6446808633458982, "grad_norm": 0.5979014039039612, "learning_rate": 1.920660300367334e-05, "loss": 0.2772, "mean_token_accuracy": 0.9160699784755707, "num_tokens": 144772916.0, "step": 41880 }, { "entropy": 0.2813174143433571, "epoch": 0.6448347986045769, "grad_norm": 0.7170819640159607, "learning_rate": 1.920590367303451e-05, "loss": 0.2816, "mean_token_accuracy": 0.9102744638919831, "num_tokens": 144839666.0, "step": 41890 }, { "entropy": 0.2871979239396751, "epoch": 0.6449887338632555, "grad_norm": 0.5785829424858093, "learning_rate": 1.9205204047066068e-05, "loss": 0.2955, "mean_token_accuracy": 0.910474619269371, "num_tokens": 144900403.0, "step": 41900 }, { "entropy": 0.28122162418439983, "epoch": 0.6451426691219341, "grad_norm": 0.5407142639160156, "learning_rate": 1.920450412579046e-05, "loss": 0.2954, "mean_token_accuracy": 0.9128542542457581, "num_tokens": 144972821.0, "step": 41910 }, { "entropy": 0.28022474246099593, "epoch": 0.6452966043806126, "grad_norm": 0.6567382216453552, "learning_rate": 1.9203803909230142e-05, "loss": 0.2864, "mean_token_accuracy": 0.913635665923357, "num_tokens": 145035409.0, "step": 41920 }, { "entropy": 0.276838268712163, "epoch": 0.6454505396392912, "grad_norm": 0.6882944703102112, "learning_rate": 1.9203103397407575e-05, "loss": 0.2861, "mean_token_accuracy": 0.9142864711582661, "num_tokens": 145096076.0, "step": 41930 }, { "entropy": 0.2761364026926458, "epoch": 0.6456044748979698, "grad_norm": 0.5645323991775513, "learning_rate": 1.920240259034523e-05, "loss": 0.3075, "mean_token_accuracy": 0.9132276698946953, "num_tokens": 145156636.0, "step": 41940 }, { "entropy": 0.2748144189827144, "epoch": 0.6457584101566484, "grad_norm": 0.6111136078834534, "learning_rate": 1.9201701488065594e-05, "loss": 0.2712, "mean_token_accuracy": 0.9156742170453072, "num_tokens": 145225660.0, "step": 41950 }, { "entropy": 0.27340021133422854, "epoch": 0.6459123454153269, "grad_norm": 0.8677094578742981, "learning_rate": 1.9201000090591154e-05, "loss": 0.2926, "mean_token_accuracy": 0.9135947853326798, "num_tokens": 145292805.0, "step": 41960 }, { "entropy": 0.28830465888604523, "epoch": 0.6460662806740055, "grad_norm": 0.5443442463874817, "learning_rate": 1.920029839794441e-05, "loss": 0.2826, "mean_token_accuracy": 0.9133005537092685, "num_tokens": 145358377.0, "step": 41970 }, { "entropy": 0.2726177849806845, "epoch": 0.6462202159326841, "grad_norm": 0.7551463842391968, "learning_rate": 1.9199596410147882e-05, "loss": 0.2897, "mean_token_accuracy": 0.9116165116429329, "num_tokens": 145423206.0, "step": 41980 }, { "entropy": 0.27040471192449334, "epoch": 0.6463741511913627, "grad_norm": 0.7313582897186279, "learning_rate": 1.9198894127224075e-05, "loss": 0.2818, "mean_token_accuracy": 0.9154700554907322, "num_tokens": 145488013.0, "step": 41990 }, { "entropy": 0.260863615386188, "epoch": 0.6465280864500412, "grad_norm": 0.7928231954574585, "learning_rate": 1.9198191549195533e-05, "loss": 0.2693, "mean_token_accuracy": 0.9190851762890816, "num_tokens": 145547313.0, "step": 42000 }, { "entropy": 0.28200349444523454, "epoch": 0.6466820217087198, "grad_norm": 0.7038399577140808, "learning_rate": 1.919748867608479e-05, "loss": 0.2915, "mean_token_accuracy": 0.9125045605003834, "num_tokens": 145611507.0, "step": 42010 }, { "entropy": 0.26978253908455374, "epoch": 0.6468359569673985, "grad_norm": 0.5627861618995667, "learning_rate": 1.9196785507914387e-05, "loss": 0.2761, "mean_token_accuracy": 0.9146513231098652, "num_tokens": 145677700.0, "step": 42020 }, { "entropy": 0.29130422407761214, "epoch": 0.6469898922260771, "grad_norm": 0.8504630923271179, "learning_rate": 1.9196082044706892e-05, "loss": 0.3027, "mean_token_accuracy": 0.9083004951477051, "num_tokens": 145739828.0, "step": 42030 }, { "entropy": 0.2891805526800454, "epoch": 0.6471438274847556, "grad_norm": 0.7919933795928955, "learning_rate": 1.9195378286484868e-05, "loss": 0.2859, "mean_token_accuracy": 0.9097086019814015, "num_tokens": 145810725.0, "step": 42040 }, { "entropy": 0.30123648066073655, "epoch": 0.6472977627434342, "grad_norm": 0.7215845584869385, "learning_rate": 1.9194674233270895e-05, "loss": 0.3148, "mean_token_accuracy": 0.9071518316864967, "num_tokens": 145874264.0, "step": 42050 }, { "entropy": 0.2859546019695699, "epoch": 0.6474516980021128, "grad_norm": 0.6203641891479492, "learning_rate": 1.9193969885087557e-05, "loss": 0.2867, "mean_token_accuracy": 0.9132275335490704, "num_tokens": 145940721.0, "step": 42060 }, { "entropy": 0.27138912556692957, "epoch": 0.6476056332607913, "grad_norm": 0.594194769859314, "learning_rate": 1.919326524195745e-05, "loss": 0.2763, "mean_token_accuracy": 0.9167255073785782, "num_tokens": 146001858.0, "step": 42070 }, { "entropy": 0.2917221702635288, "epoch": 0.6477595685194699, "grad_norm": 0.8888543844223022, "learning_rate": 1.9192560303903177e-05, "loss": 0.3006, "mean_token_accuracy": 0.908775332570076, "num_tokens": 146070786.0, "step": 42080 }, { "entropy": 0.2901315879076719, "epoch": 0.6479135037781485, "grad_norm": 0.5376140475273132, "learning_rate": 1.919185507094736e-05, "loss": 0.2887, "mean_token_accuracy": 0.9109434902667999, "num_tokens": 146135804.0, "step": 42090 }, { "entropy": 0.2812975517474115, "epoch": 0.6480674390368271, "grad_norm": 0.6243920922279358, "learning_rate": 1.9191149543112614e-05, "loss": 0.2808, "mean_token_accuracy": 0.9110221244394779, "num_tokens": 146199578.0, "step": 42100 }, { "entropy": 0.2681259266100824, "epoch": 0.6482213742955056, "grad_norm": 0.6809049248695374, "learning_rate": 1.9190443720421575e-05, "loss": 0.2832, "mean_token_accuracy": 0.9160612098872661, "num_tokens": 146269756.0, "step": 42110 }, { "entropy": 0.2904717813245952, "epoch": 0.6483753095541842, "grad_norm": 0.652571976184845, "learning_rate": 1.9189737602896894e-05, "loss": 0.2968, "mean_token_accuracy": 0.9119175605475902, "num_tokens": 146329666.0, "step": 42120 }, { "entropy": 0.29104127921164036, "epoch": 0.6485292448128628, "grad_norm": 0.9235507845878601, "learning_rate": 1.9189031190561214e-05, "loss": 0.2985, "mean_token_accuracy": 0.907935656607151, "num_tokens": 146393499.0, "step": 42130 }, { "entropy": 0.2748615300282836, "epoch": 0.6486831800715415, "grad_norm": 0.6619914770126343, "learning_rate": 1.9188324483437203e-05, "loss": 0.2832, "mean_token_accuracy": 0.9148399241268634, "num_tokens": 146461690.0, "step": 42140 }, { "entropy": 0.2715385306626558, "epoch": 0.64883711533022, "grad_norm": 0.6831164360046387, "learning_rate": 1.9187617481547523e-05, "loss": 0.2709, "mean_token_accuracy": 0.9166660986840725, "num_tokens": 146524482.0, "step": 42150 }, { "entropy": 0.27738738991320133, "epoch": 0.6489910505888986, "grad_norm": 0.5542553663253784, "learning_rate": 1.918691018491487e-05, "loss": 0.284, "mean_token_accuracy": 0.9132160015404225, "num_tokens": 146594835.0, "step": 42160 }, { "entropy": 0.279387994389981, "epoch": 0.6491449858475772, "grad_norm": 0.5899127721786499, "learning_rate": 1.918620259356192e-05, "loss": 0.2801, "mean_token_accuracy": 0.9122387908399106, "num_tokens": 146652282.0, "step": 42170 }, { "entropy": 0.26466086972504854, "epoch": 0.6492989211062558, "grad_norm": 0.6500657796859741, "learning_rate": 1.9185494707511386e-05, "loss": 0.2724, "mean_token_accuracy": 0.9172455444931984, "num_tokens": 146712426.0, "step": 42180 }, { "entropy": 0.29824695121496914, "epoch": 0.6494528563649343, "grad_norm": 0.6769059300422668, "learning_rate": 1.9184786526785966e-05, "loss": 0.2949, "mean_token_accuracy": 0.9075395323336124, "num_tokens": 146783335.0, "step": 42190 }, { "entropy": 0.28749287966638803, "epoch": 0.6496067916236129, "grad_norm": 0.5535836219787598, "learning_rate": 1.9184078051408382e-05, "loss": 0.2898, "mean_token_accuracy": 0.9111426457762718, "num_tokens": 146854912.0, "step": 42200 }, { "entropy": 0.29847226860001685, "epoch": 0.6497607268822915, "grad_norm": 0.7969227433204651, "learning_rate": 1.918336928140137e-05, "loss": 0.2919, "mean_token_accuracy": 0.9074209950864315, "num_tokens": 146922886.0, "step": 42210 }, { "entropy": 0.2994084060192108, "epoch": 0.64991466214097, "grad_norm": 0.7363086342811584, "learning_rate": 1.9182660216787657e-05, "loss": 0.2962, "mean_token_accuracy": 0.9060364179313183, "num_tokens": 146985091.0, "step": 42220 }, { "entropy": 0.29173507522791625, "epoch": 0.6500685973996486, "grad_norm": 0.6598227620124817, "learning_rate": 1.9181950857589993e-05, "loss": 0.2943, "mean_token_accuracy": 0.910878948867321, "num_tokens": 147041877.0, "step": 42230 }, { "entropy": 0.26867998465895654, "epoch": 0.6502225326583272, "grad_norm": 0.6838886737823486, "learning_rate": 1.9181241203831137e-05, "loss": 0.2765, "mean_token_accuracy": 0.9157295443117619, "num_tokens": 147098847.0, "step": 42240 }, { "entropy": 0.2918458332307637, "epoch": 0.6503764679170058, "grad_norm": 0.6323718428611755, "learning_rate": 1.9180531255533857e-05, "loss": 0.2974, "mean_token_accuracy": 0.9083765715360641, "num_tokens": 147160707.0, "step": 42250 }, { "entropy": 0.28290432719513775, "epoch": 0.6505304031756843, "grad_norm": 0.8434930443763733, "learning_rate": 1.917982101272092e-05, "loss": 0.2885, "mean_token_accuracy": 0.9137374348938465, "num_tokens": 147223816.0, "step": 42260 }, { "entropy": 0.27567701824009416, "epoch": 0.650684338434363, "grad_norm": 0.6779711246490479, "learning_rate": 1.9179110475415122e-05, "loss": 0.2836, "mean_token_accuracy": 0.9145562171936035, "num_tokens": 147286920.0, "step": 42270 }, { "entropy": 0.26586421560496093, "epoch": 0.6508382736930416, "grad_norm": 0.43307825922966003, "learning_rate": 1.917839964363925e-05, "loss": 0.2842, "mean_token_accuracy": 0.9164765551686287, "num_tokens": 147361609.0, "step": 42280 }, { "entropy": 0.2909004079177976, "epoch": 0.6509922089517202, "grad_norm": 0.6014167070388794, "learning_rate": 1.9177688517416105e-05, "loss": 0.2855, "mean_token_accuracy": 0.908222284168005, "num_tokens": 147431337.0, "step": 42290 }, { "entropy": 0.28575607035309075, "epoch": 0.6511461442103987, "grad_norm": 0.6309880018234253, "learning_rate": 1.917697709676851e-05, "loss": 0.2868, "mean_token_accuracy": 0.9119038425385952, "num_tokens": 147498804.0, "step": 42300 }, { "entropy": 0.27379797399044037, "epoch": 0.6513000794690773, "grad_norm": 0.7221308946609497, "learning_rate": 1.917626538171928e-05, "loss": 0.2807, "mean_token_accuracy": 0.914222889393568, "num_tokens": 147562694.0, "step": 42310 }, { "entropy": 0.29473520861938596, "epoch": 0.6514540147277559, "grad_norm": 0.6297355890274048, "learning_rate": 1.917555337229125e-05, "loss": 0.293, "mean_token_accuracy": 0.9069551169872284, "num_tokens": 147626921.0, "step": 42320 }, { "entropy": 0.3032667074352503, "epoch": 0.6516079499864345, "grad_norm": 0.7071203589439392, "learning_rate": 1.9174841068507263e-05, "loss": 0.3118, "mean_token_accuracy": 0.9064290173351764, "num_tokens": 147695306.0, "step": 42330 }, { "entropy": 0.26539784781634806, "epoch": 0.651761885245113, "grad_norm": 0.5787762403488159, "learning_rate": 1.9174128470390165e-05, "loss": 0.2742, "mean_token_accuracy": 0.9172234460711479, "num_tokens": 147755136.0, "step": 42340 }, { "entropy": 0.28272991850972173, "epoch": 0.6519158205037916, "grad_norm": 0.5996992588043213, "learning_rate": 1.917341557796282e-05, "loss": 0.2896, "mean_token_accuracy": 0.9111430667340755, "num_tokens": 147826161.0, "step": 42350 }, { "entropy": 0.29517256319522855, "epoch": 0.6520697557624702, "grad_norm": 0.613377034664154, "learning_rate": 1.91727023912481e-05, "loss": 0.2936, "mean_token_accuracy": 0.9075689062476158, "num_tokens": 147890968.0, "step": 42360 }, { "entropy": 0.2803575058467686, "epoch": 0.6522236910211487, "grad_norm": 0.6338340640068054, "learning_rate": 1.917198891026888e-05, "loss": 0.2949, "mean_token_accuracy": 0.9124989047646522, "num_tokens": 147959512.0, "step": 42370 }, { "entropy": 0.3019345941953361, "epoch": 0.6523776262798273, "grad_norm": 0.7163568139076233, "learning_rate": 1.9171275135048052e-05, "loss": 0.3005, "mean_token_accuracy": 0.9080400958657264, "num_tokens": 148028423.0, "step": 42380 }, { "entropy": 0.3058581267483532, "epoch": 0.6525315615385059, "grad_norm": 0.5339142680168152, "learning_rate": 1.917056106560851e-05, "loss": 0.3118, "mean_token_accuracy": 0.9063052237033844, "num_tokens": 148089764.0, "step": 42390 }, { "entropy": 0.3116034594364464, "epoch": 0.6526854967971846, "grad_norm": 0.6146937012672424, "learning_rate": 1.9169846701973168e-05, "loss": 0.3021, "mean_token_accuracy": 0.9067191429436207, "num_tokens": 148147694.0, "step": 42400 }, { "entropy": 0.2926231987774372, "epoch": 0.6528394320558631, "grad_norm": 0.6385365724563599, "learning_rate": 1.9169132044164938e-05, "loss": 0.2981, "mean_token_accuracy": 0.908379715681076, "num_tokens": 148218803.0, "step": 42410 }, { "entropy": 0.277612501103431, "epoch": 0.6529933673145417, "grad_norm": 0.5130679607391357, "learning_rate": 1.916841709220675e-05, "loss": 0.2905, "mean_token_accuracy": 0.9115144610404968, "num_tokens": 148284264.0, "step": 42420 }, { "entropy": 0.28665779791772367, "epoch": 0.6531473025732203, "grad_norm": 0.5578317642211914, "learning_rate": 1.916770184612154e-05, "loss": 0.2947, "mean_token_accuracy": 0.9115042865276337, "num_tokens": 148350217.0, "step": 42430 }, { "entropy": 0.29261385966092346, "epoch": 0.6533012378318989, "grad_norm": 0.5936428308486938, "learning_rate": 1.9166986305932247e-05, "loss": 0.2952, "mean_token_accuracy": 0.9062805235385895, "num_tokens": 148408842.0, "step": 42440 }, { "entropy": 0.2826479642651975, "epoch": 0.6534551730905774, "grad_norm": 0.6412975192070007, "learning_rate": 1.916627047166183e-05, "loss": 0.2908, "mean_token_accuracy": 0.9122454069554806, "num_tokens": 148472727.0, "step": 42450 }, { "entropy": 0.31384426755830647, "epoch": 0.653609108349256, "grad_norm": 0.5772783160209656, "learning_rate": 1.916555434333326e-05, "loss": 0.3009, "mean_token_accuracy": 0.9040808886289596, "num_tokens": 148543791.0, "step": 42460 }, { "entropy": 0.2655341310892254, "epoch": 0.6537630436079346, "grad_norm": 0.5876541137695312, "learning_rate": 1.9164837920969502e-05, "loss": 0.2722, "mean_token_accuracy": 0.9176220245659351, "num_tokens": 148617727.0, "step": 42470 }, { "entropy": 0.2932851417921484, "epoch": 0.6539169788666132, "grad_norm": 0.617912232875824, "learning_rate": 1.916412120459354e-05, "loss": 0.288, "mean_token_accuracy": 0.9085036061704159, "num_tokens": 148695118.0, "step": 42480 }, { "entropy": 0.2935022863559425, "epoch": 0.6540709141252917, "grad_norm": 0.5813345313072205, "learning_rate": 1.916340419422837e-05, "loss": 0.3015, "mean_token_accuracy": 0.9111945882439614, "num_tokens": 148769422.0, "step": 42490 }, { "entropy": 0.2788412936963141, "epoch": 0.6542248493839703, "grad_norm": 0.69084632396698, "learning_rate": 1.9162686889896992e-05, "loss": 0.2796, "mean_token_accuracy": 0.9139296233654022, "num_tokens": 148841020.0, "step": 42500 }, { "entropy": 0.2735921325162053, "epoch": 0.6543787846426489, "grad_norm": 0.4615926146507263, "learning_rate": 1.9161969291622418e-05, "loss": 0.2842, "mean_token_accuracy": 0.9141362212598324, "num_tokens": 148920496.0, "step": 42510 }, { "entropy": 0.2931715621612966, "epoch": 0.6545327199013276, "grad_norm": 0.6940825581550598, "learning_rate": 1.9161251399427668e-05, "loss": 0.3061, "mean_token_accuracy": 0.9106914982199669, "num_tokens": 148983895.0, "step": 42520 }, { "entropy": 0.2818272072821856, "epoch": 0.6546866551600061, "grad_norm": 0.6347377896308899, "learning_rate": 1.9160533213335772e-05, "loss": 0.2953, "mean_token_accuracy": 0.9131944447755813, "num_tokens": 149040073.0, "step": 42530 }, { "entropy": 0.26747934324666855, "epoch": 0.6548405904186847, "grad_norm": 0.6789999604225159, "learning_rate": 1.9159814733369773e-05, "loss": 0.2778, "mean_token_accuracy": 0.91794174015522, "num_tokens": 149107083.0, "step": 42540 }, { "entropy": 0.2661647724919021, "epoch": 0.6549945256773633, "grad_norm": 0.7856611609458923, "learning_rate": 1.9159095959552716e-05, "loss": 0.2612, "mean_token_accuracy": 0.9185903586447239, "num_tokens": 149169657.0, "step": 42550 }, { "entropy": 0.28760672761127354, "epoch": 0.6551484609360418, "grad_norm": 0.5606321096420288, "learning_rate": 1.9158376891907662e-05, "loss": 0.2964, "mean_token_accuracy": 0.9095197670161724, "num_tokens": 149235052.0, "step": 42560 }, { "entropy": 0.2815287498757243, "epoch": 0.6553023961947204, "grad_norm": 0.6055011749267578, "learning_rate": 1.915765753045768e-05, "loss": 0.2766, "mean_token_accuracy": 0.9119066663086415, "num_tokens": 149297980.0, "step": 42570 }, { "entropy": 0.2731797148473561, "epoch": 0.655456331453399, "grad_norm": 0.6869502067565918, "learning_rate": 1.9156937875225847e-05, "loss": 0.2784, "mean_token_accuracy": 0.9149393729865551, "num_tokens": 149361464.0, "step": 42580 }, { "entropy": 0.2896247336640954, "epoch": 0.6556102667120776, "grad_norm": 0.7372874617576599, "learning_rate": 1.9156217926235245e-05, "loss": 0.2955, "mean_token_accuracy": 0.9095781691372394, "num_tokens": 149420483.0, "step": 42590 }, { "entropy": 0.28269916828721764, "epoch": 0.6557642019707561, "grad_norm": 0.5973385572433472, "learning_rate": 1.9155497683508976e-05, "loss": 0.2836, "mean_token_accuracy": 0.9145316757261753, "num_tokens": 149486839.0, "step": 42600 }, { "entropy": 0.2882267117500305, "epoch": 0.6559181372294347, "grad_norm": 0.582518458366394, "learning_rate": 1.9154777147070143e-05, "loss": 0.2873, "mean_token_accuracy": 0.9107517100870609, "num_tokens": 149548138.0, "step": 42610 }, { "entropy": 0.27936506224796176, "epoch": 0.6560720724881133, "grad_norm": 0.5787614583969116, "learning_rate": 1.9154056316941863e-05, "loss": 0.2873, "mean_token_accuracy": 0.9125324644148349, "num_tokens": 149611830.0, "step": 42620 }, { "entropy": 0.27871771026402714, "epoch": 0.6562260077467919, "grad_norm": 0.538331925868988, "learning_rate": 1.915333519314726e-05, "loss": 0.2757, "mean_token_accuracy": 0.9130678974092007, "num_tokens": 149677977.0, "step": 42630 }, { "entropy": 0.2750467550009489, "epoch": 0.6563799430054704, "grad_norm": 0.6014618277549744, "learning_rate": 1.915261377570947e-05, "loss": 0.2916, "mean_token_accuracy": 0.9151456281542778, "num_tokens": 149746319.0, "step": 42640 }, { "entropy": 0.2864750779233873, "epoch": 0.6565338782641491, "grad_norm": 0.6686134338378906, "learning_rate": 1.9151892064651626e-05, "loss": 0.2877, "mean_token_accuracy": 0.9099803507328034, "num_tokens": 149818852.0, "step": 42650 }, { "entropy": 0.29461840372532605, "epoch": 0.6566878135228277, "grad_norm": 0.5474424958229065, "learning_rate": 1.9151170059996894e-05, "loss": 0.3, "mean_token_accuracy": 0.9086237594485282, "num_tokens": 149887371.0, "step": 42660 }, { "entropy": 0.2716148192062974, "epoch": 0.6568417487815063, "grad_norm": 0.8025331497192383, "learning_rate": 1.9150447761768433e-05, "loss": 0.2784, "mean_token_accuracy": 0.9184768989682197, "num_tokens": 149949794.0, "step": 42670 }, { "entropy": 0.28323459113016725, "epoch": 0.6569956840401848, "grad_norm": 0.6933644413948059, "learning_rate": 1.9149725169989408e-05, "loss": 0.2883, "mean_token_accuracy": 0.9122236594557762, "num_tokens": 150017151.0, "step": 42680 }, { "entropy": 0.25982147697359326, "epoch": 0.6571496192988634, "grad_norm": 0.5691086649894714, "learning_rate": 1.9149002284683008e-05, "loss": 0.2744, "mean_token_accuracy": 0.9192547738552094, "num_tokens": 150087050.0, "step": 42690 }, { "entropy": 0.2786480753216892, "epoch": 0.657303554557542, "grad_norm": 0.5547440648078918, "learning_rate": 1.914827910587242e-05, "loss": 0.2829, "mean_token_accuracy": 0.9129696980118751, "num_tokens": 150146887.0, "step": 42700 }, { "entropy": 0.27936655981466174, "epoch": 0.6574574898162205, "grad_norm": 0.5770680904388428, "learning_rate": 1.9147555633580844e-05, "loss": 0.2849, "mean_token_accuracy": 0.9133301980793476, "num_tokens": 150219361.0, "step": 42710 }, { "entropy": 0.2851343503221869, "epoch": 0.6576114250748991, "grad_norm": 0.71170574426651, "learning_rate": 1.9146831867831488e-05, "loss": 0.2899, "mean_token_accuracy": 0.9105467334389686, "num_tokens": 150284297.0, "step": 42720 }, { "entropy": 0.280220933072269, "epoch": 0.6577653603335777, "grad_norm": 0.5368054509162903, "learning_rate": 1.9146107808647574e-05, "loss": 0.2774, "mean_token_accuracy": 0.9119432091712951, "num_tokens": 150350605.0, "step": 42730 }, { "entropy": 0.29386113295331595, "epoch": 0.6579192955922563, "grad_norm": 0.7075034379959106, "learning_rate": 1.9145383456052325e-05, "loss": 0.2952, "mean_token_accuracy": 0.9100111141800881, "num_tokens": 150412794.0, "step": 42740 }, { "entropy": 0.29664795119315385, "epoch": 0.6580732308509348, "grad_norm": 0.4869399666786194, "learning_rate": 1.9144658810068987e-05, "loss": 0.2979, "mean_token_accuracy": 0.9065245516598225, "num_tokens": 150488045.0, "step": 42750 }, { "entropy": 0.2932399003766477, "epoch": 0.6582271661096134, "grad_norm": 1.0155164003372192, "learning_rate": 1.9143933870720798e-05, "loss": 0.2901, "mean_token_accuracy": 0.9095729820430278, "num_tokens": 150558274.0, "step": 42760 }, { "entropy": 0.29189778845757247, "epoch": 0.658381101368292, "grad_norm": 0.587307333946228, "learning_rate": 1.9143208638031022e-05, "loss": 0.296, "mean_token_accuracy": 0.9081681504845619, "num_tokens": 150625825.0, "step": 42770 }, { "entropy": 0.27397949164733293, "epoch": 0.6585350366269707, "grad_norm": 0.5683110952377319, "learning_rate": 1.9142483112022918e-05, "loss": 0.2822, "mean_token_accuracy": 0.9138561137020588, "num_tokens": 150704632.0, "step": 42780 }, { "entropy": 0.2766591708175838, "epoch": 0.6586889718856492, "grad_norm": 0.6957571506500244, "learning_rate": 1.9141757292719766e-05, "loss": 0.2816, "mean_token_accuracy": 0.9140371032059192, "num_tokens": 150767390.0, "step": 42790 }, { "entropy": 0.2969094390980899, "epoch": 0.6588429071443278, "grad_norm": 0.6532402038574219, "learning_rate": 1.9141031180144843e-05, "loss": 0.3084, "mean_token_accuracy": 0.9085651323199272, "num_tokens": 150835102.0, "step": 42800 }, { "entropy": 0.28407675242051483, "epoch": 0.6589968424030064, "grad_norm": 0.5942703485488892, "learning_rate": 1.9140304774321455e-05, "loss": 0.2932, "mean_token_accuracy": 0.9122623383998871, "num_tokens": 150894606.0, "step": 42810 }, { "entropy": 0.26734632570296524, "epoch": 0.659150777661685, "grad_norm": 0.5611342191696167, "learning_rate": 1.91395780752729e-05, "loss": 0.2764, "mean_token_accuracy": 0.9185439668595791, "num_tokens": 150968138.0, "step": 42820 }, { "entropy": 0.2846355254761875, "epoch": 0.6593047129203635, "grad_norm": 0.7773696184158325, "learning_rate": 1.9138851083022487e-05, "loss": 0.2894, "mean_token_accuracy": 0.9110493868589401, "num_tokens": 151033249.0, "step": 42830 }, { "entropy": 0.28922735042870046, "epoch": 0.6594586481790421, "grad_norm": 0.6345462203025818, "learning_rate": 1.9138123797593544e-05, "loss": 0.3054, "mean_token_accuracy": 0.9109323792159557, "num_tokens": 151092495.0, "step": 42840 }, { "entropy": 0.2847891945391893, "epoch": 0.6596125834377207, "grad_norm": 0.6479803323745728, "learning_rate": 1.9137396219009398e-05, "loss": 0.282, "mean_token_accuracy": 0.91305797919631, "num_tokens": 151159942.0, "step": 42850 }, { "entropy": 0.2729843482375145, "epoch": 0.6597665186963992, "grad_norm": 0.6861037611961365, "learning_rate": 1.9136668347293393e-05, "loss": 0.2827, "mean_token_accuracy": 0.9144159272313118, "num_tokens": 151229718.0, "step": 42860 }, { "entropy": 0.2754049035254866, "epoch": 0.6599204539550778, "grad_norm": 0.5862581133842468, "learning_rate": 1.9135940182468878e-05, "loss": 0.2833, "mean_token_accuracy": 0.9147102117538453, "num_tokens": 151297275.0, "step": 42870 }, { "entropy": 0.2894028449431062, "epoch": 0.6600743892137564, "grad_norm": 0.8687474131584167, "learning_rate": 1.9135211724559216e-05, "loss": 0.2938, "mean_token_accuracy": 0.912129633128643, "num_tokens": 151361069.0, "step": 42880 }, { "entropy": 0.2879825562238693, "epoch": 0.660228324472435, "grad_norm": 0.7836583852767944, "learning_rate": 1.9134482973587773e-05, "loss": 0.29, "mean_token_accuracy": 0.910827349871397, "num_tokens": 151427978.0, "step": 42890 }, { "entropy": 0.284980889596045, "epoch": 0.6603822597311136, "grad_norm": 0.5267102718353271, "learning_rate": 1.9133753929577925e-05, "loss": 0.2803, "mean_token_accuracy": 0.9106490157544613, "num_tokens": 151496604.0, "step": 42900 }, { "entropy": 0.2825344054959714, "epoch": 0.6605361949897922, "grad_norm": 0.6722867488861084, "learning_rate": 1.9133024592553066e-05, "loss": 0.286, "mean_token_accuracy": 0.9114262245595455, "num_tokens": 151560364.0, "step": 42910 }, { "entropy": 0.2902494288980961, "epoch": 0.6606901302484708, "grad_norm": 0.649868369102478, "learning_rate": 1.913229496253659e-05, "loss": 0.2886, "mean_token_accuracy": 0.9101741582155227, "num_tokens": 151634816.0, "step": 42920 }, { "entropy": 0.28516958532854914, "epoch": 0.6608440655071494, "grad_norm": 0.5998911261558533, "learning_rate": 1.9131565039551904e-05, "loss": 0.2921, "mean_token_accuracy": 0.9125364579260349, "num_tokens": 151704909.0, "step": 42930 }, { "entropy": 0.2794289187528193, "epoch": 0.6609980007658279, "grad_norm": 0.5095033049583435, "learning_rate": 1.9130834823622427e-05, "loss": 0.2845, "mean_token_accuracy": 0.9133609980344772, "num_tokens": 151768248.0, "step": 42940 }, { "entropy": 0.28138429932296277, "epoch": 0.6611519360245065, "grad_norm": 0.616126298904419, "learning_rate": 1.913010431477158e-05, "loss": 0.2859, "mean_token_accuracy": 0.9114858135581017, "num_tokens": 151832536.0, "step": 42950 }, { "entropy": 0.28788768844678997, "epoch": 0.6613058712831851, "grad_norm": 0.6669889092445374, "learning_rate": 1.9129373513022807e-05, "loss": 0.2915, "mean_token_accuracy": 0.9095337584614753, "num_tokens": 151902869.0, "step": 42960 }, { "entropy": 0.29138643108308315, "epoch": 0.6614598065418636, "grad_norm": 0.8322362303733826, "learning_rate": 1.912864241839954e-05, "loss": 0.2892, "mean_token_accuracy": 0.9119146570563317, "num_tokens": 151962436.0, "step": 42970 }, { "entropy": 0.3066822960972786, "epoch": 0.6616137418005422, "grad_norm": 0.6050609350204468, "learning_rate": 1.912791103092524e-05, "loss": 0.3109, "mean_token_accuracy": 0.9041274130344391, "num_tokens": 152026688.0, "step": 42980 }, { "entropy": 0.29994144309312104, "epoch": 0.6617676770592208, "grad_norm": 0.5598137378692627, "learning_rate": 1.9127179350623374e-05, "loss": 0.2961, "mean_token_accuracy": 0.9081742852926254, "num_tokens": 152101848.0, "step": 42990 }, { "entropy": 0.29957744432613254, "epoch": 0.6619216123178994, "grad_norm": 0.6282037496566772, "learning_rate": 1.9126447377517406e-05, "loss": 0.3001, "mean_token_accuracy": 0.906233037263155, "num_tokens": 152167463.0, "step": 43000 }, { "entropy": 0.2710247919894755, "epoch": 0.6620755475765779, "grad_norm": 0.6154890656471252, "learning_rate": 1.912571511163082e-05, "loss": 0.2712, "mean_token_accuracy": 0.9167703285813331, "num_tokens": 152234542.0, "step": 43010 }, { "entropy": 0.30569768575951456, "epoch": 0.6622294828352565, "grad_norm": 0.7486370205879211, "learning_rate": 1.912498255298711e-05, "loss": 0.2994, "mean_token_accuracy": 0.903989101946354, "num_tokens": 152300368.0, "step": 43020 }, { "entropy": 0.27779118772596123, "epoch": 0.6623834180939352, "grad_norm": 0.7208930850028992, "learning_rate": 1.912424970160978e-05, "loss": 0.2844, "mean_token_accuracy": 0.9126914441585541, "num_tokens": 152363672.0, "step": 43030 }, { "entropy": 0.26589377913624046, "epoch": 0.6625373533526138, "grad_norm": 0.6136458516120911, "learning_rate": 1.9123516557522335e-05, "loss": 0.269, "mean_token_accuracy": 0.9178180783987046, "num_tokens": 152438112.0, "step": 43040 }, { "entropy": 0.27729281755164265, "epoch": 0.6626912886112923, "grad_norm": 0.7037742137908936, "learning_rate": 1.9122783120748297e-05, "loss": 0.2873, "mean_token_accuracy": 0.9141374006867409, "num_tokens": 152509161.0, "step": 43050 }, { "entropy": 0.2987836763262749, "epoch": 0.6628452238699709, "grad_norm": 0.741265594959259, "learning_rate": 1.912204939131119e-05, "loss": 0.2956, "mean_token_accuracy": 0.9083013154566288, "num_tokens": 152573353.0, "step": 43060 }, { "entropy": 0.2950741548091173, "epoch": 0.6629991591286495, "grad_norm": 0.6422184705734253, "learning_rate": 1.912131536923456e-05, "loss": 0.3071, "mean_token_accuracy": 0.9091407790780067, "num_tokens": 152637672.0, "step": 43070 }, { "entropy": 0.28693766370415685, "epoch": 0.6631530943873281, "grad_norm": 0.567202091217041, "learning_rate": 1.9120581054541948e-05, "loss": 0.2908, "mean_token_accuracy": 0.9109734356403351, "num_tokens": 152695784.0, "step": 43080 }, { "entropy": 0.2797376153059304, "epoch": 0.6633070296460066, "grad_norm": 0.6182226538658142, "learning_rate": 1.911984644725692e-05, "loss": 0.2844, "mean_token_accuracy": 0.912682393193245, "num_tokens": 152759144.0, "step": 43090 }, { "entropy": 0.28220556462183594, "epoch": 0.6634609649046852, "grad_norm": 0.7196092009544373, "learning_rate": 1.9119111547403033e-05, "loss": 0.287, "mean_token_accuracy": 0.9114175461232662, "num_tokens": 152828418.0, "step": 43100 }, { "entropy": 0.2762479469180107, "epoch": 0.6636149001633638, "grad_norm": 0.5989877581596375, "learning_rate": 1.9118376355003866e-05, "loss": 0.2798, "mean_token_accuracy": 0.9140569925308227, "num_tokens": 152896190.0, "step": 43110 }, { "entropy": 0.26110141817480326, "epoch": 0.6637688354220423, "grad_norm": 0.593693196773529, "learning_rate": 1.9117640870083005e-05, "loss": 0.2672, "mean_token_accuracy": 0.9169657759368419, "num_tokens": 152965997.0, "step": 43120 }, { "entropy": 0.2912089556455612, "epoch": 0.6639227706807209, "grad_norm": 0.5534926652908325, "learning_rate": 1.911690509266405e-05, "loss": 0.287, "mean_token_accuracy": 0.9102806232869625, "num_tokens": 153027927.0, "step": 43130 }, { "entropy": 0.2621348577551544, "epoch": 0.6640767059393995, "grad_norm": 0.5716041326522827, "learning_rate": 1.9116169022770594e-05, "loss": 0.2627, "mean_token_accuracy": 0.9192001067101956, "num_tokens": 153094931.0, "step": 43140 }, { "entropy": 0.28053724551573395, "epoch": 0.6642306411980781, "grad_norm": 0.6471245288848877, "learning_rate": 1.9115432660426257e-05, "loss": 0.2871, "mean_token_accuracy": 0.9120657868683338, "num_tokens": 153173364.0, "step": 43150 }, { "entropy": 0.28969929479062556, "epoch": 0.6643845764567567, "grad_norm": 0.6643755435943604, "learning_rate": 1.9114696005654664e-05, "loss": 0.2908, "mean_token_accuracy": 0.909358910471201, "num_tokens": 153245860.0, "step": 43160 }, { "entropy": 0.2793875581584871, "epoch": 0.6645385117154353, "grad_norm": 0.7461796402931213, "learning_rate": 1.911395905847944e-05, "loss": 0.2779, "mean_token_accuracy": 0.9150628596544266, "num_tokens": 153310826.0, "step": 43170 }, { "entropy": 0.29074644511565567, "epoch": 0.6646924469741139, "grad_norm": 0.6065770983695984, "learning_rate": 1.9113221818924236e-05, "loss": 0.2932, "mean_token_accuracy": 0.9066679820418357, "num_tokens": 153384042.0, "step": 43180 }, { "entropy": 0.29596511330455544, "epoch": 0.6648463822327925, "grad_norm": 0.7078844904899597, "learning_rate": 1.9112484287012696e-05, "loss": 0.3068, "mean_token_accuracy": 0.9080710537731648, "num_tokens": 153442586.0, "step": 43190 }, { "entropy": 0.27467961898073556, "epoch": 0.665000317491471, "grad_norm": 0.5586185455322266, "learning_rate": 1.911174646276848e-05, "loss": 0.2798, "mean_token_accuracy": 0.9162917844951153, "num_tokens": 153499004.0, "step": 43200 }, { "entropy": 0.2879514364525676, "epoch": 0.6651542527501496, "grad_norm": 0.5843605995178223, "learning_rate": 1.9111008346215267e-05, "loss": 0.291, "mean_token_accuracy": 0.9139048613607883, "num_tokens": 153561897.0, "step": 43210 }, { "entropy": 0.27859767330810425, "epoch": 0.6653081880088282, "grad_norm": 0.5909144282341003, "learning_rate": 1.9110269937376723e-05, "loss": 0.2714, "mean_token_accuracy": 0.913109689205885, "num_tokens": 153619153.0, "step": 43220 }, { "entropy": 0.27969750519841907, "epoch": 0.6654621232675068, "grad_norm": 0.5810877084732056, "learning_rate": 1.9109531236276543e-05, "loss": 0.2732, "mean_token_accuracy": 0.9131002500653267, "num_tokens": 153684966.0, "step": 43230 }, { "entropy": 0.28670245883986356, "epoch": 0.6656160585261853, "grad_norm": 0.7164415121078491, "learning_rate": 1.9108792242938425e-05, "loss": 0.2948, "mean_token_accuracy": 0.9096420787274837, "num_tokens": 153753247.0, "step": 43240 }, { "entropy": 0.3021384459920228, "epoch": 0.6657699937848639, "grad_norm": 0.6132495999336243, "learning_rate": 1.910805295738608e-05, "loss": 0.2875, "mean_token_accuracy": 0.9062787137925625, "num_tokens": 153824629.0, "step": 43250 }, { "entropy": 0.270349726267159, "epoch": 0.6659239290435425, "grad_norm": 0.6000775694847107, "learning_rate": 1.9107313379643214e-05, "loss": 0.2801, "mean_token_accuracy": 0.9151061296463012, "num_tokens": 153888908.0, "step": 43260 }, { "entropy": 0.273435090854764, "epoch": 0.666077864302221, "grad_norm": 0.59184330701828, "learning_rate": 1.9106573509733562e-05, "loss": 0.2924, "mean_token_accuracy": 0.9144847877323627, "num_tokens": 153959009.0, "step": 43270 }, { "entropy": 0.2869518126361072, "epoch": 0.6662317995608997, "grad_norm": 0.6508246660232544, "learning_rate": 1.9105833347680858e-05, "loss": 0.2919, "mean_token_accuracy": 0.9101931802928448, "num_tokens": 154020149.0, "step": 43280 }, { "entropy": 0.2743998027406633, "epoch": 0.6663857348195783, "grad_norm": 0.7857657670974731, "learning_rate": 1.9105092893508845e-05, "loss": 0.2804, "mean_token_accuracy": 0.9158991739153862, "num_tokens": 154079433.0, "step": 43290 }, { "entropy": 0.2809670478105545, "epoch": 0.6665396700782569, "grad_norm": 0.8084327578544617, "learning_rate": 1.9104352147241277e-05, "loss": 0.2856, "mean_token_accuracy": 0.9139533028006553, "num_tokens": 154150313.0, "step": 43300 }, { "entropy": 0.25953824883326887, "epoch": 0.6666936053369354, "grad_norm": 0.6944770216941833, "learning_rate": 1.910361110890192e-05, "loss": 0.2754, "mean_token_accuracy": 0.9177754268050193, "num_tokens": 154208922.0, "step": 43310 }, { "entropy": 0.29073634603992105, "epoch": 0.666847540595614, "grad_norm": 0.6094241738319397, "learning_rate": 1.9102869778514543e-05, "loss": 0.3019, "mean_token_accuracy": 0.9094732709228992, "num_tokens": 154276005.0, "step": 43320 }, { "entropy": 0.2850344073958695, "epoch": 0.6670014758542926, "grad_norm": 0.8994444012641907, "learning_rate": 1.910212815610293e-05, "loss": 0.2951, "mean_token_accuracy": 0.9114385075867176, "num_tokens": 154342753.0, "step": 43330 }, { "entropy": 0.288453075196594, "epoch": 0.6671554111129712, "grad_norm": 0.8096613883972168, "learning_rate": 1.910138624169087e-05, "loss": 0.2878, "mean_token_accuracy": 0.9093158468604088, "num_tokens": 154402335.0, "step": 43340 }, { "entropy": 0.26898630941286683, "epoch": 0.6673093463716497, "grad_norm": 0.6901741027832031, "learning_rate": 1.910064403530217e-05, "loss": 0.2778, "mean_token_accuracy": 0.9161612592637539, "num_tokens": 154461328.0, "step": 43350 }, { "entropy": 0.28594037061557176, "epoch": 0.6674632816303283, "grad_norm": 0.6701340675354004, "learning_rate": 1.909990153696064e-05, "loss": 0.2871, "mean_token_accuracy": 0.910988861322403, "num_tokens": 154521240.0, "step": 43360 }, { "entropy": 0.28428381606936454, "epoch": 0.6676172168890069, "grad_norm": 0.6290795207023621, "learning_rate": 1.909915874669009e-05, "loss": 0.282, "mean_token_accuracy": 0.9136812634766102, "num_tokens": 154584959.0, "step": 43370 }, { "entropy": 0.2773758596740663, "epoch": 0.6677711521476855, "grad_norm": 0.630957841873169, "learning_rate": 1.9098415664514363e-05, "loss": 0.2874, "mean_token_accuracy": 0.911760152131319, "num_tokens": 154652068.0, "step": 43380 }, { "entropy": 0.282165186945349, "epoch": 0.667925087406364, "grad_norm": 0.5689296722412109, "learning_rate": 1.9097672290457286e-05, "loss": 0.2839, "mean_token_accuracy": 0.9133245214819908, "num_tokens": 154721994.0, "step": 43390 }, { "entropy": 0.2778062827885151, "epoch": 0.6680790226650426, "grad_norm": 0.7282940149307251, "learning_rate": 1.909692862454271e-05, "loss": 0.2712, "mean_token_accuracy": 0.9148274205625058, "num_tokens": 154774752.0, "step": 43400 }, { "entropy": 0.3049618422053754, "epoch": 0.6682329579237213, "grad_norm": 0.6116664409637451, "learning_rate": 1.9096184666794495e-05, "loss": 0.3086, "mean_token_accuracy": 0.9034510366618633, "num_tokens": 154846159.0, "step": 43410 }, { "entropy": 0.2814322475343943, "epoch": 0.6683868931823999, "grad_norm": 0.6028491258621216, "learning_rate": 1.9095440417236507e-05, "loss": 0.2878, "mean_token_accuracy": 0.912943335622549, "num_tokens": 154915016.0, "step": 43420 }, { "entropy": 0.27289552688598634, "epoch": 0.6685408284410784, "grad_norm": 0.6396311521530151, "learning_rate": 1.909469587589262e-05, "loss": 0.289, "mean_token_accuracy": 0.9154729835689068, "num_tokens": 154981841.0, "step": 43430 }, { "entropy": 0.2843230086378753, "epoch": 0.668694763699757, "grad_norm": 0.7305309176445007, "learning_rate": 1.9093951042786717e-05, "loss": 0.309, "mean_token_accuracy": 0.9099556252360343, "num_tokens": 155044813.0, "step": 43440 }, { "entropy": 0.28051521396264434, "epoch": 0.6688486989584356, "grad_norm": 0.5600211024284363, "learning_rate": 1.90932059179427e-05, "loss": 0.2864, "mean_token_accuracy": 0.91390845105052, "num_tokens": 155108764.0, "step": 43450 }, { "entropy": 0.25980738224461675, "epoch": 0.6690026342171141, "grad_norm": 0.6297683119773865, "learning_rate": 1.9092460501384464e-05, "loss": 0.261, "mean_token_accuracy": 0.9192338518798351, "num_tokens": 155176071.0, "step": 43460 }, { "entropy": 0.2797685387544334, "epoch": 0.6691565694757927, "grad_norm": 0.6150873899459839, "learning_rate": 1.9091714793135926e-05, "loss": 0.2854, "mean_token_accuracy": 0.9094043910503388, "num_tokens": 155240115.0, "step": 43470 }, { "entropy": 0.26568193892017006, "epoch": 0.6693105047344713, "grad_norm": 0.6440735459327698, "learning_rate": 1.909096879322101e-05, "loss": 0.2706, "mean_token_accuracy": 0.9174276329576969, "num_tokens": 155301542.0, "step": 43480 }, { "entropy": 0.31178731033578516, "epoch": 0.6694644399931499, "grad_norm": 1.0605449676513672, "learning_rate": 1.9090222501663648e-05, "loss": 0.3154, "mean_token_accuracy": 0.9049247242510319, "num_tokens": 155361907.0, "step": 43490 }, { "entropy": 0.29898407673463223, "epoch": 0.6696183752518284, "grad_norm": 0.7024593353271484, "learning_rate": 1.9089475918487782e-05, "loss": 0.2913, "mean_token_accuracy": 0.9077180705964565, "num_tokens": 155422756.0, "step": 43500 }, { "entropy": 0.28803847581148145, "epoch": 0.669772310510507, "grad_norm": 0.513947069644928, "learning_rate": 1.908872904371736e-05, "loss": 0.302, "mean_token_accuracy": 0.9115533292293548, "num_tokens": 155483030.0, "step": 43510 }, { "entropy": 0.2843332932330668, "epoch": 0.6699262457691856, "grad_norm": 0.5216326713562012, "learning_rate": 1.9087981877376346e-05, "loss": 0.288, "mean_token_accuracy": 0.9111464954912663, "num_tokens": 155540238.0, "step": 43520 }, { "entropy": 0.27528282115235925, "epoch": 0.6700801810278642, "grad_norm": 0.6236678957939148, "learning_rate": 1.9087234419488702e-05, "loss": 0.2877, "mean_token_accuracy": 0.9160291768610478, "num_tokens": 155601398.0, "step": 43530 }, { "entropy": 0.2903729798272252, "epoch": 0.6702341162865428, "grad_norm": 0.7544373273849487, "learning_rate": 1.9086486670078416e-05, "loss": 0.2931, "mean_token_accuracy": 0.9095414325594902, "num_tokens": 155666867.0, "step": 43540 }, { "entropy": 0.2794504711404443, "epoch": 0.6703880515452214, "grad_norm": 0.7779355645179749, "learning_rate": 1.908573862916947e-05, "loss": 0.2965, "mean_token_accuracy": 0.9109555363655091, "num_tokens": 155726914.0, "step": 43550 }, { "entropy": 0.2719670864753425, "epoch": 0.6705419868039, "grad_norm": 0.7167548537254333, "learning_rate": 1.9084990296785864e-05, "loss": 0.274, "mean_token_accuracy": 0.9151463255286216, "num_tokens": 155788436.0, "step": 43560 }, { "entropy": 0.3018341972492635, "epoch": 0.6706959220625786, "grad_norm": 0.6285359263420105, "learning_rate": 1.90842416729516e-05, "loss": 0.3002, "mean_token_accuracy": 0.9052831336855889, "num_tokens": 155863089.0, "step": 43570 }, { "entropy": 0.3055307688191533, "epoch": 0.6708498573212571, "grad_norm": 0.680708110332489, "learning_rate": 1.9083492757690706e-05, "loss": 0.3026, "mean_token_accuracy": 0.9072289496660233, "num_tokens": 155923137.0, "step": 43580 }, { "entropy": 0.27721311133354903, "epoch": 0.6710037925799357, "grad_norm": 0.5449482202529907, "learning_rate": 1.9082743551027194e-05, "loss": 0.2837, "mean_token_accuracy": 0.9155504323542119, "num_tokens": 155987978.0, "step": 43590 }, { "entropy": 0.2910130187869072, "epoch": 0.6711577278386143, "grad_norm": 0.6428499221801758, "learning_rate": 1.9081994052985107e-05, "loss": 0.2955, "mean_token_accuracy": 0.9107147283852101, "num_tokens": 156050903.0, "step": 43600 }, { "entropy": 0.2731150073930621, "epoch": 0.6713116630972928, "grad_norm": 0.5164161920547485, "learning_rate": 1.9081244263588488e-05, "loss": 0.2643, "mean_token_accuracy": 0.9140905342996121, "num_tokens": 156116008.0, "step": 43610 }, { "entropy": 0.2850996458902955, "epoch": 0.6714655983559714, "grad_norm": 0.8367108106613159, "learning_rate": 1.9080494182861388e-05, "loss": 0.2936, "mean_token_accuracy": 0.9115302927792073, "num_tokens": 156179492.0, "step": 43620 }, { "entropy": 0.26969990828074514, "epoch": 0.67161953361465, "grad_norm": 0.6246567368507385, "learning_rate": 1.907974381082787e-05, "loss": 0.2646, "mean_token_accuracy": 0.9192307636141777, "num_tokens": 156247548.0, "step": 43630 }, { "entropy": 0.30461759138852357, "epoch": 0.6717734688733286, "grad_norm": 0.5860620737075806, "learning_rate": 1.907899314751201e-05, "loss": 0.3025, "mean_token_accuracy": 0.904623418301344, "num_tokens": 156316347.0, "step": 43640 }, { "entropy": 0.3020380949601531, "epoch": 0.6719274041320071, "grad_norm": 0.7026848196983337, "learning_rate": 1.9078242192937888e-05, "loss": 0.3168, "mean_token_accuracy": 0.90665233284235, "num_tokens": 156386055.0, "step": 43650 }, { "entropy": 0.28933491790667176, "epoch": 0.6720813393906858, "grad_norm": 0.5859882235527039, "learning_rate": 1.9077490947129593e-05, "loss": 0.3061, "mean_token_accuracy": 0.911451606452465, "num_tokens": 156452943.0, "step": 43660 }, { "entropy": 0.2841335689648986, "epoch": 0.6722352746493644, "grad_norm": 0.7160584330558777, "learning_rate": 1.907673941011123e-05, "loss": 0.2794, "mean_token_accuracy": 0.9097836814820767, "num_tokens": 156512017.0, "step": 43670 }, { "entropy": 0.26992869209498166, "epoch": 0.672389209908043, "grad_norm": 0.6935170888900757, "learning_rate": 1.90759875819069e-05, "loss": 0.2879, "mean_token_accuracy": 0.9158839322626591, "num_tokens": 156585508.0, "step": 43680 }, { "entropy": 0.2983346282504499, "epoch": 0.6725431451667215, "grad_norm": 0.7599006295204163, "learning_rate": 1.907523546254073e-05, "loss": 0.2978, "mean_token_accuracy": 0.909250108152628, "num_tokens": 156639535.0, "step": 43690 }, { "entropy": 0.2906131321564317, "epoch": 0.6726970804254001, "grad_norm": 0.730605959892273, "learning_rate": 1.9074483052036848e-05, "loss": 0.2899, "mean_token_accuracy": 0.9107715882360935, "num_tokens": 156699665.0, "step": 43700 }, { "entropy": 0.27695224341005087, "epoch": 0.6728510156840787, "grad_norm": 0.626062273979187, "learning_rate": 1.9073730350419386e-05, "loss": 0.279, "mean_token_accuracy": 0.9174249149858952, "num_tokens": 156758113.0, "step": 43710 }, { "entropy": 0.2649639630690217, "epoch": 0.6730049509427573, "grad_norm": 0.7254077792167664, "learning_rate": 1.9072977357712495e-05, "loss": 0.269, "mean_token_accuracy": 0.9181448191404342, "num_tokens": 156816143.0, "step": 43720 }, { "entropy": 0.2812258177436888, "epoch": 0.6731588862014358, "grad_norm": 0.5647891163825989, "learning_rate": 1.907222407394033e-05, "loss": 0.2965, "mean_token_accuracy": 0.9137782484292984, "num_tokens": 156883277.0, "step": 43730 }, { "entropy": 0.2796397945843637, "epoch": 0.6733128214601144, "grad_norm": 0.650261402130127, "learning_rate": 1.907147049912706e-05, "loss": 0.28, "mean_token_accuracy": 0.9123042412102222, "num_tokens": 156956608.0, "step": 43740 }, { "entropy": 0.27357883248478176, "epoch": 0.673466756718793, "grad_norm": 0.5200901031494141, "learning_rate": 1.9070716633296855e-05, "loss": 0.271, "mean_token_accuracy": 0.9183363005518913, "num_tokens": 157018353.0, "step": 43750 }, { "entropy": 0.2852286048233509, "epoch": 0.6736206919774715, "grad_norm": 0.6938188076019287, "learning_rate": 1.9069962476473903e-05, "loss": 0.3012, "mean_token_accuracy": 0.9103206127882004, "num_tokens": 157080600.0, "step": 43760 }, { "entropy": 0.29564509307965636, "epoch": 0.6737746272361501, "grad_norm": 0.7695992588996887, "learning_rate": 1.9069208028682395e-05, "loss": 0.2938, "mean_token_accuracy": 0.9077957220375538, "num_tokens": 157146457.0, "step": 43770 }, { "entropy": 0.2919221095740795, "epoch": 0.6739285624948287, "grad_norm": 0.5212399959564209, "learning_rate": 1.9068453289946537e-05, "loss": 0.2965, "mean_token_accuracy": 0.9108749687671661, "num_tokens": 157222014.0, "step": 43780 }, { "entropy": 0.27582877352833746, "epoch": 0.6740824977535074, "grad_norm": 0.653262734413147, "learning_rate": 1.9067698260290543e-05, "loss": 0.2833, "mean_token_accuracy": 0.9139208339154721, "num_tokens": 157280964.0, "step": 43790 }, { "entropy": 0.2793005960993469, "epoch": 0.6742364330121859, "grad_norm": 0.6773855090141296, "learning_rate": 1.9066942939738627e-05, "loss": 0.2901, "mean_token_accuracy": 0.9090862900018692, "num_tokens": 157345698.0, "step": 43800 }, { "entropy": 0.3016113521531224, "epoch": 0.6743903682708645, "grad_norm": 0.6488208770751953, "learning_rate": 1.9066187328315027e-05, "loss": 0.3065, "mean_token_accuracy": 0.9069923922419548, "num_tokens": 157420036.0, "step": 43810 }, { "entropy": 0.26745306635275484, "epoch": 0.6745443035295431, "grad_norm": 0.5695841312408447, "learning_rate": 1.906543142604398e-05, "loss": 0.2626, "mean_token_accuracy": 0.9181425139307976, "num_tokens": 157492104.0, "step": 43820 }, { "entropy": 0.2931372937746346, "epoch": 0.6746982387882217, "grad_norm": 0.6001129746437073, "learning_rate": 1.9064675232949736e-05, "loss": 0.3072, "mean_token_accuracy": 0.9090686842799187, "num_tokens": 157566451.0, "step": 43830 }, { "entropy": 0.2774711363017559, "epoch": 0.6748521740469002, "grad_norm": 0.5945842266082764, "learning_rate": 1.9063918749056553e-05, "loss": 0.2782, "mean_token_accuracy": 0.9124567940831184, "num_tokens": 157635621.0, "step": 43840 }, { "entropy": 0.2759257884696126, "epoch": 0.6750061093055788, "grad_norm": 0.5323203802108765, "learning_rate": 1.9063161974388705e-05, "loss": 0.2816, "mean_token_accuracy": 0.915196418762207, "num_tokens": 157703237.0, "step": 43850 }, { "entropy": 0.2923116099089384, "epoch": 0.6751600445642574, "grad_norm": 0.5308940410614014, "learning_rate": 1.9062404908970466e-05, "loss": 0.2974, "mean_token_accuracy": 0.9102632574737072, "num_tokens": 157771851.0, "step": 43860 }, { "entropy": 0.2771931115537882, "epoch": 0.675313979822936, "grad_norm": 0.6313517689704895, "learning_rate": 1.906164755282612e-05, "loss": 0.2826, "mean_token_accuracy": 0.9151696987450123, "num_tokens": 157838017.0, "step": 43870 }, { "entropy": 0.28171082539483905, "epoch": 0.6754679150816145, "grad_norm": 0.6285293698310852, "learning_rate": 1.9060889905979968e-05, "loss": 0.2675, "mean_token_accuracy": 0.9135595262050629, "num_tokens": 157908264.0, "step": 43880 }, { "entropy": 0.27486395379528406, "epoch": 0.6756218503402931, "grad_norm": 0.6698090434074402, "learning_rate": 1.906013196845631e-05, "loss": 0.2887, "mean_token_accuracy": 0.9132810853421688, "num_tokens": 157966526.0, "step": 43890 }, { "entropy": 0.29337240010499954, "epoch": 0.6757757855989717, "grad_norm": 0.600729763507843, "learning_rate": 1.905937374027947e-05, "loss": 0.2907, "mean_token_accuracy": 0.9105646014213562, "num_tokens": 158029156.0, "step": 43900 }, { "entropy": 0.2760942796245217, "epoch": 0.6759297208576502, "grad_norm": 0.7478737235069275, "learning_rate": 1.9058615221473764e-05, "loss": 0.2667, "mean_token_accuracy": 0.9146930351853371, "num_tokens": 158090617.0, "step": 43910 }, { "entropy": 0.2734149988740683, "epoch": 0.6760836561163289, "grad_norm": 0.5981205105781555, "learning_rate": 1.905785641206353e-05, "loss": 0.2893, "mean_token_accuracy": 0.9135112226009369, "num_tokens": 158162122.0, "step": 43920 }, { "entropy": 0.26209351839497685, "epoch": 0.6762375913750075, "grad_norm": 0.7048168778419495, "learning_rate": 1.9057097312073107e-05, "loss": 0.2677, "mean_token_accuracy": 0.9198895715177059, "num_tokens": 158232606.0, "step": 43930 }, { "entropy": 0.2939992857165635, "epoch": 0.6763915266336861, "grad_norm": 0.5727118253707886, "learning_rate": 1.9056337921526853e-05, "loss": 0.2894, "mean_token_accuracy": 0.9103999584913254, "num_tokens": 158308805.0, "step": 43940 }, { "entropy": 0.2490351302549243, "epoch": 0.6765454618923646, "grad_norm": 0.5679351091384888, "learning_rate": 1.9055578240449127e-05, "loss": 0.2626, "mean_token_accuracy": 0.9213403716683388, "num_tokens": 158380069.0, "step": 43950 }, { "entropy": 0.28678468652069566, "epoch": 0.6766993971510432, "grad_norm": 0.6231617331504822, "learning_rate": 1.9054818268864295e-05, "loss": 0.2896, "mean_token_accuracy": 0.907254672050476, "num_tokens": 158439977.0, "step": 43960 }, { "entropy": 0.24765914594754576, "epoch": 0.6768533324097218, "grad_norm": 0.537750244140625, "learning_rate": 1.9054058006796748e-05, "loss": 0.2556, "mean_token_accuracy": 0.9236360304057598, "num_tokens": 158509773.0, "step": 43970 }, { "entropy": 0.3105886295437813, "epoch": 0.6770072676684004, "grad_norm": 0.5567288398742676, "learning_rate": 1.9053297454270866e-05, "loss": 0.3002, "mean_token_accuracy": 0.9035798616707325, "num_tokens": 158575734.0, "step": 43980 }, { "entropy": 0.28959731152281165, "epoch": 0.6771612029270789, "grad_norm": 0.57724529504776, "learning_rate": 1.9052536611311047e-05, "loss": 0.2892, "mean_token_accuracy": 0.9084495186805726, "num_tokens": 158646045.0, "step": 43990 }, { "entropy": 0.27037055222317574, "epoch": 0.6773151381857575, "grad_norm": 0.6590880155563354, "learning_rate": 1.9051775477941706e-05, "loss": 0.282, "mean_token_accuracy": 0.9173475444316864, "num_tokens": 158708206.0, "step": 44000 }, { "entropy": 0.2916911454871297, "epoch": 0.6774690734444361, "grad_norm": 0.5078679323196411, "learning_rate": 1.9051014054187262e-05, "loss": 0.2833, "mean_token_accuracy": 0.9088791973888874, "num_tokens": 158771679.0, "step": 44010 }, { "entropy": 0.2830917598679662, "epoch": 0.6776230087031146, "grad_norm": 0.4845653176307678, "learning_rate": 1.9050252340072132e-05, "loss": 0.2867, "mean_token_accuracy": 0.9129835173487664, "num_tokens": 158847025.0, "step": 44020 }, { "entropy": 0.26741807255893946, "epoch": 0.6777769439617932, "grad_norm": 0.7397655844688416, "learning_rate": 1.9049490335620763e-05, "loss": 0.2749, "mean_token_accuracy": 0.915567721426487, "num_tokens": 158919378.0, "step": 44030 }, { "entropy": 0.27804513480514287, "epoch": 0.6779308792204719, "grad_norm": 0.7264512181282043, "learning_rate": 1.9048728040857592e-05, "loss": 0.2834, "mean_token_accuracy": 0.9137001134455204, "num_tokens": 158979804.0, "step": 44040 }, { "entropy": 0.26798576973378657, "epoch": 0.6780848144791505, "grad_norm": 0.6896481513977051, "learning_rate": 1.9047965455807076e-05, "loss": 0.2826, "mean_token_accuracy": 0.9146680302917958, "num_tokens": 159050188.0, "step": 44050 }, { "entropy": 0.28451199680566785, "epoch": 0.678238749737829, "grad_norm": 0.7112587690353394, "learning_rate": 1.904720258049368e-05, "loss": 0.2994, "mean_token_accuracy": 0.9116448380053044, "num_tokens": 159114675.0, "step": 44060 }, { "entropy": 0.26450355667620895, "epoch": 0.6783926849965076, "grad_norm": 0.6128736734390259, "learning_rate": 1.904643941494188e-05, "loss": 0.2645, "mean_token_accuracy": 0.9184626683592796, "num_tokens": 159189116.0, "step": 44070 }, { "entropy": 0.28325996920466423, "epoch": 0.6785466202551862, "grad_norm": 0.7223693132400513, "learning_rate": 1.9045675959176153e-05, "loss": 0.2941, "mean_token_accuracy": 0.9115894496440887, "num_tokens": 159244621.0, "step": 44080 }, { "entropy": 0.276500410400331, "epoch": 0.6787005555138648, "grad_norm": 0.5628377199172974, "learning_rate": 1.9044912213220996e-05, "loss": 0.2902, "mean_token_accuracy": 0.9148767277598381, "num_tokens": 159315021.0, "step": 44090 }, { "entropy": 0.27229378959164025, "epoch": 0.6788544907725433, "grad_norm": 0.6257047057151794, "learning_rate": 1.904414817710091e-05, "loss": 0.2743, "mean_token_accuracy": 0.9162360750138759, "num_tokens": 159379184.0, "step": 44100 }, { "entropy": 0.2656897102482617, "epoch": 0.6790084260312219, "grad_norm": 0.5313580632209778, "learning_rate": 1.90433838508404e-05, "loss": 0.2723, "mean_token_accuracy": 0.9177310936152935, "num_tokens": 159454857.0, "step": 44110 }, { "entropy": 0.2577522801235318, "epoch": 0.6791623612899005, "grad_norm": 0.6437742114067078, "learning_rate": 1.9042619234463993e-05, "loss": 0.2688, "mean_token_accuracy": 0.9184901401400566, "num_tokens": 159531528.0, "step": 44120 }, { "entropy": 0.2995790036395192, "epoch": 0.6793162965485791, "grad_norm": 0.6125620603561401, "learning_rate": 1.9041854327996215e-05, "loss": 0.3018, "mean_token_accuracy": 0.9073189742863178, "num_tokens": 159607562.0, "step": 44130 }, { "entropy": 0.28445497518405316, "epoch": 0.6794702318072576, "grad_norm": 0.6694263815879822, "learning_rate": 1.9041089131461607e-05, "loss": 0.2848, "mean_token_accuracy": 0.9098239101469516, "num_tokens": 159676706.0, "step": 44140 }, { "entropy": 0.2808829491958022, "epoch": 0.6796241670659362, "grad_norm": 0.7658823728561401, "learning_rate": 1.904032364488471e-05, "loss": 0.2844, "mean_token_accuracy": 0.9131200909614563, "num_tokens": 159729896.0, "step": 44150 }, { "entropy": 0.2913707866333425, "epoch": 0.6797781023246148, "grad_norm": 0.5381408333778381, "learning_rate": 1.9039557868290085e-05, "loss": 0.3019, "mean_token_accuracy": 0.910252571105957, "num_tokens": 159799607.0, "step": 44160 }, { "entropy": 0.29226557863876224, "epoch": 0.6799320375832935, "grad_norm": 0.7600438594818115, "learning_rate": 1.9038791801702302e-05, "loss": 0.279, "mean_token_accuracy": 0.9104148641228675, "num_tokens": 159858966.0, "step": 44170 }, { "entropy": 0.2868138233199716, "epoch": 0.680085972841972, "grad_norm": 0.6973016262054443, "learning_rate": 1.903802544514594e-05, "loss": 0.2887, "mean_token_accuracy": 0.9096319124102592, "num_tokens": 159919602.0, "step": 44180 }, { "entropy": 0.26056279996410014, "epoch": 0.6802399081006506, "grad_norm": 0.5257284045219421, "learning_rate": 1.9037258798645567e-05, "loss": 0.2583, "mean_token_accuracy": 0.919570679217577, "num_tokens": 159984085.0, "step": 44190 }, { "entropy": 0.266432648152113, "epoch": 0.6803938433593292, "grad_norm": 0.8164317607879639, "learning_rate": 1.9036491862225793e-05, "loss": 0.2711, "mean_token_accuracy": 0.9127192072570324, "num_tokens": 160042500.0, "step": 44200 }, { "entropy": 0.27875033067539334, "epoch": 0.6805477786180077, "grad_norm": 0.6642779111862183, "learning_rate": 1.9035724635911222e-05, "loss": 0.2903, "mean_token_accuracy": 0.9113708682358265, "num_tokens": 160113266.0, "step": 44210 }, { "entropy": 0.28705096719786527, "epoch": 0.6807017138766863, "grad_norm": 0.837016761302948, "learning_rate": 1.9034957119726456e-05, "loss": 0.2935, "mean_token_accuracy": 0.9108832716941834, "num_tokens": 160176091.0, "step": 44220 }, { "entropy": 0.2838635925203562, "epoch": 0.6808556491353649, "grad_norm": 0.560225784778595, "learning_rate": 1.9034189313696126e-05, "loss": 0.2894, "mean_token_accuracy": 0.913162799179554, "num_tokens": 160242503.0, "step": 44230 }, { "entropy": 0.268189961835742, "epoch": 0.6810095843940435, "grad_norm": 0.748110294342041, "learning_rate": 1.903342121784486e-05, "loss": 0.2693, "mean_token_accuracy": 0.9158104173839092, "num_tokens": 160308515.0, "step": 44240 }, { "entropy": 0.2762747272849083, "epoch": 0.681163519652722, "grad_norm": 0.5905294418334961, "learning_rate": 1.90326528321973e-05, "loss": 0.2785, "mean_token_accuracy": 0.915319700539112, "num_tokens": 160375545.0, "step": 44250 }, { "entropy": 0.27399718230590225, "epoch": 0.6813174549114006, "grad_norm": 0.5718430876731873, "learning_rate": 1.9031884156778097e-05, "loss": 0.2884, "mean_token_accuracy": 0.9132949061691761, "num_tokens": 160435421.0, "step": 44260 }, { "entropy": 0.27036379063501953, "epoch": 0.6814713901700792, "grad_norm": 0.6119801998138428, "learning_rate": 1.903111519161191e-05, "loss": 0.2691, "mean_token_accuracy": 0.9158527992665768, "num_tokens": 160500955.0, "step": 44270 }, { "entropy": 0.2915733135305345, "epoch": 0.6816253254287578, "grad_norm": 0.6361083388328552, "learning_rate": 1.9030345936723405e-05, "loss": 0.2953, "mean_token_accuracy": 0.9068191669881344, "num_tokens": 160563273.0, "step": 44280 }, { "entropy": 0.28699868293479086, "epoch": 0.6817792606874363, "grad_norm": 0.8087133765220642, "learning_rate": 1.9029576392137263e-05, "loss": 0.2896, "mean_token_accuracy": 0.9112575829029084, "num_tokens": 160620755.0, "step": 44290 }, { "entropy": 0.27587278839200735, "epoch": 0.681933195946115, "grad_norm": 0.6309298872947693, "learning_rate": 1.9028806557878172e-05, "loss": 0.2785, "mean_token_accuracy": 0.9149789810180664, "num_tokens": 160695075.0, "step": 44300 }, { "entropy": 0.27558902082964776, "epoch": 0.6820871312047936, "grad_norm": 0.6554559469223022, "learning_rate": 1.9028036433970825e-05, "loss": 0.2913, "mean_token_accuracy": 0.914205702394247, "num_tokens": 160758968.0, "step": 44310 }, { "entropy": 0.26804550979286434, "epoch": 0.6822410664634722, "grad_norm": 0.571194589138031, "learning_rate": 1.9027266020439935e-05, "loss": 0.2731, "mean_token_accuracy": 0.9168618783354759, "num_tokens": 160822810.0, "step": 44320 }, { "entropy": 0.28488434106111526, "epoch": 0.6823950017221507, "grad_norm": 0.6609893441200256, "learning_rate": 1.9026495317310205e-05, "loss": 0.2871, "mean_token_accuracy": 0.9111726514995098, "num_tokens": 160886563.0, "step": 44330 }, { "entropy": 0.27801548605784776, "epoch": 0.6825489369808293, "grad_norm": 0.5872876644134521, "learning_rate": 1.9025724324606372e-05, "loss": 0.2894, "mean_token_accuracy": 0.9151597931981087, "num_tokens": 160954499.0, "step": 44340 }, { "entropy": 0.27500994196161627, "epoch": 0.6827028722395079, "grad_norm": 0.7807015776634216, "learning_rate": 1.902495304235316e-05, "loss": 0.2759, "mean_token_accuracy": 0.9156386815011501, "num_tokens": 161011031.0, "step": 44350 }, { "entropy": 0.2723522925749421, "epoch": 0.6828568074981864, "grad_norm": 0.5788021087646484, "learning_rate": 1.9024181470575325e-05, "loss": 0.2911, "mean_token_accuracy": 0.9150971807539463, "num_tokens": 161072968.0, "step": 44360 }, { "entropy": 0.28501264452934266, "epoch": 0.683010742756865, "grad_norm": 0.7425968647003174, "learning_rate": 1.9023409609297605e-05, "loss": 0.2962, "mean_token_accuracy": 0.9101709760725498, "num_tokens": 161143391.0, "step": 44370 }, { "entropy": 0.2745893469080329, "epoch": 0.6831646780155436, "grad_norm": 0.7189406156539917, "learning_rate": 1.902263745854477e-05, "loss": 0.2793, "mean_token_accuracy": 0.9142208270728588, "num_tokens": 161207983.0, "step": 44380 }, { "entropy": 0.2773917311802506, "epoch": 0.6833186132742222, "grad_norm": 0.6560516357421875, "learning_rate": 1.9021865018341586e-05, "loss": 0.2818, "mean_token_accuracy": 0.9137469589710235, "num_tokens": 161262417.0, "step": 44390 }, { "entropy": 0.28371570287272335, "epoch": 0.6834725485329007, "grad_norm": 0.627678632736206, "learning_rate": 1.9021092288712838e-05, "loss": 0.2844, "mean_token_accuracy": 0.9124453671276569, "num_tokens": 161329105.0, "step": 44400 }, { "entropy": 0.2794519987888634, "epoch": 0.6836264837915793, "grad_norm": 0.6842293739318848, "learning_rate": 1.9020319269683317e-05, "loss": 0.2727, "mean_token_accuracy": 0.9133497267961502, "num_tokens": 161391889.0, "step": 44410 }, { "entropy": 0.27892958363518117, "epoch": 0.683780419050258, "grad_norm": 0.5663219094276428, "learning_rate": 1.9019545961277813e-05, "loss": 0.2864, "mean_token_accuracy": 0.9147569879889488, "num_tokens": 161462972.0, "step": 44420 }, { "entropy": 0.2873703225515783, "epoch": 0.6839343543089366, "grad_norm": 0.5874693989753723, "learning_rate": 1.9018772363521142e-05, "loss": 0.2924, "mean_token_accuracy": 0.9103899352252484, "num_tokens": 161538740.0, "step": 44430 }, { "entropy": 0.29962439434602856, "epoch": 0.6840882895676151, "grad_norm": 0.6718660593032837, "learning_rate": 1.9017998476438117e-05, "loss": 0.2993, "mean_token_accuracy": 0.9084685176610947, "num_tokens": 161612957.0, "step": 44440 }, { "entropy": 0.282204152084887, "epoch": 0.6842422248262937, "grad_norm": 0.726603627204895, "learning_rate": 1.901722430005357e-05, "loss": 0.2967, "mean_token_accuracy": 0.9115023605525494, "num_tokens": 161675080.0, "step": 44450 }, { "entropy": 0.2763680133037269, "epoch": 0.6843961600849723, "grad_norm": 0.7209736704826355, "learning_rate": 1.901644983439233e-05, "loss": 0.2728, "mean_token_accuracy": 0.9151715859770775, "num_tokens": 161730090.0, "step": 44460 }, { "entropy": 0.27750322688370943, "epoch": 0.6845500953436509, "grad_norm": 0.7269104719161987, "learning_rate": 1.9015675079479246e-05, "loss": 0.2822, "mean_token_accuracy": 0.9133912935853005, "num_tokens": 161787847.0, "step": 44470 }, { "entropy": 0.27667969958856703, "epoch": 0.6847040306023294, "grad_norm": 0.699150562286377, "learning_rate": 1.9014900035339173e-05, "loss": 0.2904, "mean_token_accuracy": 0.9125633977353573, "num_tokens": 161855805.0, "step": 44480 }, { "entropy": 0.286936897970736, "epoch": 0.684857965861008, "grad_norm": 0.7540349364280701, "learning_rate": 1.9014124701996973e-05, "loss": 0.2935, "mean_token_accuracy": 0.9109982162714004, "num_tokens": 161914854.0, "step": 44490 }, { "entropy": 0.2831710884347558, "epoch": 0.6850119011196866, "grad_norm": 0.6213226318359375, "learning_rate": 1.9013349079477522e-05, "loss": 0.2751, "mean_token_accuracy": 0.9124741002917289, "num_tokens": 161982344.0, "step": 44500 }, { "entropy": 0.27675996655598284, "epoch": 0.6851658363783651, "grad_norm": 0.6291294693946838, "learning_rate": 1.9012573167805698e-05, "loss": 0.2848, "mean_token_accuracy": 0.9129011109471321, "num_tokens": 162041921.0, "step": 44510 }, { "entropy": 0.2666435334831476, "epoch": 0.6853197716370437, "grad_norm": 0.6530308127403259, "learning_rate": 1.9011796967006395e-05, "loss": 0.2785, "mean_token_accuracy": 0.9147624149918556, "num_tokens": 162106675.0, "step": 44520 }, { "entropy": 0.28963300930336117, "epoch": 0.6854737068957223, "grad_norm": 0.5874329209327698, "learning_rate": 1.9011020477104517e-05, "loss": 0.2893, "mean_token_accuracy": 0.9091944582760334, "num_tokens": 162181902.0, "step": 44530 }, { "entropy": 0.31480387337505816, "epoch": 0.6856276421544009, "grad_norm": 0.6455724239349365, "learning_rate": 1.9010243698124966e-05, "loss": 0.3045, "mean_token_accuracy": 0.9027889609336853, "num_tokens": 162250340.0, "step": 44540 }, { "entropy": 0.2662516546435654, "epoch": 0.6857815774130795, "grad_norm": 0.6241493821144104, "learning_rate": 1.900946663009267e-05, "loss": 0.264, "mean_token_accuracy": 0.917481055110693, "num_tokens": 162317561.0, "step": 44550 }, { "entropy": 0.29221233883872627, "epoch": 0.6859355126717581, "grad_norm": 0.4727589190006256, "learning_rate": 1.9008689273032552e-05, "loss": 0.2904, "mean_token_accuracy": 0.9084780342876911, "num_tokens": 162388964.0, "step": 44560 }, { "entropy": 0.26496908348053694, "epoch": 0.6860894479304367, "grad_norm": 0.6378102898597717, "learning_rate": 1.900791162696955e-05, "loss": 0.2746, "mean_token_accuracy": 0.9172878623008728, "num_tokens": 162458510.0, "step": 44570 }, { "entropy": 0.27750943806022405, "epoch": 0.6862433831891153, "grad_norm": 0.6677014827728271, "learning_rate": 1.9007133691928617e-05, "loss": 0.2826, "mean_token_accuracy": 0.9127932898700237, "num_tokens": 162530446.0, "step": 44580 }, { "entropy": 0.2884434095583856, "epoch": 0.6863973184477938, "grad_norm": 0.5764338970184326, "learning_rate": 1.9006355467934704e-05, "loss": 0.2979, "mean_token_accuracy": 0.9093617834150791, "num_tokens": 162598111.0, "step": 44590 }, { "entropy": 0.2780085179954767, "epoch": 0.6865512537064724, "grad_norm": 0.6120584011077881, "learning_rate": 1.9005576955012783e-05, "loss": 0.2863, "mean_token_accuracy": 0.913066541403532, "num_tokens": 162654607.0, "step": 44600 }, { "entropy": 0.2643111544661224, "epoch": 0.686705188965151, "grad_norm": 0.7020398378372192, "learning_rate": 1.900479815318782e-05, "loss": 0.2642, "mean_token_accuracy": 0.9165577538311481, "num_tokens": 162722962.0, "step": 44610 }, { "entropy": 0.28175307670608163, "epoch": 0.6868591242238296, "grad_norm": 0.5871190428733826, "learning_rate": 1.9004019062484802e-05, "loss": 0.2903, "mean_token_accuracy": 0.9115276388823986, "num_tokens": 162791120.0, "step": 44620 }, { "entropy": 0.30113535951822995, "epoch": 0.6870130594825081, "grad_norm": 0.6587545871734619, "learning_rate": 1.9003239682928728e-05, "loss": 0.3001, "mean_token_accuracy": 0.9073204196989536, "num_tokens": 162854061.0, "step": 44630 }, { "entropy": 0.26596732148900626, "epoch": 0.6871669947411867, "grad_norm": 0.6538835763931274, "learning_rate": 1.9002460014544596e-05, "loss": 0.2731, "mean_token_accuracy": 0.9186862856149673, "num_tokens": 162917132.0, "step": 44640 }, { "entropy": 0.28384239664301275, "epoch": 0.6873209299998653, "grad_norm": 0.5802688002586365, "learning_rate": 1.900168005735742e-05, "loss": 0.2962, "mean_token_accuracy": 0.9128950498998165, "num_tokens": 162984432.0, "step": 44650 }, { "entropy": 0.2523239130154252, "epoch": 0.6874748652585438, "grad_norm": 0.5597543120384216, "learning_rate": 1.900089981139222e-05, "loss": 0.2641, "mean_token_accuracy": 0.921030055731535, "num_tokens": 163053540.0, "step": 44660 }, { "entropy": 0.2456652933731675, "epoch": 0.6876288005172224, "grad_norm": 0.8181042671203613, "learning_rate": 1.900011927667403e-05, "loss": 0.2584, "mean_token_accuracy": 0.9218986928462982, "num_tokens": 163118835.0, "step": 44670 }, { "entropy": 0.28210039613768456, "epoch": 0.6877827357759011, "grad_norm": 0.606970489025116, "learning_rate": 1.8999338453227882e-05, "loss": 0.291, "mean_token_accuracy": 0.9120914399623871, "num_tokens": 163188440.0, "step": 44680 }, { "entropy": 0.2800084094516933, "epoch": 0.6879366710345797, "grad_norm": 0.6566420197486877, "learning_rate": 1.8998557341078837e-05, "loss": 0.2855, "mean_token_accuracy": 0.912006800621748, "num_tokens": 163254160.0, "step": 44690 }, { "entropy": 0.2772180110216141, "epoch": 0.6880906062932582, "grad_norm": 0.5465054512023926, "learning_rate": 1.8997775940251943e-05, "loss": 0.2934, "mean_token_accuracy": 0.914997074753046, "num_tokens": 163322076.0, "step": 44700 }, { "entropy": 0.26581970108672975, "epoch": 0.6882445415519368, "grad_norm": 0.5701799392700195, "learning_rate": 1.899699425077227e-05, "loss": 0.266, "mean_token_accuracy": 0.9154383860528469, "num_tokens": 163389927.0, "step": 44710 }, { "entropy": 0.28172578662633896, "epoch": 0.6883984768106154, "grad_norm": 0.6723424792289734, "learning_rate": 1.89962122726649e-05, "loss": 0.2768, "mean_token_accuracy": 0.914019986987114, "num_tokens": 163452812.0, "step": 44720 }, { "entropy": 0.2756735227070749, "epoch": 0.688552412069294, "grad_norm": 0.6133396029472351, "learning_rate": 1.8995430005954916e-05, "loss": 0.2759, "mean_token_accuracy": 0.9163792282342911, "num_tokens": 163519670.0, "step": 44730 }, { "entropy": 0.25388070112094285, "epoch": 0.6887063473279725, "grad_norm": 0.5335307717323303, "learning_rate": 1.8994647450667415e-05, "loss": 0.2603, "mean_token_accuracy": 0.9224171876907349, "num_tokens": 163583737.0, "step": 44740 }, { "entropy": 0.2855404348112643, "epoch": 0.6888602825866511, "grad_norm": 0.7102593779563904, "learning_rate": 1.8993864606827495e-05, "loss": 0.2703, "mean_token_accuracy": 0.9110117629170418, "num_tokens": 163650619.0, "step": 44750 }, { "entropy": 0.2796757897362113, "epoch": 0.6890142178453297, "grad_norm": 0.6266162395477295, "learning_rate": 1.899308147446028e-05, "loss": 0.2922, "mean_token_accuracy": 0.9118725836277009, "num_tokens": 163720831.0, "step": 44760 }, { "entropy": 0.26636980939656496, "epoch": 0.6891681531040083, "grad_norm": 0.5796332955360413, "learning_rate": 1.8992298053590886e-05, "loss": 0.2797, "mean_token_accuracy": 0.9180410765111446, "num_tokens": 163785968.0, "step": 44770 }, { "entropy": 0.2748131133615971, "epoch": 0.6893220883626868, "grad_norm": 0.5139272212982178, "learning_rate": 1.8991514344244447e-05, "loss": 0.2853, "mean_token_accuracy": 0.9147897154092789, "num_tokens": 163844254.0, "step": 44780 }, { "entropy": 0.2837600462138653, "epoch": 0.6894760236213654, "grad_norm": 0.5972120761871338, "learning_rate": 1.8990730346446104e-05, "loss": 0.2756, "mean_token_accuracy": 0.9131331242620945, "num_tokens": 163912987.0, "step": 44790 }, { "entropy": 0.28730533933267, "epoch": 0.6896299588800441, "grad_norm": 0.6176275014877319, "learning_rate": 1.8989946060221012e-05, "loss": 0.2833, "mean_token_accuracy": 0.9114282950758934, "num_tokens": 163982581.0, "step": 44800 }, { "entropy": 0.25907009886577725, "epoch": 0.6897838941387227, "grad_norm": 0.5031880736351013, "learning_rate": 1.898916148559433e-05, "loss": 0.2723, "mean_token_accuracy": 0.9160083308815956, "num_tokens": 164050033.0, "step": 44810 }, { "entropy": 0.2583266980946064, "epoch": 0.6899378293974012, "grad_norm": 0.5326364636421204, "learning_rate": 1.8988376622591226e-05, "loss": 0.2714, "mean_token_accuracy": 0.9192154496908188, "num_tokens": 164119130.0, "step": 44820 }, { "entropy": 0.27541837813332676, "epoch": 0.6900917646560798, "grad_norm": 0.6410999298095703, "learning_rate": 1.8987591471236876e-05, "loss": 0.2887, "mean_token_accuracy": 0.9132480591535568, "num_tokens": 164182142.0, "step": 44830 }, { "entropy": 0.27488162675872446, "epoch": 0.6902456999147584, "grad_norm": 0.5760348439216614, "learning_rate": 1.8986806031556476e-05, "loss": 0.2825, "mean_token_accuracy": 0.9131511509418487, "num_tokens": 164248240.0, "step": 44840 }, { "entropy": 0.2934636720456183, "epoch": 0.6903996351734369, "grad_norm": 0.5510298013687134, "learning_rate": 1.8986020303575215e-05, "loss": 0.2955, "mean_token_accuracy": 0.9111857779324055, "num_tokens": 164310096.0, "step": 44850 }, { "entropy": 0.2482956934720278, "epoch": 0.6905535704321155, "grad_norm": 0.6275602579116821, "learning_rate": 1.89852342873183e-05, "loss": 0.2514, "mean_token_accuracy": 0.9208963789045811, "num_tokens": 164375056.0, "step": 44860 }, { "entropy": 0.2789968952536583, "epoch": 0.6907075056907941, "grad_norm": 0.579716145992279, "learning_rate": 1.8984447982810956e-05, "loss": 0.2864, "mean_token_accuracy": 0.9120717450976372, "num_tokens": 164436830.0, "step": 44870 }, { "entropy": 0.2825705393217504, "epoch": 0.6908614409494727, "grad_norm": 0.522230863571167, "learning_rate": 1.8983661390078397e-05, "loss": 0.2772, "mean_token_accuracy": 0.912604284286499, "num_tokens": 164496254.0, "step": 44880 }, { "entropy": 0.28498655669391154, "epoch": 0.6910153762081512, "grad_norm": 0.7568225264549255, "learning_rate": 1.8982874509145866e-05, "loss": 0.2795, "mean_token_accuracy": 0.9126522526144981, "num_tokens": 164574794.0, "step": 44890 }, { "entropy": 0.2582881374284625, "epoch": 0.6911693114668298, "grad_norm": 0.5086861848831177, "learning_rate": 1.89820873400386e-05, "loss": 0.2578, "mean_token_accuracy": 0.9190596900880337, "num_tokens": 164643910.0, "step": 44900 }, { "entropy": 0.2827527591958642, "epoch": 0.6913232467255084, "grad_norm": 0.7420859932899475, "learning_rate": 1.8981299882781856e-05, "loss": 0.2864, "mean_token_accuracy": 0.9118117742240429, "num_tokens": 164700670.0, "step": 44910 }, { "entropy": 0.26193333268165586, "epoch": 0.691477181984187, "grad_norm": 0.6133100390434265, "learning_rate": 1.898051213740089e-05, "loss": 0.2644, "mean_token_accuracy": 0.9179864443838597, "num_tokens": 164755862.0, "step": 44920 }, { "entropy": 0.25978118339553474, "epoch": 0.6916311172428656, "grad_norm": 0.6993829607963562, "learning_rate": 1.897972410392098e-05, "loss": 0.2779, "mean_token_accuracy": 0.9189265109598637, "num_tokens": 164820216.0, "step": 44930 }, { "entropy": 0.28367263851687313, "epoch": 0.6917850525015442, "grad_norm": 0.658303439617157, "learning_rate": 1.8978935782367407e-05, "loss": 0.2889, "mean_token_accuracy": 0.9098994918167591, "num_tokens": 164885497.0, "step": 44940 }, { "entropy": 0.2962148949503899, "epoch": 0.6919389877602228, "grad_norm": 0.6458753943443298, "learning_rate": 1.8978147172765454e-05, "loss": 0.2933, "mean_token_accuracy": 0.9106638699769973, "num_tokens": 164953284.0, "step": 44950 }, { "entropy": 0.2653128864243627, "epoch": 0.6920929230189014, "grad_norm": 0.6345463991165161, "learning_rate": 1.8977358275140422e-05, "loss": 0.2708, "mean_token_accuracy": 0.9181375421583653, "num_tokens": 165022149.0, "step": 44960 }, { "entropy": 0.3037816734984517, "epoch": 0.6922468582775799, "grad_norm": 0.5907452702522278, "learning_rate": 1.8976569089517626e-05, "loss": 0.3166, "mean_token_accuracy": 0.9025037907063961, "num_tokens": 165086426.0, "step": 44970 }, { "entropy": 0.27010345198214053, "epoch": 0.6924007935362585, "grad_norm": 0.5517788529396057, "learning_rate": 1.8975779615922374e-05, "loss": 0.2662, "mean_token_accuracy": 0.9157427422702312, "num_tokens": 165153500.0, "step": 44980 }, { "entropy": 0.280867144651711, "epoch": 0.6925547287949371, "grad_norm": 0.5548813939094543, "learning_rate": 1.8974989854379996e-05, "loss": 0.2905, "mean_token_accuracy": 0.9139386579394341, "num_tokens": 165214758.0, "step": 44990 }, { "entropy": 0.2751151526346803, "epoch": 0.6927086640536156, "grad_norm": 0.6124765276908875, "learning_rate": 1.8974199804915832e-05, "loss": 0.2887, "mean_token_accuracy": 0.9146330326795578, "num_tokens": 165276818.0, "step": 45000 }, { "entropy": 0.29429921535775067, "epoch": 0.6928625993122942, "grad_norm": 0.5815081596374512, "learning_rate": 1.8973409467555223e-05, "loss": 0.2897, "mean_token_accuracy": 0.9100518226623535, "num_tokens": 165338159.0, "step": 45010 }, { "entropy": 0.2873254703357816, "epoch": 0.6930165345709728, "grad_norm": 0.729935884475708, "learning_rate": 1.8972618842323523e-05, "loss": 0.285, "mean_token_accuracy": 0.9102135598659515, "num_tokens": 165397061.0, "step": 45020 }, { "entropy": 0.28876144578680396, "epoch": 0.6931704698296514, "grad_norm": 0.6097128987312317, "learning_rate": 1.8971827929246098e-05, "loss": 0.2814, "mean_token_accuracy": 0.9108847640454769, "num_tokens": 165464285.0, "step": 45030 }, { "entropy": 0.28418552223592997, "epoch": 0.6933244050883299, "grad_norm": 0.5201469659805298, "learning_rate": 1.897103672834832e-05, "loss": 0.2939, "mean_token_accuracy": 0.9129217199981212, "num_tokens": 165540542.0, "step": 45040 }, { "entropy": 0.2773055821657181, "epoch": 0.6934783403470085, "grad_norm": 0.590111255645752, "learning_rate": 1.8970245239655572e-05, "loss": 0.2853, "mean_token_accuracy": 0.9139102339744568, "num_tokens": 165599030.0, "step": 45050 }, { "entropy": 0.2699641050770879, "epoch": 0.6936322756056872, "grad_norm": 0.7215502262115479, "learning_rate": 1.8969453463193242e-05, "loss": 0.2701, "mean_token_accuracy": 0.9173597045242786, "num_tokens": 165666095.0, "step": 45060 }, { "entropy": 0.2740931139327586, "epoch": 0.6937862108643658, "grad_norm": 0.655002772808075, "learning_rate": 1.8968661398986735e-05, "loss": 0.2836, "mean_token_accuracy": 0.9178390249609947, "num_tokens": 165736611.0, "step": 45070 }, { "entropy": 0.28223523059859873, "epoch": 0.6939401461230443, "grad_norm": 0.7117425799369812, "learning_rate": 1.8967869047061456e-05, "loss": 0.2787, "mean_token_accuracy": 0.9108494967222214, "num_tokens": 165808496.0, "step": 45080 }, { "entropy": 0.28844691906124353, "epoch": 0.6940940813817229, "grad_norm": 0.529532253742218, "learning_rate": 1.896707640744283e-05, "loss": 0.2879, "mean_token_accuracy": 0.9113955549895764, "num_tokens": 165878849.0, "step": 45090 }, { "entropy": 0.27619710443541406, "epoch": 0.6942480166404015, "grad_norm": 0.6365560293197632, "learning_rate": 1.8966283480156274e-05, "loss": 0.2873, "mean_token_accuracy": 0.9121952593326569, "num_tokens": 165951887.0, "step": 45100 }, { "entropy": 0.285586547665298, "epoch": 0.69440195189908, "grad_norm": 0.6195048689842224, "learning_rate": 1.8965490265227238e-05, "loss": 0.2796, "mean_token_accuracy": 0.9094523593783379, "num_tokens": 166018833.0, "step": 45110 }, { "entropy": 0.264870248362422, "epoch": 0.6945558871577586, "grad_norm": 0.7119628190994263, "learning_rate": 1.896469676268117e-05, "loss": 0.2751, "mean_token_accuracy": 0.9174707315862178, "num_tokens": 166079304.0, "step": 45120 }, { "entropy": 0.2813081622123718, "epoch": 0.6947098224164372, "grad_norm": 0.6121072173118591, "learning_rate": 1.8963902972543513e-05, "loss": 0.2894, "mean_token_accuracy": 0.9130487985908985, "num_tokens": 166146541.0, "step": 45130 }, { "entropy": 0.27164308410137894, "epoch": 0.6948637576751158, "grad_norm": 0.8078771233558655, "learning_rate": 1.8963108894839737e-05, "loss": 0.2668, "mean_token_accuracy": 0.9161887168884277, "num_tokens": 166209777.0, "step": 45140 }, { "entropy": 0.2714681373909116, "epoch": 0.6950176929337943, "grad_norm": 0.6609299182891846, "learning_rate": 1.8962314529595324e-05, "loss": 0.279, "mean_token_accuracy": 0.9161906264722347, "num_tokens": 166280130.0, "step": 45150 }, { "entropy": 0.2937706898897886, "epoch": 0.6951716281924729, "grad_norm": 0.5070076584815979, "learning_rate": 1.896151987683575e-05, "loss": 0.2933, "mean_token_accuracy": 0.9096271894872189, "num_tokens": 166344320.0, "step": 45160 }, { "entropy": 0.26270055221393707, "epoch": 0.6953255634511515, "grad_norm": 0.6898835897445679, "learning_rate": 1.896072493658651e-05, "loss": 0.2713, "mean_token_accuracy": 0.9163460940122604, "num_tokens": 166408045.0, "step": 45170 }, { "entropy": 0.2718539358116686, "epoch": 0.6954794987098302, "grad_norm": 0.5610255599021912, "learning_rate": 1.8959929708873102e-05, "loss": 0.2928, "mean_token_accuracy": 0.9153996765613556, "num_tokens": 166476809.0, "step": 45180 }, { "entropy": 0.298693734779954, "epoch": 0.6956334339685087, "grad_norm": 0.5646334886550903, "learning_rate": 1.895913419372105e-05, "loss": 0.2961, "mean_token_accuracy": 0.9048005633056164, "num_tokens": 166547352.0, "step": 45190 }, { "entropy": 0.26478884890675547, "epoch": 0.6957873692271873, "grad_norm": 0.8287488222122192, "learning_rate": 1.8958338391155856e-05, "loss": 0.266, "mean_token_accuracy": 0.9175065845251084, "num_tokens": 166619475.0, "step": 45200 }, { "entropy": 0.2660355867817998, "epoch": 0.6959413044858659, "grad_norm": 0.6250998377799988, "learning_rate": 1.8957542301203067e-05, "loss": 0.2726, "mean_token_accuracy": 0.9176676802337169, "num_tokens": 166686147.0, "step": 45210 }, { "entropy": 0.28868607729673385, "epoch": 0.6960952397445445, "grad_norm": 0.7037652134895325, "learning_rate": 1.8956745923888208e-05, "loss": 0.3035, "mean_token_accuracy": 0.910186768323183, "num_tokens": 166752923.0, "step": 45220 }, { "entropy": 0.2842561865225434, "epoch": 0.696249175003223, "grad_norm": 0.6194638609886169, "learning_rate": 1.895594925923684e-05, "loss": 0.2909, "mean_token_accuracy": 0.9121138997375965, "num_tokens": 166826787.0, "step": 45230 }, { "entropy": 0.274934121966362, "epoch": 0.6964031102619016, "grad_norm": 0.6135548949241638, "learning_rate": 1.8955152307274507e-05, "loss": 0.2707, "mean_token_accuracy": 0.9143842250108719, "num_tokens": 166891283.0, "step": 45240 }, { "entropy": 0.294114376604557, "epoch": 0.6965570455205802, "grad_norm": 0.6931295990943909, "learning_rate": 1.8954355068026788e-05, "loss": 0.3017, "mean_token_accuracy": 0.9067056089639663, "num_tokens": 166955736.0, "step": 45250 }, { "entropy": 0.2819964999333024, "epoch": 0.6967109807792587, "grad_norm": 0.5530402064323425, "learning_rate": 1.895355754151925e-05, "loss": 0.2904, "mean_token_accuracy": 0.9129805855453015, "num_tokens": 167019840.0, "step": 45260 }, { "entropy": 0.2720128265209496, "epoch": 0.6968649160379373, "grad_norm": 0.6059053540229797, "learning_rate": 1.895275972777748e-05, "loss": 0.2664, "mean_token_accuracy": 0.9155713982880116, "num_tokens": 167082193.0, "step": 45270 }, { "entropy": 0.2878600345924497, "epoch": 0.6970188512966159, "grad_norm": 0.5400505661964417, "learning_rate": 1.895196162682708e-05, "loss": 0.2897, "mean_token_accuracy": 0.9096733339130878, "num_tokens": 167154428.0, "step": 45280 }, { "entropy": 0.2883100813254714, "epoch": 0.6971727865552945, "grad_norm": 0.6714498996734619, "learning_rate": 1.895116323869364e-05, "loss": 0.2926, "mean_token_accuracy": 0.9099299661815167, "num_tokens": 167215600.0, "step": 45290 }, { "entropy": 0.29651803271844984, "epoch": 0.697326721813973, "grad_norm": 0.6076843738555908, "learning_rate": 1.895036456340278e-05, "loss": 0.2851, "mean_token_accuracy": 0.9089212760329246, "num_tokens": 167285291.0, "step": 45300 }, { "entropy": 0.25626105461269616, "epoch": 0.6974806570726517, "grad_norm": 0.5847645401954651, "learning_rate": 1.8949565600980125e-05, "loss": 0.2699, "mean_token_accuracy": 0.920108062773943, "num_tokens": 167351154.0, "step": 45310 }, { "entropy": 0.30970993507653477, "epoch": 0.6976345923313303, "grad_norm": 0.5997050404548645, "learning_rate": 1.89487663514513e-05, "loss": 0.3082, "mean_token_accuracy": 0.9037378922104835, "num_tokens": 167413501.0, "step": 45320 }, { "entropy": 0.27681797556579113, "epoch": 0.6977885275900089, "grad_norm": 0.5757103562355042, "learning_rate": 1.894796681484195e-05, "loss": 0.2807, "mean_token_accuracy": 0.915025744587183, "num_tokens": 167490917.0, "step": 45330 }, { "entropy": 0.2743516942486167, "epoch": 0.6979424628486874, "grad_norm": 0.6209287047386169, "learning_rate": 1.894716699117772e-05, "loss": 0.2808, "mean_token_accuracy": 0.9149707749485969, "num_tokens": 167562038.0, "step": 45340 }, { "entropy": 0.2886689743027091, "epoch": 0.698096398107366, "grad_norm": 0.6011208891868591, "learning_rate": 1.894636688048427e-05, "loss": 0.2782, "mean_token_accuracy": 0.9115883372724056, "num_tokens": 167635493.0, "step": 45350 }, { "entropy": 0.27266751509159803, "epoch": 0.6982503333660446, "grad_norm": 0.541897714138031, "learning_rate": 1.8945566482787273e-05, "loss": 0.2735, "mean_token_accuracy": 0.9148726627230644, "num_tokens": 167708929.0, "step": 45360 }, { "entropy": 0.27064778646454213, "epoch": 0.6984042686247232, "grad_norm": 0.6866198182106018, "learning_rate": 1.8944765798112396e-05, "loss": 0.2756, "mean_token_accuracy": 0.9139136880636215, "num_tokens": 167778782.0, "step": 45370 }, { "entropy": 0.2730270879343152, "epoch": 0.6985582038834017, "grad_norm": 0.6443812251091003, "learning_rate": 1.8943964826485336e-05, "loss": 0.2726, "mean_token_accuracy": 0.9155149444937706, "num_tokens": 167846460.0, "step": 45380 }, { "entropy": 0.28306266833096744, "epoch": 0.6987121391420803, "grad_norm": 0.5849273800849915, "learning_rate": 1.8943163567931784e-05, "loss": 0.298, "mean_token_accuracy": 0.9136308424174786, "num_tokens": 167912155.0, "step": 45390 }, { "entropy": 0.2633565385825932, "epoch": 0.6988660744007589, "grad_norm": 0.7541478872299194, "learning_rate": 1.8942362022477442e-05, "loss": 0.285, "mean_token_accuracy": 0.9196350052952766, "num_tokens": 167970870.0, "step": 45400 }, { "entropy": 0.2878704176284373, "epoch": 0.6990200096594374, "grad_norm": 0.6462510228157043, "learning_rate": 1.894156019014803e-05, "loss": 0.2887, "mean_token_accuracy": 0.9112355224788189, "num_tokens": 168037544.0, "step": 45410 }, { "entropy": 0.2928544824011624, "epoch": 0.699173944918116, "grad_norm": 0.5497059226036072, "learning_rate": 1.8940758070969267e-05, "loss": 0.3059, "mean_token_accuracy": 0.9079118438065052, "num_tokens": 168111569.0, "step": 45420 }, { "entropy": 0.2792126625776291, "epoch": 0.6993278801767946, "grad_norm": 0.5358501672744751, "learning_rate": 1.8939955664966883e-05, "loss": 0.2887, "mean_token_accuracy": 0.9124881297349929, "num_tokens": 168180578.0, "step": 45430 }, { "entropy": 0.27987445052713156, "epoch": 0.6994818154354733, "grad_norm": 0.7999045252799988, "learning_rate": 1.8939152972166624e-05, "loss": 0.2868, "mean_token_accuracy": 0.9123490430414677, "num_tokens": 168246316.0, "step": 45440 }, { "entropy": 0.267829460836947, "epoch": 0.6996357506941518, "grad_norm": 0.592119574546814, "learning_rate": 1.8938349992594243e-05, "loss": 0.2824, "mean_token_accuracy": 0.9135216556489467, "num_tokens": 168319320.0, "step": 45450 }, { "entropy": 0.2666604995727539, "epoch": 0.6997896859528304, "grad_norm": 0.5826367735862732, "learning_rate": 1.893754672627549e-05, "loss": 0.2678, "mean_token_accuracy": 0.9183663606643677, "num_tokens": 168386772.0, "step": 45460 }, { "entropy": 0.27737358026206493, "epoch": 0.699943621211509, "grad_norm": 0.6462878584861755, "learning_rate": 1.8936743173236146e-05, "loss": 0.2915, "mean_token_accuracy": 0.9125666908919812, "num_tokens": 168450213.0, "step": 45470 }, { "entropy": 0.26853621657937765, "epoch": 0.7000975564701876, "grad_norm": 0.6034942269325256, "learning_rate": 1.893593933350198e-05, "loss": 0.2664, "mean_token_accuracy": 0.9130857639014721, "num_tokens": 168512481.0, "step": 45480 }, { "entropy": 0.2910187034867704, "epoch": 0.7002514917288661, "grad_norm": 0.7634368538856506, "learning_rate": 1.8935135207098783e-05, "loss": 0.2921, "mean_token_accuracy": 0.9099361300468445, "num_tokens": 168571252.0, "step": 45490 }, { "entropy": 0.27305917711928485, "epoch": 0.7004054269875447, "grad_norm": 0.5450948476791382, "learning_rate": 1.893433079405236e-05, "loss": 0.2731, "mean_token_accuracy": 0.915716203302145, "num_tokens": 168636082.0, "step": 45500 }, { "entropy": 0.2858160327188671, "epoch": 0.7005593622462233, "grad_norm": 0.593716561794281, "learning_rate": 1.89335260943885e-05, "loss": 0.2879, "mean_token_accuracy": 0.9104527741670608, "num_tokens": 168706192.0, "step": 45510 }, { "entropy": 0.27734532598406075, "epoch": 0.7007132975049019, "grad_norm": 0.5979112982749939, "learning_rate": 1.8932721108133032e-05, "loss": 0.2729, "mean_token_accuracy": 0.9134864255785942, "num_tokens": 168775418.0, "step": 45520 }, { "entropy": 0.2904390743002295, "epoch": 0.7008672327635804, "grad_norm": 0.7865944504737854, "learning_rate": 1.893191583531177e-05, "loss": 0.2958, "mean_token_accuracy": 0.9091293647885322, "num_tokens": 168833637.0, "step": 45530 }, { "entropy": 0.27062080791220067, "epoch": 0.701021168022259, "grad_norm": 0.6657543778419495, "learning_rate": 1.893111027595056e-05, "loss": 0.2811, "mean_token_accuracy": 0.9159065127372742, "num_tokens": 168899116.0, "step": 45540 }, { "entropy": 0.2768267936073244, "epoch": 0.7011751032809376, "grad_norm": 0.6156225800514221, "learning_rate": 1.8930304430075234e-05, "loss": 0.2829, "mean_token_accuracy": 0.9149292588233948, "num_tokens": 168960353.0, "step": 45550 }, { "entropy": 0.29412705888971685, "epoch": 0.7013290385396163, "grad_norm": 0.7077690958976746, "learning_rate": 1.892949829771165e-05, "loss": 0.2903, "mean_token_accuracy": 0.9087356269359589, "num_tokens": 169028995.0, "step": 45560 }, { "entropy": 0.28997896835207937, "epoch": 0.7014829737982948, "grad_norm": 0.6692488193511963, "learning_rate": 1.8928691878885665e-05, "loss": 0.2881, "mean_token_accuracy": 0.9086852177977562, "num_tokens": 169101534.0, "step": 45570 }, { "entropy": 0.2809226730838418, "epoch": 0.7016369090569734, "grad_norm": 0.7044214010238647, "learning_rate": 1.892788517362315e-05, "loss": 0.2843, "mean_token_accuracy": 0.9138387314975261, "num_tokens": 169165379.0, "step": 45580 }, { "entropy": 0.2883514929562807, "epoch": 0.701790844315652, "grad_norm": 0.6350946426391602, "learning_rate": 1.8927078181949986e-05, "loss": 0.2797, "mean_token_accuracy": 0.9126913197338581, "num_tokens": 169228879.0, "step": 45590 }, { "entropy": 0.25994427818804977, "epoch": 0.7019447795743305, "grad_norm": 0.5587230920791626, "learning_rate": 1.8926270903892063e-05, "loss": 0.2631, "mean_token_accuracy": 0.9178469747304916, "num_tokens": 169288560.0, "step": 45600 }, { "entropy": 0.2728152140043676, "epoch": 0.7020987148330091, "grad_norm": 0.5874131917953491, "learning_rate": 1.8925463339475277e-05, "loss": 0.2721, "mean_token_accuracy": 0.9151930660009384, "num_tokens": 169357454.0, "step": 45610 }, { "entropy": 0.27825458683073523, "epoch": 0.7022526500916877, "grad_norm": 0.5571638345718384, "learning_rate": 1.8924655488725534e-05, "loss": 0.2839, "mean_token_accuracy": 0.9147852599620819, "num_tokens": 169423424.0, "step": 45620 }, { "entropy": 0.2951206311583519, "epoch": 0.7024065853503663, "grad_norm": 0.5842522382736206, "learning_rate": 1.892384735166875e-05, "loss": 0.2903, "mean_token_accuracy": 0.9086376950144768, "num_tokens": 169488106.0, "step": 45630 }, { "entropy": 0.26421845480799677, "epoch": 0.7025605206090448, "grad_norm": 0.6388973593711853, "learning_rate": 1.8923038928330853e-05, "loss": 0.2741, "mean_token_accuracy": 0.9169628649950028, "num_tokens": 169548984.0, "step": 45640 }, { "entropy": 0.2809149146080017, "epoch": 0.7027144558677234, "grad_norm": 0.7044681906700134, "learning_rate": 1.8922230218737777e-05, "loss": 0.2884, "mean_token_accuracy": 0.9103379741311073, "num_tokens": 169610933.0, "step": 45650 }, { "entropy": 0.3026619577780366, "epoch": 0.702868391126402, "grad_norm": 0.6652345061302185, "learning_rate": 1.8921421222915466e-05, "loss": 0.2903, "mean_token_accuracy": 0.9080809749662876, "num_tokens": 169663888.0, "step": 45660 }, { "entropy": 0.28988134022802114, "epoch": 0.7030223263850806, "grad_norm": 0.5834031701087952, "learning_rate": 1.892061194088987e-05, "loss": 0.2812, "mean_token_accuracy": 0.9113368026912212, "num_tokens": 169725067.0, "step": 45670 }, { "entropy": 0.2975700656883419, "epoch": 0.7031762616437591, "grad_norm": 0.5641053915023804, "learning_rate": 1.891980237268695e-05, "loss": 0.3006, "mean_token_accuracy": 0.9080562300980091, "num_tokens": 169790493.0, "step": 45680 }, { "entropy": 0.28423391776159407, "epoch": 0.7033301969024378, "grad_norm": 0.7206414937973022, "learning_rate": 1.8918992518332686e-05, "loss": 0.2947, "mean_token_accuracy": 0.9106495022773743, "num_tokens": 169852126.0, "step": 45690 }, { "entropy": 0.26979882307350633, "epoch": 0.7034841321611164, "grad_norm": 0.5175797939300537, "learning_rate": 1.891818237785305e-05, "loss": 0.2668, "mean_token_accuracy": 0.9138843715190887, "num_tokens": 169918996.0, "step": 45700 }, { "entropy": 0.2957953630015254, "epoch": 0.703638067419795, "grad_norm": 0.6174346804618835, "learning_rate": 1.8917371951274036e-05, "loss": 0.3048, "mean_token_accuracy": 0.9067063465714454, "num_tokens": 169983217.0, "step": 45710 }, { "entropy": 0.2662781113758683, "epoch": 0.7037920026784735, "grad_norm": 0.5230692028999329, "learning_rate": 1.8916561238621644e-05, "loss": 0.2738, "mean_token_accuracy": 0.9181770622730255, "num_tokens": 170051255.0, "step": 45720 }, { "entropy": 0.2787764146924019, "epoch": 0.7039459379371521, "grad_norm": 0.6407349109649658, "learning_rate": 1.8915750239921875e-05, "loss": 0.2789, "mean_token_accuracy": 0.9134166680276394, "num_tokens": 170112670.0, "step": 45730 }, { "entropy": 0.28914207853376866, "epoch": 0.7040998731958307, "grad_norm": 0.6560412645339966, "learning_rate": 1.8914938955200753e-05, "loss": 0.2899, "mean_token_accuracy": 0.9095540940761566, "num_tokens": 170177974.0, "step": 45740 }, { "entropy": 0.27896343292668463, "epoch": 0.7042538084545092, "grad_norm": 0.6628237962722778, "learning_rate": 1.8914127384484304e-05, "loss": 0.2835, "mean_token_accuracy": 0.9124539628624916, "num_tokens": 170245331.0, "step": 45750 }, { "entropy": 0.2669643378816545, "epoch": 0.7044077437131878, "grad_norm": 0.5267335772514343, "learning_rate": 1.891331552779856e-05, "loss": 0.261, "mean_token_accuracy": 0.9183203727006912, "num_tokens": 170318330.0, "step": 45760 }, { "entropy": 0.2689900888130069, "epoch": 0.7045616789718664, "grad_norm": 0.615659236907959, "learning_rate": 1.8912503385169568e-05, "loss": 0.2697, "mean_token_accuracy": 0.9173640072345733, "num_tokens": 170379769.0, "step": 45770 }, { "entropy": 0.27506145993247627, "epoch": 0.704715614230545, "grad_norm": 0.6305544376373291, "learning_rate": 1.8911690956623378e-05, "loss": 0.276, "mean_token_accuracy": 0.9134001962840557, "num_tokens": 170451658.0, "step": 45780 }, { "entropy": 0.2786728048697114, "epoch": 0.7048695494892235, "grad_norm": 0.581743061542511, "learning_rate": 1.8910878242186058e-05, "loss": 0.2922, "mean_token_accuracy": 0.9114201322197915, "num_tokens": 170515632.0, "step": 45790 }, { "entropy": 0.2568360401317477, "epoch": 0.7050234847479021, "grad_norm": 0.6311538815498352, "learning_rate": 1.891006524188368e-05, "loss": 0.2706, "mean_token_accuracy": 0.918388831615448, "num_tokens": 170579997.0, "step": 45800 }, { "entropy": 0.2728671310469508, "epoch": 0.7051774200065807, "grad_norm": 0.6606505513191223, "learning_rate": 1.8909251955742323e-05, "loss": 0.2874, "mean_token_accuracy": 0.9146986782550812, "num_tokens": 170643669.0, "step": 45810 }, { "entropy": 0.2747464833781123, "epoch": 0.7053313552652594, "grad_norm": 0.6020164489746094, "learning_rate": 1.890843838378808e-05, "loss": 0.28, "mean_token_accuracy": 0.9155317768454552, "num_tokens": 170710925.0, "step": 45820 }, { "entropy": 0.29144352236762644, "epoch": 0.7054852905239379, "grad_norm": 0.8054155707359314, "learning_rate": 1.890762452604705e-05, "loss": 0.2847, "mean_token_accuracy": 0.909898653626442, "num_tokens": 170773594.0, "step": 45830 }, { "entropy": 0.2707052164711058, "epoch": 0.7056392257826165, "grad_norm": 0.5967512726783752, "learning_rate": 1.8906810382545338e-05, "loss": 0.2718, "mean_token_accuracy": 0.9163473799824715, "num_tokens": 170837685.0, "step": 45840 }, { "entropy": 0.2549693265464157, "epoch": 0.7057931610412951, "grad_norm": 0.6039122939109802, "learning_rate": 1.8905995953309068e-05, "loss": 0.275, "mean_token_accuracy": 0.918518340587616, "num_tokens": 170893028.0, "step": 45850 }, { "entropy": 0.285798598267138, "epoch": 0.7059470962999737, "grad_norm": 0.6708390712738037, "learning_rate": 1.8905181238364366e-05, "loss": 0.2887, "mean_token_accuracy": 0.9140677787363529, "num_tokens": 170959082.0, "step": 45860 }, { "entropy": 0.2630560559220612, "epoch": 0.7061010315586522, "grad_norm": 0.7162461876869202, "learning_rate": 1.8904366237737366e-05, "loss": 0.2829, "mean_token_accuracy": 0.9187082044780255, "num_tokens": 171019942.0, "step": 45870 }, { "entropy": 0.2681437672115862, "epoch": 0.7062549668173308, "grad_norm": 0.6615038514137268, "learning_rate": 1.8903550951454213e-05, "loss": 0.2696, "mean_token_accuracy": 0.9167959652841091, "num_tokens": 171081196.0, "step": 45880 }, { "entropy": 0.2561062886379659, "epoch": 0.7064089020760094, "grad_norm": 0.6435053944587708, "learning_rate": 1.8902735379541064e-05, "loss": 0.2675, "mean_token_accuracy": 0.9169914215803147, "num_tokens": 171149742.0, "step": 45890 }, { "entropy": 0.274282643944025, "epoch": 0.7065628373346879, "grad_norm": 0.7476029396057129, "learning_rate": 1.8901919522024083e-05, "loss": 0.2862, "mean_token_accuracy": 0.9143293865025044, "num_tokens": 171213406.0, "step": 45900 }, { "entropy": 0.28021889394149185, "epoch": 0.7067167725933665, "grad_norm": 0.6624448299407959, "learning_rate": 1.8901103378929445e-05, "loss": 0.2917, "mean_token_accuracy": 0.9116986878216267, "num_tokens": 171274879.0, "step": 45910 }, { "entropy": 0.28444199431687595, "epoch": 0.7068707078520451, "grad_norm": 0.804296612739563, "learning_rate": 1.8900286950283327e-05, "loss": 0.2808, "mean_token_accuracy": 0.9138134635984898, "num_tokens": 171343530.0, "step": 45920 }, { "entropy": 0.2654479348100722, "epoch": 0.7070246431107237, "grad_norm": 0.7320570349693298, "learning_rate": 1.8899470236111927e-05, "loss": 0.2786, "mean_token_accuracy": 0.9154445812106132, "num_tokens": 171407275.0, "step": 45930 }, { "entropy": 0.2803954371251166, "epoch": 0.7071785783694022, "grad_norm": 0.5844241380691528, "learning_rate": 1.8898653236441435e-05, "loss": 0.2865, "mean_token_accuracy": 0.9136653579771519, "num_tokens": 171479009.0, "step": 45940 }, { "entropy": 0.2717167193070054, "epoch": 0.7073325136280809, "grad_norm": 0.5400091409683228, "learning_rate": 1.889783595129807e-05, "loss": 0.2675, "mean_token_accuracy": 0.9170857988297939, "num_tokens": 171543715.0, "step": 45950 }, { "entropy": 0.268998222053051, "epoch": 0.7074864488867595, "grad_norm": 0.47813543677330017, "learning_rate": 1.889701838070805e-05, "loss": 0.2774, "mean_token_accuracy": 0.9168025322258473, "num_tokens": 171615842.0, "step": 45960 }, { "entropy": 0.24437742540612817, "epoch": 0.7076403841454381, "grad_norm": 0.7572637796401978, "learning_rate": 1.8896200524697602e-05, "loss": 0.2598, "mean_token_accuracy": 0.92308359593153, "num_tokens": 171681337.0, "step": 45970 }, { "entropy": 0.2927701950073242, "epoch": 0.7077943194041166, "grad_norm": 0.6638133525848389, "learning_rate": 1.8895382383292962e-05, "loss": 0.2889, "mean_token_accuracy": 0.9097866706550122, "num_tokens": 171747614.0, "step": 45980 }, { "entropy": 0.29155350998044016, "epoch": 0.7079482546627952, "grad_norm": 0.6175015568733215, "learning_rate": 1.8894563956520374e-05, "loss": 0.2898, "mean_token_accuracy": 0.9124462246894837, "num_tokens": 171817228.0, "step": 45990 }, { "entropy": 0.2665152230300009, "epoch": 0.7081021899214738, "grad_norm": 0.7560442090034485, "learning_rate": 1.88937452444061e-05, "loss": 0.2715, "mean_token_accuracy": 0.9159969903528691, "num_tokens": 171887191.0, "step": 46000 }, { "entropy": 0.268849961925298, "epoch": 0.7082561251801524, "grad_norm": 0.554000198841095, "learning_rate": 1.88929262469764e-05, "loss": 0.2757, "mean_token_accuracy": 0.9160850867629051, "num_tokens": 171955304.0, "step": 46010 }, { "entropy": 0.2903327270410955, "epoch": 0.7084100604388309, "grad_norm": 0.6266428828239441, "learning_rate": 1.8892106964257548e-05, "loss": 0.2909, "mean_token_accuracy": 0.9100034885108471, "num_tokens": 172021823.0, "step": 46020 }, { "entropy": 0.2580777920782566, "epoch": 0.7085639956975095, "grad_norm": 0.5617612600326538, "learning_rate": 1.889128739627583e-05, "loss": 0.2598, "mean_token_accuracy": 0.9180049546062946, "num_tokens": 172100457.0, "step": 46030 }, { "entropy": 0.2685320499353111, "epoch": 0.7087179309561881, "grad_norm": 0.5655395984649658, "learning_rate": 1.8890467543057537e-05, "loss": 0.278, "mean_token_accuracy": 0.9140824750065804, "num_tokens": 172171830.0, "step": 46040 }, { "entropy": 0.2638641791418195, "epoch": 0.7088718662148666, "grad_norm": 0.5835657119750977, "learning_rate": 1.8889647404628965e-05, "loss": 0.2648, "mean_token_accuracy": 0.9175072774291039, "num_tokens": 172237298.0, "step": 46050 }, { "entropy": 0.26943207494914534, "epoch": 0.7090258014735452, "grad_norm": 0.5413421392440796, "learning_rate": 1.888882698101643e-05, "loss": 0.2712, "mean_token_accuracy": 0.913127975910902, "num_tokens": 172302969.0, "step": 46060 }, { "entropy": 0.2764554416760802, "epoch": 0.7091797367322239, "grad_norm": 0.5710896849632263, "learning_rate": 1.888800627224625e-05, "loss": 0.2838, "mean_token_accuracy": 0.9144784457981586, "num_tokens": 172365882.0, "step": 46070 }, { "entropy": 0.2752723188139498, "epoch": 0.7093336719909025, "grad_norm": 0.5519530177116394, "learning_rate": 1.8887185278344757e-05, "loss": 0.275, "mean_token_accuracy": 0.9139547914266586, "num_tokens": 172433310.0, "step": 46080 }, { "entropy": 0.27457260694354774, "epoch": 0.709487607249581, "grad_norm": 0.6068763136863708, "learning_rate": 1.8886363999338285e-05, "loss": 0.2877, "mean_token_accuracy": 0.9148390300571918, "num_tokens": 172495163.0, "step": 46090 }, { "entropy": 0.27673274036496875, "epoch": 0.7096415425082596, "grad_norm": 0.6774597764015198, "learning_rate": 1.8885542435253178e-05, "loss": 0.292, "mean_token_accuracy": 0.9132874496281147, "num_tokens": 172566883.0, "step": 46100 }, { "entropy": 0.26822675867006185, "epoch": 0.7097954777669382, "grad_norm": 0.6395755410194397, "learning_rate": 1.88847205861158e-05, "loss": 0.2801, "mean_token_accuracy": 0.917362368106842, "num_tokens": 172630836.0, "step": 46110 }, { "entropy": 0.28997804410755634, "epoch": 0.7099494130256168, "grad_norm": 0.603323221206665, "learning_rate": 1.888389845195251e-05, "loss": 0.2906, "mean_token_accuracy": 0.9114642731845379, "num_tokens": 172703827.0, "step": 46120 }, { "entropy": 0.27565506668761375, "epoch": 0.7101033482842953, "grad_norm": 0.5593700408935547, "learning_rate": 1.8883076032789685e-05, "loss": 0.2733, "mean_token_accuracy": 0.9161887787282467, "num_tokens": 172768834.0, "step": 46130 }, { "entropy": 0.2777728001587093, "epoch": 0.7102572835429739, "grad_norm": 0.8991336822509766, "learning_rate": 1.8882253328653712e-05, "loss": 0.2773, "mean_token_accuracy": 0.9137742422521115, "num_tokens": 172828220.0, "step": 46140 }, { "entropy": 0.2713407525792718, "epoch": 0.7104112188016525, "grad_norm": 0.5320849418640137, "learning_rate": 1.8881430339570976e-05, "loss": 0.2809, "mean_token_accuracy": 0.9141324177384377, "num_tokens": 172897088.0, "step": 46150 }, { "entropy": 0.2872146571986377, "epoch": 0.710565154060331, "grad_norm": 0.5300466418266296, "learning_rate": 1.888060706556788e-05, "loss": 0.2934, "mean_token_accuracy": 0.9098632872104645, "num_tokens": 172960734.0, "step": 46160 }, { "entropy": 0.2700274411588907, "epoch": 0.7107190893190096, "grad_norm": 0.6264575123786926, "learning_rate": 1.8879783506670842e-05, "loss": 0.2785, "mean_token_accuracy": 0.9162834130227566, "num_tokens": 173021323.0, "step": 46170 }, { "entropy": 0.2660750110633671, "epoch": 0.7108730245776882, "grad_norm": 0.549471378326416, "learning_rate": 1.8878959662906278e-05, "loss": 0.2641, "mean_token_accuracy": 0.920140478760004, "num_tokens": 173086365.0, "step": 46180 }, { "entropy": 0.2658633063547313, "epoch": 0.7110269598363668, "grad_norm": 0.6747667193412781, "learning_rate": 1.8878135534300618e-05, "loss": 0.2782, "mean_token_accuracy": 0.9158807776868343, "num_tokens": 173149120.0, "step": 46190 }, { "entropy": 0.2972603375092149, "epoch": 0.7111808950950455, "grad_norm": 0.6399314999580383, "learning_rate": 1.8877311120880302e-05, "loss": 0.2982, "mean_token_accuracy": 0.9091298319399357, "num_tokens": 173215803.0, "step": 46200 }, { "entropy": 0.287031230609864, "epoch": 0.711334830353724, "grad_norm": 0.5528502464294434, "learning_rate": 1.887648642267177e-05, "loss": 0.2976, "mean_token_accuracy": 0.9109546564519405, "num_tokens": 173284994.0, "step": 46210 }, { "entropy": 0.27444598991423846, "epoch": 0.7114887656124026, "grad_norm": 0.680241048336029, "learning_rate": 1.8875661439701487e-05, "loss": 0.2822, "mean_token_accuracy": 0.9135601833462715, "num_tokens": 173345083.0, "step": 46220 }, { "entropy": 0.2853585149161518, "epoch": 0.7116427008710812, "grad_norm": 0.8368792533874512, "learning_rate": 1.8874836171995918e-05, "loss": 0.2884, "mean_token_accuracy": 0.9122063808143139, "num_tokens": 173414054.0, "step": 46230 }, { "entropy": 0.28083999110385777, "epoch": 0.7117966361297597, "grad_norm": 0.6627376675605774, "learning_rate": 1.887401061958153e-05, "loss": 0.2891, "mean_token_accuracy": 0.9136639252305031, "num_tokens": 173471477.0, "step": 46240 }, { "entropy": 0.258889639377594, "epoch": 0.7119505713884383, "grad_norm": 0.5800250768661499, "learning_rate": 1.8873184782484815e-05, "loss": 0.2696, "mean_token_accuracy": 0.9182043693959713, "num_tokens": 173538033.0, "step": 46250 }, { "entropy": 0.28380530783906577, "epoch": 0.7121045066471169, "grad_norm": 0.5755691528320312, "learning_rate": 1.8872358660732266e-05, "loss": 0.2907, "mean_token_accuracy": 0.9117553949356079, "num_tokens": 173600110.0, "step": 46260 }, { "entropy": 0.2634152210317552, "epoch": 0.7122584419057955, "grad_norm": 0.5893577337265015, "learning_rate": 1.8871532254350385e-05, "loss": 0.2782, "mean_token_accuracy": 0.9145892523229122, "num_tokens": 173668449.0, "step": 46270 }, { "entropy": 0.2695906421169639, "epoch": 0.712412377164474, "grad_norm": 0.7187315821647644, "learning_rate": 1.8870705563365678e-05, "loss": 0.2686, "mean_token_accuracy": 0.9167750865221024, "num_tokens": 173726836.0, "step": 46280 }, { "entropy": 0.26967991078272463, "epoch": 0.7125663124231526, "grad_norm": 0.55885910987854, "learning_rate": 1.886987858780467e-05, "loss": 0.2664, "mean_token_accuracy": 0.9165060348808766, "num_tokens": 173791844.0, "step": 46290 }, { "entropy": 0.282695050816983, "epoch": 0.7127202476818312, "grad_norm": 0.5456230640411377, "learning_rate": 1.8869051327693894e-05, "loss": 0.2921, "mean_token_accuracy": 0.9107254914939403, "num_tokens": 173857367.0, "step": 46300 }, { "entropy": 0.30121417678892615, "epoch": 0.7128741829405097, "grad_norm": 0.6399500370025635, "learning_rate": 1.886822378305988e-05, "loss": 0.2929, "mean_token_accuracy": 0.9039551109075546, "num_tokens": 173918377.0, "step": 46310 }, { "entropy": 0.2950227762572467, "epoch": 0.7130281181991883, "grad_norm": 0.6159273386001587, "learning_rate": 1.8867395953929188e-05, "loss": 0.288, "mean_token_accuracy": 0.9087656624615192, "num_tokens": 173978993.0, "step": 46320 }, { "entropy": 0.26163814291357995, "epoch": 0.713182053457867, "grad_norm": 0.8077239990234375, "learning_rate": 1.8866567840328363e-05, "loss": 0.2709, "mean_token_accuracy": 0.9171763859689236, "num_tokens": 174041718.0, "step": 46330 }, { "entropy": 0.27911384543403983, "epoch": 0.7133359887165456, "grad_norm": 0.660335898399353, "learning_rate": 1.8865739442283977e-05, "loss": 0.2871, "mean_token_accuracy": 0.9118652321398258, "num_tokens": 174108051.0, "step": 46340 }, { "entropy": 0.27693718690425156, "epoch": 0.7134899239752241, "grad_norm": 0.5791406631469727, "learning_rate": 1.8864910759822605e-05, "loss": 0.292, "mean_token_accuracy": 0.9149965226650238, "num_tokens": 174174970.0, "step": 46350 }, { "entropy": 0.28624166883528235, "epoch": 0.7136438592339027, "grad_norm": 0.8982306718826294, "learning_rate": 1.8864081792970835e-05, "loss": 0.2831, "mean_token_accuracy": 0.9108321510255337, "num_tokens": 174242056.0, "step": 46360 }, { "entropy": 0.27371904756873844, "epoch": 0.7137977944925813, "grad_norm": 0.4739035665988922, "learning_rate": 1.8863252541755254e-05, "loss": 0.2834, "mean_token_accuracy": 0.9123196080327034, "num_tokens": 174313561.0, "step": 46370 }, { "entropy": 0.26625257860869167, "epoch": 0.7139517297512599, "grad_norm": 0.7062321901321411, "learning_rate": 1.886242300620247e-05, "loss": 0.2715, "mean_token_accuracy": 0.9154143400490284, "num_tokens": 174381549.0, "step": 46380 }, { "entropy": 0.29104502806439997, "epoch": 0.7141056650099384, "grad_norm": 0.6856503486633301, "learning_rate": 1.8861593186339093e-05, "loss": 0.2918, "mean_token_accuracy": 0.9106639675796032, "num_tokens": 174447129.0, "step": 46390 }, { "entropy": 0.27077407874166964, "epoch": 0.714259600268617, "grad_norm": 0.6019087433815002, "learning_rate": 1.8860763082191743e-05, "loss": 0.2769, "mean_token_accuracy": 0.916529742628336, "num_tokens": 174512332.0, "step": 46400 }, { "entropy": 0.28265950735658407, "epoch": 0.7144135355272956, "grad_norm": 0.7017494440078735, "learning_rate": 1.885993269378705e-05, "loss": 0.2953, "mean_token_accuracy": 0.909857414662838, "num_tokens": 174585137.0, "step": 46410 }, { "entropy": 0.2932608684524894, "epoch": 0.7145674707859742, "grad_norm": 0.6859896779060364, "learning_rate": 1.8859102021151655e-05, "loss": 0.3015, "mean_token_accuracy": 0.908940352499485, "num_tokens": 174649853.0, "step": 46420 }, { "entropy": 0.28738476568832994, "epoch": 0.7147214060446527, "grad_norm": 0.7663370370864868, "learning_rate": 1.885827106431221e-05, "loss": 0.298, "mean_token_accuracy": 0.9112087741494179, "num_tokens": 174713276.0, "step": 46430 }, { "entropy": 0.29905507219955324, "epoch": 0.7148753413033313, "grad_norm": 0.6357316374778748, "learning_rate": 1.885743982329536e-05, "loss": 0.297, "mean_token_accuracy": 0.9066379211843014, "num_tokens": 174776888.0, "step": 46440 }, { "entropy": 0.2857418737374246, "epoch": 0.71502927656201, "grad_norm": 0.6230836510658264, "learning_rate": 1.8856608298127784e-05, "loss": 0.2988, "mean_token_accuracy": 0.9092770107090473, "num_tokens": 174845726.0, "step": 46450 }, { "entropy": 0.2728384341113269, "epoch": 0.7151832118206886, "grad_norm": 0.777518630027771, "learning_rate": 1.8855776488836152e-05, "loss": 0.2832, "mean_token_accuracy": 0.918161066621542, "num_tokens": 174909071.0, "step": 46460 }, { "entropy": 0.2765492834150791, "epoch": 0.7153371470793671, "grad_norm": 0.6384899616241455, "learning_rate": 1.8854944395447152e-05, "loss": 0.2706, "mean_token_accuracy": 0.9142571613192558, "num_tokens": 174976864.0, "step": 46470 }, { "entropy": 0.2791677681729198, "epoch": 0.7154910823380457, "grad_norm": 0.5825950503349304, "learning_rate": 1.8854112017987476e-05, "loss": 0.2902, "mean_token_accuracy": 0.9112940572202206, "num_tokens": 175046816.0, "step": 46480 }, { "entropy": 0.2819791798479855, "epoch": 0.7156450175967243, "grad_norm": 0.5316089391708374, "learning_rate": 1.8853279356483828e-05, "loss": 0.2817, "mean_token_accuracy": 0.913087648153305, "num_tokens": 175113359.0, "step": 46490 }, { "entropy": 0.2871778765693307, "epoch": 0.7157989528554028, "grad_norm": 0.6225156188011169, "learning_rate": 1.8852446410962915e-05, "loss": 0.2947, "mean_token_accuracy": 0.9107820577919483, "num_tokens": 175181356.0, "step": 46500 }, { "entropy": 0.2829551286064088, "epoch": 0.7159528881140814, "grad_norm": 0.6291922926902771, "learning_rate": 1.8851613181451466e-05, "loss": 0.2952, "mean_token_accuracy": 0.9115431807935238, "num_tokens": 175250309.0, "step": 46510 }, { "entropy": 0.2987811335362494, "epoch": 0.71610682337276, "grad_norm": 0.5931468605995178, "learning_rate": 1.8850779667976208e-05, "loss": 0.2983, "mean_token_accuracy": 0.9075489446520806, "num_tokens": 175309132.0, "step": 46520 }, { "entropy": 0.27822851706296203, "epoch": 0.7162607586314386, "grad_norm": 0.5062656402587891, "learning_rate": 1.884994587056388e-05, "loss": 0.2835, "mean_token_accuracy": 0.9143104255199432, "num_tokens": 175372567.0, "step": 46530 }, { "entropy": 0.27753059128299357, "epoch": 0.7164146938901171, "grad_norm": 0.6895431876182556, "learning_rate": 1.884911178924123e-05, "loss": 0.2876, "mean_token_accuracy": 0.915132150053978, "num_tokens": 175441308.0, "step": 46540 }, { "entropy": 0.29054964827373625, "epoch": 0.7165686291487957, "grad_norm": 0.5928008556365967, "learning_rate": 1.884827742403502e-05, "loss": 0.2973, "mean_token_accuracy": 0.9091856867074967, "num_tokens": 175507637.0, "step": 46550 }, { "entropy": 0.2709351605735719, "epoch": 0.7167225644074743, "grad_norm": 0.795845627784729, "learning_rate": 1.884744277497201e-05, "loss": 0.2695, "mean_token_accuracy": 0.916879452764988, "num_tokens": 175568344.0, "step": 46560 }, { "entropy": 0.27480000089854, "epoch": 0.7168764996661529, "grad_norm": 0.6370983123779297, "learning_rate": 1.884660784207898e-05, "loss": 0.2804, "mean_token_accuracy": 0.9160125315189361, "num_tokens": 175634293.0, "step": 46570 }, { "entropy": 0.26768350023776294, "epoch": 0.7170304349248315, "grad_norm": 0.5021169185638428, "learning_rate": 1.8845772625382716e-05, "loss": 0.2755, "mean_token_accuracy": 0.9152834832668304, "num_tokens": 175701159.0, "step": 46580 }, { "entropy": 0.2641742693260312, "epoch": 0.7171843701835101, "grad_norm": 0.5956816673278809, "learning_rate": 1.884493712491001e-05, "loss": 0.2778, "mean_token_accuracy": 0.9164406202733517, "num_tokens": 175766162.0, "step": 46590 }, { "entropy": 0.29128649942576884, "epoch": 0.7173383054421887, "grad_norm": 0.5370131731033325, "learning_rate": 1.8844101340687667e-05, "loss": 0.2895, "mean_token_accuracy": 0.9074614390730857, "num_tokens": 175830127.0, "step": 46600 }, { "entropy": 0.2853025312535465, "epoch": 0.7174922407008673, "grad_norm": 0.6637627482414246, "learning_rate": 1.88432652727425e-05, "loss": 0.2901, "mean_token_accuracy": 0.9115528970956802, "num_tokens": 175890075.0, "step": 46610 }, { "entropy": 0.27440949631854894, "epoch": 0.7176461759595458, "grad_norm": 0.6047239303588867, "learning_rate": 1.8842428921101324e-05, "loss": 0.2701, "mean_token_accuracy": 0.9168999060988426, "num_tokens": 175953125.0, "step": 46620 }, { "entropy": 0.2705947322770953, "epoch": 0.7178001112182244, "grad_norm": 0.6544464230537415, "learning_rate": 1.8841592285790977e-05, "loss": 0.2718, "mean_token_accuracy": 0.9128809764981269, "num_tokens": 176018556.0, "step": 46630 }, { "entropy": 0.28007878456264734, "epoch": 0.717954046476903, "grad_norm": 0.6562625765800476, "learning_rate": 1.88407553668383e-05, "loss": 0.2714, "mean_token_accuracy": 0.9124787740409375, "num_tokens": 176084272.0, "step": 46640 }, { "entropy": 0.2763217634521425, "epoch": 0.7181079817355815, "grad_norm": 0.5856375694274902, "learning_rate": 1.883991816427013e-05, "loss": 0.2792, "mean_token_accuracy": 0.9137669630348683, "num_tokens": 176152110.0, "step": 46650 }, { "entropy": 0.2927857863716781, "epoch": 0.7182619169942601, "grad_norm": 0.595267653465271, "learning_rate": 1.8839080678113337e-05, "loss": 0.2931, "mean_token_accuracy": 0.9086197644472123, "num_tokens": 176212044.0, "step": 46660 }, { "entropy": 0.285804336052388, "epoch": 0.7184158522529387, "grad_norm": 0.5260430574417114, "learning_rate": 1.8838242908394782e-05, "loss": 0.2911, "mean_token_accuracy": 0.9128671385347843, "num_tokens": 176275341.0, "step": 46670 }, { "entropy": 0.2897842171601951, "epoch": 0.7185697875116173, "grad_norm": 0.8075722455978394, "learning_rate": 1.883740485514134e-05, "loss": 0.285, "mean_token_accuracy": 0.9088813774287701, "num_tokens": 176338592.0, "step": 46680 }, { "entropy": 0.26840661419555545, "epoch": 0.7187237227702958, "grad_norm": 0.6309696435928345, "learning_rate": 1.88365665183799e-05, "loss": 0.2644, "mean_token_accuracy": 0.9166816845536232, "num_tokens": 176401689.0, "step": 46690 }, { "entropy": 0.273628665599972, "epoch": 0.7188776580289744, "grad_norm": 0.7589080929756165, "learning_rate": 1.883572789813736e-05, "loss": 0.2832, "mean_token_accuracy": 0.9145082324743271, "num_tokens": 176467176.0, "step": 46700 }, { "entropy": 0.2722245424054563, "epoch": 0.7190315932876531, "grad_norm": 0.6776342988014221, "learning_rate": 1.8834888994440616e-05, "loss": 0.2749, "mean_token_accuracy": 0.9145777799189091, "num_tokens": 176526833.0, "step": 46710 }, { "entropy": 0.29293576334603133, "epoch": 0.7191855285463317, "grad_norm": 0.556286633014679, "learning_rate": 1.883404980731658e-05, "loss": 0.2907, "mean_token_accuracy": 0.9094980202615262, "num_tokens": 176604911.0, "step": 46720 }, { "entropy": 0.2857065266929567, "epoch": 0.7193394638050102, "grad_norm": 0.7063271403312683, "learning_rate": 1.8833210336792174e-05, "loss": 0.2926, "mean_token_accuracy": 0.9107996210455894, "num_tokens": 176667915.0, "step": 46730 }, { "entropy": 0.279746151342988, "epoch": 0.7194933990636888, "grad_norm": 0.6838348507881165, "learning_rate": 1.8832370582894336e-05, "loss": 0.2821, "mean_token_accuracy": 0.9117916360497474, "num_tokens": 176733610.0, "step": 46740 }, { "entropy": 0.3067979569546878, "epoch": 0.7196473343223674, "grad_norm": 0.6204955577850342, "learning_rate": 1.8831530545649998e-05, "loss": 0.3092, "mean_token_accuracy": 0.9046007327735424, "num_tokens": 176802525.0, "step": 46750 }, { "entropy": 0.2632765299640596, "epoch": 0.719801269581046, "grad_norm": 0.4819471836090088, "learning_rate": 1.883069022508611e-05, "loss": 0.2743, "mean_token_accuracy": 0.9199503161013126, "num_tokens": 176874077.0, "step": 46760 }, { "entropy": 0.2787231129594147, "epoch": 0.7199552048397245, "grad_norm": 0.544924259185791, "learning_rate": 1.882984962122963e-05, "loss": 0.2803, "mean_token_accuracy": 0.9109675854444503, "num_tokens": 176940217.0, "step": 46770 }, { "entropy": 0.2828382899984717, "epoch": 0.7201091400984031, "grad_norm": 0.65770024061203, "learning_rate": 1.882900873410753e-05, "loss": 0.2873, "mean_token_accuracy": 0.9122068852186203, "num_tokens": 177002594.0, "step": 46780 }, { "entropy": 0.2797721912153065, "epoch": 0.7202630753570817, "grad_norm": 0.6627818942070007, "learning_rate": 1.8828167563746783e-05, "loss": 0.2901, "mean_token_accuracy": 0.9130064845085144, "num_tokens": 177069608.0, "step": 46790 }, { "entropy": 0.2802095977589488, "epoch": 0.7204170106157602, "grad_norm": 0.5835596323013306, "learning_rate": 1.8827326110174368e-05, "loss": 0.2804, "mean_token_accuracy": 0.9125995807349682, "num_tokens": 177135799.0, "step": 46800 }, { "entropy": 0.28860953394323585, "epoch": 0.7205709458744388, "grad_norm": 0.731052041053772, "learning_rate": 1.882648437341729e-05, "loss": 0.2775, "mean_token_accuracy": 0.9103144958615303, "num_tokens": 177198161.0, "step": 46810 }, { "entropy": 0.2916354214772582, "epoch": 0.7207248811331174, "grad_norm": 0.6008538007736206, "learning_rate": 1.882564235350254e-05, "loss": 0.2958, "mean_token_accuracy": 0.9082310691475868, "num_tokens": 177265125.0, "step": 46820 }, { "entropy": 0.2665655085816979, "epoch": 0.7208788163917961, "grad_norm": 0.6525949835777283, "learning_rate": 1.8824800050457143e-05, "loss": 0.263, "mean_token_accuracy": 0.9181693755090237, "num_tokens": 177332975.0, "step": 46830 }, { "entropy": 0.27916210172697903, "epoch": 0.7210327516504746, "grad_norm": 0.6607490181922913, "learning_rate": 1.882395746430811e-05, "loss": 0.283, "mean_token_accuracy": 0.9124748118221759, "num_tokens": 177398085.0, "step": 46840 }, { "entropy": 0.2667331639677286, "epoch": 0.7211866869091532, "grad_norm": 0.5781418085098267, "learning_rate": 1.882311459508248e-05, "loss": 0.2791, "mean_token_accuracy": 0.916219049692154, "num_tokens": 177464264.0, "step": 46850 }, { "entropy": 0.29406912960112097, "epoch": 0.7213406221678318, "grad_norm": 0.5167020559310913, "learning_rate": 1.8822271442807288e-05, "loss": 0.2905, "mean_token_accuracy": 0.9091994225978851, "num_tokens": 177531082.0, "step": 46860 }, { "entropy": 0.2627218272536993, "epoch": 0.7214945574265104, "grad_norm": 0.5643063187599182, "learning_rate": 1.882142800750958e-05, "loss": 0.2747, "mean_token_accuracy": 0.9181987278163433, "num_tokens": 177592671.0, "step": 46870 }, { "entropy": 0.29084020145237444, "epoch": 0.7216484926851889, "grad_norm": 0.788341760635376, "learning_rate": 1.8820584289216422e-05, "loss": 0.2905, "mean_token_accuracy": 0.9118900701403618, "num_tokens": 177657146.0, "step": 46880 }, { "entropy": 0.277149517275393, "epoch": 0.7218024279438675, "grad_norm": 0.6030257344245911, "learning_rate": 1.8819740287954876e-05, "loss": 0.2884, "mean_token_accuracy": 0.9131317555904388, "num_tokens": 177719061.0, "step": 46890 }, { "entropy": 0.29259960241615773, "epoch": 0.7219563632025461, "grad_norm": 0.6943095326423645, "learning_rate": 1.8818896003752013e-05, "loss": 0.3034, "mean_token_accuracy": 0.9077541455626488, "num_tokens": 177782722.0, "step": 46900 }, { "entropy": 0.28530903486534953, "epoch": 0.7221102984612247, "grad_norm": 0.5882741212844849, "learning_rate": 1.8818051436634928e-05, "loss": 0.282, "mean_token_accuracy": 0.9122581802308559, "num_tokens": 177846207.0, "step": 46910 }, { "entropy": 0.28074872996658085, "epoch": 0.7222642337199032, "grad_norm": 0.5197461843490601, "learning_rate": 1.8817206586630706e-05, "loss": 0.2814, "mean_token_accuracy": 0.9155375346541405, "num_tokens": 177914936.0, "step": 46920 }, { "entropy": 0.2710596867837012, "epoch": 0.7224181689785818, "grad_norm": 0.5945757031440735, "learning_rate": 1.8816361453766458e-05, "loss": 0.271, "mean_token_accuracy": 0.9154853671789169, "num_tokens": 177980545.0, "step": 46930 }, { "entropy": 0.2846430056728423, "epoch": 0.7225721042372604, "grad_norm": 0.5627084970474243, "learning_rate": 1.881551603806929e-05, "loss": 0.2839, "mean_token_accuracy": 0.9103705875575543, "num_tokens": 178047780.0, "step": 46940 }, { "entropy": 0.2966036486439407, "epoch": 0.7227260394959389, "grad_norm": 0.5684537291526794, "learning_rate": 1.8814670339566326e-05, "loss": 0.2905, "mean_token_accuracy": 0.9071074463427067, "num_tokens": 178109043.0, "step": 46950 }, { "entropy": 0.2730785527266562, "epoch": 0.7228799747546176, "grad_norm": 0.6907402276992798, "learning_rate": 1.88138243582847e-05, "loss": 0.283, "mean_token_accuracy": 0.9143172793090344, "num_tokens": 178174817.0, "step": 46960 }, { "entropy": 0.27679315498098733, "epoch": 0.7230339100132962, "grad_norm": 0.6556649804115295, "learning_rate": 1.8812978094251542e-05, "loss": 0.2821, "mean_token_accuracy": 0.9120730370283127, "num_tokens": 178241371.0, "step": 46970 }, { "entropy": 0.2694229053333402, "epoch": 0.7231878452719748, "grad_norm": 0.6738699674606323, "learning_rate": 1.881213154749401e-05, "loss": 0.2612, "mean_token_accuracy": 0.9172519758343697, "num_tokens": 178307454.0, "step": 46980 }, { "entropy": 0.28887507217004893, "epoch": 0.7233417805306533, "grad_norm": 0.6656084060668945, "learning_rate": 1.881128471803926e-05, "loss": 0.2956, "mean_token_accuracy": 0.9103210613131523, "num_tokens": 178367672.0, "step": 46990 }, { "entropy": 0.2908062799833715, "epoch": 0.7234957157893319, "grad_norm": 0.7481100559234619, "learning_rate": 1.881043760591445e-05, "loss": 0.2962, "mean_token_accuracy": 0.9074383683502674, "num_tokens": 178429679.0, "step": 47000 }, { "entropy": 0.29236960196867584, "epoch": 0.7236496510480105, "grad_norm": 0.7494210600852966, "learning_rate": 1.880959021114677e-05, "loss": 0.2991, "mean_token_accuracy": 0.9102812469005584, "num_tokens": 178496309.0, "step": 47010 }, { "entropy": 0.2964381232857704, "epoch": 0.7238035863066891, "grad_norm": 0.4937839210033417, "learning_rate": 1.8808742533763392e-05, "loss": 0.2911, "mean_token_accuracy": 0.9089697636663914, "num_tokens": 178569970.0, "step": 47020 }, { "entropy": 0.2838938761502504, "epoch": 0.7239575215653676, "grad_norm": 0.5196161866188049, "learning_rate": 1.8807894573791516e-05, "loss": 0.2805, "mean_token_accuracy": 0.9121592789888382, "num_tokens": 178640464.0, "step": 47030 }, { "entropy": 0.2781289073638618, "epoch": 0.7241114568240462, "grad_norm": 0.6573265194892883, "learning_rate": 1.8807046331258343e-05, "loss": 0.2841, "mean_token_accuracy": 0.9127081014215946, "num_tokens": 178701927.0, "step": 47040 }, { "entropy": 0.26058234050869944, "epoch": 0.7242653920827248, "grad_norm": 0.7017239332199097, "learning_rate": 1.880619780619109e-05, "loss": 0.2586, "mean_token_accuracy": 0.9197558514773846, "num_tokens": 178771040.0, "step": 47050 }, { "entropy": 0.27054145047441125, "epoch": 0.7244193273414034, "grad_norm": 0.5575741529464722, "learning_rate": 1.8805348998616973e-05, "loss": 0.2867, "mean_token_accuracy": 0.9158181384205818, "num_tokens": 178838991.0, "step": 47060 }, { "entropy": 0.2815396553836763, "epoch": 0.7245732626000819, "grad_norm": 0.59820157289505, "learning_rate": 1.880449990856322e-05, "loss": 0.2952, "mean_token_accuracy": 0.9110629193484783, "num_tokens": 178907780.0, "step": 47070 }, { "entropy": 0.2697593031451106, "epoch": 0.7247271978587605, "grad_norm": 0.6562172174453735, "learning_rate": 1.8803650536057073e-05, "loss": 0.27, "mean_token_accuracy": 0.9180623218417168, "num_tokens": 178982733.0, "step": 47080 }, { "entropy": 0.25222532711923124, "epoch": 0.7248811331174392, "grad_norm": 0.4919161796569824, "learning_rate": 1.8802800881125785e-05, "loss": 0.258, "mean_token_accuracy": 0.921234180778265, "num_tokens": 179046129.0, "step": 47090 }, { "entropy": 0.2777066324837506, "epoch": 0.7250350683761178, "grad_norm": 0.59650057554245, "learning_rate": 1.8801950943796606e-05, "loss": 0.2864, "mean_token_accuracy": 0.91199686601758, "num_tokens": 179108109.0, "step": 47100 }, { "entropy": 0.28726622853428124, "epoch": 0.7251890036347963, "grad_norm": 0.433296263217926, "learning_rate": 1.8801100724096807e-05, "loss": 0.3019, "mean_token_accuracy": 0.9121717564761639, "num_tokens": 179184615.0, "step": 47110 }, { "entropy": 0.27812109272927044, "epoch": 0.7253429388934749, "grad_norm": 0.5210338234901428, "learning_rate": 1.8800250222053658e-05, "loss": 0.2886, "mean_token_accuracy": 0.9132994621992111, "num_tokens": 179257468.0, "step": 47120 }, { "entropy": 0.292189160361886, "epoch": 0.7254968741521535, "grad_norm": 0.6239445209503174, "learning_rate": 1.8799399437694452e-05, "loss": 0.2953, "mean_token_accuracy": 0.9089690044522285, "num_tokens": 179320748.0, "step": 47130 }, { "entropy": 0.2771254050545394, "epoch": 0.725650809410832, "grad_norm": 0.49397560954093933, "learning_rate": 1.8798548371046477e-05, "loss": 0.2764, "mean_token_accuracy": 0.915096890926361, "num_tokens": 179395092.0, "step": 47140 }, { "entropy": 0.25936662442982195, "epoch": 0.7258047446695106, "grad_norm": 0.7568424344062805, "learning_rate": 1.8797697022137033e-05, "loss": 0.2783, "mean_token_accuracy": 0.9202179871499538, "num_tokens": 179457345.0, "step": 47150 }, { "entropy": 0.2883282179012895, "epoch": 0.7259586799281892, "grad_norm": 0.5168830752372742, "learning_rate": 1.879684539099344e-05, "loss": 0.2871, "mean_token_accuracy": 0.9104873746633529, "num_tokens": 179518512.0, "step": 47160 }, { "entropy": 0.27005056040361525, "epoch": 0.7261126151868678, "grad_norm": 0.5591107606887817, "learning_rate": 1.8795993477643012e-05, "loss": 0.2728, "mean_token_accuracy": 0.9160346612334251, "num_tokens": 179580847.0, "step": 47170 }, { "entropy": 0.2756882765330374, "epoch": 0.7262665504455463, "grad_norm": 0.7939562201499939, "learning_rate": 1.879514128211308e-05, "loss": 0.2859, "mean_token_accuracy": 0.9127511195838451, "num_tokens": 179643560.0, "step": 47180 }, { "entropy": 0.2747446087189019, "epoch": 0.7264204857042249, "grad_norm": 0.5968903303146362, "learning_rate": 1.8794288804430986e-05, "loss": 0.2797, "mean_token_accuracy": 0.9139226704835892, "num_tokens": 179709826.0, "step": 47190 }, { "entropy": 0.27940909983590245, "epoch": 0.7265744209629035, "grad_norm": 0.6859086751937866, "learning_rate": 1.8793436044624068e-05, "loss": 0.2865, "mean_token_accuracy": 0.911885280907154, "num_tokens": 179770679.0, "step": 47200 }, { "entropy": 0.28150308979675176, "epoch": 0.7267283562215822, "grad_norm": 0.579969048500061, "learning_rate": 1.8792583002719698e-05, "loss": 0.2836, "mean_token_accuracy": 0.9124595642089843, "num_tokens": 179832313.0, "step": 47210 }, { "entropy": 0.27615839801728725, "epoch": 0.7268822914802607, "grad_norm": 0.6251976490020752, "learning_rate": 1.879172967874523e-05, "loss": 0.2662, "mean_token_accuracy": 0.9150531634688377, "num_tokens": 179893809.0, "step": 47220 }, { "entropy": 0.26906847963109615, "epoch": 0.7270362267389393, "grad_norm": 0.6136351823806763, "learning_rate": 1.8790876072728042e-05, "loss": 0.283, "mean_token_accuracy": 0.9171786792576313, "num_tokens": 179964727.0, "step": 47230 }, { "entropy": 0.2901169521734118, "epoch": 0.7271901619976179, "grad_norm": 0.5611817240715027, "learning_rate": 1.8790022184695526e-05, "loss": 0.2918, "mean_token_accuracy": 0.9092640742659569, "num_tokens": 180037803.0, "step": 47240 }, { "entropy": 0.2678467070683837, "epoch": 0.7273440972562965, "grad_norm": 0.776326060295105, "learning_rate": 1.878916801467506e-05, "loss": 0.2715, "mean_token_accuracy": 0.9171694949269295, "num_tokens": 180106390.0, "step": 47250 }, { "entropy": 0.28917033411562443, "epoch": 0.727498032514975, "grad_norm": 0.5740782022476196, "learning_rate": 1.878831356269406e-05, "loss": 0.2866, "mean_token_accuracy": 0.9104723542928695, "num_tokens": 180173111.0, "step": 47260 }, { "entropy": 0.263614942971617, "epoch": 0.7276519677736536, "grad_norm": 0.539150059223175, "learning_rate": 1.878745882877993e-05, "loss": 0.2727, "mean_token_accuracy": 0.9163293063640594, "num_tokens": 180245868.0, "step": 47270 }, { "entropy": 0.2635684854350984, "epoch": 0.7278059030323322, "grad_norm": 0.5300570130348206, "learning_rate": 1.8786603812960097e-05, "loss": 0.2621, "mean_token_accuracy": 0.9171912238001824, "num_tokens": 180310032.0, "step": 47280 }, { "entropy": 0.27918185889720915, "epoch": 0.7279598382910107, "grad_norm": 0.6201032996177673, "learning_rate": 1.8785748515261977e-05, "loss": 0.2797, "mean_token_accuracy": 0.9138856753706932, "num_tokens": 180376113.0, "step": 47290 }, { "entropy": 0.29382985467091205, "epoch": 0.7281137735496893, "grad_norm": 0.6606944799423218, "learning_rate": 1.8784892935713023e-05, "loss": 0.291, "mean_token_accuracy": 0.9086393259465695, "num_tokens": 180451187.0, "step": 47300 }, { "entropy": 0.2685559851117432, "epoch": 0.7282677088083679, "grad_norm": 0.6050626635551453, "learning_rate": 1.8784037074340673e-05, "loss": 0.2703, "mean_token_accuracy": 0.9160167738795281, "num_tokens": 180521169.0, "step": 47310 }, { "entropy": 0.27534030796959996, "epoch": 0.7284216440670465, "grad_norm": 0.7910142540931702, "learning_rate": 1.8783180931172388e-05, "loss": 0.2727, "mean_token_accuracy": 0.9145561888813972, "num_tokens": 180578984.0, "step": 47320 }, { "entropy": 0.2801698693074286, "epoch": 0.728575579325725, "grad_norm": 0.5612896680831909, "learning_rate": 1.878232450623563e-05, "loss": 0.2877, "mean_token_accuracy": 0.9122897081077099, "num_tokens": 180645380.0, "step": 47330 }, { "entropy": 0.2788607904687524, "epoch": 0.7287295145844037, "grad_norm": 0.646554708480835, "learning_rate": 1.8781467799557878e-05, "loss": 0.2835, "mean_token_accuracy": 0.9125910595059394, "num_tokens": 180713476.0, "step": 47340 }, { "entropy": 0.27647988628596065, "epoch": 0.7288834498430823, "grad_norm": 0.6424283981323242, "learning_rate": 1.878061081116661e-05, "loss": 0.2897, "mean_token_accuracy": 0.9137404054403305, "num_tokens": 180784341.0, "step": 47350 }, { "entropy": 0.2544133609160781, "epoch": 0.7290373851017609, "grad_norm": 0.5818415284156799, "learning_rate": 1.8779753541089323e-05, "loss": 0.261, "mean_token_accuracy": 0.9210616201162338, "num_tokens": 180845116.0, "step": 47360 }, { "entropy": 0.2637120632454753, "epoch": 0.7291913203604394, "grad_norm": 0.6185593605041504, "learning_rate": 1.8778895989353517e-05, "loss": 0.2738, "mean_token_accuracy": 0.9183044835925103, "num_tokens": 180909368.0, "step": 47370 }, { "entropy": 0.26489745592698455, "epoch": 0.729345255619118, "grad_norm": 0.6972387433052063, "learning_rate": 1.8778038155986707e-05, "loss": 0.28, "mean_token_accuracy": 0.9159681119024754, "num_tokens": 180971881.0, "step": 47380 }, { "entropy": 0.29090099800378083, "epoch": 0.7294991908777966, "grad_norm": 0.6390918493270874, "learning_rate": 1.8777180041016404e-05, "loss": 0.299, "mean_token_accuracy": 0.907512104511261, "num_tokens": 181039634.0, "step": 47390 }, { "entropy": 0.27481170324608684, "epoch": 0.7296531261364751, "grad_norm": 0.6707773208618164, "learning_rate": 1.877632164447014e-05, "loss": 0.272, "mean_token_accuracy": 0.9151556260883809, "num_tokens": 181097650.0, "step": 47400 }, { "entropy": 0.29299614690244197, "epoch": 0.7298070613951537, "grad_norm": 0.5227223038673401, "learning_rate": 1.8775462966375457e-05, "loss": 0.2895, "mean_token_accuracy": 0.9089139819145202, "num_tokens": 181175615.0, "step": 47410 }, { "entropy": 0.2593810320831835, "epoch": 0.7299609966538323, "grad_norm": 0.6644623875617981, "learning_rate": 1.87746040067599e-05, "loss": 0.2813, "mean_token_accuracy": 0.9163196533918381, "num_tokens": 181238569.0, "step": 47420 }, { "entropy": 0.26940553383901716, "epoch": 0.7301149319125109, "grad_norm": 0.7587987780570984, "learning_rate": 1.8773744765651018e-05, "loss": 0.2633, "mean_token_accuracy": 0.9176800735294819, "num_tokens": 181302016.0, "step": 47430 }, { "entropy": 0.26175381857901814, "epoch": 0.7302688671711894, "grad_norm": 0.6338091492652893, "learning_rate": 1.877288524307639e-05, "loss": 0.266, "mean_token_accuracy": 0.9193636156618595, "num_tokens": 181360081.0, "step": 47440 }, { "entropy": 0.28190412605181336, "epoch": 0.730422802429868, "grad_norm": 0.6100246906280518, "learning_rate": 1.8772025439063575e-05, "loss": 0.27, "mean_token_accuracy": 0.9117810025811195, "num_tokens": 181424914.0, "step": 47450 }, { "entropy": 0.26637041782960297, "epoch": 0.7305767376885466, "grad_norm": 0.5401053428649902, "learning_rate": 1.8771165353640164e-05, "loss": 0.2808, "mean_token_accuracy": 0.9162866458296776, "num_tokens": 181490129.0, "step": 47460 }, { "entropy": 0.2806004146113992, "epoch": 0.7307306729472253, "grad_norm": 0.7246898412704468, "learning_rate": 1.8770304986833746e-05, "loss": 0.2955, "mean_token_accuracy": 0.9121958777308464, "num_tokens": 181554112.0, "step": 47470 }, { "entropy": 0.301965572591871, "epoch": 0.7308846082059038, "grad_norm": 0.5618496537208557, "learning_rate": 1.8769444338671925e-05, "loss": 0.3074, "mean_token_accuracy": 0.9059404753148556, "num_tokens": 181619239.0, "step": 47480 }, { "entropy": 0.28563754372298716, "epoch": 0.7310385434645824, "grad_norm": 0.6832311749458313, "learning_rate": 1.8768583409182307e-05, "loss": 0.2821, "mean_token_accuracy": 0.9114485464990139, "num_tokens": 181686050.0, "step": 47490 }, { "entropy": 0.27278118608519436, "epoch": 0.731192478723261, "grad_norm": 0.6233751177787781, "learning_rate": 1.8767722198392514e-05, "loss": 0.2838, "mean_token_accuracy": 0.9143331781029701, "num_tokens": 181745573.0, "step": 47500 }, { "entropy": 0.2708624922670424, "epoch": 0.7313464139819396, "grad_norm": 0.6163052320480347, "learning_rate": 1.8766860706330175e-05, "loss": 0.2855, "mean_token_accuracy": 0.9167679771780968, "num_tokens": 181812833.0, "step": 47510 }, { "entropy": 0.2798126461915672, "epoch": 0.7315003492406181, "grad_norm": 0.6223185658454895, "learning_rate": 1.8765998933022923e-05, "loss": 0.2754, "mean_token_accuracy": 0.913993063569069, "num_tokens": 181872092.0, "step": 47520 }, { "entropy": 0.2887897864915431, "epoch": 0.7316542844992967, "grad_norm": 0.6852173805236816, "learning_rate": 1.8765136878498408e-05, "loss": 0.2894, "mean_token_accuracy": 0.9088794894516468, "num_tokens": 181938635.0, "step": 47530 }, { "entropy": 0.2655188223347068, "epoch": 0.7318082197579753, "grad_norm": 0.6327770948410034, "learning_rate": 1.876427454278428e-05, "loss": 0.2761, "mean_token_accuracy": 0.9160035394132138, "num_tokens": 182000087.0, "step": 47540 }, { "entropy": 0.2889036701992154, "epoch": 0.7319621550166538, "grad_norm": 0.5879833102226257, "learning_rate": 1.8763411925908212e-05, "loss": 0.2844, "mean_token_accuracy": 0.9123052574694157, "num_tokens": 182061890.0, "step": 47550 }, { "entropy": 0.2734166756272316, "epoch": 0.7321160902753324, "grad_norm": 0.5666911602020264, "learning_rate": 1.876254902789787e-05, "loss": 0.2832, "mean_token_accuracy": 0.9148828938603402, "num_tokens": 182124604.0, "step": 47560 }, { "entropy": 0.27244984786957505, "epoch": 0.732270025534011, "grad_norm": 0.6584586501121521, "learning_rate": 1.8761685848780935e-05, "loss": 0.2785, "mean_token_accuracy": 0.9133166372776031, "num_tokens": 182186510.0, "step": 47570 }, { "entropy": 0.28306491672992706, "epoch": 0.7324239607926896, "grad_norm": 0.6204150319099426, "learning_rate": 1.87608223885851e-05, "loss": 0.2845, "mean_token_accuracy": 0.914777921140194, "num_tokens": 182250580.0, "step": 47580 }, { "entropy": 0.2602881111204624, "epoch": 0.7325778960513682, "grad_norm": 0.7588750123977661, "learning_rate": 1.8759958647338074e-05, "loss": 0.2737, "mean_token_accuracy": 0.9171980269253254, "num_tokens": 182314953.0, "step": 47590 }, { "entropy": 0.26688205171376467, "epoch": 0.7327318313100468, "grad_norm": 0.9308176040649414, "learning_rate": 1.8759094625067552e-05, "loss": 0.275, "mean_token_accuracy": 0.9179552800953388, "num_tokens": 182375766.0, "step": 47600 }, { "entropy": 0.2981922892853618, "epoch": 0.7328857665687254, "grad_norm": 0.5739518404006958, "learning_rate": 1.8758230321801262e-05, "loss": 0.2937, "mean_token_accuracy": 0.908262624591589, "num_tokens": 182441990.0, "step": 47610 }, { "entropy": 0.27451344914734366, "epoch": 0.733039701827404, "grad_norm": 0.4226537048816681, "learning_rate": 1.8757365737566928e-05, "loss": 0.2742, "mean_token_accuracy": 0.9145894594490528, "num_tokens": 182504388.0, "step": 47620 }, { "entropy": 0.2695024442858994, "epoch": 0.7331936370860825, "grad_norm": 0.8477731347084045, "learning_rate": 1.8756500872392283e-05, "loss": 0.2723, "mean_token_accuracy": 0.9136481262743473, "num_tokens": 182568425.0, "step": 47630 }, { "entropy": 0.2538095634430647, "epoch": 0.7333475723447611, "grad_norm": 0.72531658411026, "learning_rate": 1.8755635726305077e-05, "loss": 0.2776, "mean_token_accuracy": 0.9191220112144947, "num_tokens": 182627531.0, "step": 47640 }, { "entropy": 0.2748864979483187, "epoch": 0.7335015076034397, "grad_norm": 0.6424367427825928, "learning_rate": 1.875477029933306e-05, "loss": 0.2759, "mean_token_accuracy": 0.9152735121548176, "num_tokens": 182698042.0, "step": 47650 }, { "entropy": 0.27286092434078457, "epoch": 0.7336554428621183, "grad_norm": 0.5159920454025269, "learning_rate": 1.8753904591504003e-05, "loss": 0.2709, "mean_token_accuracy": 0.9147025547921658, "num_tokens": 182766495.0, "step": 47660 }, { "entropy": 0.26941091250628235, "epoch": 0.7338093781207968, "grad_norm": 0.8065195083618164, "learning_rate": 1.8753038602845673e-05, "loss": 0.2805, "mean_token_accuracy": 0.9162942983210087, "num_tokens": 182834685.0, "step": 47670 }, { "entropy": 0.2759528262540698, "epoch": 0.7339633133794754, "grad_norm": 0.8174416422843933, "learning_rate": 1.875217233338585e-05, "loss": 0.2911, "mean_token_accuracy": 0.9138583049178124, "num_tokens": 182899067.0, "step": 47680 }, { "entropy": 0.27905903197824955, "epoch": 0.734117248638154, "grad_norm": 0.647565484046936, "learning_rate": 1.8751305783152326e-05, "loss": 0.2844, "mean_token_accuracy": 0.9140129677951336, "num_tokens": 182957512.0, "step": 47690 }, { "entropy": 0.2706372532993555, "epoch": 0.7342711838968325, "grad_norm": 0.7095201015472412, "learning_rate": 1.87504389521729e-05, "loss": 0.2741, "mean_token_accuracy": 0.9149083435535431, "num_tokens": 183030206.0, "step": 47700 }, { "entropy": 0.2716413180343807, "epoch": 0.7344251191555111, "grad_norm": 0.6768395304679871, "learning_rate": 1.874957184047538e-05, "loss": 0.2687, "mean_token_accuracy": 0.9155723884701729, "num_tokens": 183096869.0, "step": 47710 }, { "entropy": 0.2678498395718634, "epoch": 0.7345790544141898, "grad_norm": 0.6284934878349304, "learning_rate": 1.8748704448087585e-05, "loss": 0.274, "mean_token_accuracy": 0.9157500363886356, "num_tokens": 183164135.0, "step": 47720 }, { "entropy": 0.27542056106030943, "epoch": 0.7347329896728684, "grad_norm": 0.585565447807312, "learning_rate": 1.874783677503734e-05, "loss": 0.2839, "mean_token_accuracy": 0.9155757516622544, "num_tokens": 183228682.0, "step": 47730 }, { "entropy": 0.28890399299561975, "epoch": 0.734886924931547, "grad_norm": 0.6601151823997498, "learning_rate": 1.874696882135248e-05, "loss": 0.2849, "mean_token_accuracy": 0.9104161009192466, "num_tokens": 183291631.0, "step": 47740 }, { "entropy": 0.28495850740000606, "epoch": 0.7350408601902255, "grad_norm": 0.5888108015060425, "learning_rate": 1.8746100587060853e-05, "loss": 0.3059, "mean_token_accuracy": 0.9101397782564163, "num_tokens": 183356841.0, "step": 47750 }, { "entropy": 0.26674054013565185, "epoch": 0.7351947954489041, "grad_norm": 0.6357328295707703, "learning_rate": 1.874523207219031e-05, "loss": 0.2678, "mean_token_accuracy": 0.9185981310904026, "num_tokens": 183419031.0, "step": 47760 }, { "entropy": 0.2663998082280159, "epoch": 0.7353487307075827, "grad_norm": 0.8157978653907776, "learning_rate": 1.874436327676871e-05, "loss": 0.2693, "mean_token_accuracy": 0.9162728920578956, "num_tokens": 183477372.0, "step": 47770 }, { "entropy": 0.25872800312936306, "epoch": 0.7355026659662612, "grad_norm": 0.4915708005428314, "learning_rate": 1.8743494200823927e-05, "loss": 0.2639, "mean_token_accuracy": 0.9186015151441097, "num_tokens": 183548754.0, "step": 47780 }, { "entropy": 0.28451293846592307, "epoch": 0.7356566012249398, "grad_norm": 0.5286253094673157, "learning_rate": 1.874262484438384e-05, "loss": 0.2873, "mean_token_accuracy": 0.9134308576583863, "num_tokens": 183623178.0, "step": 47790 }, { "entropy": 0.286310924962163, "epoch": 0.7358105364836184, "grad_norm": 0.462809294462204, "learning_rate": 1.874175520747634e-05, "loss": 0.2996, "mean_token_accuracy": 0.9090508729219436, "num_tokens": 183683311.0, "step": 47800 }, { "entropy": 0.2996310106478631, "epoch": 0.735964471742297, "grad_norm": 0.49300557374954224, "learning_rate": 1.8740885290129328e-05, "loss": 0.2922, "mean_token_accuracy": 0.907071353495121, "num_tokens": 183761506.0, "step": 47810 }, { "entropy": 0.27019950896501543, "epoch": 0.7361184070009755, "grad_norm": 0.5348256826400757, "learning_rate": 1.874001509237071e-05, "loss": 0.2738, "mean_token_accuracy": 0.9176261380314827, "num_tokens": 183820028.0, "step": 47820 }, { "entropy": 0.26723490171134473, "epoch": 0.7362723422596541, "grad_norm": 0.7983783483505249, "learning_rate": 1.8739144614228397e-05, "loss": 0.2864, "mean_token_accuracy": 0.9168876610696316, "num_tokens": 183882100.0, "step": 47830 }, { "entropy": 0.2792211130261421, "epoch": 0.7364262775183327, "grad_norm": 0.5386754870414734, "learning_rate": 1.873827385573032e-05, "loss": 0.2934, "mean_token_accuracy": 0.9107469923794269, "num_tokens": 183945745.0, "step": 47840 }, { "entropy": 0.2801339411176741, "epoch": 0.7365802127770114, "grad_norm": 0.5513001084327698, "learning_rate": 1.8737402816904412e-05, "loss": 0.2822, "mean_token_accuracy": 0.9124644979834556, "num_tokens": 184018165.0, "step": 47850 }, { "entropy": 0.2748703299090266, "epoch": 0.7367341480356899, "grad_norm": 0.45873114466667175, "learning_rate": 1.8736531497778614e-05, "loss": 0.2827, "mean_token_accuracy": 0.9115138664841652, "num_tokens": 184080749.0, "step": 47860 }, { "entropy": 0.2866640522144735, "epoch": 0.7368880832943685, "grad_norm": 0.6290655136108398, "learning_rate": 1.8735659898380883e-05, "loss": 0.2893, "mean_token_accuracy": 0.9121323436498642, "num_tokens": 184142976.0, "step": 47870 }, { "entropy": 0.27532689133659005, "epoch": 0.7370420185530471, "grad_norm": 0.5403308868408203, "learning_rate": 1.8734788018739174e-05, "loss": 0.2896, "mean_token_accuracy": 0.9151376739144326, "num_tokens": 184213122.0, "step": 47880 }, { "entropy": 0.2566630123183131, "epoch": 0.7371959538117256, "grad_norm": 0.7086687684059143, "learning_rate": 1.8733915858881462e-05, "loss": 0.2583, "mean_token_accuracy": 0.9206757739186286, "num_tokens": 184279263.0, "step": 47890 }, { "entropy": 0.26659971904009583, "epoch": 0.7373498890704042, "grad_norm": 0.536450982093811, "learning_rate": 1.8733043418835725e-05, "loss": 0.279, "mean_token_accuracy": 0.9158828735351563, "num_tokens": 184349506.0, "step": 47900 }, { "entropy": 0.2784594738855958, "epoch": 0.7375038243290828, "grad_norm": 0.5772756934165955, "learning_rate": 1.8732170698629954e-05, "loss": 0.2818, "mean_token_accuracy": 0.9139470905065536, "num_tokens": 184417411.0, "step": 47910 }, { "entropy": 0.2828568210825324, "epoch": 0.7376577595877614, "grad_norm": 0.6978919506072998, "learning_rate": 1.873129769829214e-05, "loss": 0.2858, "mean_token_accuracy": 0.9109197869896889, "num_tokens": 184492908.0, "step": 47920 }, { "entropy": 0.29368366338312624, "epoch": 0.7378116948464399, "grad_norm": 0.6768296360969543, "learning_rate": 1.8730424417850296e-05, "loss": 0.3016, "mean_token_accuracy": 0.9075121201574803, "num_tokens": 184555242.0, "step": 47930 }, { "entropy": 0.29016707502305505, "epoch": 0.7379656301051185, "grad_norm": 0.680338978767395, "learning_rate": 1.8729550857332432e-05, "loss": 0.2908, "mean_token_accuracy": 0.9108698517084122, "num_tokens": 184614961.0, "step": 47940 }, { "entropy": 0.27002117596566677, "epoch": 0.7381195653637971, "grad_norm": 0.6247902512550354, "learning_rate": 1.8728677016766573e-05, "loss": 0.2802, "mean_token_accuracy": 0.9151929624378681, "num_tokens": 184680379.0, "step": 47950 }, { "entropy": 0.2687271415255964, "epoch": 0.7382735006224757, "grad_norm": 0.7060662508010864, "learning_rate": 1.8727802896180756e-05, "loss": 0.2792, "mean_token_accuracy": 0.9151815608143806, "num_tokens": 184739295.0, "step": 47960 }, { "entropy": 0.28032503100112083, "epoch": 0.7384274358811543, "grad_norm": 0.6597979068756104, "learning_rate": 1.872692849560302e-05, "loss": 0.2655, "mean_token_accuracy": 0.9122538916766644, "num_tokens": 184801014.0, "step": 47970 }, { "entropy": 0.2576602446846664, "epoch": 0.7385813711398329, "grad_norm": 0.6231567859649658, "learning_rate": 1.8726053815061416e-05, "loss": 0.2646, "mean_token_accuracy": 0.9191468328237533, "num_tokens": 184861532.0, "step": 47980 }, { "entropy": 0.27606116635724903, "epoch": 0.7387353063985115, "grad_norm": 0.6201844215393066, "learning_rate": 1.8725178854584007e-05, "loss": 0.2823, "mean_token_accuracy": 0.9144208483397961, "num_tokens": 184922195.0, "step": 47990 }, { "entropy": 0.26051752306520937, "epoch": 0.73888924165719, "grad_norm": 0.5487775206565857, "learning_rate": 1.8724303614198858e-05, "loss": 0.2736, "mean_token_accuracy": 0.9199826084077358, "num_tokens": 184990788.0, "step": 48000 }, { "entropy": 0.26772728506475685, "epoch": 0.7390431769158686, "grad_norm": 0.49836090207099915, "learning_rate": 1.872342809393405e-05, "loss": 0.2693, "mean_token_accuracy": 0.9176815375685692, "num_tokens": 185059220.0, "step": 48010 }, { "entropy": 0.27903773123398423, "epoch": 0.7391971121745472, "grad_norm": 0.6851846575737, "learning_rate": 1.8722552293817672e-05, "loss": 0.2853, "mean_token_accuracy": 0.9131020568311214, "num_tokens": 185138099.0, "step": 48020 }, { "entropy": 0.2831815251149237, "epoch": 0.7393510474332258, "grad_norm": 0.5162007808685303, "learning_rate": 1.8721676213877813e-05, "loss": 0.2857, "mean_token_accuracy": 0.9135765694081783, "num_tokens": 185206196.0, "step": 48030 }, { "entropy": 0.2539249365217984, "epoch": 0.7395049826919043, "grad_norm": 0.582388162612915, "learning_rate": 1.8720799854142585e-05, "loss": 0.2638, "mean_token_accuracy": 0.9204012587666511, "num_tokens": 185272550.0, "step": 48040 }, { "entropy": 0.31287725353613494, "epoch": 0.7396589179505829, "grad_norm": 0.7628118395805359, "learning_rate": 1.87199232146401e-05, "loss": 0.3003, "mean_token_accuracy": 0.90118358284235, "num_tokens": 185339483.0, "step": 48050 }, { "entropy": 0.2934316132217646, "epoch": 0.7398128532092615, "grad_norm": 0.7273001670837402, "learning_rate": 1.871904629539848e-05, "loss": 0.2944, "mean_token_accuracy": 0.9104494251310825, "num_tokens": 185397458.0, "step": 48060 }, { "entropy": 0.2551352414302528, "epoch": 0.7399667884679401, "grad_norm": 0.5471970438957214, "learning_rate": 1.8718169096445856e-05, "loss": 0.2632, "mean_token_accuracy": 0.9208303950726986, "num_tokens": 185464123.0, "step": 48070 }, { "entropy": 0.27948117200285194, "epoch": 0.7401207237266186, "grad_norm": 0.6664542555809021, "learning_rate": 1.871729161781037e-05, "loss": 0.2907, "mean_token_accuracy": 0.9131044946610928, "num_tokens": 185529586.0, "step": 48080 }, { "entropy": 0.2846075266599655, "epoch": 0.7402746589852972, "grad_norm": 0.5228456258773804, "learning_rate": 1.8716413859520176e-05, "loss": 0.2915, "mean_token_accuracy": 0.9108577206730842, "num_tokens": 185596308.0, "step": 48090 }, { "entropy": 0.2789336103014648, "epoch": 0.7404285942439759, "grad_norm": 0.5275773406028748, "learning_rate": 1.871553582160343e-05, "loss": 0.2721, "mean_token_accuracy": 0.9129897147417069, "num_tokens": 185667078.0, "step": 48100 }, { "entropy": 0.2691918319091201, "epoch": 0.7405825295026545, "grad_norm": 0.47267502546310425, "learning_rate": 1.8714657504088295e-05, "loss": 0.2727, "mean_token_accuracy": 0.9140137121081352, "num_tokens": 185727614.0, "step": 48110 }, { "entropy": 0.26269833287224176, "epoch": 0.740736464761333, "grad_norm": 0.8235490918159485, "learning_rate": 1.8713778907002955e-05, "loss": 0.2759, "mean_token_accuracy": 0.9194856084883213, "num_tokens": 185786426.0, "step": 48120 }, { "entropy": 0.2972516412846744, "epoch": 0.7408904000200116, "grad_norm": 0.6360589861869812, "learning_rate": 1.871290003037559e-05, "loss": 0.2908, "mean_token_accuracy": 0.9080623961985111, "num_tokens": 185847233.0, "step": 48130 }, { "entropy": 0.2810745959170163, "epoch": 0.7410443352786902, "grad_norm": 0.6155954599380493, "learning_rate": 1.8712020874234395e-05, "loss": 0.2814, "mean_token_accuracy": 0.9120764575898648, "num_tokens": 185905782.0, "step": 48140 }, { "entropy": 0.25955564798787234, "epoch": 0.7411982705373688, "grad_norm": 0.5438452959060669, "learning_rate": 1.8711141438607582e-05, "loss": 0.2763, "mean_token_accuracy": 0.919135895371437, "num_tokens": 185979273.0, "step": 48150 }, { "entropy": 0.28173026302829385, "epoch": 0.7413522057960473, "grad_norm": 0.6761582493782043, "learning_rate": 1.8710261723523356e-05, "loss": 0.2818, "mean_token_accuracy": 0.9118919186294079, "num_tokens": 186045368.0, "step": 48160 }, { "entropy": 0.2632252351380885, "epoch": 0.7415061410547259, "grad_norm": 0.558504581451416, "learning_rate": 1.870938172900994e-05, "loss": 0.2789, "mean_token_accuracy": 0.9177747085690499, "num_tokens": 186112771.0, "step": 48170 }, { "entropy": 0.29111264618113636, "epoch": 0.7416600763134045, "grad_norm": 0.623576819896698, "learning_rate": 1.8708501455095564e-05, "loss": 0.2841, "mean_token_accuracy": 0.9104017928242684, "num_tokens": 186184686.0, "step": 48180 }, { "entropy": 0.2740080956369638, "epoch": 0.741814011572083, "grad_norm": 0.4910455048084259, "learning_rate": 1.870762090180847e-05, "loss": 0.2971, "mean_token_accuracy": 0.9141025923192501, "num_tokens": 186260531.0, "step": 48190 }, { "entropy": 0.2792298963293433, "epoch": 0.7419679468307616, "grad_norm": 0.7835333943367004, "learning_rate": 1.8706740069176907e-05, "loss": 0.2878, "mean_token_accuracy": 0.9103448487818241, "num_tokens": 186326038.0, "step": 48200 }, { "entropy": 0.28055418785661457, "epoch": 0.7421218820894402, "grad_norm": 0.6190364360809326, "learning_rate": 1.870585895722913e-05, "loss": 0.2793, "mean_token_accuracy": 0.9159918121993542, "num_tokens": 186394773.0, "step": 48210 }, { "entropy": 0.27520265970379115, "epoch": 0.7422758173481188, "grad_norm": 0.5813986659049988, "learning_rate": 1.8704977565993404e-05, "loss": 0.2743, "mean_token_accuracy": 0.913424164801836, "num_tokens": 186464398.0, "step": 48220 }, { "entropy": 0.26036722091957926, "epoch": 0.7424297526067974, "grad_norm": 0.7285448908805847, "learning_rate": 1.870409589549801e-05, "loss": 0.2726, "mean_token_accuracy": 0.919470626115799, "num_tokens": 186531172.0, "step": 48230 }, { "entropy": 0.25141597222536805, "epoch": 0.742583687865476, "grad_norm": 0.5970339775085449, "learning_rate": 1.870321394577123e-05, "loss": 0.2599, "mean_token_accuracy": 0.9203479915857316, "num_tokens": 186590218.0, "step": 48240 }, { "entropy": 0.27314913980662825, "epoch": 0.7427376231241546, "grad_norm": 0.4509308338165283, "learning_rate": 1.870233171684135e-05, "loss": 0.2735, "mean_token_accuracy": 0.9161285623908043, "num_tokens": 186655027.0, "step": 48250 }, { "entropy": 0.2671280876733363, "epoch": 0.7428915583828332, "grad_norm": 0.7125024795532227, "learning_rate": 1.8701449208736685e-05, "loss": 0.2915, "mean_token_accuracy": 0.9166593603789807, "num_tokens": 186720229.0, "step": 48260 }, { "entropy": 0.23942155689001082, "epoch": 0.7430454936415117, "grad_norm": 0.6096566915512085, "learning_rate": 1.8700566421485538e-05, "loss": 0.245, "mean_token_accuracy": 0.9219053819775581, "num_tokens": 186788106.0, "step": 48270 }, { "entropy": 0.2682387506589293, "epoch": 0.7431994289001903, "grad_norm": 0.6391311287879944, "learning_rate": 1.8699683355116232e-05, "loss": 0.2839, "mean_token_accuracy": 0.9140532366931439, "num_tokens": 186851497.0, "step": 48280 }, { "entropy": 0.2894848256371915, "epoch": 0.7433533641588689, "grad_norm": 0.5115017294883728, "learning_rate": 1.8698800009657095e-05, "loss": 0.2866, "mean_token_accuracy": 0.9099365189671517, "num_tokens": 186924640.0, "step": 48290 }, { "entropy": 0.27854219246655704, "epoch": 0.7435072994175475, "grad_norm": 0.668972373008728, "learning_rate": 1.8697916385136462e-05, "loss": 0.2806, "mean_token_accuracy": 0.9129437409341336, "num_tokens": 186992673.0, "step": 48300 }, { "entropy": 0.27663529543206095, "epoch": 0.743661234676226, "grad_norm": 0.734952986240387, "learning_rate": 1.869703248158269e-05, "loss": 0.2746, "mean_token_accuracy": 0.9122163340449333, "num_tokens": 187064183.0, "step": 48310 }, { "entropy": 0.2749444700777531, "epoch": 0.7438151699349046, "grad_norm": 0.6360074281692505, "learning_rate": 1.8696148299024125e-05, "loss": 0.286, "mean_token_accuracy": 0.9136074215173722, "num_tokens": 187127682.0, "step": 48320 }, { "entropy": 0.27491642544046047, "epoch": 0.7439691051935832, "grad_norm": 0.5188555717468262, "learning_rate": 1.8695263837489138e-05, "loss": 0.2712, "mean_token_accuracy": 0.9166292726993561, "num_tokens": 187194239.0, "step": 48330 }, { "entropy": 0.2876333604566753, "epoch": 0.7441230404522617, "grad_norm": 0.6833227276802063, "learning_rate": 1.86943790970061e-05, "loss": 0.2836, "mean_token_accuracy": 0.9087807103991509, "num_tokens": 187267509.0, "step": 48340 }, { "entropy": 0.2707445981912315, "epoch": 0.7442769757109404, "grad_norm": 0.6702616214752197, "learning_rate": 1.869349407760339e-05, "loss": 0.2705, "mean_token_accuracy": 0.9156997665762902, "num_tokens": 187328746.0, "step": 48350 }, { "entropy": 0.2658798567019403, "epoch": 0.744430910969619, "grad_norm": 0.6786824464797974, "learning_rate": 1.869260877930941e-05, "loss": 0.2738, "mean_token_accuracy": 0.9164983026683331, "num_tokens": 187408698.0, "step": 48360 }, { "entropy": 0.2843466212972999, "epoch": 0.7445848462282976, "grad_norm": 0.5614883303642273, "learning_rate": 1.8691723202152553e-05, "loss": 0.2882, "mean_token_accuracy": 0.9108877636492252, "num_tokens": 187482281.0, "step": 48370 }, { "entropy": 0.26623541554436086, "epoch": 0.7447387814869761, "grad_norm": 0.5268957614898682, "learning_rate": 1.8690837346161232e-05, "loss": 0.2686, "mean_token_accuracy": 0.9166571453213692, "num_tokens": 187545711.0, "step": 48380 }, { "entropy": 0.2710176163353026, "epoch": 0.7448927167456547, "grad_norm": 0.726271390914917, "learning_rate": 1.8689951211363863e-05, "loss": 0.2742, "mean_token_accuracy": 0.9156313940882683, "num_tokens": 187612289.0, "step": 48390 }, { "entropy": 0.2886076156049967, "epoch": 0.7450466520043333, "grad_norm": 0.6544974446296692, "learning_rate": 1.868906479778888e-05, "loss": 0.2919, "mean_token_accuracy": 0.9111862815916538, "num_tokens": 187676128.0, "step": 48400 }, { "entropy": 0.27217430593445896, "epoch": 0.7452005872630119, "grad_norm": 0.6851034164428711, "learning_rate": 1.868817810546471e-05, "loss": 0.2707, "mean_token_accuracy": 0.9149783968925476, "num_tokens": 187748608.0, "step": 48410 }, { "entropy": 0.26895329626277087, "epoch": 0.7453545225216904, "grad_norm": 0.6355152130126953, "learning_rate": 1.86872911344198e-05, "loss": 0.2677, "mean_token_accuracy": 0.9153853721916676, "num_tokens": 187814892.0, "step": 48420 }, { "entropy": 0.289797545876354, "epoch": 0.745508457780369, "grad_norm": 0.564807116985321, "learning_rate": 1.8686403884682612e-05, "loss": 0.2844, "mean_token_accuracy": 0.9098317086696625, "num_tokens": 187888855.0, "step": 48430 }, { "entropy": 0.2722950043156743, "epoch": 0.7456623930390476, "grad_norm": 0.5749436616897583, "learning_rate": 1.8685516356281607e-05, "loss": 0.2721, "mean_token_accuracy": 0.9157579496502877, "num_tokens": 187950219.0, "step": 48440 }, { "entropy": 0.26083004036918284, "epoch": 0.7458163282977261, "grad_norm": 0.6724530458450317, "learning_rate": 1.868462854924525e-05, "loss": 0.2682, "mean_token_accuracy": 0.9174384228885174, "num_tokens": 188019347.0, "step": 48450 }, { "entropy": 0.2971156505867839, "epoch": 0.7459702635564047, "grad_norm": 0.6387141942977905, "learning_rate": 1.868374046360203e-05, "loss": 0.2963, "mean_token_accuracy": 0.9064780794084072, "num_tokens": 188089001.0, "step": 48460 }, { "entropy": 0.26455963887274264, "epoch": 0.7461241988150833, "grad_norm": 0.618766188621521, "learning_rate": 1.8682852099380437e-05, "loss": 0.2696, "mean_token_accuracy": 0.9159719869494438, "num_tokens": 188148241.0, "step": 48470 }, { "entropy": 0.2875300687737763, "epoch": 0.746278134073762, "grad_norm": 0.5617566108703613, "learning_rate": 1.8681963456608964e-05, "loss": 0.2783, "mean_token_accuracy": 0.9105733945965767, "num_tokens": 188218531.0, "step": 48480 }, { "entropy": 0.277177807316184, "epoch": 0.7464320693324406, "grad_norm": 0.5320027470588684, "learning_rate": 1.8681074535316127e-05, "loss": 0.2846, "mean_token_accuracy": 0.9159172020852566, "num_tokens": 188282774.0, "step": 48490 }, { "entropy": 0.260828154720366, "epoch": 0.7465860045911191, "grad_norm": 0.638350248336792, "learning_rate": 1.8680185335530435e-05, "loss": 0.2633, "mean_token_accuracy": 0.9181330099701881, "num_tokens": 188342931.0, "step": 48500 }, { "entropy": 0.28999262768775225, "epoch": 0.7467399398497977, "grad_norm": 0.6816715002059937, "learning_rate": 1.867929585728042e-05, "loss": 0.3073, "mean_token_accuracy": 0.9094244912266731, "num_tokens": 188408878.0, "step": 48510 }, { "entropy": 0.2712237244471908, "epoch": 0.7468938751084763, "grad_norm": 0.577602744102478, "learning_rate": 1.8678406100594616e-05, "loss": 0.2822, "mean_token_accuracy": 0.9156387507915497, "num_tokens": 188467649.0, "step": 48520 }, { "entropy": 0.28388122618198397, "epoch": 0.7470478103671548, "grad_norm": 0.593203067779541, "learning_rate": 1.8677516065501562e-05, "loss": 0.2889, "mean_token_accuracy": 0.911713682115078, "num_tokens": 188536409.0, "step": 48530 }, { "entropy": 0.28412556340917944, "epoch": 0.7472017456258334, "grad_norm": 0.5322335362434387, "learning_rate": 1.8676625752029815e-05, "loss": 0.282, "mean_token_accuracy": 0.9131880454719067, "num_tokens": 188608280.0, "step": 48540 }, { "entropy": 0.24946870235726237, "epoch": 0.747355680884512, "grad_norm": 0.5276674032211304, "learning_rate": 1.867573516020794e-05, "loss": 0.2551, "mean_token_accuracy": 0.9205176144838333, "num_tokens": 188671082.0, "step": 48550 }, { "entropy": 0.28573831766843794, "epoch": 0.7475096161431906, "grad_norm": 0.6253941059112549, "learning_rate": 1.86748442900645e-05, "loss": 0.2942, "mean_token_accuracy": 0.9103261396288872, "num_tokens": 188731545.0, "step": 48560 }, { "entropy": 0.2636290847323835, "epoch": 0.7476635514018691, "grad_norm": 0.5562434792518616, "learning_rate": 1.867395314162808e-05, "loss": 0.2655, "mean_token_accuracy": 0.9173695757985115, "num_tokens": 188799298.0, "step": 48570 }, { "entropy": 0.2582477011717856, "epoch": 0.7478174866605477, "grad_norm": 0.7148557305335999, "learning_rate": 1.8673061714927262e-05, "loss": 0.268, "mean_token_accuracy": 0.9189928822219372, "num_tokens": 188869107.0, "step": 48580 }, { "entropy": 0.28157183770090344, "epoch": 0.7479714219192263, "grad_norm": 0.5816941261291504, "learning_rate": 1.8672170009990654e-05, "loss": 0.2942, "mean_token_accuracy": 0.9113032050430775, "num_tokens": 188938253.0, "step": 48590 }, { "entropy": 0.28026556558907034, "epoch": 0.7481253571779048, "grad_norm": 0.6841230988502502, "learning_rate": 1.8671278026846853e-05, "loss": 0.2845, "mean_token_accuracy": 0.912933275848627, "num_tokens": 189002372.0, "step": 48600 }, { "entropy": 0.2753694823011756, "epoch": 0.7482792924365835, "grad_norm": 0.5551029443740845, "learning_rate": 1.8670385765524477e-05, "loss": 0.2804, "mean_token_accuracy": 0.9148269392549991, "num_tokens": 189064086.0, "step": 48610 }, { "entropy": 0.2857323678210378, "epoch": 0.7484332276952621, "grad_norm": 0.7922680974006653, "learning_rate": 1.866949322605215e-05, "loss": 0.2929, "mean_token_accuracy": 0.9098080046474933, "num_tokens": 189132819.0, "step": 48620 }, { "entropy": 0.25975513458251953, "epoch": 0.7485871629539407, "grad_norm": 0.5527614951133728, "learning_rate": 1.8668600408458507e-05, "loss": 0.2654, "mean_token_accuracy": 0.9170138582587242, "num_tokens": 189197414.0, "step": 48630 }, { "entropy": 0.26801561173051597, "epoch": 0.7487410982126192, "grad_norm": 0.5407155156135559, "learning_rate": 1.866770731277219e-05, "loss": 0.2687, "mean_token_accuracy": 0.9176082991063594, "num_tokens": 189268684.0, "step": 48640 }, { "entropy": 0.2867979769594967, "epoch": 0.7488950334712978, "grad_norm": 0.5678980946540833, "learning_rate": 1.866681393902185e-05, "loss": 0.292, "mean_token_accuracy": 0.9109968952834606, "num_tokens": 189336138.0, "step": 48650 }, { "entropy": 0.27892850348725917, "epoch": 0.7490489687299764, "grad_norm": 0.5838626027107239, "learning_rate": 1.866592028723614e-05, "loss": 0.2905, "mean_token_accuracy": 0.9120869778096676, "num_tokens": 189405320.0, "step": 48660 }, { "entropy": 0.2593200056813657, "epoch": 0.749202903988655, "grad_norm": 0.5818244218826294, "learning_rate": 1.866502635744374e-05, "loss": 0.2655, "mean_token_accuracy": 0.9183834731578827, "num_tokens": 189465563.0, "step": 48670 }, { "entropy": 0.27032328974455594, "epoch": 0.7493568392473335, "grad_norm": 0.588126003742218, "learning_rate": 1.8664132149673318e-05, "loss": 0.2735, "mean_token_accuracy": 0.9136845678091049, "num_tokens": 189529273.0, "step": 48680 }, { "entropy": 0.2866950330324471, "epoch": 0.7495107745060121, "grad_norm": 0.6590777039527893, "learning_rate": 1.8663237663953565e-05, "loss": 0.293, "mean_token_accuracy": 0.9097066581249237, "num_tokens": 189594765.0, "step": 48690 }, { "entropy": 0.28559542577713726, "epoch": 0.7496647097646907, "grad_norm": 0.5299332141876221, "learning_rate": 1.8662342900313177e-05, "loss": 0.2805, "mean_token_accuracy": 0.9132518313825131, "num_tokens": 189657769.0, "step": 48700 }, { "entropy": 0.26301196999847887, "epoch": 0.7498186450233693, "grad_norm": 0.6203874945640564, "learning_rate": 1.8661447858780856e-05, "loss": 0.2806, "mean_token_accuracy": 0.9171637900173664, "num_tokens": 189725299.0, "step": 48710 }, { "entropy": 0.29656399600207806, "epoch": 0.7499725802820478, "grad_norm": 0.6860345005989075, "learning_rate": 1.866055253938532e-05, "loss": 0.2959, "mean_token_accuracy": 0.9081163108348846, "num_tokens": 189793836.0, "step": 48720 }, { "entropy": 0.28133940445259215, "epoch": 0.7501265155407265, "grad_norm": 0.8308819532394409, "learning_rate": 1.8659656942155286e-05, "loss": 0.2721, "mean_token_accuracy": 0.9134748980402947, "num_tokens": 189860262.0, "step": 48730 }, { "entropy": 0.2824593440629542, "epoch": 0.7502804507994051, "grad_norm": 0.7287125587463379, "learning_rate": 1.8658761067119485e-05, "loss": 0.2812, "mean_token_accuracy": 0.9123995646834373, "num_tokens": 189915548.0, "step": 48740 }, { "entropy": 0.27860473105683925, "epoch": 0.7504343860580837, "grad_norm": 0.6032159328460693, "learning_rate": 1.865786491430666e-05, "loss": 0.2886, "mean_token_accuracy": 0.9129670619964599, "num_tokens": 189979116.0, "step": 48750 }, { "entropy": 0.27438422152772546, "epoch": 0.7505883213167622, "grad_norm": 0.5978263020515442, "learning_rate": 1.865696848374556e-05, "loss": 0.2713, "mean_token_accuracy": 0.9160892330110073, "num_tokens": 190046582.0, "step": 48760 }, { "entropy": 0.2588395361788571, "epoch": 0.7507422565754408, "grad_norm": 0.6772012710571289, "learning_rate": 1.8656071775464945e-05, "loss": 0.2632, "mean_token_accuracy": 0.9219339251518249, "num_tokens": 190105641.0, "step": 48770 }, { "entropy": 0.2724211739376187, "epoch": 0.7508961918341194, "grad_norm": 0.5608686208724976, "learning_rate": 1.8655174789493575e-05, "loss": 0.2854, "mean_token_accuracy": 0.9123087629675866, "num_tokens": 190169842.0, "step": 48780 }, { "entropy": 0.28275170773267744, "epoch": 0.751050127092798, "grad_norm": 0.6025518774986267, "learning_rate": 1.8654277525860228e-05, "loss": 0.2836, "mean_token_accuracy": 0.9123859651386738, "num_tokens": 190240890.0, "step": 48790 }, { "entropy": 0.2678770240396261, "epoch": 0.7512040623514765, "grad_norm": 0.7427533864974976, "learning_rate": 1.8653379984593696e-05, "loss": 0.2759, "mean_token_accuracy": 0.9162223786115646, "num_tokens": 190301879.0, "step": 48800 }, { "entropy": 0.28138768961653116, "epoch": 0.7513579976101551, "grad_norm": 0.561643660068512, "learning_rate": 1.8652482165722764e-05, "loss": 0.2939, "mean_token_accuracy": 0.9111680805683136, "num_tokens": 190370524.0, "step": 48810 }, { "entropy": 0.2697126128710806, "epoch": 0.7515119328688337, "grad_norm": 0.5905852913856506, "learning_rate": 1.865158406927624e-05, "loss": 0.2724, "mean_token_accuracy": 0.9162194468080997, "num_tokens": 190435120.0, "step": 48820 }, { "entropy": 0.27567188153043387, "epoch": 0.7516658681275122, "grad_norm": 0.7481628656387329, "learning_rate": 1.8650685695282925e-05, "loss": 0.2859, "mean_token_accuracy": 0.9135924309492112, "num_tokens": 190493109.0, "step": 48830 }, { "entropy": 0.28659549159929154, "epoch": 0.7518198033861908, "grad_norm": 0.681229293346405, "learning_rate": 1.8649787043771654e-05, "loss": 0.2946, "mean_token_accuracy": 0.9123890943825245, "num_tokens": 190553806.0, "step": 48840 }, { "entropy": 0.26374933058395983, "epoch": 0.7519737386448694, "grad_norm": 0.77545166015625, "learning_rate": 1.8648888114771247e-05, "loss": 0.2575, "mean_token_accuracy": 0.9195030279457569, "num_tokens": 190611570.0, "step": 48850 }, { "entropy": 0.2625838523730636, "epoch": 0.7521276739035481, "grad_norm": 0.6162158250808716, "learning_rate": 1.8647988908310545e-05, "loss": 0.277, "mean_token_accuracy": 0.9180337995290756, "num_tokens": 190680466.0, "step": 48860 }, { "entropy": 0.2536844119429588, "epoch": 0.7522816091622266, "grad_norm": 0.6807559132575989, "learning_rate": 1.864708942441839e-05, "loss": 0.2744, "mean_token_accuracy": 0.9181945033371448, "num_tokens": 190747802.0, "step": 48870 }, { "entropy": 0.2597523059695959, "epoch": 0.7524355444209052, "grad_norm": 0.571708619594574, "learning_rate": 1.8646189663123647e-05, "loss": 0.2742, "mean_token_accuracy": 0.9182249039411545, "num_tokens": 190812729.0, "step": 48880 }, { "entropy": 0.2631407589651644, "epoch": 0.7525894796795838, "grad_norm": 0.6411423683166504, "learning_rate": 1.8645289624455175e-05, "loss": 0.2635, "mean_token_accuracy": 0.9208660446107387, "num_tokens": 190875437.0, "step": 48890 }, { "entropy": 0.2784984494559467, "epoch": 0.7527434149382624, "grad_norm": 0.5802655816078186, "learning_rate": 1.8644389308441847e-05, "loss": 0.2807, "mean_token_accuracy": 0.9124461360275745, "num_tokens": 190948334.0, "step": 48900 }, { "entropy": 0.262020976562053, "epoch": 0.7528973501969409, "grad_norm": 0.47822999954223633, "learning_rate": 1.8643488715112545e-05, "loss": 0.2773, "mean_token_accuracy": 0.9168849058449269, "num_tokens": 191016663.0, "step": 48910 }, { "entropy": 0.28479773830622435, "epoch": 0.7530512854556195, "grad_norm": 0.5824897289276123, "learning_rate": 1.8642587844496166e-05, "loss": 0.2998, "mean_token_accuracy": 0.9113501965999603, "num_tokens": 191081177.0, "step": 48920 }, { "entropy": 0.269211677275598, "epoch": 0.7532052207142981, "grad_norm": 0.5375756025314331, "learning_rate": 1.8641686696621603e-05, "loss": 0.2691, "mean_token_accuracy": 0.9173179812729358, "num_tokens": 191150494.0, "step": 48930 }, { "entropy": 0.26760396314784884, "epoch": 0.7533591559729766, "grad_norm": 0.5881125926971436, "learning_rate": 1.864078527151777e-05, "loss": 0.2796, "mean_token_accuracy": 0.9164474457502365, "num_tokens": 191215783.0, "step": 48940 }, { "entropy": 0.2536557856015861, "epoch": 0.7535130912316552, "grad_norm": 0.5839555859565735, "learning_rate": 1.863988356921359e-05, "loss": 0.2605, "mean_token_accuracy": 0.9222228787839413, "num_tokens": 191280455.0, "step": 48950 }, { "entropy": 0.26181323267519474, "epoch": 0.7536670264903338, "grad_norm": 0.6368776559829712, "learning_rate": 1.8638981589737976e-05, "loss": 0.2735, "mean_token_accuracy": 0.9153805851936341, "num_tokens": 191345370.0, "step": 48960 }, { "entropy": 0.28218920771032574, "epoch": 0.7538209617490124, "grad_norm": 0.49594372510910034, "learning_rate": 1.8638079333119874e-05, "loss": 0.2849, "mean_token_accuracy": 0.912363800406456, "num_tokens": 191413045.0, "step": 48970 }, { "entropy": 0.2817457351833582, "epoch": 0.7539748970076909, "grad_norm": 0.5857717990875244, "learning_rate": 1.8637176799388228e-05, "loss": 0.273, "mean_token_accuracy": 0.9113378949463368, "num_tokens": 191485509.0, "step": 48980 }, { "entropy": 0.2781449044123292, "epoch": 0.7541288322663696, "grad_norm": 0.6135105490684509, "learning_rate": 1.8636273988571993e-05, "loss": 0.2971, "mean_token_accuracy": 0.912710078805685, "num_tokens": 191543954.0, "step": 48990 }, { "entropy": 0.270864678081125, "epoch": 0.7542827675250482, "grad_norm": 0.686004102230072, "learning_rate": 1.8635370900700124e-05, "loss": 0.2724, "mean_token_accuracy": 0.9168320015072823, "num_tokens": 191631726.0, "step": 49000 }, { "entropy": 0.2952373606152833, "epoch": 0.7544367027837268, "grad_norm": 0.6633973121643066, "learning_rate": 1.86344675358016e-05, "loss": 0.2822, "mean_token_accuracy": 0.9097517907619477, "num_tokens": 191703732.0, "step": 49010 }, { "entropy": 0.2655109147541225, "epoch": 0.7545906380424053, "grad_norm": 0.6329435110092163, "learning_rate": 1.8633563893905395e-05, "loss": 0.2666, "mean_token_accuracy": 0.9143915086984634, "num_tokens": 191766515.0, "step": 49020 }, { "entropy": 0.27614907873794436, "epoch": 0.7547445733010839, "grad_norm": 0.6225886344909668, "learning_rate": 1.8632659975040506e-05, "loss": 0.2816, "mean_token_accuracy": 0.9127144515514374, "num_tokens": 191826668.0, "step": 49030 }, { "entropy": 0.2779842847958207, "epoch": 0.7548985085597625, "grad_norm": 0.6893705129623413, "learning_rate": 1.8631755779235926e-05, "loss": 0.2911, "mean_token_accuracy": 0.9116652823984623, "num_tokens": 191892275.0, "step": 49040 }, { "entropy": 0.27717255991883577, "epoch": 0.755052443818441, "grad_norm": 0.6309623718261719, "learning_rate": 1.8630851306520665e-05, "loss": 0.2769, "mean_token_accuracy": 0.9151891075074673, "num_tokens": 191952188.0, "step": 49050 }, { "entropy": 0.26609683940187095, "epoch": 0.7552063790771196, "grad_norm": 1.0220850706100464, "learning_rate": 1.8629946556923738e-05, "loss": 0.2695, "mean_token_accuracy": 0.9171098574995995, "num_tokens": 192027943.0, "step": 49060 }, { "entropy": 0.28441100744530556, "epoch": 0.7553603143357982, "grad_norm": 0.70188307762146, "learning_rate": 1.8629041530474166e-05, "loss": 0.2854, "mean_token_accuracy": 0.9116407424211502, "num_tokens": 192082166.0, "step": 49070 }, { "entropy": 0.27166938753798603, "epoch": 0.7555142495944768, "grad_norm": 0.590408205986023, "learning_rate": 1.862813622720099e-05, "loss": 0.2668, "mean_token_accuracy": 0.915519443154335, "num_tokens": 192146127.0, "step": 49080 }, { "entropy": 0.27244580127298834, "epoch": 0.7556681848531553, "grad_norm": 0.719247043132782, "learning_rate": 1.8627230647133242e-05, "loss": 0.2783, "mean_token_accuracy": 0.9153725631535053, "num_tokens": 192214302.0, "step": 49090 }, { "entropy": 0.27674592901021244, "epoch": 0.7558221201118339, "grad_norm": 0.5984766483306885, "learning_rate": 1.8626324790299982e-05, "loss": 0.2823, "mean_token_accuracy": 0.9139218829572201, "num_tokens": 192278575.0, "step": 49100 }, { "entropy": 0.2586815059185028, "epoch": 0.7559760553705126, "grad_norm": 0.6425195932388306, "learning_rate": 1.862541865673027e-05, "loss": 0.2672, "mean_token_accuracy": 0.9196793980896473, "num_tokens": 192343923.0, "step": 49110 }, { "entropy": 0.2620922437869012, "epoch": 0.7561299906291912, "grad_norm": 0.6469586491584778, "learning_rate": 1.862451224645317e-05, "loss": 0.2875, "mean_token_accuracy": 0.9176808267831802, "num_tokens": 192412046.0, "step": 49120 }, { "entropy": 0.27352765910327437, "epoch": 0.7562839258878697, "grad_norm": 0.7226448655128479, "learning_rate": 1.8623605559497763e-05, "loss": 0.2791, "mean_token_accuracy": 0.9145478680729866, "num_tokens": 192477972.0, "step": 49130 }, { "entropy": 0.25486066974699495, "epoch": 0.7564378611465483, "grad_norm": 0.5128594636917114, "learning_rate": 1.862269859589314e-05, "loss": 0.272, "mean_token_accuracy": 0.9207976676523686, "num_tokens": 192551688.0, "step": 49140 }, { "entropy": 0.27588850203901527, "epoch": 0.7565917964052269, "grad_norm": 0.562751829624176, "learning_rate": 1.862179135566839e-05, "loss": 0.2855, "mean_token_accuracy": 0.913342471420765, "num_tokens": 192612997.0, "step": 49150 }, { "entropy": 0.2908229285851121, "epoch": 0.7567457316639055, "grad_norm": 0.6271775364875793, "learning_rate": 1.8620883838852618e-05, "loss": 0.2834, "mean_token_accuracy": 0.9114102214574814, "num_tokens": 192677905.0, "step": 49160 }, { "entropy": 0.2753538368269801, "epoch": 0.756899666922584, "grad_norm": 0.6144720911979675, "learning_rate": 1.8619976045474945e-05, "loss": 0.2695, "mean_token_accuracy": 0.9142581544816494, "num_tokens": 192747974.0, "step": 49170 }, { "entropy": 0.27920931223779916, "epoch": 0.7570536021812626, "grad_norm": 0.4994702935218811, "learning_rate": 1.8619067975564487e-05, "loss": 0.2849, "mean_token_accuracy": 0.9137516900897026, "num_tokens": 192816809.0, "step": 49180 }, { "entropy": 0.2760936850681901, "epoch": 0.7572075374399412, "grad_norm": 0.5737549066543579, "learning_rate": 1.8618159629150376e-05, "loss": 0.2774, "mean_token_accuracy": 0.9122656211256981, "num_tokens": 192873062.0, "step": 49190 }, { "entropy": 0.2665581122040749, "epoch": 0.7573614726986198, "grad_norm": 0.691432535648346, "learning_rate": 1.8617251006261753e-05, "loss": 0.2693, "mean_token_accuracy": 0.916641490906477, "num_tokens": 192938812.0, "step": 49200 }, { "entropy": 0.2614286823198199, "epoch": 0.7575154079572983, "grad_norm": 0.6020224094390869, "learning_rate": 1.8616342106927767e-05, "loss": 0.2563, "mean_token_accuracy": 0.918587452173233, "num_tokens": 193006356.0, "step": 49210 }, { "entropy": 0.27609982173889874, "epoch": 0.7576693432159769, "grad_norm": 0.6188429594039917, "learning_rate": 1.861543293117758e-05, "loss": 0.2986, "mean_token_accuracy": 0.9098023056983948, "num_tokens": 193071892.0, "step": 49220 }, { "entropy": 0.27451254688203336, "epoch": 0.7578232784746555, "grad_norm": 0.6620258092880249, "learning_rate": 1.861452347904035e-05, "loss": 0.2878, "mean_token_accuracy": 0.9129255920648575, "num_tokens": 193129528.0, "step": 49230 }, { "entropy": 0.27649910748004913, "epoch": 0.7579772137333342, "grad_norm": 0.6543771028518677, "learning_rate": 1.8613613750545256e-05, "loss": 0.2788, "mean_token_accuracy": 0.9139558203518391, "num_tokens": 193196181.0, "step": 49240 }, { "entropy": 0.2674621583893895, "epoch": 0.7581311489920127, "grad_norm": 0.6457260251045227, "learning_rate": 1.8612703745721488e-05, "loss": 0.2641, "mean_token_accuracy": 0.9175738424062729, "num_tokens": 193259881.0, "step": 49250 }, { "entropy": 0.2807484150864184, "epoch": 0.7582850842506913, "grad_norm": 0.7036094069480896, "learning_rate": 1.861179346459823e-05, "loss": 0.2788, "mean_token_accuracy": 0.9124410457909107, "num_tokens": 193325875.0, "step": 49260 }, { "entropy": 0.2794319996610284, "epoch": 0.7584390195093699, "grad_norm": 0.5256134867668152, "learning_rate": 1.8610882907204696e-05, "loss": 0.2731, "mean_token_accuracy": 0.9144115306437015, "num_tokens": 193392011.0, "step": 49270 }, { "entropy": 0.25769885117188096, "epoch": 0.7585929547680484, "grad_norm": 0.5971353650093079, "learning_rate": 1.860997207357009e-05, "loss": 0.2592, "mean_token_accuracy": 0.9189709976315499, "num_tokens": 193455863.0, "step": 49280 }, { "entropy": 0.2634177059866488, "epoch": 0.758746890026727, "grad_norm": 0.6145702600479126, "learning_rate": 1.8609060963723626e-05, "loss": 0.263, "mean_token_accuracy": 0.9151388138532639, "num_tokens": 193520036.0, "step": 49290 }, { "entropy": 0.28289608089253304, "epoch": 0.7589008252854056, "grad_norm": 0.5393322110176086, "learning_rate": 1.8608149577694545e-05, "loss": 0.2852, "mean_token_accuracy": 0.9126356102526187, "num_tokens": 193590202.0, "step": 49300 }, { "entropy": 0.28544404190033673, "epoch": 0.7590547605440842, "grad_norm": 0.7005900740623474, "learning_rate": 1.8607237915512076e-05, "loss": 0.2827, "mean_token_accuracy": 0.9101469598710537, "num_tokens": 193655755.0, "step": 49310 }, { "entropy": 0.27038397481665016, "epoch": 0.7592086958027627, "grad_norm": 0.6697724461555481, "learning_rate": 1.8606325977205472e-05, "loss": 0.2798, "mean_token_accuracy": 0.9157718196511269, "num_tokens": 193720421.0, "step": 49320 }, { "entropy": 0.2971194120123982, "epoch": 0.7593626310614413, "grad_norm": 0.5083634257316589, "learning_rate": 1.860541376280398e-05, "loss": 0.2977, "mean_token_accuracy": 0.907415259629488, "num_tokens": 193787823.0, "step": 49330 }, { "entropy": 0.2796894909814, "epoch": 0.7595165663201199, "grad_norm": 0.5101590752601624, "learning_rate": 1.8604501272336875e-05, "loss": 0.2777, "mean_token_accuracy": 0.9124010309576989, "num_tokens": 193859218.0, "step": 49340 }, { "entropy": 0.25934944907203317, "epoch": 0.7596705015787985, "grad_norm": 0.7648314237594604, "learning_rate": 1.8603588505833422e-05, "loss": 0.269, "mean_token_accuracy": 0.9183264397084713, "num_tokens": 193926243.0, "step": 49350 }, { "entropy": 0.2572488606907427, "epoch": 0.759824436837477, "grad_norm": 0.7406197786331177, "learning_rate": 1.8602675463322904e-05, "loss": 0.2697, "mean_token_accuracy": 0.9199385859072209, "num_tokens": 193997833.0, "step": 49360 }, { "entropy": 0.25826434828341005, "epoch": 0.7599783720961557, "grad_norm": 0.5334420800209045, "learning_rate": 1.8601762144834616e-05, "loss": 0.2704, "mean_token_accuracy": 0.9203065410256386, "num_tokens": 194056978.0, "step": 49370 }, { "entropy": 0.28615356190130115, "epoch": 0.7601323073548343, "grad_norm": 0.6445715427398682, "learning_rate": 1.860084855039785e-05, "loss": 0.2774, "mean_token_accuracy": 0.9123332910239697, "num_tokens": 194122711.0, "step": 49380 }, { "entropy": 0.2738765560090542, "epoch": 0.7602862426135129, "grad_norm": 0.548578143119812, "learning_rate": 1.8599934680041924e-05, "loss": 0.2823, "mean_token_accuracy": 0.9149070337414742, "num_tokens": 194195520.0, "step": 49390 }, { "entropy": 0.29467150047421453, "epoch": 0.7604401778721914, "grad_norm": 0.5320222973823547, "learning_rate": 1.859902053379615e-05, "loss": 0.2853, "mean_token_accuracy": 0.9103438757359982, "num_tokens": 194256719.0, "step": 49400 }, { "entropy": 0.27025031140074135, "epoch": 0.76059411313087, "grad_norm": 0.616554856300354, "learning_rate": 1.8598106111689853e-05, "loss": 0.2792, "mean_token_accuracy": 0.9156125888228417, "num_tokens": 194325273.0, "step": 49410 }, { "entropy": 0.27811755603179333, "epoch": 0.7607480483895486, "grad_norm": 0.8862802982330322, "learning_rate": 1.859719141375237e-05, "loss": 0.281, "mean_token_accuracy": 0.9142454355955124, "num_tokens": 194384690.0, "step": 49420 }, { "entropy": 0.2634008567780256, "epoch": 0.7609019836482271, "grad_norm": 0.625795304775238, "learning_rate": 1.8596276440013045e-05, "loss": 0.272, "mean_token_accuracy": 0.9156042084097862, "num_tokens": 194459924.0, "step": 49430 }, { "entropy": 0.2893853522837162, "epoch": 0.7610559189069057, "grad_norm": 0.6873787045478821, "learning_rate": 1.859536119050123e-05, "loss": 0.2823, "mean_token_accuracy": 0.9113217242062092, "num_tokens": 194524885.0, "step": 49440 }, { "entropy": 0.2633634306490421, "epoch": 0.7612098541655843, "grad_norm": 0.5099371075630188, "learning_rate": 1.859444566524629e-05, "loss": 0.265, "mean_token_accuracy": 0.916824734210968, "num_tokens": 194594758.0, "step": 49450 }, { "entropy": 0.25624507521279155, "epoch": 0.7613637894242629, "grad_norm": 0.5358495712280273, "learning_rate": 1.8593529864277593e-05, "loss": 0.2688, "mean_token_accuracy": 0.9185904674232006, "num_tokens": 194658488.0, "step": 49460 }, { "entropy": 0.2756058066152036, "epoch": 0.7615177246829414, "grad_norm": 0.7465378642082214, "learning_rate": 1.8592613787624517e-05, "loss": 0.2923, "mean_token_accuracy": 0.9123845018446446, "num_tokens": 194731053.0, "step": 49470 }, { "entropy": 0.29264928717166183, "epoch": 0.76167165994162, "grad_norm": 0.6301212310791016, "learning_rate": 1.859169743531645e-05, "loss": 0.2983, "mean_token_accuracy": 0.9105756632983685, "num_tokens": 194802935.0, "step": 49480 }, { "entropy": 0.28882662411779164, "epoch": 0.7618255952002987, "grad_norm": 0.5583731532096863, "learning_rate": 1.859078080738279e-05, "loss": 0.2936, "mean_token_accuracy": 0.9112548306584358, "num_tokens": 194874028.0, "step": 49490 }, { "entropy": 0.30297104846686124, "epoch": 0.7619795304589773, "grad_norm": 0.5507969260215759, "learning_rate": 1.8589863903852946e-05, "loss": 0.295, "mean_token_accuracy": 0.9057094968855381, "num_tokens": 194947935.0, "step": 49500 }, { "entropy": 0.2714772030711174, "epoch": 0.7621334657176558, "grad_norm": 0.6723789572715759, "learning_rate": 1.8588946724756328e-05, "loss": 0.282, "mean_token_accuracy": 0.9145022355020046, "num_tokens": 195011704.0, "step": 49510 }, { "entropy": 0.26827310789376496, "epoch": 0.7622874009763344, "grad_norm": 0.5702575445175171, "learning_rate": 1.8588029270122363e-05, "loss": 0.2741, "mean_token_accuracy": 0.9150992915034294, "num_tokens": 195080402.0, "step": 49520 }, { "entropy": 0.2791680432856083, "epoch": 0.762441336235013, "grad_norm": 0.5610247254371643, "learning_rate": 1.8587111539980482e-05, "loss": 0.2868, "mean_token_accuracy": 0.9149254634976387, "num_tokens": 195143912.0, "step": 49530 }, { "entropy": 0.27927063442766664, "epoch": 0.7625952714936916, "grad_norm": 0.5187149047851562, "learning_rate": 1.8586193534360122e-05, "loss": 0.288, "mean_token_accuracy": 0.913219191133976, "num_tokens": 195206108.0, "step": 49540 }, { "entropy": 0.26619253009557725, "epoch": 0.7627492067523701, "grad_norm": 0.4565163254737854, "learning_rate": 1.8585275253290744e-05, "loss": 0.2641, "mean_token_accuracy": 0.9171346016228199, "num_tokens": 195282014.0, "step": 49550 }, { "entropy": 0.2751891601830721, "epoch": 0.7629031420110487, "grad_norm": 0.6287184357643127, "learning_rate": 1.8584356696801792e-05, "loss": 0.2931, "mean_token_accuracy": 0.9148409351706505, "num_tokens": 195343852.0, "step": 49560 }, { "entropy": 0.2683402462862432, "epoch": 0.7630570772697273, "grad_norm": 0.8478397727012634, "learning_rate": 1.8583437864922747e-05, "loss": 0.2713, "mean_token_accuracy": 0.9171590469777584, "num_tokens": 195404962.0, "step": 49570 }, { "entropy": 0.2624936822801828, "epoch": 0.7632110125284058, "grad_norm": 0.5980342626571655, "learning_rate": 1.8582518757683077e-05, "loss": 0.2783, "mean_token_accuracy": 0.9165436640381813, "num_tokens": 195474718.0, "step": 49580 }, { "entropy": 0.26467273412272335, "epoch": 0.7633649477870844, "grad_norm": 0.6835181713104248, "learning_rate": 1.858159937511227e-05, "loss": 0.2737, "mean_token_accuracy": 0.9190254963934421, "num_tokens": 195534296.0, "step": 49590 }, { "entropy": 0.26832167655229566, "epoch": 0.763518883045763, "grad_norm": 0.5827786326408386, "learning_rate": 1.8580679717239825e-05, "loss": 0.2809, "mean_token_accuracy": 0.9166721887886524, "num_tokens": 195604023.0, "step": 49600 }, { "entropy": 0.2713644890114665, "epoch": 0.7636728183044416, "grad_norm": 0.6077384948730469, "learning_rate": 1.8579759784095234e-05, "loss": 0.2736, "mean_token_accuracy": 0.9149356126785279, "num_tokens": 195677452.0, "step": 49610 }, { "entropy": 0.2538875613361597, "epoch": 0.7638267535631202, "grad_norm": 0.8041209578514099, "learning_rate": 1.8578839575708022e-05, "loss": 0.2712, "mean_token_accuracy": 0.9208792515099049, "num_tokens": 195743148.0, "step": 49620 }, { "entropy": 0.28327081399038434, "epoch": 0.7639806888217988, "grad_norm": 0.6054731011390686, "learning_rate": 1.85779190921077e-05, "loss": 0.2871, "mean_token_accuracy": 0.9139562129974366, "num_tokens": 195802981.0, "step": 49630 }, { "entropy": 0.2722968259826303, "epoch": 0.7641346240804774, "grad_norm": 0.6021659970283508, "learning_rate": 1.8576998333323802e-05, "loss": 0.2867, "mean_token_accuracy": 0.9138127438724041, "num_tokens": 195874398.0, "step": 49640 }, { "entropy": 0.27292001601308585, "epoch": 0.764288559339156, "grad_norm": 0.5717055201530457, "learning_rate": 1.8576077299385866e-05, "loss": 0.2753, "mean_token_accuracy": 0.9171643868088722, "num_tokens": 195942973.0, "step": 49650 }, { "entropy": 0.26853071758523583, "epoch": 0.7644424945978345, "grad_norm": 0.6914904117584229, "learning_rate": 1.8575155990323436e-05, "loss": 0.2695, "mean_token_accuracy": 0.9165036387741565, "num_tokens": 196003636.0, "step": 49660 }, { "entropy": 0.2632883067242801, "epoch": 0.7645964298565131, "grad_norm": 0.6238709092140198, "learning_rate": 1.8574234406166073e-05, "loss": 0.2756, "mean_token_accuracy": 0.9161060757935047, "num_tokens": 196072827.0, "step": 49670 }, { "entropy": 0.28470040718093514, "epoch": 0.7647503651151917, "grad_norm": 0.668649435043335, "learning_rate": 1.857331254694334e-05, "loss": 0.2877, "mean_token_accuracy": 0.9091488391160965, "num_tokens": 196132354.0, "step": 49680 }, { "entropy": 0.26405077390372755, "epoch": 0.7649043003738702, "grad_norm": 0.7258462309837341, "learning_rate": 1.8572390412684807e-05, "loss": 0.2786, "mean_token_accuracy": 0.9183212213218213, "num_tokens": 196200095.0, "step": 49690 }, { "entropy": 0.2845470143482089, "epoch": 0.7650582356325488, "grad_norm": 0.583998441696167, "learning_rate": 1.857146800342006e-05, "loss": 0.2842, "mean_token_accuracy": 0.909665072709322, "num_tokens": 196267641.0, "step": 49700 }, { "entropy": 0.2669827172998339, "epoch": 0.7652121708912274, "grad_norm": 0.7737205624580383, "learning_rate": 1.857054531917869e-05, "loss": 0.2754, "mean_token_accuracy": 0.915758803486824, "num_tokens": 196321059.0, "step": 49710 }, { "entropy": 0.2648621862754226, "epoch": 0.765366106149906, "grad_norm": 0.6595352292060852, "learning_rate": 1.85696223599903e-05, "loss": 0.2637, "mean_token_accuracy": 0.9181392349302768, "num_tokens": 196386287.0, "step": 49720 }, { "entropy": 0.29196679731830955, "epoch": 0.7655200414085845, "grad_norm": 0.713299572467804, "learning_rate": 1.856869912588449e-05, "loss": 0.3055, "mean_token_accuracy": 0.9087884314358234, "num_tokens": 196449388.0, "step": 49730 }, { "entropy": 0.28708526492118835, "epoch": 0.7656739766672631, "grad_norm": 0.7304220795631409, "learning_rate": 1.8567775616890887e-05, "loss": 0.2961, "mean_token_accuracy": 0.9103723742067814, "num_tokens": 196521096.0, "step": 49740 }, { "entropy": 0.26686020120978354, "epoch": 0.7658279119259418, "grad_norm": 0.8337358832359314, "learning_rate": 1.8566851833039115e-05, "loss": 0.275, "mean_token_accuracy": 0.917206234484911, "num_tokens": 196584974.0, "step": 49750 }, { "entropy": 0.2672646151855588, "epoch": 0.7659818471846204, "grad_norm": 0.7283950448036194, "learning_rate": 1.8565927774358806e-05, "loss": 0.2711, "mean_token_accuracy": 0.9157659761607647, "num_tokens": 196654694.0, "step": 49760 }, { "entropy": 0.2596847268752754, "epoch": 0.7661357824432989, "grad_norm": 0.620305597782135, "learning_rate": 1.856500344087961e-05, "loss": 0.2633, "mean_token_accuracy": 0.9188693918287754, "num_tokens": 196720635.0, "step": 49770 }, { "entropy": 0.27100097481161356, "epoch": 0.7662897177019775, "grad_norm": 0.5156665444374084, "learning_rate": 1.8564078832631175e-05, "loss": 0.2774, "mean_token_accuracy": 0.9164222843945027, "num_tokens": 196789660.0, "step": 49780 }, { "entropy": 0.2914531282149255, "epoch": 0.7664436529606561, "grad_norm": 0.6683536767959595, "learning_rate": 1.8563153949643166e-05, "loss": 0.2896, "mean_token_accuracy": 0.9083258479833602, "num_tokens": 196855984.0, "step": 49790 }, { "entropy": 0.2503671714104712, "epoch": 0.7665975882193347, "grad_norm": 0.5404151082038879, "learning_rate": 1.856222879194525e-05, "loss": 0.2586, "mean_token_accuracy": 0.9231023244559765, "num_tokens": 196925699.0, "step": 49800 }, { "entropy": 0.26161311138421295, "epoch": 0.7667515234780132, "grad_norm": 0.5366895794868469, "learning_rate": 1.856130335956711e-05, "loss": 0.2708, "mean_token_accuracy": 0.9178942635655403, "num_tokens": 196986816.0, "step": 49810 }, { "entropy": 0.2764232291840017, "epoch": 0.7669054587366918, "grad_norm": 0.5089753866195679, "learning_rate": 1.8560377652538434e-05, "loss": 0.2833, "mean_token_accuracy": 0.9135750226676465, "num_tokens": 197060663.0, "step": 49820 }, { "entropy": 0.27942757550626995, "epoch": 0.7670593939953704, "grad_norm": 0.6364976167678833, "learning_rate": 1.855945167088892e-05, "loss": 0.2865, "mean_token_accuracy": 0.9140254721045494, "num_tokens": 197121125.0, "step": 49830 }, { "entropy": 0.2771745862439275, "epoch": 0.767213329254049, "grad_norm": 0.6708118319511414, "learning_rate": 1.8558525414648266e-05, "loss": 0.2847, "mean_token_accuracy": 0.9147635273635387, "num_tokens": 197173576.0, "step": 49840 }, { "entropy": 0.28340664114803077, "epoch": 0.7673672645127275, "grad_norm": 0.5410529971122742, "learning_rate": 1.8557598883846196e-05, "loss": 0.2785, "mean_token_accuracy": 0.9138721704483033, "num_tokens": 197237623.0, "step": 49850 }, { "entropy": 0.29218602254986764, "epoch": 0.7675211997714061, "grad_norm": 0.609720766544342, "learning_rate": 1.8556672078512434e-05, "loss": 0.2956, "mean_token_accuracy": 0.9111540049314499, "num_tokens": 197308836.0, "step": 49860 }, { "entropy": 0.26920866975560787, "epoch": 0.7676751350300848, "grad_norm": 0.5083591341972351, "learning_rate": 1.8555744998676707e-05, "loss": 0.2824, "mean_token_accuracy": 0.9161722511053085, "num_tokens": 197374975.0, "step": 49870 }, { "entropy": 0.2607306305319071, "epoch": 0.7678290702887633, "grad_norm": 0.6230519413948059, "learning_rate": 1.8554817644368758e-05, "loss": 0.272, "mean_token_accuracy": 0.917838940769434, "num_tokens": 197439726.0, "step": 49880 }, { "entropy": 0.25741773992776873, "epoch": 0.7679830055474419, "grad_norm": 0.5045331716537476, "learning_rate": 1.8553890015618333e-05, "loss": 0.2631, "mean_token_accuracy": 0.9215730883181095, "num_tokens": 197500844.0, "step": 49890 }, { "entropy": 0.25857863007113335, "epoch": 0.7681369408061205, "grad_norm": 0.4984235465526581, "learning_rate": 1.85529621124552e-05, "loss": 0.272, "mean_token_accuracy": 0.9195904411375523, "num_tokens": 197568244.0, "step": 49900 }, { "entropy": 0.25920819249004123, "epoch": 0.7682908760647991, "grad_norm": 0.5449269413948059, "learning_rate": 1.855203393490912e-05, "loss": 0.2698, "mean_token_accuracy": 0.9189957112073899, "num_tokens": 197636003.0, "step": 49910 }, { "entropy": 0.25323368115350603, "epoch": 0.7684448113234776, "grad_norm": 0.5942535996437073, "learning_rate": 1.855110548300987e-05, "loss": 0.265, "mean_token_accuracy": 0.9203741379082203, "num_tokens": 197702941.0, "step": 49920 }, { "entropy": 0.26768262395635245, "epoch": 0.7685987465821562, "grad_norm": 0.4953833222389221, "learning_rate": 1.855017675678723e-05, "loss": 0.2707, "mean_token_accuracy": 0.9151345059275627, "num_tokens": 197779901.0, "step": 49930 }, { "entropy": 0.2959285549819469, "epoch": 0.7687526818408348, "grad_norm": 0.6370549201965332, "learning_rate": 1.8549247756271006e-05, "loss": 0.2921, "mean_token_accuracy": 0.9079946972429752, "num_tokens": 197852948.0, "step": 49940 }, { "entropy": 0.2643545904196799, "epoch": 0.7689066170995134, "grad_norm": 0.49608996510505676, "learning_rate": 1.8548318481490996e-05, "loss": 0.2603, "mean_token_accuracy": 0.9172416642308235, "num_tokens": 197918440.0, "step": 49950 }, { "entropy": 0.27385900700464844, "epoch": 0.7690605523581919, "grad_norm": 0.6737620830535889, "learning_rate": 1.8547388932477005e-05, "loss": 0.2833, "mean_token_accuracy": 0.9129000954329968, "num_tokens": 197981583.0, "step": 49960 }, { "entropy": 0.26425964292138815, "epoch": 0.7692144876168705, "grad_norm": 0.6014845371246338, "learning_rate": 1.8546459109258862e-05, "loss": 0.2746, "mean_token_accuracy": 0.9169947966933251, "num_tokens": 198042600.0, "step": 49970 }, { "entropy": 0.25435372376814486, "epoch": 0.7693684228755491, "grad_norm": 0.636353611946106, "learning_rate": 1.8545529011866393e-05, "loss": 0.2632, "mean_token_accuracy": 0.9200123675167561, "num_tokens": 198108872.0, "step": 49980 }, { "entropy": 0.2683893068693578, "epoch": 0.7695223581342276, "grad_norm": 0.5959178805351257, "learning_rate": 1.8544598640329434e-05, "loss": 0.2644, "mean_token_accuracy": 0.9183045849204063, "num_tokens": 198171918.0, "step": 49990 }, { "entropy": 0.2880647495388985, "epoch": 0.7696762933929063, "grad_norm": 0.6051809191703796, "learning_rate": 1.8543667994677834e-05, "loss": 0.2905, "mean_token_accuracy": 0.9103722997009754, "num_tokens": 198234913.0, "step": 50000 }, { "epoch": 0.7696762933929063, "eval_entropy": 0.2754413199257024, "eval_loss": 0.27366578578948975, "eval_mean_token_accuracy": 0.9147083051436186, "eval_num_tokens": 198234913.0, "eval_runtime": 7808.4021, "eval_samples_per_second": 4.16, "eval_steps_per_second": 4.16, "step": 50000 } ], "logging_steps": 10, "max_steps": 194889, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.372115261179225e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }