diff --git "a/checkpoint-10000/trainer_state.json" "b/checkpoint-10000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10000/trainer_state.json" @@ -0,0 +1,10056 @@ +{ + "best_global_step": 10000, + "best_metric": 0.16012564301490784, + "best_model_checkpoint": "/workspace/llm-storage/output/qwen-32B-V3/checkpoint-10000", + "epoch": 0.5801599791142408, + "eval_steps": 5000, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.6092591404914856, + "epoch": 0.0005801599791142408, + "grad_norm": 0.31482434272766113, + "learning_rate": 2.6102088167053363e-07, + "loss": 1.0497, + "mean_token_accuracy": 0.7648690357804299, + "num_tokens": 43143.0, + "step": 10 + }, + { + "entropy": 0.6265671212226153, + "epoch": 0.0011603199582284815, + "grad_norm": 0.38310864567756653, + "learning_rate": 5.510440835266822e-07, + "loss": 1.0354, + "mean_token_accuracy": 0.7595696784555912, + "num_tokens": 87593.0, + "step": 20 + }, + { + "entropy": 0.6312713149935008, + "epoch": 0.0017404799373427222, + "grad_norm": 0.37424585223197937, + "learning_rate": 8.410672853828306e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.7564290009438992, + "num_tokens": 131402.0, + "step": 30 + }, + { + "entropy": 0.5926671892404556, + "epoch": 0.002320639916456963, + "grad_norm": 0.32779833674430847, + "learning_rate": 1.131090487238979e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7731007449328899, + "num_tokens": 175572.0, + "step": 40 + }, + { + "entropy": 0.59578264541924, + "epoch": 0.002900799895571204, + "grad_norm": 0.32650482654571533, + "learning_rate": 1.4211136890951276e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7712436608970166, + "num_tokens": 225350.0, + "step": 50 + }, + { + "entropy": 0.6028633829206228, + "epoch": 0.0034809598746854443, + "grad_norm": 0.4550016522407532, + "learning_rate": 1.7111368909512762e-06, + "loss": 1.04, + "mean_token_accuracy": 0.7677938863635063, + "num_tokens": 267379.0, + "step": 60 + }, + { + "entropy": 0.6093336276710033, + "epoch": 0.004061119853799686, + "grad_norm": 0.6661575436592102, + "learning_rate": 2.001160092807425e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7664665102958679, + "num_tokens": 311307.0, + "step": 70 + }, + { + "entropy": 0.6007830150425434, + "epoch": 0.004641279832913926, + "grad_norm": 0.30777132511138916, + "learning_rate": 2.291183294663573e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7737567193806172, + "num_tokens": 359277.0, + "step": 80 + }, + { + "entropy": 0.628833281993866, + "epoch": 0.0052214398120281665, + "grad_norm": 0.3702338933944702, + "learning_rate": 2.581206496519722e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.760690987855196, + "num_tokens": 405086.0, + "step": 90 + }, + { + "entropy": 0.6096299141645432, + "epoch": 0.005801599791142408, + "grad_norm": 0.36545589566230774, + "learning_rate": 2.87122969837587e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.777703908085823, + "num_tokens": 452859.0, + "step": 100 + }, + { + "entropy": 0.6223576374351978, + "epoch": 0.006381759770256648, + "grad_norm": 0.42212843894958496, + "learning_rate": 3.1612529002320188e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7787812650203705, + "num_tokens": 497969.0, + "step": 110 + }, + { + "entropy": 0.6298252783715725, + "epoch": 0.006961919749370889, + "grad_norm": 0.5084234476089478, + "learning_rate": 3.451276102088167e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7801030069589615, + "num_tokens": 543148.0, + "step": 120 + }, + { + "entropy": 0.6175421200692653, + "epoch": 0.00754207972848513, + "grad_norm": 0.34466612339019775, + "learning_rate": 3.7412993039443157e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7925293773412705, + "num_tokens": 591404.0, + "step": 130 + }, + { + "entropy": 0.6364047959446907, + "epoch": 0.008122239707599371, + "grad_norm": 0.4404062330722809, + "learning_rate": 4.0313225058004636e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7929937243461609, + "num_tokens": 633201.0, + "step": 140 + }, + { + "entropy": 0.6060394033789634, + "epoch": 0.008702399686713611, + "grad_norm": 0.3953554034233093, + "learning_rate": 4.321345707656613e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.808499849587679, + "num_tokens": 678632.0, + "step": 150 + }, + { + "entropy": 0.6047076687216759, + "epoch": 0.009282559665827852, + "grad_norm": 0.3636057674884796, + "learning_rate": 4.611368909512762e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.8089389830827713, + "num_tokens": 722812.0, + "step": 160 + }, + { + "entropy": 0.5886593971401453, + "epoch": 0.009862719644942093, + "grad_norm": 0.22046884894371033, + "learning_rate": 4.90139211136891e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.8176202610135078, + "num_tokens": 767799.0, + "step": 170 + }, + { + "entropy": 0.5819547358900309, + "epoch": 0.010442879624056333, + "grad_norm": 0.2481335550546646, + "learning_rate": 5.191415313225058e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8232176326215267, + "num_tokens": 815188.0, + "step": 180 + }, + { + "entropy": 0.5815581317991019, + "epoch": 0.011023039603170574, + "grad_norm": 0.246281236410141, + "learning_rate": 5.4814385150812065e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8307517014443875, + "num_tokens": 860881.0, + "step": 190 + }, + { + "entropy": 0.6089683990925551, + "epoch": 0.011603199582284816, + "grad_norm": 0.22894756495952606, + "learning_rate": 5.771461716937356e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.8290468625724315, + "num_tokens": 906504.0, + "step": 200 + }, + { + "entropy": 0.602594980597496, + "epoch": 0.012183359561399055, + "grad_norm": 0.36406323313713074, + "learning_rate": 6.061484918793504e-06, + "loss": 0.5932, + "mean_token_accuracy": 0.8330310329794883, + "num_tokens": 953812.0, + "step": 210 + }, + { + "entropy": 0.5921055857092142, + "epoch": 0.012763519540513297, + "grad_norm": 0.24109871685504913, + "learning_rate": 6.351508120649652e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8346533574163914, + "num_tokens": 999911.0, + "step": 220 + }, + { + "entropy": 0.5773816656321287, + "epoch": 0.013343679519627538, + "grad_norm": 0.2699449062347412, + "learning_rate": 6.6415313225058e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.838110388815403, + "num_tokens": 1048360.0, + "step": 230 + }, + { + "entropy": 0.5852162480354309, + "epoch": 0.013923839498741777, + "grad_norm": 0.24839651584625244, + "learning_rate": 6.9315545243619495e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8362175330519677, + "num_tokens": 1101457.0, + "step": 240 + }, + { + "entropy": 0.5615280520170927, + "epoch": 0.014503999477856019, + "grad_norm": 0.4666668772697449, + "learning_rate": 7.221577726218099e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8428752638399601, + "num_tokens": 1149274.0, + "step": 250 + }, + { + "entropy": 0.5958327937871217, + "epoch": 0.01508415945697026, + "grad_norm": 0.29664549231529236, + "learning_rate": 7.511600928074246e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8327157489955426, + "num_tokens": 1199326.0, + "step": 260 + }, + { + "entropy": 0.5765781041234732, + "epoch": 0.0156643194360845, + "grad_norm": 0.34510108828544617, + "learning_rate": 7.801624129930394e-06, + "loss": 0.5481, + "mean_token_accuracy": 0.8387525670230389, + "num_tokens": 1245589.0, + "step": 270 + }, + { + "entropy": 0.5647396679967642, + "epoch": 0.016244479415198743, + "grad_norm": 0.4642154276371002, + "learning_rate": 8.091647331786543e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8474723532795906, + "num_tokens": 1296020.0, + "step": 280 + }, + { + "entropy": 0.5554123785346746, + "epoch": 0.01682463939431298, + "grad_norm": 0.4111632704734802, + "learning_rate": 8.381670533642692e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8421548426151275, + "num_tokens": 1341949.0, + "step": 290 + }, + { + "entropy": 0.5540930911898613, + "epoch": 0.017404799373427222, + "grad_norm": 0.5401211380958557, + "learning_rate": 8.67169373549884e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8475939244031906, + "num_tokens": 1393956.0, + "step": 300 + }, + { + "entropy": 0.5825928870588541, + "epoch": 0.017984959352541463, + "grad_norm": 0.36595726013183594, + "learning_rate": 8.961716937354989e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.8401123985648156, + "num_tokens": 1442663.0, + "step": 310 + }, + { + "entropy": 0.5483734723180532, + "epoch": 0.018565119331655704, + "grad_norm": 0.3432326912879944, + "learning_rate": 9.251740139211138e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8518648609519005, + "num_tokens": 1485130.0, + "step": 320 + }, + { + "entropy": 0.539715152978897, + "epoch": 0.019145279310769946, + "grad_norm": 0.3705282211303711, + "learning_rate": 9.541763341067286e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8501971706748008, + "num_tokens": 1531642.0, + "step": 330 + }, + { + "entropy": 0.5503693576902151, + "epoch": 0.019725439289884187, + "grad_norm": 0.33528175950050354, + "learning_rate": 9.831786542923435e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8480824060738087, + "num_tokens": 1574214.0, + "step": 340 + }, + { + "entropy": 0.5398841660469771, + "epoch": 0.020305599268998425, + "grad_norm": 0.37876197695732117, + "learning_rate": 1.0121809744779582e-05, + "loss": 0.5097, + "mean_token_accuracy": 0.8514233089983463, + "num_tokens": 1622759.0, + "step": 350 + }, + { + "entropy": 0.5365910518914461, + "epoch": 0.020885759248112666, + "grad_norm": 0.32982495427131653, + "learning_rate": 1.0411832946635731e-05, + "loss": 0.5154, + "mean_token_accuracy": 0.8518380291759968, + "num_tokens": 1668053.0, + "step": 360 + }, + { + "entropy": 0.546030517667532, + "epoch": 0.021465919227226907, + "grad_norm": 0.3440488874912262, + "learning_rate": 1.070185614849188e-05, + "loss": 0.5044, + "mean_token_accuracy": 0.8508104898035527, + "num_tokens": 1713330.0, + "step": 370 + }, + { + "entropy": 0.5135512065142394, + "epoch": 0.02204607920634115, + "grad_norm": 0.34793198108673096, + "learning_rate": 1.0991879350348028e-05, + "loss": 0.48, + "mean_token_accuracy": 0.8594391435384751, + "num_tokens": 1759826.0, + "step": 380 + }, + { + "entropy": 0.5169919081032276, + "epoch": 0.02262623918545539, + "grad_norm": 0.43590423464775085, + "learning_rate": 1.1281902552204177e-05, + "loss": 0.49, + "mean_token_accuracy": 0.8547763951122761, + "num_tokens": 1807818.0, + "step": 390 + }, + { + "entropy": 0.5108715072274208, + "epoch": 0.02320639916456963, + "grad_norm": 0.4136670231819153, + "learning_rate": 1.1571925754060326e-05, + "loss": 0.478, + "mean_token_accuracy": 0.858852919191122, + "num_tokens": 1852918.0, + "step": 400 + }, + { + "entropy": 0.5136338964104652, + "epoch": 0.02378655914368387, + "grad_norm": 0.35156485438346863, + "learning_rate": 1.1861948955916475e-05, + "loss": 0.4849, + "mean_token_accuracy": 0.8604479238390923, + "num_tokens": 1899312.0, + "step": 410 + }, + { + "entropy": 0.4993183337152004, + "epoch": 0.02436671912279811, + "grad_norm": 0.4222034513950348, + "learning_rate": 1.2151972157772622e-05, + "loss": 0.475, + "mean_token_accuracy": 0.863164746761322, + "num_tokens": 1949582.0, + "step": 420 + }, + { + "entropy": 0.49920870158821345, + "epoch": 0.024946879101912352, + "grad_norm": 0.38237398862838745, + "learning_rate": 1.244199535962877e-05, + "loss": 0.4626, + "mean_token_accuracy": 0.8643533334136009, + "num_tokens": 1994739.0, + "step": 430 + }, + { + "entropy": 0.49481778107583524, + "epoch": 0.025527039081026593, + "grad_norm": 0.4159950911998749, + "learning_rate": 1.273201856148492e-05, + "loss": 0.469, + "mean_token_accuracy": 0.8627370178699494, + "num_tokens": 2041762.0, + "step": 440 + }, + { + "entropy": 0.5128690533339977, + "epoch": 0.026107199060140834, + "grad_norm": 0.4283367991447449, + "learning_rate": 1.3022041763341066e-05, + "loss": 0.4807, + "mean_token_accuracy": 0.8572683438658715, + "num_tokens": 2085077.0, + "step": 450 + }, + { + "entropy": 0.4647744446992874, + "epoch": 0.026687359039255076, + "grad_norm": 0.3297072649002075, + "learning_rate": 1.3312064965197215e-05, + "loss": 0.4633, + "mean_token_accuracy": 0.8636987172067165, + "num_tokens": 2133076.0, + "step": 460 + }, + { + "entropy": 0.47540101408958435, + "epoch": 0.027267519018369317, + "grad_norm": 0.4102586507797241, + "learning_rate": 1.3602088167053364e-05, + "loss": 0.4587, + "mean_token_accuracy": 0.8651154920458793, + "num_tokens": 2177426.0, + "step": 470 + }, + { + "entropy": 0.4590921364724636, + "epoch": 0.027847678997483555, + "grad_norm": 0.4205019176006317, + "learning_rate": 1.3892111368909514e-05, + "loss": 0.4438, + "mean_token_accuracy": 0.8711051598191262, + "num_tokens": 2226703.0, + "step": 480 + }, + { + "entropy": 0.46271922569721935, + "epoch": 0.028427838976597796, + "grad_norm": 0.36271682381629944, + "learning_rate": 1.4182134570765663e-05, + "loss": 0.4384, + "mean_token_accuracy": 0.8709373705089092, + "num_tokens": 2268772.0, + "step": 490 + }, + { + "entropy": 0.4736779436469078, + "epoch": 0.029007998955712037, + "grad_norm": 0.3690861761569977, + "learning_rate": 1.4472157772621812e-05, + "loss": 0.476, + "mean_token_accuracy": 0.8604674808681011, + "num_tokens": 2316346.0, + "step": 500 + }, + { + "entropy": 0.46557683814316986, + "epoch": 0.02958815893482628, + "grad_norm": 0.38637956976890564, + "learning_rate": 1.476218097447796e-05, + "loss": 0.4766, + "mean_token_accuracy": 0.8639958471059799, + "num_tokens": 2360499.0, + "step": 510 + }, + { + "entropy": 0.46088958978652955, + "epoch": 0.03016831891394052, + "grad_norm": 0.4146084785461426, + "learning_rate": 1.5052204176334108e-05, + "loss": 0.4533, + "mean_token_accuracy": 0.8679031573235989, + "num_tokens": 2407530.0, + "step": 520 + }, + { + "entropy": 0.4659605773165822, + "epoch": 0.03074847889305476, + "grad_norm": 0.41284066438674927, + "learning_rate": 1.5342227378190254e-05, + "loss": 0.4608, + "mean_token_accuracy": 0.8647156253457069, + "num_tokens": 2455919.0, + "step": 530 + }, + { + "entropy": 0.4460117544978857, + "epoch": 0.031328638872169, + "grad_norm": 0.3880382478237152, + "learning_rate": 1.5632250580046403e-05, + "loss": 0.4299, + "mean_token_accuracy": 0.8701338037848473, + "num_tokens": 2498401.0, + "step": 540 + }, + { + "entropy": 0.43934665694832803, + "epoch": 0.03190879885128324, + "grad_norm": 0.4032915234565735, + "learning_rate": 1.5922273781902552e-05, + "loss": 0.4408, + "mean_token_accuracy": 0.875631807744503, + "num_tokens": 2542371.0, + "step": 550 + }, + { + "entropy": 0.47000532038509846, + "epoch": 0.032488958830397485, + "grad_norm": 0.4985908567905426, + "learning_rate": 1.62122969837587e-05, + "loss": 0.45, + "mean_token_accuracy": 0.8665805287659168, + "num_tokens": 2590482.0, + "step": 560 + }, + { + "entropy": 0.44049855694174767, + "epoch": 0.03306911880951172, + "grad_norm": 0.5142253637313843, + "learning_rate": 1.650232018561485e-05, + "loss": 0.4551, + "mean_token_accuracy": 0.8705280378460885, + "num_tokens": 2640700.0, + "step": 570 + }, + { + "entropy": 0.4254057455807924, + "epoch": 0.03364927878862596, + "grad_norm": 0.5099009275436401, + "learning_rate": 1.6792343387471e-05, + "loss": 0.4136, + "mean_token_accuracy": 0.8773816056549549, + "num_tokens": 2682036.0, + "step": 580 + }, + { + "entropy": 0.41887914538383486, + "epoch": 0.034229438767740206, + "grad_norm": 0.40679696202278137, + "learning_rate": 1.708236658932715e-05, + "loss": 0.405, + "mean_token_accuracy": 0.8809815347194672, + "num_tokens": 2731325.0, + "step": 590 + }, + { + "entropy": 0.44186822772026063, + "epoch": 0.034809598746854443, + "grad_norm": 0.5680528879165649, + "learning_rate": 1.7372389791183298e-05, + "loss": 0.4419, + "mean_token_accuracy": 0.8706110134720803, + "num_tokens": 2776200.0, + "step": 600 + }, + { + "entropy": 0.4203101532533765, + "epoch": 0.03538975872596869, + "grad_norm": 0.46001675724983215, + "learning_rate": 1.7662412993039443e-05, + "loss": 0.4255, + "mean_token_accuracy": 0.8766615472733974, + "num_tokens": 2825721.0, + "step": 610 + }, + { + "entropy": 0.42122571505606177, + "epoch": 0.035969918705082926, + "grad_norm": 0.4673120081424713, + "learning_rate": 1.7952436194895593e-05, + "loss": 0.4154, + "mean_token_accuracy": 0.8757893688976764, + "num_tokens": 2872319.0, + "step": 620 + }, + { + "entropy": 0.4180071948096156, + "epoch": 0.036550078684197164, + "grad_norm": 0.4085332751274109, + "learning_rate": 1.824245939675174e-05, + "loss": 0.4049, + "mean_token_accuracy": 0.8795566871762276, + "num_tokens": 2923446.0, + "step": 630 + }, + { + "entropy": 0.40333463735878466, + "epoch": 0.03713023866331141, + "grad_norm": 0.4654521942138672, + "learning_rate": 1.853248259860789e-05, + "loss": 0.4023, + "mean_token_accuracy": 0.8806409478187561, + "num_tokens": 2973360.0, + "step": 640 + }, + { + "entropy": 0.3950599055737257, + "epoch": 0.03771039864242565, + "grad_norm": 0.5778453350067139, + "learning_rate": 1.8822505800464036e-05, + "loss": 0.3881, + "mean_token_accuracy": 0.8865383200347423, + "num_tokens": 3019237.0, + "step": 650 + }, + { + "entropy": 0.41338928155601024, + "epoch": 0.03829055862153989, + "grad_norm": 0.6102851033210754, + "learning_rate": 1.9112529002320186e-05, + "loss": 0.4046, + "mean_token_accuracy": 0.8776419334113598, + "num_tokens": 3067301.0, + "step": 660 + }, + { + "entropy": 0.4048546342179179, + "epoch": 0.03887071860065413, + "grad_norm": 0.4595702290534973, + "learning_rate": 1.9402552204176335e-05, + "loss": 0.4045, + "mean_token_accuracy": 0.8821790389716625, + "num_tokens": 3116068.0, + "step": 670 + }, + { + "entropy": 0.3987867733463645, + "epoch": 0.039450878579768374, + "grad_norm": 0.6189276576042175, + "learning_rate": 1.9692575406032484e-05, + "loss": 0.3956, + "mean_token_accuracy": 0.8834411546587944, + "num_tokens": 3161009.0, + "step": 680 + }, + { + "entropy": 0.4175357658416033, + "epoch": 0.04003103855888261, + "grad_norm": 0.6002233028411865, + "learning_rate": 1.998259860788863e-05, + "loss": 0.4152, + "mean_token_accuracy": 0.8810918845236302, + "num_tokens": 3211395.0, + "step": 690 + }, + { + "entropy": 0.42440674882382157, + "epoch": 0.04061119853799685, + "grad_norm": 0.5412352085113525, + "learning_rate": 2.027262180974478e-05, + "loss": 0.4191, + "mean_token_accuracy": 0.8750564321875572, + "num_tokens": 3260695.0, + "step": 700 + }, + { + "entropy": 0.38289534505456685, + "epoch": 0.041191358517111094, + "grad_norm": 0.587879478931427, + "learning_rate": 2.0562645011600928e-05, + "loss": 0.3891, + "mean_token_accuracy": 0.8879447788000107, + "num_tokens": 3308185.0, + "step": 710 + }, + { + "entropy": 0.396764425560832, + "epoch": 0.04177151849622533, + "grad_norm": 0.5996712446212769, + "learning_rate": 2.0852668213457077e-05, + "loss": 0.4011, + "mean_token_accuracy": 0.8831724308431148, + "num_tokens": 3358002.0, + "step": 720 + }, + { + "entropy": 0.38041351214051244, + "epoch": 0.04235167847533958, + "grad_norm": 0.7141826748847961, + "learning_rate": 2.1142691415313226e-05, + "loss": 0.3782, + "mean_token_accuracy": 0.8894676253199577, + "num_tokens": 3400202.0, + "step": 730 + }, + { + "entropy": 0.3839953914284706, + "epoch": 0.042931838454453815, + "grad_norm": 0.6685739159584045, + "learning_rate": 2.1432714617169375e-05, + "loss": 0.3792, + "mean_token_accuracy": 0.8913779281079769, + "num_tokens": 3442178.0, + "step": 740 + }, + { + "entropy": 0.3902773916721344, + "epoch": 0.04351199843356806, + "grad_norm": 0.6567870378494263, + "learning_rate": 2.1722737819025524e-05, + "loss": 0.3953, + "mean_token_accuracy": 0.8875682443380356, + "num_tokens": 3485617.0, + "step": 750 + }, + { + "entropy": 0.3912440575659275, + "epoch": 0.0440921584126823, + "grad_norm": 0.480203241109848, + "learning_rate": 2.2012761020881673e-05, + "loss": 0.3864, + "mean_token_accuracy": 0.8870986111462116, + "num_tokens": 3535953.0, + "step": 760 + }, + { + "entropy": 0.38406776301562784, + "epoch": 0.044672318391796535, + "grad_norm": 0.5438635349273682, + "learning_rate": 2.2302784222737822e-05, + "loss": 0.39, + "mean_token_accuracy": 0.8876713000237941, + "num_tokens": 3584153.0, + "step": 770 + }, + { + "entropy": 0.3509963572025299, + "epoch": 0.04525247837091078, + "grad_norm": 0.6218645572662354, + "learning_rate": 2.2592807424593968e-05, + "loss": 0.3618, + "mean_token_accuracy": 0.8956875406205654, + "num_tokens": 3626138.0, + "step": 780 + }, + { + "entropy": 0.410224013030529, + "epoch": 0.04583263835002502, + "grad_norm": 0.6950373649597168, + "learning_rate": 2.2882830626450117e-05, + "loss": 0.4168, + "mean_token_accuracy": 0.8795195922255516, + "num_tokens": 3670100.0, + "step": 790 + }, + { + "entropy": 0.4062420692294836, + "epoch": 0.04641279832913926, + "grad_norm": 0.5211433172225952, + "learning_rate": 2.3172853828306266e-05, + "loss": 0.3983, + "mean_token_accuracy": 0.8853016197681427, + "num_tokens": 3721840.0, + "step": 800 + }, + { + "entropy": 0.39612339809536934, + "epoch": 0.0469929583082535, + "grad_norm": 0.7575780749320984, + "learning_rate": 2.3462877030162415e-05, + "loss": 0.3811, + "mean_token_accuracy": 0.8861218802630901, + "num_tokens": 3768660.0, + "step": 810 + }, + { + "entropy": 0.375431539863348, + "epoch": 0.04757311828736774, + "grad_norm": 0.6787402033805847, + "learning_rate": 2.3752900232018564e-05, + "loss": 0.3755, + "mean_token_accuracy": 0.8891229540109634, + "num_tokens": 3813668.0, + "step": 820 + }, + { + "entropy": 0.36895305681973695, + "epoch": 0.04815327826648198, + "grad_norm": 0.5642525553703308, + "learning_rate": 2.404292343387471e-05, + "loss": 0.3733, + "mean_token_accuracy": 0.8911125175654888, + "num_tokens": 3861721.0, + "step": 830 + }, + { + "entropy": 0.3749150296673179, + "epoch": 0.04873343824559622, + "grad_norm": 0.6928619742393494, + "learning_rate": 2.433294663573086e-05, + "loss": 0.384, + "mean_token_accuracy": 0.8895471081137657, + "num_tokens": 3907067.0, + "step": 840 + }, + { + "entropy": 0.36954501532018186, + "epoch": 0.049313598224710466, + "grad_norm": 0.5658312439918518, + "learning_rate": 2.462296983758701e-05, + "loss": 0.3674, + "mean_token_accuracy": 0.8910060659050941, + "num_tokens": 3958880.0, + "step": 850 + }, + { + "entropy": 0.36861355025321246, + "epoch": 0.049893758203824704, + "grad_norm": 0.6630789041519165, + "learning_rate": 2.4912993039443154e-05, + "loss": 0.3722, + "mean_token_accuracy": 0.8911329627037048, + "num_tokens": 4006517.0, + "step": 860 + }, + { + "entropy": 0.35659054648131133, + "epoch": 0.05047391818293895, + "grad_norm": 0.5517529845237732, + "learning_rate": 2.5203016241299303e-05, + "loss": 0.3554, + "mean_token_accuracy": 0.8948859825730324, + "num_tokens": 4053364.0, + "step": 870 + }, + { + "entropy": 0.38213329780846833, + "epoch": 0.051054078162053186, + "grad_norm": 0.6579017043113708, + "learning_rate": 2.5493039443155452e-05, + "loss": 0.3908, + "mean_token_accuracy": 0.8897779375314713, + "num_tokens": 4096213.0, + "step": 880 + }, + { + "entropy": 0.37707923762500284, + "epoch": 0.051634238141167424, + "grad_norm": 0.6471875309944153, + "learning_rate": 2.57830626450116e-05, + "loss": 0.3725, + "mean_token_accuracy": 0.8882941015064716, + "num_tokens": 4143188.0, + "step": 890 + }, + { + "entropy": 0.3501476192846894, + "epoch": 0.05221439812028167, + "grad_norm": 0.6628760099411011, + "learning_rate": 2.607308584686775e-05, + "loss": 0.3489, + "mean_token_accuracy": 0.89874022975564, + "num_tokens": 4189912.0, + "step": 900 + }, + { + "entropy": 0.37001798804849384, + "epoch": 0.05279455809939591, + "grad_norm": 0.541143000125885, + "learning_rate": 2.63631090487239e-05, + "loss": 0.3755, + "mean_token_accuracy": 0.8922918282449246, + "num_tokens": 4239698.0, + "step": 910 + }, + { + "entropy": 0.34365569427609444, + "epoch": 0.05337471807851015, + "grad_norm": 0.7561637163162231, + "learning_rate": 2.665313225058005e-05, + "loss": 0.3534, + "mean_token_accuracy": 0.8983624868094922, + "num_tokens": 4284788.0, + "step": 920 + }, + { + "entropy": 0.35005063433200123, + "epoch": 0.05395487805762439, + "grad_norm": 0.7029115557670593, + "learning_rate": 2.6943155452436198e-05, + "loss": 0.3407, + "mean_token_accuracy": 0.9001237884163856, + "num_tokens": 4331833.0, + "step": 930 + }, + { + "entropy": 0.37458054944872854, + "epoch": 0.054535038036738634, + "grad_norm": 0.6400732398033142, + "learning_rate": 2.7233178654292347e-05, + "loss": 0.3832, + "mean_token_accuracy": 0.8898490644991398, + "num_tokens": 4379620.0, + "step": 940 + }, + { + "entropy": 0.36579465996474025, + "epoch": 0.05511519801585287, + "grad_norm": 0.6700307130813599, + "learning_rate": 2.7523201856148496e-05, + "loss": 0.3656, + "mean_token_accuracy": 0.8951072141528129, + "num_tokens": 4423679.0, + "step": 950 + }, + { + "entropy": 0.35494615864008666, + "epoch": 0.05569535799496711, + "grad_norm": 0.6050117015838623, + "learning_rate": 2.7813225058004645e-05, + "loss": 0.3607, + "mean_token_accuracy": 0.8956887379288674, + "num_tokens": 4473950.0, + "step": 960 + }, + { + "entropy": 0.3530488181859255, + "epoch": 0.056275517974081354, + "grad_norm": 0.8418362736701965, + "learning_rate": 2.8103248259860794e-05, + "loss": 0.3603, + "mean_token_accuracy": 0.8947755537927151, + "num_tokens": 4521503.0, + "step": 970 + }, + { + "entropy": 0.36019507255405186, + "epoch": 0.05685567795319559, + "grad_norm": 0.5852341651916504, + "learning_rate": 2.8393271461716937e-05, + "loss": 0.3581, + "mean_token_accuracy": 0.8908769391477108, + "num_tokens": 4571287.0, + "step": 980 + }, + { + "entropy": 0.36673853918910027, + "epoch": 0.05743583793230984, + "grad_norm": 0.7163056135177612, + "learning_rate": 2.8683294663573086e-05, + "loss": 0.368, + "mean_token_accuracy": 0.8892034880816937, + "num_tokens": 4614371.0, + "step": 990 + }, + { + "entropy": 0.34335535652935506, + "epoch": 0.058015997911424075, + "grad_norm": 0.6813564896583557, + "learning_rate": 2.8973317865429235e-05, + "loss": 0.3511, + "mean_token_accuracy": 0.8987673252820969, + "num_tokens": 4666940.0, + "step": 1000 + }, + { + "entropy": 0.35222412273287773, + "epoch": 0.05859615789053831, + "grad_norm": 0.7608205080032349, + "learning_rate": 2.9263341067285384e-05, + "loss": 0.353, + "mean_token_accuracy": 0.8965450279414654, + "num_tokens": 4719514.0, + "step": 1010 + }, + { + "entropy": 0.33759879376739266, + "epoch": 0.05917631786965256, + "grad_norm": 0.6182528138160706, + "learning_rate": 2.9553364269141533e-05, + "loss": 0.342, + "mean_token_accuracy": 0.8990197770297528, + "num_tokens": 4765373.0, + "step": 1020 + }, + { + "entropy": 0.3454021533951163, + "epoch": 0.059756477848766795, + "grad_norm": 0.7237105369567871, + "learning_rate": 2.984338747099768e-05, + "loss": 0.3508, + "mean_token_accuracy": 0.8985992193222045, + "num_tokens": 4811403.0, + "step": 1030 + }, + { + "entropy": 0.36100250408053397, + "epoch": 0.06033663782788104, + "grad_norm": 0.751064121723175, + "learning_rate": 3.0133410672853828e-05, + "loss": 0.3703, + "mean_token_accuracy": 0.8933355711400509, + "num_tokens": 4861938.0, + "step": 1040 + }, + { + "entropy": 0.3391844678670168, + "epoch": 0.06091679780699528, + "grad_norm": 0.7583457827568054, + "learning_rate": 3.0423433874709977e-05, + "loss": 0.3523, + "mean_token_accuracy": 0.9001285172998905, + "num_tokens": 4909431.0, + "step": 1050 + }, + { + "entropy": 0.3209255587309599, + "epoch": 0.06149695778610952, + "grad_norm": 0.6351149678230286, + "learning_rate": 3.0713457076566126e-05, + "loss": 0.3308, + "mean_token_accuracy": 0.906328284740448, + "num_tokens": 4954763.0, + "step": 1060 + }, + { + "entropy": 0.330963964946568, + "epoch": 0.06207711776522376, + "grad_norm": 0.9053978323936462, + "learning_rate": 3.100348027842228e-05, + "loss": 0.34, + "mean_token_accuracy": 0.9045378334820271, + "num_tokens": 4996608.0, + "step": 1070 + }, + { + "entropy": 0.3450266053900123, + "epoch": 0.062657277744338, + "grad_norm": 0.8627491593360901, + "learning_rate": 3.1293503480278424e-05, + "loss": 0.3539, + "mean_token_accuracy": 0.8979708187282085, + "num_tokens": 5041974.0, + "step": 1080 + }, + { + "entropy": 0.32811101488769057, + "epoch": 0.06323743772345224, + "grad_norm": 0.5866477489471436, + "learning_rate": 3.158352668213457e-05, + "loss": 0.322, + "mean_token_accuracy": 0.9066893652081489, + "num_tokens": 5086686.0, + "step": 1090 + }, + { + "entropy": 0.3340771088376641, + "epoch": 0.06381759770256648, + "grad_norm": 0.6504079699516296, + "learning_rate": 3.187354988399072e-05, + "loss": 0.3413, + "mean_token_accuracy": 0.9030842788517475, + "num_tokens": 5134291.0, + "step": 1100 + }, + { + "entropy": 0.33917609360069034, + "epoch": 0.06439775768168073, + "grad_norm": 0.7563319206237793, + "learning_rate": 3.216357308584687e-05, + "loss": 0.3455, + "mean_token_accuracy": 0.9005046658217907, + "num_tokens": 5180663.0, + "step": 1110 + }, + { + "entropy": 0.3190030524507165, + "epoch": 0.06497791766079497, + "grad_norm": 0.8681765198707581, + "learning_rate": 3.245359628770302e-05, + "loss": 0.328, + "mean_token_accuracy": 0.9065721377730369, + "num_tokens": 5230307.0, + "step": 1120 + }, + { + "entropy": 0.32803232986479997, + "epoch": 0.0655580776399092, + "grad_norm": 1.0122687816619873, + "learning_rate": 3.2743619489559166e-05, + "loss": 0.3235, + "mean_token_accuracy": 0.9055903278291225, + "num_tokens": 5273816.0, + "step": 1130 + }, + { + "entropy": 0.32474765107035636, + "epoch": 0.06613823761902345, + "grad_norm": 0.6172017455101013, + "learning_rate": 3.303364269141532e-05, + "loss": 0.327, + "mean_token_accuracy": 0.9024863466620445, + "num_tokens": 5319857.0, + "step": 1140 + }, + { + "entropy": 0.3278034429997206, + "epoch": 0.06671839759813769, + "grad_norm": 0.7470880746841431, + "learning_rate": 3.332366589327146e-05, + "loss": 0.3377, + "mean_token_accuracy": 0.9043868541717529, + "num_tokens": 5366449.0, + "step": 1150 + }, + { + "entropy": 0.32880317997187375, + "epoch": 0.06729855757725192, + "grad_norm": 0.6245658993721008, + "learning_rate": 3.361368909512761e-05, + "loss": 0.3248, + "mean_token_accuracy": 0.903885880857706, + "num_tokens": 5411352.0, + "step": 1160 + }, + { + "entropy": 0.33650453556329013, + "epoch": 0.06787871755636617, + "grad_norm": 0.8718119859695435, + "learning_rate": 3.3903712296983756e-05, + "loss": 0.3502, + "mean_token_accuracy": 0.9024200178682804, + "num_tokens": 5459045.0, + "step": 1170 + }, + { + "entropy": 0.32192675210535526, + "epoch": 0.06845887753548041, + "grad_norm": 0.6714931130409241, + "learning_rate": 3.419373549883991e-05, + "loss": 0.3191, + "mean_token_accuracy": 0.9042942568659782, + "num_tokens": 5507168.0, + "step": 1180 + }, + { + "entropy": 0.30569215696305035, + "epoch": 0.06903903751459466, + "grad_norm": 0.9545487761497498, + "learning_rate": 3.4483758700696054e-05, + "loss": 0.3075, + "mean_token_accuracy": 0.9113899394869804, + "num_tokens": 5550582.0, + "step": 1190 + }, + { + "entropy": 0.3087855382822454, + "epoch": 0.06961919749370889, + "grad_norm": 0.5128186345100403, + "learning_rate": 3.477378190255221e-05, + "loss": 0.32, + "mean_token_accuracy": 0.9096474513411522, + "num_tokens": 5597665.0, + "step": 1200 + }, + { + "entropy": 0.3275372742675245, + "epoch": 0.07019935747282313, + "grad_norm": 0.8742493987083435, + "learning_rate": 3.506380510440835e-05, + "loss": 0.3448, + "mean_token_accuracy": 0.9029547147452831, + "num_tokens": 5637819.0, + "step": 1210 + }, + { + "entropy": 0.30361549500375984, + "epoch": 0.07077951745193738, + "grad_norm": 0.5691332221031189, + "learning_rate": 3.5353828306264505e-05, + "loss": 0.3137, + "mean_token_accuracy": 0.9107619062066078, + "num_tokens": 5689005.0, + "step": 1220 + }, + { + "entropy": 0.31537855304777623, + "epoch": 0.07135967743105161, + "grad_norm": 0.780820906162262, + "learning_rate": 3.564385150812065e-05, + "loss": 0.3247, + "mean_token_accuracy": 0.9074193447828293, + "num_tokens": 5734156.0, + "step": 1230 + }, + { + "entropy": 0.31504823341965676, + "epoch": 0.07193983741016585, + "grad_norm": 0.6359608173370361, + "learning_rate": 3.59338747099768e-05, + "loss": 0.325, + "mean_token_accuracy": 0.9081170357763767, + "num_tokens": 5776028.0, + "step": 1240 + }, + { + "entropy": 0.30476621389389036, + "epoch": 0.0725199973892801, + "grad_norm": 0.8188539147377014, + "learning_rate": 3.622389791183295e-05, + "loss": 0.3165, + "mean_token_accuracy": 0.9092209786176682, + "num_tokens": 5820629.0, + "step": 1250 + }, + { + "entropy": 0.3109021671116352, + "epoch": 0.07310015736839433, + "grad_norm": 0.892595112323761, + "learning_rate": 3.65139211136891e-05, + "loss": 0.3217, + "mean_token_accuracy": 0.9091733299195767, + "num_tokens": 5866803.0, + "step": 1260 + }, + { + "entropy": 0.3030667521059513, + "epoch": 0.07368031734750857, + "grad_norm": 0.7754374146461487, + "learning_rate": 3.680394431554525e-05, + "loss": 0.3021, + "mean_token_accuracy": 0.9113591209053993, + "num_tokens": 5911419.0, + "step": 1270 + }, + { + "entropy": 0.3238999357447028, + "epoch": 0.07426047732662282, + "grad_norm": 0.6384932994842529, + "learning_rate": 3.709396751740139e-05, + "loss": 0.3287, + "mean_token_accuracy": 0.9043565027415752, + "num_tokens": 5964282.0, + "step": 1280 + }, + { + "entropy": 0.30372555535286666, + "epoch": 0.07484063730573706, + "grad_norm": 0.7555716633796692, + "learning_rate": 3.7383990719257545e-05, + "loss": 0.3106, + "mean_token_accuracy": 0.911032336205244, + "num_tokens": 6016832.0, + "step": 1290 + }, + { + "entropy": 0.3099183689802885, + "epoch": 0.0754207972848513, + "grad_norm": 0.7847490310668945, + "learning_rate": 3.767401392111369e-05, + "loss": 0.3223, + "mean_token_accuracy": 0.9080614030361176, + "num_tokens": 6061968.0, + "step": 1300 + }, + { + "entropy": 0.3111672980710864, + "epoch": 0.07600095726396554, + "grad_norm": 0.8858250379562378, + "learning_rate": 3.7964037122969843e-05, + "loss": 0.3106, + "mean_token_accuracy": 0.908698596805334, + "num_tokens": 6116092.0, + "step": 1310 + }, + { + "entropy": 0.2978003408759832, + "epoch": 0.07658111724307978, + "grad_norm": 0.7014368176460266, + "learning_rate": 3.825406032482598e-05, + "loss": 0.3075, + "mean_token_accuracy": 0.9119744315743447, + "num_tokens": 6159140.0, + "step": 1320 + }, + { + "entropy": 0.30285347141325475, + "epoch": 0.07716127722219401, + "grad_norm": 0.7451701164245605, + "learning_rate": 3.8544083526682135e-05, + "loss": 0.3123, + "mean_token_accuracy": 0.9101359650492669, + "num_tokens": 6209432.0, + "step": 1330 + }, + { + "entropy": 0.3168087238445878, + "epoch": 0.07774143720130826, + "grad_norm": 0.6220279932022095, + "learning_rate": 3.883410672853828e-05, + "loss": 0.3315, + "mean_token_accuracy": 0.9054328575730324, + "num_tokens": 6260646.0, + "step": 1340 + }, + { + "entropy": 0.32216351768001916, + "epoch": 0.0783215971804225, + "grad_norm": 0.741764485836029, + "learning_rate": 3.912412993039443e-05, + "loss": 0.323, + "mean_token_accuracy": 0.9068306528031826, + "num_tokens": 6308522.0, + "step": 1350 + }, + { + "entropy": 0.31719786580652, + "epoch": 0.07890175715953675, + "grad_norm": 0.8606551885604858, + "learning_rate": 3.941415313225058e-05, + "loss": 0.3172, + "mean_token_accuracy": 0.9046679921448231, + "num_tokens": 6356358.0, + "step": 1360 + }, + { + "entropy": 0.30493796356022357, + "epoch": 0.07948191713865098, + "grad_norm": 0.7900994420051575, + "learning_rate": 3.970417633410673e-05, + "loss": 0.3115, + "mean_token_accuracy": 0.9090129427611828, + "num_tokens": 6401072.0, + "step": 1370 + }, + { + "entropy": 0.28357998207211493, + "epoch": 0.08006207711776522, + "grad_norm": 0.6944790482521057, + "learning_rate": 3.999419953596288e-05, + "loss": 0.2842, + "mean_token_accuracy": 0.9181315436959266, + "num_tokens": 6443893.0, + "step": 1380 + }, + { + "entropy": 0.29827274810522797, + "epoch": 0.08064223709687947, + "grad_norm": 0.7437567114830017, + "learning_rate": 4.028422273781903e-05, + "loss": 0.31, + "mean_token_accuracy": 0.911144058406353, + "num_tokens": 6495536.0, + "step": 1390 + }, + { + "entropy": 0.2871918672695756, + "epoch": 0.0812223970759937, + "grad_norm": 0.7829996347427368, + "learning_rate": 4.0574245939675175e-05, + "loss": 0.2914, + "mean_token_accuracy": 0.9157790549099445, + "num_tokens": 6542141.0, + "step": 1400 + }, + { + "entropy": 0.3050424698740244, + "epoch": 0.08180255705510794, + "grad_norm": 0.6369175910949707, + "learning_rate": 4.086426914153133e-05, + "loss": 0.3139, + "mean_token_accuracy": 0.9090121239423752, + "num_tokens": 6589554.0, + "step": 1410 + }, + { + "entropy": 0.30965569131076337, + "epoch": 0.08238271703422219, + "grad_norm": 0.8156128525733948, + "learning_rate": 4.115429234338747e-05, + "loss": 0.3204, + "mean_token_accuracy": 0.9098815053701401, + "num_tokens": 6634930.0, + "step": 1420 + }, + { + "entropy": 0.30370572274550794, + "epoch": 0.08296287701333643, + "grad_norm": 0.7009026408195496, + "learning_rate": 4.1444315545243626e-05, + "loss": 0.3103, + "mean_token_accuracy": 0.9091388165950776, + "num_tokens": 6684235.0, + "step": 1430 + }, + { + "entropy": 0.2887363304384053, + "epoch": 0.08354303699245066, + "grad_norm": 0.6463255882263184, + "learning_rate": 4.173433874709977e-05, + "loss": 0.2861, + "mean_token_accuracy": 0.9151381827890873, + "num_tokens": 6729055.0, + "step": 1440 + }, + { + "entropy": 0.3040386477485299, + "epoch": 0.08412319697156491, + "grad_norm": 0.7513805031776428, + "learning_rate": 4.202436194895592e-05, + "loss": 0.3132, + "mean_token_accuracy": 0.9091064013540745, + "num_tokens": 6778597.0, + "step": 1450 + }, + { + "entropy": 0.3067681266926229, + "epoch": 0.08470335695067915, + "grad_norm": 0.7590340375900269, + "learning_rate": 4.231438515081207e-05, + "loss": 0.297, + "mean_token_accuracy": 0.9120954312384129, + "num_tokens": 6827311.0, + "step": 1460 + }, + { + "entropy": 0.29724564850330354, + "epoch": 0.08528351692979338, + "grad_norm": 0.7005306482315063, + "learning_rate": 4.2604408352668216e-05, + "loss": 0.306, + "mean_token_accuracy": 0.9125577352941037, + "num_tokens": 6879913.0, + "step": 1470 + }, + { + "entropy": 0.29294680543243884, + "epoch": 0.08586367690890763, + "grad_norm": 0.8225511908531189, + "learning_rate": 4.289443155452436e-05, + "loss": 0.3024, + "mean_token_accuracy": 0.9159172855317592, + "num_tokens": 6923273.0, + "step": 1480 + }, + { + "entropy": 0.3013799009844661, + "epoch": 0.08644383688802187, + "grad_norm": 0.7784894704818726, + "learning_rate": 4.318445475638051e-05, + "loss": 0.3156, + "mean_token_accuracy": 0.9099559903144836, + "num_tokens": 6970105.0, + "step": 1490 + }, + { + "entropy": 0.29184954073280095, + "epoch": 0.08702399686713612, + "grad_norm": 0.8364036679267883, + "learning_rate": 4.347447795823666e-05, + "loss": 0.2951, + "mean_token_accuracy": 0.9125184990465641, + "num_tokens": 7015664.0, + "step": 1500 + }, + { + "entropy": 0.2919993345625699, + "epoch": 0.08760415684625035, + "grad_norm": 0.778735339641571, + "learning_rate": 4.3764501160092805e-05, + "loss": 0.3043, + "mean_token_accuracy": 0.9137713015079498, + "num_tokens": 7066672.0, + "step": 1510 + }, + { + "entropy": 0.2860091797076166, + "epoch": 0.0881843168253646, + "grad_norm": 0.769975483417511, + "learning_rate": 4.405452436194896e-05, + "loss": 0.297, + "mean_token_accuracy": 0.9169738702476025, + "num_tokens": 7111610.0, + "step": 1520 + }, + { + "entropy": 0.2807452784851193, + "epoch": 0.08876447680447884, + "grad_norm": 0.6086603999137878, + "learning_rate": 4.43445475638051e-05, + "loss": 0.3001, + "mean_token_accuracy": 0.9169657327234745, + "num_tokens": 7157165.0, + "step": 1530 + }, + { + "entropy": 0.2788984961807728, + "epoch": 0.08934463678359307, + "grad_norm": 0.9280663728713989, + "learning_rate": 4.4634570765661256e-05, + "loss": 0.2905, + "mean_token_accuracy": 0.916178935021162, + "num_tokens": 7206639.0, + "step": 1540 + }, + { + "entropy": 0.2887772241607308, + "epoch": 0.08992479676270732, + "grad_norm": 0.9003749489784241, + "learning_rate": 4.49245939675174e-05, + "loss": 0.298, + "mean_token_accuracy": 0.9168709181249142, + "num_tokens": 7252729.0, + "step": 1550 + }, + { + "entropy": 0.2965578636154532, + "epoch": 0.09050495674182156, + "grad_norm": 0.7360148429870605, + "learning_rate": 4.5214617169373554e-05, + "loss": 0.3019, + "mean_token_accuracy": 0.9135144047439099, + "num_tokens": 7299386.0, + "step": 1560 + }, + { + "entropy": 0.2901795361191034, + "epoch": 0.0910851167209358, + "grad_norm": 0.6405844688415527, + "learning_rate": 4.55046403712297e-05, + "loss": 0.2957, + "mean_token_accuracy": 0.9143725715577602, + "num_tokens": 7344537.0, + "step": 1570 + }, + { + "entropy": 0.2946268294006586, + "epoch": 0.09166527670005004, + "grad_norm": 0.7796591520309448, + "learning_rate": 4.579466357308585e-05, + "loss": 0.3116, + "mean_token_accuracy": 0.9126657597720623, + "num_tokens": 7389177.0, + "step": 1580 + }, + { + "entropy": 0.2914118329063058, + "epoch": 0.09224543667916428, + "grad_norm": 0.6452388167381287, + "learning_rate": 4.6084686774942e-05, + "loss": 0.2972, + "mean_token_accuracy": 0.9153137519955635, + "num_tokens": 7433131.0, + "step": 1590 + }, + { + "entropy": 0.2951007978990674, + "epoch": 0.09282559665827853, + "grad_norm": 0.8093860745429993, + "learning_rate": 4.637470997679815e-05, + "loss": 0.3075, + "mean_token_accuracy": 0.912248969078064, + "num_tokens": 7477853.0, + "step": 1600 + }, + { + "entropy": 0.28378327367827294, + "epoch": 0.09340575663739276, + "grad_norm": 0.7582586407661438, + "learning_rate": 4.6664733178654296e-05, + "loss": 0.2817, + "mean_token_accuracy": 0.918566457182169, + "num_tokens": 7527745.0, + "step": 1610 + }, + { + "entropy": 0.30057731214910743, + "epoch": 0.093985916616507, + "grad_norm": 0.8611761927604675, + "learning_rate": 4.695475638051044e-05, + "loss": 0.308, + "mean_token_accuracy": 0.9110127367079258, + "num_tokens": 7573521.0, + "step": 1620 + }, + { + "entropy": 0.2673630990087986, + "epoch": 0.09456607659562125, + "grad_norm": 0.7502352595329285, + "learning_rate": 4.7244779582366594e-05, + "loss": 0.2764, + "mean_token_accuracy": 0.9209020972251892, + "num_tokens": 7615884.0, + "step": 1630 + }, + { + "entropy": 0.28669884596019984, + "epoch": 0.09514623657473548, + "grad_norm": 0.7012315392494202, + "learning_rate": 4.753480278422274e-05, + "loss": 0.2956, + "mean_token_accuracy": 0.9155751645565033, + "num_tokens": 7659625.0, + "step": 1640 + }, + { + "entropy": 0.26666297325864435, + "epoch": 0.09572639655384972, + "grad_norm": 0.9144484996795654, + "learning_rate": 4.7824825986078886e-05, + "loss": 0.2793, + "mean_token_accuracy": 0.9196700178086757, + "num_tokens": 7709331.0, + "step": 1650 + }, + { + "entropy": 0.2813756976276636, + "epoch": 0.09630655653296397, + "grad_norm": 0.6945490837097168, + "learning_rate": 4.811484918793504e-05, + "loss": 0.2848, + "mean_token_accuracy": 0.916472227871418, + "num_tokens": 7760376.0, + "step": 1660 + }, + { + "entropy": 0.27898413948714734, + "epoch": 0.09688671651207821, + "grad_norm": 0.7581312656402588, + "learning_rate": 4.8404872389791184e-05, + "loss": 0.296, + "mean_token_accuracy": 0.9143371537327767, + "num_tokens": 7809036.0, + "step": 1670 + }, + { + "entropy": 0.2824300312437117, + "epoch": 0.09746687649119244, + "grad_norm": 0.8237022757530212, + "learning_rate": 4.869489559164733e-05, + "loss": 0.2773, + "mean_token_accuracy": 0.9186645068228245, + "num_tokens": 7849875.0, + "step": 1680 + }, + { + "entropy": 0.27429430549964307, + "epoch": 0.09804703647030669, + "grad_norm": 0.6170187592506409, + "learning_rate": 4.898491879350348e-05, + "loss": 0.2899, + "mean_token_accuracy": 0.9145666979253292, + "num_tokens": 7895262.0, + "step": 1690 + }, + { + "entropy": 0.29639883916825055, + "epoch": 0.09862719644942093, + "grad_norm": 0.6594032645225525, + "learning_rate": 4.927494199535963e-05, + "loss": 0.3047, + "mean_token_accuracy": 0.9122639246284961, + "num_tokens": 7943577.0, + "step": 1700 + }, + { + "entropy": 0.2817915199324489, + "epoch": 0.09920735642853516, + "grad_norm": 0.613117516040802, + "learning_rate": 4.956496519721578e-05, + "loss": 0.2872, + "mean_token_accuracy": 0.9162633776664734, + "num_tokens": 7991905.0, + "step": 1710 + }, + { + "entropy": 0.27560282312333584, + "epoch": 0.09978751640764941, + "grad_norm": 0.6343222856521606, + "learning_rate": 4.9854988399071926e-05, + "loss": 0.2771, + "mean_token_accuracy": 0.9205437496304512, + "num_tokens": 8043488.0, + "step": 1720 + }, + { + "entropy": 0.2689267177134752, + "epoch": 0.10036767638676365, + "grad_norm": 0.6848794221878052, + "learning_rate": 4.999998718383338e-05, + "loss": 0.2785, + "mean_token_accuracy": 0.9179315030574798, + "num_tokens": 8090630.0, + "step": 1730 + }, + { + "entropy": 0.27072975048795345, + "epoch": 0.1009478363658779, + "grad_norm": 0.7242428064346313, + "learning_rate": 4.9999884654579254e-05, + "loss": 0.2807, + "mean_token_accuracy": 0.9206205457448959, + "num_tokens": 8134453.0, + "step": 1740 + }, + { + "entropy": 0.26652313526719806, + "epoch": 0.10152799634499213, + "grad_norm": 0.8293262124061584, + "learning_rate": 4.999967959649149e-05, + "loss": 0.29, + "mean_token_accuracy": 0.9180788114666939, + "num_tokens": 8179676.0, + "step": 1750 + }, + { + "entropy": 0.2596722885966301, + "epoch": 0.10210815632410637, + "grad_norm": 0.6212183833122253, + "learning_rate": 4.999937201041107e-05, + "loss": 0.2715, + "mean_token_accuracy": 0.9241020932793618, + "num_tokens": 8226661.0, + "step": 1760 + }, + { + "entropy": 0.29126374013721945, + "epoch": 0.10268831630322062, + "grad_norm": 0.5874972343444824, + "learning_rate": 4.999896189759945e-05, + "loss": 0.2876, + "mean_token_accuracy": 0.9155148319900036, + "num_tokens": 8272677.0, + "step": 1770 + }, + { + "entropy": 0.2604101177304983, + "epoch": 0.10326847628233485, + "grad_norm": 0.713620126247406, + "learning_rate": 4.999844925973858e-05, + "loss": 0.2706, + "mean_token_accuracy": 0.9188888490200042, + "num_tokens": 8320237.0, + "step": 1780 + }, + { + "entropy": 0.2498532086610794, + "epoch": 0.10384863626144909, + "grad_norm": 0.7551988363265991, + "learning_rate": 4.999783409893087e-05, + "loss": 0.2542, + "mean_token_accuracy": 0.9265534482896328, + "num_tokens": 8365280.0, + "step": 1790 + }, + { + "entropy": 0.26464840173721316, + "epoch": 0.10442879624056334, + "grad_norm": 0.8986396193504333, + "learning_rate": 4.999711641769921e-05, + "loss": 0.2786, + "mean_token_accuracy": 0.9206510946154595, + "num_tokens": 8408392.0, + "step": 1800 + }, + { + "entropy": 0.2578721922822297, + "epoch": 0.10500895621967758, + "grad_norm": 0.6243773698806763, + "learning_rate": 4.999629621898693e-05, + "loss": 0.2707, + "mean_token_accuracy": 0.9237455770373344, + "num_tokens": 8453974.0, + "step": 1810 + }, + { + "entropy": 0.2655750337988138, + "epoch": 0.10558911619879181, + "grad_norm": 0.6850927472114563, + "learning_rate": 4.99953735061578e-05, + "loss": 0.2729, + "mean_token_accuracy": 0.9220066092908382, + "num_tokens": 8502858.0, + "step": 1820 + }, + { + "entropy": 0.2591411518864334, + "epoch": 0.10616927617790606, + "grad_norm": 0.5950872302055359, + "learning_rate": 4.9994348282996027e-05, + "loss": 0.2702, + "mean_token_accuracy": 0.9222574345767498, + "num_tokens": 8551856.0, + "step": 1830 + }, + { + "entropy": 0.27220588698983195, + "epoch": 0.1067494361570203, + "grad_norm": 0.7156206965446472, + "learning_rate": 4.999322055370623e-05, + "loss": 0.2816, + "mean_token_accuracy": 0.9194983392953873, + "num_tokens": 8602149.0, + "step": 1840 + }, + { + "entropy": 0.2603364289738238, + "epoch": 0.10732959613613453, + "grad_norm": 0.6216195225715637, + "learning_rate": 4.9991990322913426e-05, + "loss": 0.2647, + "mean_token_accuracy": 0.9241082407534122, + "num_tokens": 8647988.0, + "step": 1850 + }, + { + "entropy": 0.27109031714499, + "epoch": 0.10790975611524878, + "grad_norm": 0.6569051742553711, + "learning_rate": 4.9990657595663e-05, + "loss": 0.2885, + "mean_token_accuracy": 0.9145662099123001, + "num_tokens": 8695162.0, + "step": 1860 + }, + { + "entropy": 0.2688842251896858, + "epoch": 0.10848991609436302, + "grad_norm": 0.714058518409729, + "learning_rate": 4.9989222377420696e-05, + "loss": 0.2657, + "mean_token_accuracy": 0.9226457640528679, + "num_tokens": 8741231.0, + "step": 1870 + }, + { + "entropy": 0.2644309110939503, + "epoch": 0.10907007607347727, + "grad_norm": 0.6936904788017273, + "learning_rate": 4.9987684674072586e-05, + "loss": 0.2788, + "mean_token_accuracy": 0.9207229740917683, + "num_tokens": 8789145.0, + "step": 1880 + }, + { + "entropy": 0.2770258506760001, + "epoch": 0.1096502360525915, + "grad_norm": 0.7012014389038086, + "learning_rate": 4.998604449192506e-05, + "loss": 0.2858, + "mean_token_accuracy": 0.9167472742497921, + "num_tokens": 8832385.0, + "step": 1890 + }, + { + "entropy": 0.26609011068940164, + "epoch": 0.11023039603170574, + "grad_norm": 0.6062008142471313, + "learning_rate": 4.9984301837704795e-05, + "loss": 0.2755, + "mean_token_accuracy": 0.921289111673832, + "num_tokens": 8879913.0, + "step": 1900 + }, + { + "entropy": 0.25690990863367913, + "epoch": 0.11081055601081999, + "grad_norm": 0.6987804174423218, + "learning_rate": 4.998245671855871e-05, + "loss": 0.2677, + "mean_token_accuracy": 0.9239314064383507, + "num_tokens": 8921391.0, + "step": 1910 + }, + { + "entropy": 0.27248289808630943, + "epoch": 0.11139071598993422, + "grad_norm": 0.6866018176078796, + "learning_rate": 4.998050914205394e-05, + "loss": 0.2772, + "mean_token_accuracy": 0.9199076242744922, + "num_tokens": 8970774.0, + "step": 1920 + }, + { + "entropy": 0.2705771800130606, + "epoch": 0.11197087596904846, + "grad_norm": 0.5729894638061523, + "learning_rate": 4.9978459116177864e-05, + "loss": 0.2878, + "mean_token_accuracy": 0.918898756057024, + "num_tokens": 9016340.0, + "step": 1930 + }, + { + "entropy": 0.2585883465595543, + "epoch": 0.11255103594816271, + "grad_norm": 0.6543694138526917, + "learning_rate": 4.9976306649337966e-05, + "loss": 0.2687, + "mean_token_accuracy": 0.9229423679411412, + "num_tokens": 9061942.0, + "step": 1940 + }, + { + "entropy": 0.24645599657669665, + "epoch": 0.11313119592727695, + "grad_norm": 0.541439950466156, + "learning_rate": 4.9974051750361896e-05, + "loss": 0.2577, + "mean_token_accuracy": 0.9263320304453373, + "num_tokens": 9106555.0, + "step": 1950 + }, + { + "entropy": 0.2508511829189956, + "epoch": 0.11371135590639118, + "grad_norm": 0.6171912550926208, + "learning_rate": 4.997169442849737e-05, + "loss": 0.2583, + "mean_token_accuracy": 0.9254555203020572, + "num_tokens": 9153548.0, + "step": 1960 + }, + { + "entropy": 0.2571233163587749, + "epoch": 0.11429151588550543, + "grad_norm": 0.8694570064544678, + "learning_rate": 4.9969234693412195e-05, + "loss": 0.2748, + "mean_token_accuracy": 0.9206481039524078, + "num_tokens": 9198601.0, + "step": 1970 + }, + { + "entropy": 0.26682604048401115, + "epoch": 0.11487167586461967, + "grad_norm": 0.756659209728241, + "learning_rate": 4.996667255519414e-05, + "loss": 0.2688, + "mean_token_accuracy": 0.9216569639742375, + "num_tokens": 9245039.0, + "step": 1980 + }, + { + "entropy": 0.260152268409729, + "epoch": 0.1154518358437339, + "grad_norm": 0.5664719939231873, + "learning_rate": 4.9964008024351e-05, + "loss": 0.2682, + "mean_token_accuracy": 0.9203166864812374, + "num_tokens": 9291539.0, + "step": 1990 + }, + { + "entropy": 0.2628775766119361, + "epoch": 0.11603199582284815, + "grad_norm": 0.7043173909187317, + "learning_rate": 4.9961241111810454e-05, + "loss": 0.2668, + "mean_token_accuracy": 0.9213770076632499, + "num_tokens": 9342518.0, + "step": 2000 + }, + { + "entropy": 0.25576556120067834, + "epoch": 0.1166121558019624, + "grad_norm": 0.5857242345809937, + "learning_rate": 4.99583718289201e-05, + "loss": 0.2741, + "mean_token_accuracy": 0.9231432363390922, + "num_tokens": 9388302.0, + "step": 2010 + }, + { + "entropy": 0.2568430436775088, + "epoch": 0.11719231578107663, + "grad_norm": 0.6334712505340576, + "learning_rate": 4.995540018744736e-05, + "loss": 0.2602, + "mean_token_accuracy": 0.9242356009781361, + "num_tokens": 9434426.0, + "step": 2020 + }, + { + "entropy": 0.24862975357100367, + "epoch": 0.11777247576019087, + "grad_norm": 0.8271284699440002, + "learning_rate": 4.9952326199579445e-05, + "loss": 0.2565, + "mean_token_accuracy": 0.9245531909167767, + "num_tokens": 9478463.0, + "step": 2030 + }, + { + "entropy": 0.26271321792155505, + "epoch": 0.11835263573930511, + "grad_norm": 0.6133384704589844, + "learning_rate": 4.9949149877923304e-05, + "loss": 0.2746, + "mean_token_accuracy": 0.9198125831782817, + "num_tokens": 9525802.0, + "step": 2040 + }, + { + "entropy": 0.2644510926678777, + "epoch": 0.11893279571841936, + "grad_norm": 0.8745215535163879, + "learning_rate": 4.994587123550559e-05, + "loss": 0.2757, + "mean_token_accuracy": 0.9209706641733646, + "num_tokens": 9569271.0, + "step": 2050 + }, + { + "entropy": 0.2595982373692095, + "epoch": 0.11951295569753359, + "grad_norm": 0.5087917447090149, + "learning_rate": 4.9942490285772576e-05, + "loss": 0.2613, + "mean_token_accuracy": 0.9242874056100845, + "num_tokens": 9618329.0, + "step": 2060 + }, + { + "entropy": 0.2463766710832715, + "epoch": 0.12009311567664784, + "grad_norm": 0.5843988656997681, + "learning_rate": 4.9939007042590114e-05, + "loss": 0.2616, + "mean_token_accuracy": 0.9276760250329972, + "num_tokens": 9663547.0, + "step": 2070 + }, + { + "entropy": 0.279539078194648, + "epoch": 0.12067327565576208, + "grad_norm": 0.5433543920516968, + "learning_rate": 4.993542152024359e-05, + "loss": 0.2989, + "mean_token_accuracy": 0.9160007424652576, + "num_tokens": 9719617.0, + "step": 2080 + }, + { + "entropy": 0.24361262358725072, + "epoch": 0.12125343563487631, + "grad_norm": 0.69965660572052, + "learning_rate": 4.993173373343785e-05, + "loss": 0.2537, + "mean_token_accuracy": 0.9253748945891858, + "num_tokens": 9763606.0, + "step": 2090 + }, + { + "entropy": 0.24894468253478408, + "epoch": 0.12183359561399056, + "grad_norm": 0.7231481671333313, + "learning_rate": 4.9927943697297144e-05, + "loss": 0.2677, + "mean_token_accuracy": 0.92439334243536, + "num_tokens": 9811915.0, + "step": 2100 + }, + { + "entropy": 0.2520544639788568, + "epoch": 0.1224137555931048, + "grad_norm": 0.5563618540763855, + "learning_rate": 4.992405142736505e-05, + "loss": 0.2515, + "mean_token_accuracy": 0.9277243062853813, + "num_tokens": 9855396.0, + "step": 2110 + }, + { + "entropy": 0.25913242446258666, + "epoch": 0.12299391557221905, + "grad_norm": 0.6788963675498962, + "learning_rate": 4.9920056939604456e-05, + "loss": 0.2828, + "mean_token_accuracy": 0.9205480501055717, + "num_tokens": 9911320.0, + "step": 2120 + }, + { + "entropy": 0.24703723872080446, + "epoch": 0.12357407555133328, + "grad_norm": 0.6745657920837402, + "learning_rate": 4.991596025039743e-05, + "loss": 0.2501, + "mean_token_accuracy": 0.9276753909885883, + "num_tokens": 9959615.0, + "step": 2130 + }, + { + "entropy": 0.2519259788095951, + "epoch": 0.12415423553044752, + "grad_norm": 0.7236958742141724, + "learning_rate": 4.991176137654521e-05, + "loss": 0.2661, + "mean_token_accuracy": 0.9234645597636699, + "num_tokens": 10008629.0, + "step": 2140 + }, + { + "entropy": 0.268074760120362, + "epoch": 0.12473439550956177, + "grad_norm": 0.6498594880104065, + "learning_rate": 4.990746033526809e-05, + "loss": 0.2788, + "mean_token_accuracy": 0.9195640549063683, + "num_tokens": 10056809.0, + "step": 2150 + }, + { + "entropy": 0.2480918040499091, + "epoch": 0.125314555488676, + "grad_norm": 0.5399681329727173, + "learning_rate": 4.990305714420539e-05, + "loss": 0.2634, + "mean_token_accuracy": 0.9267123579978943, + "num_tokens": 10100239.0, + "step": 2160 + }, + { + "entropy": 0.24669867865741252, + "epoch": 0.12589471546779024, + "grad_norm": 0.6594753265380859, + "learning_rate": 4.9898551821415354e-05, + "loss": 0.2582, + "mean_token_accuracy": 0.9247825779020786, + "num_tokens": 10141850.0, + "step": 2170 + }, + { + "entropy": 0.2652435668744147, + "epoch": 0.12647487544690447, + "grad_norm": 0.6089848279953003, + "learning_rate": 4.989394438537509e-05, + "loss": 0.2685, + "mean_token_accuracy": 0.9229709073901177, + "num_tokens": 10189525.0, + "step": 2180 + }, + { + "entropy": 0.23870805129408837, + "epoch": 0.12705503542601873, + "grad_norm": 0.8019735813140869, + "learning_rate": 4.988923485498047e-05, + "loss": 0.2602, + "mean_token_accuracy": 0.9255488894879818, + "num_tokens": 10232926.0, + "step": 2190 + }, + { + "entropy": 0.27198631726205347, + "epoch": 0.12763519540513296, + "grad_norm": 0.49819546937942505, + "learning_rate": 4.988442324954612e-05, + "loss": 0.2707, + "mean_token_accuracy": 0.922489058226347, + "num_tokens": 10280610.0, + "step": 2200 + }, + { + "entropy": 0.23695894731208683, + "epoch": 0.1282153553842472, + "grad_norm": 0.8277322053909302, + "learning_rate": 4.987950958880523e-05, + "loss": 0.2536, + "mean_token_accuracy": 0.9263455778360367, + "num_tokens": 10323869.0, + "step": 2210 + }, + { + "entropy": 0.2598136071115732, + "epoch": 0.12879551536336145, + "grad_norm": 0.552314281463623, + "learning_rate": 4.987449389290959e-05, + "loss": 0.2711, + "mean_token_accuracy": 0.9234828911721706, + "num_tokens": 10371761.0, + "step": 2220 + }, + { + "entropy": 0.24340019840747118, + "epoch": 0.12937567534247568, + "grad_norm": 0.6650964617729187, + "learning_rate": 4.986937618242943e-05, + "loss": 0.2459, + "mean_token_accuracy": 0.9285555556416512, + "num_tokens": 10418335.0, + "step": 2230 + }, + { + "entropy": 0.24143684292212128, + "epoch": 0.12995583532158994, + "grad_norm": 0.7575153708457947, + "learning_rate": 4.9864156478353355e-05, + "loss": 0.2485, + "mean_token_accuracy": 0.9256550617516041, + "num_tokens": 10465609.0, + "step": 2240 + }, + { + "entropy": 0.25854320377111434, + "epoch": 0.13053599530070417, + "grad_norm": 0.6647493839263916, + "learning_rate": 4.985883480208828e-05, + "loss": 0.2672, + "mean_token_accuracy": 0.9227815322577954, + "num_tokens": 10509367.0, + "step": 2250 + }, + { + "entropy": 0.2537616178393364, + "epoch": 0.1311161552798184, + "grad_norm": 0.8977476358413696, + "learning_rate": 4.985341117545931e-05, + "loss": 0.2755, + "mean_token_accuracy": 0.9249600730836391, + "num_tokens": 10557544.0, + "step": 2260 + }, + { + "entropy": 0.25211329543963074, + "epoch": 0.13169631525893266, + "grad_norm": 0.6823932528495789, + "learning_rate": 4.984788562070967e-05, + "loss": 0.2587, + "mean_token_accuracy": 0.9245777301490307, + "num_tokens": 10606796.0, + "step": 2270 + }, + { + "entropy": 0.26160632343962786, + "epoch": 0.1322764752380469, + "grad_norm": 0.6796965599060059, + "learning_rate": 4.984225816050061e-05, + "loss": 0.2741, + "mean_token_accuracy": 0.92069718465209, + "num_tokens": 10653365.0, + "step": 2280 + }, + { + "entropy": 0.26065108422189953, + "epoch": 0.13285663521716112, + "grad_norm": 0.5360117554664612, + "learning_rate": 4.9836528817911324e-05, + "loss": 0.2807, + "mean_token_accuracy": 0.9213643401861191, + "num_tokens": 10704392.0, + "step": 2290 + }, + { + "entropy": 0.24109390340745449, + "epoch": 0.13343679519627538, + "grad_norm": 0.7460360527038574, + "learning_rate": 4.983069761643883e-05, + "loss": 0.2545, + "mean_token_accuracy": 0.9274110339581967, + "num_tokens": 10748602.0, + "step": 2300 + }, + { + "entropy": 0.24397279247641562, + "epoch": 0.1340169551753896, + "grad_norm": 0.6013259887695312, + "learning_rate": 4.982476457999788e-05, + "loss": 0.2532, + "mean_token_accuracy": 0.9253578655421734, + "num_tokens": 10794560.0, + "step": 2310 + }, + { + "entropy": 0.25872422121465205, + "epoch": 0.13459711515450384, + "grad_norm": 0.7177749872207642, + "learning_rate": 4.9818729732920896e-05, + "loss": 0.2686, + "mean_token_accuracy": 0.9234141893684864, + "num_tokens": 10842808.0, + "step": 2320 + }, + { + "entropy": 0.24671107297763228, + "epoch": 0.1351772751336181, + "grad_norm": 0.5370097160339355, + "learning_rate": 4.9812593099957815e-05, + "loss": 0.2603, + "mean_token_accuracy": 0.925677801668644, + "num_tokens": 10887965.0, + "step": 2330 + }, + { + "entropy": 0.252953112963587, + "epoch": 0.13575743511273233, + "grad_norm": 0.512586772441864, + "learning_rate": 4.980635470627602e-05, + "loss": 0.2685, + "mean_token_accuracy": 0.9216497145593167, + "num_tokens": 10934382.0, + "step": 2340 + }, + { + "entropy": 0.22603638656437397, + "epoch": 0.13633759509184656, + "grad_norm": 0.5041609406471252, + "learning_rate": 4.9800014577460256e-05, + "loss": 0.2409, + "mean_token_accuracy": 0.9299511142075062, + "num_tokens": 10981426.0, + "step": 2350 + }, + { + "entropy": 0.24845968801528215, + "epoch": 0.13691775507096082, + "grad_norm": 0.608397901058197, + "learning_rate": 4.9793572739512456e-05, + "loss": 0.2543, + "mean_token_accuracy": 0.9257522024214268, + "num_tokens": 11027703.0, + "step": 2360 + }, + { + "entropy": 0.24288834761828185, + "epoch": 0.13749791505007505, + "grad_norm": 0.6330724358558655, + "learning_rate": 4.9787029218851733e-05, + "loss": 0.2612, + "mean_token_accuracy": 0.9245579600334167, + "num_tokens": 11076143.0, + "step": 2370 + }, + { + "entropy": 0.25457229763269423, + "epoch": 0.1380780750291893, + "grad_norm": 0.5116296410560608, + "learning_rate": 4.978038404231418e-05, + "loss": 0.2615, + "mean_token_accuracy": 0.9241728395223617, + "num_tokens": 11131202.0, + "step": 2380 + }, + { + "entropy": 0.2371656922623515, + "epoch": 0.13865823500830354, + "grad_norm": 0.5970665216445923, + "learning_rate": 4.9773637237152815e-05, + "loss": 0.2504, + "mean_token_accuracy": 0.9288275025784969, + "num_tokens": 11169991.0, + "step": 2390 + }, + { + "entropy": 0.22894019475206734, + "epoch": 0.13923839498741777, + "grad_norm": 0.6836785674095154, + "learning_rate": 4.976678883103744e-05, + "loss": 0.2414, + "mean_token_accuracy": 0.9305731669068337, + "num_tokens": 11214674.0, + "step": 2400 + }, + { + "entropy": 0.2416924899443984, + "epoch": 0.13981855496653203, + "grad_norm": 0.5035353302955627, + "learning_rate": 4.975983885205457e-05, + "loss": 0.2525, + "mean_token_accuracy": 0.9274380512535572, + "num_tokens": 11263196.0, + "step": 2410 + }, + { + "entropy": 0.2437665986828506, + "epoch": 0.14039871494564626, + "grad_norm": 0.7226947546005249, + "learning_rate": 4.9752787328707236e-05, + "loss": 0.2515, + "mean_token_accuracy": 0.9273409470915794, + "num_tokens": 11316612.0, + "step": 2420 + }, + { + "entropy": 0.24210527054965497, + "epoch": 0.1409788749247605, + "grad_norm": 0.5358086228370667, + "learning_rate": 4.974563428991497e-05, + "loss": 0.2544, + "mean_token_accuracy": 0.9273376606404782, + "num_tokens": 11364356.0, + "step": 2430 + }, + { + "entropy": 0.2610197628848255, + "epoch": 0.14155903490387475, + "grad_norm": 0.5456538200378418, + "learning_rate": 4.973837976501361e-05, + "loss": 0.2705, + "mean_token_accuracy": 0.9214465416967869, + "num_tokens": 11410738.0, + "step": 2440 + }, + { + "entropy": 0.22908686995506286, + "epoch": 0.14213919488298898, + "grad_norm": 0.6462218761444092, + "learning_rate": 4.9731023783755214e-05, + "loss": 0.2394, + "mean_token_accuracy": 0.9314847208559514, + "num_tokens": 11455149.0, + "step": 2450 + }, + { + "entropy": 0.2433037543669343, + "epoch": 0.14271935486210321, + "grad_norm": 0.4991690516471863, + "learning_rate": 4.9723566376307926e-05, + "loss": 0.2649, + "mean_token_accuracy": 0.927307202666998, + "num_tokens": 11496878.0, + "step": 2460 + }, + { + "entropy": 0.22638822020962834, + "epoch": 0.14329951484121747, + "grad_norm": 0.5482367277145386, + "learning_rate": 4.971600757325587e-05, + "loss": 0.2369, + "mean_token_accuracy": 0.9311371214687825, + "num_tokens": 11540780.0, + "step": 2470 + }, + { + "entropy": 0.23333994410932063, + "epoch": 0.1438796748203317, + "grad_norm": 0.7447476387023926, + "learning_rate": 4.9708347405598984e-05, + "loss": 0.2494, + "mean_token_accuracy": 0.9279144316911697, + "num_tokens": 11582693.0, + "step": 2480 + }, + { + "entropy": 0.24550476456061004, + "epoch": 0.14445983479944594, + "grad_norm": 0.7074565291404724, + "learning_rate": 4.970058590475294e-05, + "loss": 0.2504, + "mean_token_accuracy": 0.9264522574841976, + "num_tokens": 11630387.0, + "step": 2490 + }, + { + "entropy": 0.24947644881904124, + "epoch": 0.1450399947785602, + "grad_norm": 0.6218745708465576, + "learning_rate": 4.969272310254899e-05, + "loss": 0.2572, + "mean_token_accuracy": 0.9247526183724404, + "num_tokens": 11680757.0, + "step": 2500 + }, + { + "entropy": 0.25409162109717726, + "epoch": 0.14562015475767442, + "grad_norm": 0.5973914265632629, + "learning_rate": 4.968475903123384e-05, + "loss": 0.2558, + "mean_token_accuracy": 0.9242112256586552, + "num_tokens": 11730103.0, + "step": 2510 + }, + { + "entropy": 0.22596047539263964, + "epoch": 0.14620031473678866, + "grad_norm": 0.7123662829399109, + "learning_rate": 4.967669372346953e-05, + "loss": 0.2408, + "mean_token_accuracy": 0.9305530324578285, + "num_tokens": 11776084.0, + "step": 2520 + }, + { + "entropy": 0.228758408036083, + "epoch": 0.14678047471590291, + "grad_norm": 0.6951113343238831, + "learning_rate": 4.966852721233326e-05, + "loss": 0.2416, + "mean_token_accuracy": 0.9290082268416882, + "num_tokens": 11818295.0, + "step": 2530 + }, + { + "entropy": 0.25033763302490114, + "epoch": 0.14736063469501715, + "grad_norm": 0.4864213168621063, + "learning_rate": 4.96602595313173e-05, + "loss": 0.261, + "mean_token_accuracy": 0.9273130305111408, + "num_tokens": 11867462.0, + "step": 2540 + }, + { + "entropy": 0.24239856181666256, + "epoch": 0.1479407946741314, + "grad_norm": 0.5470122694969177, + "learning_rate": 4.965189071432885e-05, + "loss": 0.2572, + "mean_token_accuracy": 0.9257767535746098, + "num_tokens": 11918987.0, + "step": 2550 + }, + { + "entropy": 0.2301312554627657, + "epoch": 0.14852095465324563, + "grad_norm": 0.688961386680603, + "learning_rate": 4.9643420795689857e-05, + "loss": 0.2394, + "mean_token_accuracy": 0.9313627950847149, + "num_tokens": 11961815.0, + "step": 2560 + }, + { + "entropy": 0.21071888115257026, + "epoch": 0.14910111463235987, + "grad_norm": 0.5099593997001648, + "learning_rate": 4.9634849810136906e-05, + "loss": 0.2232, + "mean_token_accuracy": 0.9355365477502346, + "num_tokens": 12001811.0, + "step": 2570 + }, + { + "entropy": 0.24706896413117646, + "epoch": 0.14968127461147412, + "grad_norm": 0.6023861765861511, + "learning_rate": 4.962617779282112e-05, + "loss": 0.259, + "mean_token_accuracy": 0.9239365212619305, + "num_tokens": 12057737.0, + "step": 2580 + }, + { + "entropy": 0.2306947776116431, + "epoch": 0.15026143459058836, + "grad_norm": 0.5581653714179993, + "learning_rate": 4.9617404779307894e-05, + "loss": 0.2384, + "mean_token_accuracy": 0.9297014966607093, + "num_tokens": 12103034.0, + "step": 2590 + }, + { + "entropy": 0.2356986219994724, + "epoch": 0.1508415945697026, + "grad_norm": 0.5773573517799377, + "learning_rate": 4.9608530805576905e-05, + "loss": 0.239, + "mean_token_accuracy": 0.9304272450506688, + "num_tokens": 12147029.0, + "step": 2600 + }, + { + "entropy": 0.22645492665469646, + "epoch": 0.15142175454881684, + "grad_norm": 0.6168541312217712, + "learning_rate": 4.959955590802182e-05, + "loss": 0.2419, + "mean_token_accuracy": 0.932081300765276, + "num_tokens": 12190923.0, + "step": 2610 + }, + { + "entropy": 0.2271472282707691, + "epoch": 0.15200191452793108, + "grad_norm": 0.55539870262146, + "learning_rate": 4.959048012345026e-05, + "loss": 0.2377, + "mean_token_accuracy": 0.9291647724807263, + "num_tokens": 12231729.0, + "step": 2620 + }, + { + "entropy": 0.2423120320774615, + "epoch": 0.1525820745070453, + "grad_norm": 0.5955498814582825, + "learning_rate": 4.958130348908357e-05, + "loss": 0.2444, + "mean_token_accuracy": 0.9264371335506439, + "num_tokens": 12276152.0, + "step": 2630 + }, + { + "entropy": 0.21412776447832585, + "epoch": 0.15316223448615957, + "grad_norm": 0.5757045745849609, + "learning_rate": 4.9572026042556694e-05, + "loss": 0.222, + "mean_token_accuracy": 0.9346926376223564, + "num_tokens": 12319840.0, + "step": 2640 + }, + { + "entropy": 0.24216453162953258, + "epoch": 0.1537423944652738, + "grad_norm": 0.5278294682502747, + "learning_rate": 4.9562647821918075e-05, + "loss": 0.2534, + "mean_token_accuracy": 0.9284684792160988, + "num_tokens": 12364275.0, + "step": 2650 + }, + { + "entropy": 0.22583805080503225, + "epoch": 0.15432255444438803, + "grad_norm": 0.6550887227058411, + "learning_rate": 4.955316886562938e-05, + "loss": 0.2379, + "mean_token_accuracy": 0.930680613219738, + "num_tokens": 12410962.0, + "step": 2660 + }, + { + "entropy": 0.22963961940258742, + "epoch": 0.15490271442350229, + "grad_norm": 0.5252585411071777, + "learning_rate": 4.9543589212565453e-05, + "loss": 0.2512, + "mean_token_accuracy": 0.928978119045496, + "num_tokens": 12461335.0, + "step": 2670 + }, + { + "entropy": 0.2353910928592086, + "epoch": 0.15548287440261652, + "grad_norm": 0.4994027316570282, + "learning_rate": 4.9533908902014105e-05, + "loss": 0.2406, + "mean_token_accuracy": 0.9297120355069637, + "num_tokens": 12503870.0, + "step": 2680 + }, + { + "entropy": 0.24874753830954432, + "epoch": 0.15606303438173078, + "grad_norm": 0.7533536553382874, + "learning_rate": 4.9524127973675956e-05, + "loss": 0.2624, + "mean_token_accuracy": 0.925228051841259, + "num_tokens": 12551246.0, + "step": 2690 + }, + { + "entropy": 0.23545104889199137, + "epoch": 0.156643194360845, + "grad_norm": 0.5818786025047302, + "learning_rate": 4.951424646766427e-05, + "loss": 0.255, + "mean_token_accuracy": 0.9263931758701801, + "num_tokens": 12601374.0, + "step": 2700 + }, + { + "entropy": 0.22164076548069717, + "epoch": 0.15722335433995924, + "grad_norm": 0.5828121900558472, + "learning_rate": 4.950426442450481e-05, + "loss": 0.2354, + "mean_token_accuracy": 0.9324681513011456, + "num_tokens": 12645239.0, + "step": 2710 + }, + { + "entropy": 0.24353402024134993, + "epoch": 0.1578035143190735, + "grad_norm": 0.5427147746086121, + "learning_rate": 4.949418188513566e-05, + "loss": 0.2463, + "mean_token_accuracy": 0.9281445041298866, + "num_tokens": 12691504.0, + "step": 2720 + }, + { + "entropy": 0.22709716809913516, + "epoch": 0.15838367429818773, + "grad_norm": 0.6611645817756653, + "learning_rate": 4.948399889090705e-05, + "loss": 0.2496, + "mean_token_accuracy": 0.9289138957858085, + "num_tokens": 12736301.0, + "step": 2730 + }, + { + "entropy": 0.24094573482871057, + "epoch": 0.15896383427730196, + "grad_norm": 0.7900369763374329, + "learning_rate": 4.947371548358119e-05, + "loss": 0.2531, + "mean_token_accuracy": 0.9274966597557068, + "num_tokens": 12786009.0, + "step": 2740 + }, + { + "entropy": 0.21535170646384358, + "epoch": 0.15954399425641622, + "grad_norm": 0.5301403999328613, + "learning_rate": 4.94633317053321e-05, + "loss": 0.2332, + "mean_token_accuracy": 0.9358279384672642, + "num_tokens": 12824027.0, + "step": 2750 + }, + { + "entropy": 0.22699127178639172, + "epoch": 0.16012415423553045, + "grad_norm": 0.6651958227157593, + "learning_rate": 4.945284759874544e-05, + "loss": 0.2419, + "mean_token_accuracy": 0.932181203365326, + "num_tokens": 12866417.0, + "step": 2760 + }, + { + "entropy": 0.22900630263611674, + "epoch": 0.16070431421464468, + "grad_norm": 0.5129234194755554, + "learning_rate": 4.944226320681835e-05, + "loss": 0.2474, + "mean_token_accuracy": 0.9273658707737923, + "num_tokens": 12910023.0, + "step": 2770 + }, + { + "entropy": 0.24179538674652576, + "epoch": 0.16128447419375894, + "grad_norm": 0.5888381600379944, + "learning_rate": 4.9431578572959245e-05, + "loss": 0.2523, + "mean_token_accuracy": 0.9272563695907593, + "num_tokens": 12959763.0, + "step": 2780 + }, + { + "entropy": 0.22313232766464353, + "epoch": 0.16186463417287317, + "grad_norm": 0.6060441732406616, + "learning_rate": 4.942079374098765e-05, + "loss": 0.2396, + "mean_token_accuracy": 0.9306677743792534, + "num_tokens": 13007603.0, + "step": 2790 + }, + { + "entropy": 0.23654372477903962, + "epoch": 0.1624447941519874, + "grad_norm": 0.5284214615821838, + "learning_rate": 4.9409908755134004e-05, + "loss": 0.245, + "mean_token_accuracy": 0.9300524212419987, + "num_tokens": 13056978.0, + "step": 2800 + }, + { + "entropy": 0.22591229677200317, + "epoch": 0.16302495413110166, + "grad_norm": 0.46177852153778076, + "learning_rate": 4.939892366003952e-05, + "loss": 0.2435, + "mean_token_accuracy": 0.9306000806391239, + "num_tokens": 13098134.0, + "step": 2810 + }, + { + "entropy": 0.22791590187698602, + "epoch": 0.1636051141102159, + "grad_norm": 0.584980845451355, + "learning_rate": 4.938783850075596e-05, + "loss": 0.242, + "mean_token_accuracy": 0.9299115337431431, + "num_tokens": 13143712.0, + "step": 2820 + }, + { + "entropy": 0.20182297620922326, + "epoch": 0.16418527408933015, + "grad_norm": 0.46102726459503174, + "learning_rate": 4.937665332274548e-05, + "loss": 0.2117, + "mean_token_accuracy": 0.9381147347390651, + "num_tokens": 13188573.0, + "step": 2830 + }, + { + "entropy": 0.20154008883982896, + "epoch": 0.16476543406844438, + "grad_norm": 0.48307520151138306, + "learning_rate": 4.936536817188041e-05, + "loss": 0.226, + "mean_token_accuracy": 0.9363874800503253, + "num_tokens": 13230122.0, + "step": 2840 + }, + { + "entropy": 0.23018551748245955, + "epoch": 0.1653455940475586, + "grad_norm": 0.5604698657989502, + "learning_rate": 4.935398309444311e-05, + "loss": 0.2422, + "mean_token_accuracy": 0.931646253168583, + "num_tokens": 13276718.0, + "step": 2850 + }, + { + "entropy": 0.24946487545967103, + "epoch": 0.16592575402667287, + "grad_norm": 0.5893325805664062, + "learning_rate": 4.934249813712574e-05, + "loss": 0.256, + "mean_token_accuracy": 0.9232370875775814, + "num_tokens": 13327352.0, + "step": 2860 + }, + { + "entropy": 0.24300083378329873, + "epoch": 0.1665059140057871, + "grad_norm": 0.5226495265960693, + "learning_rate": 4.933091334703009e-05, + "loss": 0.2444, + "mean_token_accuracy": 0.9288116693496704, + "num_tokens": 13381424.0, + "step": 2870 + }, + { + "entropy": 0.21801077760756016, + "epoch": 0.16708607398490133, + "grad_norm": 0.487408846616745, + "learning_rate": 4.931922877166737e-05, + "loss": 0.2327, + "mean_token_accuracy": 0.9331744261085987, + "num_tokens": 13425780.0, + "step": 2880 + }, + { + "entropy": 0.22886358993127942, + "epoch": 0.1676662339640156, + "grad_norm": 0.45596927404403687, + "learning_rate": 4.930744445895805e-05, + "loss": 0.2429, + "mean_token_accuracy": 0.9295140974223614, + "num_tokens": 13472554.0, + "step": 2890 + }, + { + "entropy": 0.2427712651900947, + "epoch": 0.16824639394312982, + "grad_norm": 0.5616423487663269, + "learning_rate": 4.929556045723162e-05, + "loss": 0.2609, + "mean_token_accuracy": 0.924220583587885, + "num_tokens": 13524012.0, + "step": 2900 + }, + { + "entropy": 0.24258834794163703, + "epoch": 0.16882655392224405, + "grad_norm": 0.4376303553581238, + "learning_rate": 4.9283576815226404e-05, + "loss": 0.2542, + "mean_token_accuracy": 0.9275100819766522, + "num_tokens": 13578810.0, + "step": 2910 + }, + { + "entropy": 0.23221430759876965, + "epoch": 0.1694067139013583, + "grad_norm": 0.6046859622001648, + "learning_rate": 4.9271493582089415e-05, + "loss": 0.2418, + "mean_token_accuracy": 0.9287591382861138, + "num_tokens": 13623344.0, + "step": 2920 + }, + { + "entropy": 0.22016856037080287, + "epoch": 0.16998687388047254, + "grad_norm": 0.5059357285499573, + "learning_rate": 4.925931080737604e-05, + "loss": 0.2274, + "mean_token_accuracy": 0.9328511685132981, + "num_tokens": 13671204.0, + "step": 2930 + }, + { + "entropy": 0.230393850710243, + "epoch": 0.17056703385958677, + "grad_norm": 0.5744109153747559, + "learning_rate": 4.924702854104996e-05, + "loss": 0.2498, + "mean_token_accuracy": 0.9299418836832046, + "num_tokens": 13722722.0, + "step": 2940 + }, + { + "entropy": 0.22403340972959995, + "epoch": 0.17114719383870103, + "grad_norm": 0.5041079521179199, + "learning_rate": 4.9234646833482857e-05, + "loss": 0.2367, + "mean_token_accuracy": 0.9316170141100883, + "num_tokens": 13773327.0, + "step": 2950 + }, + { + "entropy": 0.22908725552260875, + "epoch": 0.17172735381781526, + "grad_norm": 0.4489116668701172, + "learning_rate": 4.922216573545425e-05, + "loss": 0.2401, + "mean_token_accuracy": 0.929831539094448, + "num_tokens": 13824475.0, + "step": 2960 + }, + { + "entropy": 0.22610533041879535, + "epoch": 0.1723075137969295, + "grad_norm": 0.6265159845352173, + "learning_rate": 4.9209585298151264e-05, + "loss": 0.2316, + "mean_token_accuracy": 0.9314801268279552, + "num_tokens": 13874551.0, + "step": 2970 + }, + { + "entropy": 0.20110784210264682, + "epoch": 0.17288767377604375, + "grad_norm": 0.6167561411857605, + "learning_rate": 4.919690557316844e-05, + "loss": 0.2191, + "mean_token_accuracy": 0.938017500936985, + "num_tokens": 13920533.0, + "step": 2980 + }, + { + "entropy": 0.22401857385411858, + "epoch": 0.17346783375515798, + "grad_norm": 0.5413254499435425, + "learning_rate": 4.918412661250752e-05, + "loss": 0.232, + "mean_token_accuracy": 0.9295246057212353, + "num_tokens": 13971095.0, + "step": 2990 + }, + { + "entropy": 0.2363914219662547, + "epoch": 0.17404799373427224, + "grad_norm": 0.4723835587501526, + "learning_rate": 4.917124846857722e-05, + "loss": 0.2392, + "mean_token_accuracy": 0.9293315403163434, + "num_tokens": 14018683.0, + "step": 3000 + }, + { + "entropy": 0.2324467858299613, + "epoch": 0.17462815371338647, + "grad_norm": 0.5366467833518982, + "learning_rate": 4.915827119419304e-05, + "loss": 0.2548, + "mean_token_accuracy": 0.9281632632017136, + "num_tokens": 14067926.0, + "step": 3010 + }, + { + "entropy": 0.21662747263908386, + "epoch": 0.1752083136925007, + "grad_norm": 0.5410497188568115, + "learning_rate": 4.9145194842577e-05, + "loss": 0.2291, + "mean_token_accuracy": 0.9332032054662704, + "num_tokens": 14111845.0, + "step": 3020 + }, + { + "entropy": 0.2280825436115265, + "epoch": 0.17578847367161496, + "grad_norm": 0.5273613929748535, + "learning_rate": 4.913201946735748e-05, + "loss": 0.2375, + "mean_token_accuracy": 0.9290619127452373, + "num_tokens": 14160618.0, + "step": 3030 + }, + { + "entropy": 0.22731232466176152, + "epoch": 0.1763686336507292, + "grad_norm": 0.5042228698730469, + "learning_rate": 4.911874512256895e-05, + "loss": 0.2408, + "mean_token_accuracy": 0.9305116273462772, + "num_tokens": 14206626.0, + "step": 3040 + }, + { + "entropy": 0.22788220876827836, + "epoch": 0.17694879362984342, + "grad_norm": 0.4939880073070526, + "learning_rate": 4.910537186265181e-05, + "loss": 0.2408, + "mean_token_accuracy": 0.9305845081806183, + "num_tokens": 14253994.0, + "step": 3050 + }, + { + "entropy": 0.23591083874925972, + "epoch": 0.17752895360895768, + "grad_norm": 0.6042311787605286, + "learning_rate": 4.909189974245208e-05, + "loss": 0.2403, + "mean_token_accuracy": 0.9243310630321503, + "num_tokens": 14302084.0, + "step": 3060 + }, + { + "entropy": 0.21141142481938005, + "epoch": 0.1781091135880719, + "grad_norm": 0.6198765635490417, + "learning_rate": 4.9078328817221254e-05, + "loss": 0.2274, + "mean_token_accuracy": 0.9360906191170215, + "num_tokens": 14348010.0, + "step": 3070 + }, + { + "entropy": 0.21201363792642952, + "epoch": 0.17868927356718614, + "grad_norm": 0.6266659498214722, + "learning_rate": 4.9064659142616034e-05, + "loss": 0.2226, + "mean_token_accuracy": 0.9376968041062355, + "num_tokens": 14392703.0, + "step": 3080 + }, + { + "entropy": 0.22430537100881337, + "epoch": 0.1792694335463004, + "grad_norm": 0.4451443552970886, + "learning_rate": 4.905089077469811e-05, + "loss": 0.2375, + "mean_token_accuracy": 0.9309919483959674, + "num_tokens": 14441392.0, + "step": 3090 + }, + { + "entropy": 0.21128207352012396, + "epoch": 0.17984959352541463, + "grad_norm": 0.7574012875556946, + "learning_rate": 4.903702376993393e-05, + "loss": 0.2175, + "mean_token_accuracy": 0.9375483751296997, + "num_tokens": 14482752.0, + "step": 3100 + }, + { + "entropy": 0.23162122778594493, + "epoch": 0.18042975350452886, + "grad_norm": 0.4594537019729614, + "learning_rate": 4.902305818519447e-05, + "loss": 0.2448, + "mean_token_accuracy": 0.929555106163025, + "num_tokens": 14529885.0, + "step": 3110 + }, + { + "entropy": 0.2356396524235606, + "epoch": 0.18100991348364312, + "grad_norm": 0.4247165620326996, + "learning_rate": 4.9008994077755e-05, + "loss": 0.2472, + "mean_token_accuracy": 0.9278498165309429, + "num_tokens": 14582410.0, + "step": 3120 + }, + { + "entropy": 0.2056024202145636, + "epoch": 0.18159007346275735, + "grad_norm": 0.47717151045799255, + "learning_rate": 4.899483150529485e-05, + "loss": 0.2151, + "mean_token_accuracy": 0.9377128437161446, + "num_tokens": 14629896.0, + "step": 3130 + }, + { + "entropy": 0.21019009239971637, + "epoch": 0.1821702334418716, + "grad_norm": 0.612072765827179, + "learning_rate": 4.8980570525897164e-05, + "loss": 0.2236, + "mean_token_accuracy": 0.9353572398424148, + "num_tokens": 14673535.0, + "step": 3140 + }, + { + "entropy": 0.213212897721678, + "epoch": 0.18275039342098584, + "grad_norm": 0.569848895072937, + "learning_rate": 4.896621119804867e-05, + "loss": 0.2302, + "mean_token_accuracy": 0.9323088079690933, + "num_tokens": 14721276.0, + "step": 3150 + }, + { + "entropy": 0.21546710170805455, + "epoch": 0.18333055340010007, + "grad_norm": 0.6138198375701904, + "learning_rate": 4.8951753580639465e-05, + "loss": 0.2349, + "mean_token_accuracy": 0.9347792230546474, + "num_tokens": 14766184.0, + "step": 3160 + }, + { + "entropy": 0.23566469438374044, + "epoch": 0.18391071337921433, + "grad_norm": 0.556420624256134, + "learning_rate": 4.893719773296271e-05, + "loss": 0.2407, + "mean_token_accuracy": 0.9287654548883438, + "num_tokens": 14817608.0, + "step": 3170 + }, + { + "entropy": 0.20662189042195678, + "epoch": 0.18449087335832856, + "grad_norm": 0.7384880781173706, + "learning_rate": 4.892254371471445e-05, + "loss": 0.2244, + "mean_token_accuracy": 0.9348165556788445, + "num_tokens": 14866609.0, + "step": 3180 + }, + { + "entropy": 0.20848252726718783, + "epoch": 0.1850710333374428, + "grad_norm": 0.5100448131561279, + "learning_rate": 4.890779158599333e-05, + "loss": 0.2288, + "mean_token_accuracy": 0.9349535062909127, + "num_tokens": 14912076.0, + "step": 3190 + }, + { + "entropy": 0.21848638206720353, + "epoch": 0.18565119331655705, + "grad_norm": 0.6783629655838013, + "learning_rate": 4.889294140730039e-05, + "loss": 0.2249, + "mean_token_accuracy": 0.9346417836844921, + "num_tokens": 14959176.0, + "step": 3200 + }, + { + "entropy": 0.21758969323709607, + "epoch": 0.18623135329567128, + "grad_norm": 0.5793417096138, + "learning_rate": 4.887799323953875e-05, + "loss": 0.2325, + "mean_token_accuracy": 0.932265616953373, + "num_tokens": 15007000.0, + "step": 3210 + }, + { + "entropy": 0.22435894645750523, + "epoch": 0.1868115132747855, + "grad_norm": 0.5868973135948181, + "learning_rate": 4.886294714401345e-05, + "loss": 0.234, + "mean_token_accuracy": 0.9285348080098629, + "num_tokens": 15052880.0, + "step": 3220 + }, + { + "entropy": 0.22446088809520007, + "epoch": 0.18739167325389977, + "grad_norm": 0.5677152872085571, + "learning_rate": 4.884780318243108e-05, + "loss": 0.2357, + "mean_token_accuracy": 0.9334528259932995, + "num_tokens": 15096016.0, + "step": 3230 + }, + { + "entropy": 0.2249863190576434, + "epoch": 0.187971833233014, + "grad_norm": 0.5062674880027771, + "learning_rate": 4.8832561416899666e-05, + "loss": 0.2351, + "mean_token_accuracy": 0.9315970242023468, + "num_tokens": 15144862.0, + "step": 3240 + }, + { + "entropy": 0.21153551228344442, + "epoch": 0.18855199321212823, + "grad_norm": 0.5741562843322754, + "learning_rate": 4.881722190992831e-05, + "loss": 0.2303, + "mean_token_accuracy": 0.9362453766167164, + "num_tokens": 15188421.0, + "step": 3250 + }, + { + "entropy": 0.21869329242035745, + "epoch": 0.1891321531912425, + "grad_norm": 0.46246302127838135, + "learning_rate": 4.8801784724426955e-05, + "loss": 0.2305, + "mean_token_accuracy": 0.933679711818695, + "num_tokens": 15240655.0, + "step": 3260 + }, + { + "entropy": 0.22525724107399583, + "epoch": 0.18971231317035672, + "grad_norm": 0.8487122654914856, + "learning_rate": 4.878624992370617e-05, + "loss": 0.2444, + "mean_token_accuracy": 0.9320090629160405, + "num_tokens": 15286555.0, + "step": 3270 + }, + { + "entropy": 0.21956042116507887, + "epoch": 0.19029247314947095, + "grad_norm": 0.5654147267341614, + "learning_rate": 4.877061757147686e-05, + "loss": 0.2265, + "mean_token_accuracy": 0.9331092551350594, + "num_tokens": 15330233.0, + "step": 3280 + }, + { + "entropy": 0.21436818102374672, + "epoch": 0.1908726331285852, + "grad_norm": 0.5871095657348633, + "learning_rate": 4.875488773184997e-05, + "loss": 0.2236, + "mean_token_accuracy": 0.9342731289565563, + "num_tokens": 15375882.0, + "step": 3290 + }, + { + "entropy": 0.22209350736811756, + "epoch": 0.19145279310769944, + "grad_norm": 0.7583034634590149, + "learning_rate": 4.873906046933631e-05, + "loss": 0.2351, + "mean_token_accuracy": 0.9331734322011471, + "num_tokens": 15426410.0, + "step": 3300 + }, + { + "entropy": 0.2043465775437653, + "epoch": 0.1920329530868137, + "grad_norm": 0.5400552749633789, + "learning_rate": 4.872313584884619e-05, + "loss": 0.2143, + "mean_token_accuracy": 0.9386071145534516, + "num_tokens": 15474465.0, + "step": 3310 + }, + { + "entropy": 0.20643043788149953, + "epoch": 0.19261311306592793, + "grad_norm": 0.47623029351234436, + "learning_rate": 4.870711393568922e-05, + "loss": 0.2206, + "mean_token_accuracy": 0.9350184559822082, + "num_tokens": 15525615.0, + "step": 3320 + }, + { + "entropy": 0.22230770923197268, + "epoch": 0.19319327304504216, + "grad_norm": 0.4153057038784027, + "learning_rate": 4.869099479557404e-05, + "loss": 0.2343, + "mean_token_accuracy": 0.932657214999199, + "num_tokens": 15572220.0, + "step": 3330 + }, + { + "entropy": 0.22769315969198942, + "epoch": 0.19377343302415642, + "grad_norm": 0.5515888333320618, + "learning_rate": 4.867477849460801e-05, + "loss": 0.2398, + "mean_token_accuracy": 0.9315300233662128, + "num_tokens": 15620869.0, + "step": 3340 + }, + { + "entropy": 0.22345890710130334, + "epoch": 0.19435359300327065, + "grad_norm": 0.535883367061615, + "learning_rate": 4.865846509929699e-05, + "loss": 0.2338, + "mean_token_accuracy": 0.9313367612659931, + "num_tokens": 15665711.0, + "step": 3350 + }, + { + "entropy": 0.20086854100227355, + "epoch": 0.19493375298238488, + "grad_norm": 0.5591940879821777, + "learning_rate": 4.8642054676544995e-05, + "loss": 0.2202, + "mean_token_accuracy": 0.9367882005870343, + "num_tokens": 15706488.0, + "step": 3360 + }, + { + "entropy": 0.21088826097548008, + "epoch": 0.19551391296149914, + "grad_norm": 0.41410505771636963, + "learning_rate": 4.8625547293654016e-05, + "loss": 0.2148, + "mean_token_accuracy": 0.9360318586230278, + "num_tokens": 15756518.0, + "step": 3370 + }, + { + "entropy": 0.22003289479762317, + "epoch": 0.19609407294061337, + "grad_norm": 0.41268664598464966, + "learning_rate": 4.860894301832368e-05, + "loss": 0.239, + "mean_token_accuracy": 0.9294947683811188, + "num_tokens": 15801989.0, + "step": 3380 + }, + { + "entropy": 0.21069823503494262, + "epoch": 0.1966742329197276, + "grad_norm": 0.4836124777793884, + "learning_rate": 4.859224191865096e-05, + "loss": 0.2246, + "mean_token_accuracy": 0.9346454039216041, + "num_tokens": 15848345.0, + "step": 3390 + }, + { + "entropy": 0.20483749387785793, + "epoch": 0.19725439289884186, + "grad_norm": 0.5193631649017334, + "learning_rate": 4.857544406312996e-05, + "loss": 0.2154, + "mean_token_accuracy": 0.9347153976559639, + "num_tokens": 15891112.0, + "step": 3400 + }, + { + "entropy": 0.21235422920435668, + "epoch": 0.1978345528779561, + "grad_norm": 0.5894874334335327, + "learning_rate": 4.8558549520651576e-05, + "loss": 0.224, + "mean_token_accuracy": 0.9346397936344146, + "num_tokens": 15938995.0, + "step": 3410 + }, + { + "entropy": 0.22362595032900573, + "epoch": 0.19841471285707032, + "grad_norm": 0.4738730192184448, + "learning_rate": 4.854155836050323e-05, + "loss": 0.234, + "mean_token_accuracy": 0.9306644752621651, + "num_tokens": 15990243.0, + "step": 3420 + }, + { + "entropy": 0.20369641818106174, + "epoch": 0.19899487283618458, + "grad_norm": 0.6050360798835754, + "learning_rate": 4.8524470652368605e-05, + "loss": 0.2094, + "mean_token_accuracy": 0.9386633589863778, + "num_tokens": 16036747.0, + "step": 3430 + }, + { + "entropy": 0.21607417557388545, + "epoch": 0.19957503281529881, + "grad_norm": 0.46845728158950806, + "learning_rate": 4.8507286466327325e-05, + "loss": 0.2297, + "mean_token_accuracy": 0.932365907728672, + "num_tokens": 16087081.0, + "step": 3440 + }, + { + "entropy": 0.2165160103701055, + "epoch": 0.20015519279441307, + "grad_norm": 0.4557925760746002, + "learning_rate": 4.8490005872854705e-05, + "loss": 0.2237, + "mean_token_accuracy": 0.935276222974062, + "num_tokens": 16136627.0, + "step": 3450 + }, + { + "entropy": 0.19594889180734754, + "epoch": 0.2007353527735273, + "grad_norm": 0.5375991463661194, + "learning_rate": 4.8472628942821434e-05, + "loss": 0.2062, + "mean_token_accuracy": 0.939439382404089, + "num_tokens": 16181836.0, + "step": 3460 + }, + { + "entropy": 0.21792814126238227, + "epoch": 0.20131551275264153, + "grad_norm": 0.5240413546562195, + "learning_rate": 4.84551557474933e-05, + "loss": 0.2351, + "mean_token_accuracy": 0.9313452154397964, + "num_tokens": 16233495.0, + "step": 3470 + }, + { + "entropy": 0.22336672740057112, + "epoch": 0.2018956727317558, + "grad_norm": 0.5529021620750427, + "learning_rate": 4.8437586358530886e-05, + "loss": 0.2345, + "mean_token_accuracy": 0.931741351634264, + "num_tokens": 16283960.0, + "step": 3480 + }, + { + "entropy": 0.21436627982184292, + "epoch": 0.20247583271087002, + "grad_norm": 0.5123047232627869, + "learning_rate": 4.8419920847989276e-05, + "loss": 0.2254, + "mean_token_accuracy": 0.9330189742147923, + "num_tokens": 16331962.0, + "step": 3490 + }, + { + "entropy": 0.21620165817439557, + "epoch": 0.20305599268998425, + "grad_norm": 0.5535455942153931, + "learning_rate": 4.8402159288317774e-05, + "loss": 0.2276, + "mean_token_accuracy": 0.9338933601975441, + "num_tokens": 16383783.0, + "step": 3500 + }, + { + "entropy": 0.22764494689181447, + "epoch": 0.2036361526690985, + "grad_norm": 0.5306493639945984, + "learning_rate": 4.838430175235959e-05, + "loss": 0.2439, + "mean_token_accuracy": 0.9292599484324455, + "num_tokens": 16427907.0, + "step": 3510 + }, + { + "entropy": 0.22134519619867205, + "epoch": 0.20421631264821274, + "grad_norm": 0.5804843902587891, + "learning_rate": 4.836634831335158e-05, + "loss": 0.2334, + "mean_token_accuracy": 0.9312367618083954, + "num_tokens": 16476575.0, + "step": 3520 + }, + { + "entropy": 0.20778467981144785, + "epoch": 0.20479647262732698, + "grad_norm": 0.4420047104358673, + "learning_rate": 4.834829904492387e-05, + "loss": 0.2177, + "mean_token_accuracy": 0.9361670479178429, + "num_tokens": 16525318.0, + "step": 3530 + }, + { + "entropy": 0.19020560318604113, + "epoch": 0.20537663260644123, + "grad_norm": 0.503233015537262, + "learning_rate": 4.833015402109962e-05, + "loss": 0.2087, + "mean_token_accuracy": 0.9398520894348621, + "num_tokens": 16569666.0, + "step": 3540 + }, + { + "entropy": 0.20770877627655865, + "epoch": 0.20595679258555546, + "grad_norm": 0.5581107139587402, + "learning_rate": 4.8311913316294696e-05, + "loss": 0.2226, + "mean_token_accuracy": 0.935036338865757, + "num_tokens": 16616961.0, + "step": 3550 + }, + { + "entropy": 0.2245352765545249, + "epoch": 0.2065369525646697, + "grad_norm": 0.5490514636039734, + "learning_rate": 4.829357700531738e-05, + "loss": 0.2292, + "mean_token_accuracy": 0.9319665767252445, + "num_tokens": 16665889.0, + "step": 3560 + }, + { + "entropy": 0.19434359865263104, + "epoch": 0.20711711254378395, + "grad_norm": 0.5543225407600403, + "learning_rate": 4.827514516336804e-05, + "loss": 0.207, + "mean_token_accuracy": 0.9370627529919148, + "num_tokens": 16711190.0, + "step": 3570 + }, + { + "entropy": 0.20287939431145788, + "epoch": 0.20769727252289819, + "grad_norm": 0.5180935263633728, + "learning_rate": 4.8256617866038836e-05, + "loss": 0.2185, + "mean_token_accuracy": 0.9351489208638668, + "num_tokens": 16758635.0, + "step": 3580 + }, + { + "entropy": 0.21579952826723456, + "epoch": 0.20827743250201242, + "grad_norm": 0.49279242753982544, + "learning_rate": 4.823799518931339e-05, + "loss": 0.2227, + "mean_token_accuracy": 0.9348270982503891, + "num_tokens": 16811863.0, + "step": 3590 + }, + { + "entropy": 0.2157499306835234, + "epoch": 0.20885759248112667, + "grad_norm": 0.544121503829956, + "learning_rate": 4.821927720956653e-05, + "loss": 0.2329, + "mean_token_accuracy": 0.9324086248874665, + "num_tokens": 16862619.0, + "step": 3600 + }, + { + "entropy": 0.20684012779965996, + "epoch": 0.2094377524602409, + "grad_norm": 0.4262360632419586, + "learning_rate": 4.82004640035639e-05, + "loss": 0.2207, + "mean_token_accuracy": 0.9382325381040573, + "num_tokens": 16913557.0, + "step": 3610 + }, + { + "entropy": 0.20470981393009424, + "epoch": 0.21001791243935516, + "grad_norm": 0.7546806931495667, + "learning_rate": 4.818155564846171e-05, + "loss": 0.2253, + "mean_token_accuracy": 0.9325879275798797, + "num_tokens": 16958266.0, + "step": 3620 + }, + { + "entropy": 0.21832176949828863, + "epoch": 0.2105980724184694, + "grad_norm": 0.48170948028564453, + "learning_rate": 4.8162552221806366e-05, + "loss": 0.2254, + "mean_token_accuracy": 0.9330754652619362, + "num_tokens": 17010693.0, + "step": 3630 + }, + { + "entropy": 0.2072186628356576, + "epoch": 0.21117823239758363, + "grad_norm": 0.46969473361968994, + "learning_rate": 4.8143453801534216e-05, + "loss": 0.2182, + "mean_token_accuracy": 0.9360827006399631, + "num_tokens": 17053293.0, + "step": 3640 + }, + { + "entropy": 0.20906007792800665, + "epoch": 0.21175839237669788, + "grad_norm": 0.5067852735519409, + "learning_rate": 4.8124260465971154e-05, + "loss": 0.2307, + "mean_token_accuracy": 0.9361007727682591, + "num_tokens": 17096934.0, + "step": 3650 + }, + { + "entropy": 0.21725732050836086, + "epoch": 0.21233855235581212, + "grad_norm": 0.46726423501968384, + "learning_rate": 4.810497229383236e-05, + "loss": 0.2325, + "mean_token_accuracy": 0.9331971801817417, + "num_tokens": 17149169.0, + "step": 3660 + }, + { + "entropy": 0.21568809747695922, + "epoch": 0.21291871233492635, + "grad_norm": 0.4911714196205139, + "learning_rate": 4.8085589364221954e-05, + "loss": 0.2244, + "mean_token_accuracy": 0.9322738297283649, + "num_tokens": 17193954.0, + "step": 3670 + }, + { + "entropy": 0.20482491608709097, + "epoch": 0.2134988723140406, + "grad_norm": 0.478519469499588, + "learning_rate": 4.806611175663267e-05, + "loss": 0.2127, + "mean_token_accuracy": 0.9363058671355248, + "num_tokens": 17240217.0, + "step": 3680 + }, + { + "entropy": 0.205538971722126, + "epoch": 0.21407903229315484, + "grad_norm": 0.49216026067733765, + "learning_rate": 4.804653955094552e-05, + "loss": 0.2253, + "mean_token_accuracy": 0.9350344993174076, + "num_tokens": 17283580.0, + "step": 3690 + }, + { + "entropy": 0.22439903365448116, + "epoch": 0.21465919227226907, + "grad_norm": 0.41275811195373535, + "learning_rate": 4.802687282742952e-05, + "loss": 0.2347, + "mean_token_accuracy": 0.931698563694954, + "num_tokens": 17334328.0, + "step": 3700 + }, + { + "entropy": 0.20268806330859662, + "epoch": 0.21523935225138333, + "grad_norm": 0.43220409750938416, + "learning_rate": 4.800711166674125e-05, + "loss": 0.2239, + "mean_token_accuracy": 0.9376021914184094, + "num_tokens": 17380127.0, + "step": 3710 + }, + { + "entropy": 0.21536349263042212, + "epoch": 0.21581951223049756, + "grad_norm": 0.4750637710094452, + "learning_rate": 4.798725614992467e-05, + "loss": 0.2194, + "mean_token_accuracy": 0.9356134064495564, + "num_tokens": 17423241.0, + "step": 3720 + }, + { + "entropy": 0.20201460719108583, + "epoch": 0.2163996722096118, + "grad_norm": 0.49889251589775085, + "learning_rate": 4.7967306358410655e-05, + "loss": 0.2173, + "mean_token_accuracy": 0.9363719061017036, + "num_tokens": 17467129.0, + "step": 3730 + }, + { + "entropy": 0.2058021416887641, + "epoch": 0.21697983218872605, + "grad_norm": 0.49397414922714233, + "learning_rate": 4.7947262374016746e-05, + "loss": 0.2159, + "mean_token_accuracy": 0.9376700825989246, + "num_tokens": 17516294.0, + "step": 3740 + }, + { + "entropy": 0.21063542012125253, + "epoch": 0.21755999216784028, + "grad_norm": 0.40509095788002014, + "learning_rate": 4.7927124278946763e-05, + "loss": 0.2265, + "mean_token_accuracy": 0.9345964543521404, + "num_tokens": 17564702.0, + "step": 3750 + }, + { + "entropy": 0.20479513900354504, + "epoch": 0.21814015214695454, + "grad_norm": 0.49233299493789673, + "learning_rate": 4.7906892155790526e-05, + "loss": 0.2138, + "mean_token_accuracy": 0.9362231433391571, + "num_tokens": 17612916.0, + "step": 3760 + }, + { + "entropy": 0.2081054573878646, + "epoch": 0.21872031212606877, + "grad_norm": 0.5161234736442566, + "learning_rate": 4.788656608752344e-05, + "loss": 0.2265, + "mean_token_accuracy": 0.935090222209692, + "num_tokens": 17663083.0, + "step": 3770 + }, + { + "entropy": 0.20970828384160994, + "epoch": 0.219300472105183, + "grad_norm": 0.6927179098129272, + "learning_rate": 4.786614615750621e-05, + "loss": 0.2184, + "mean_token_accuracy": 0.9350227914750576, + "num_tokens": 17706210.0, + "step": 3780 + }, + { + "entropy": 0.203233555983752, + "epoch": 0.21988063208429726, + "grad_norm": 0.4430566430091858, + "learning_rate": 4.78456324494845e-05, + "loss": 0.2241, + "mean_token_accuracy": 0.9355765037238598, + "num_tokens": 17751405.0, + "step": 3790 + }, + { + "entropy": 0.21286698011681437, + "epoch": 0.2204607920634115, + "grad_norm": 0.49385738372802734, + "learning_rate": 4.7825025047588546e-05, + "loss": 0.2127, + "mean_token_accuracy": 0.9353455156087875, + "num_tokens": 17798996.0, + "step": 3800 + }, + { + "entropy": 0.1983182031661272, + "epoch": 0.22104095204252572, + "grad_norm": 0.48408204317092896, + "learning_rate": 4.780432403633287e-05, + "loss": 0.2119, + "mean_token_accuracy": 0.9384646512567997, + "num_tokens": 17842404.0, + "step": 3810 + }, + { + "entropy": 0.20450984751805662, + "epoch": 0.22162111202163998, + "grad_norm": 0.5382962226867676, + "learning_rate": 4.778352950061587e-05, + "loss": 0.2162, + "mean_token_accuracy": 0.9384715110063553, + "num_tokens": 17886467.0, + "step": 3820 + }, + { + "entropy": 0.20185880754142999, + "epoch": 0.2222012720007542, + "grad_norm": 0.46590447425842285, + "learning_rate": 4.7762641525719524e-05, + "loss": 0.2021, + "mean_token_accuracy": 0.9374426051974296, + "num_tokens": 17930695.0, + "step": 3830 + }, + { + "entropy": 0.19705468099564313, + "epoch": 0.22278143197986844, + "grad_norm": 0.6423255801200867, + "learning_rate": 4.774166019730902e-05, + "loss": 0.2245, + "mean_token_accuracy": 0.9373861603438854, + "num_tokens": 17976748.0, + "step": 3840 + }, + { + "entropy": 0.21414560144767164, + "epoch": 0.2233615919589827, + "grad_norm": 0.45483607053756714, + "learning_rate": 4.77205856014324e-05, + "loss": 0.2175, + "mean_token_accuracy": 0.934902535378933, + "num_tokens": 18019563.0, + "step": 3850 + }, + { + "entropy": 0.18687412347644566, + "epoch": 0.22394175193809693, + "grad_norm": 0.5898351669311523, + "learning_rate": 4.76994178245202e-05, + "loss": 0.1954, + "mean_token_accuracy": 0.9406257934868336, + "num_tokens": 18067999.0, + "step": 3860 + }, + { + "entropy": 0.19021482225507497, + "epoch": 0.22452191191721116, + "grad_norm": 0.6612934470176697, + "learning_rate": 4.767815695338514e-05, + "loss": 0.2081, + "mean_token_accuracy": 0.9406044170260429, + "num_tokens": 18107258.0, + "step": 3870 + }, + { + "entropy": 0.22130839619785547, + "epoch": 0.22510207189632542, + "grad_norm": 0.4955511689186096, + "learning_rate": 4.765680307522169e-05, + "loss": 0.2366, + "mean_token_accuracy": 0.930343372374773, + "num_tokens": 18158023.0, + "step": 3880 + }, + { + "entropy": 0.21906097168102862, + "epoch": 0.22568223187543965, + "grad_norm": 0.6124922633171082, + "learning_rate": 4.76353562776058e-05, + "loss": 0.2349, + "mean_token_accuracy": 0.932503878325224, + "num_tokens": 18211991.0, + "step": 3890 + }, + { + "entropy": 0.20890653673559428, + "epoch": 0.2262623918545539, + "grad_norm": 0.5365882515907288, + "learning_rate": 4.761381664849449e-05, + "loss": 0.2231, + "mean_token_accuracy": 0.9328603260219097, + "num_tokens": 18262039.0, + "step": 3900 + }, + { + "entropy": 0.21737108742818237, + "epoch": 0.22684255183366814, + "grad_norm": 0.8186827898025513, + "learning_rate": 4.7592184276225465e-05, + "loss": 0.2299, + "mean_token_accuracy": 0.9339910477399826, + "num_tokens": 18307280.0, + "step": 3910 + }, + { + "entropy": 0.20889012468978763, + "epoch": 0.22742271181278237, + "grad_norm": 0.5230360627174377, + "learning_rate": 4.757045924951683e-05, + "loss": 0.2181, + "mean_token_accuracy": 0.9364873923361301, + "num_tokens": 18351330.0, + "step": 3920 + }, + { + "entropy": 0.1970261473208666, + "epoch": 0.22800287179189663, + "grad_norm": 0.4875032305717468, + "learning_rate": 4.7548641657466655e-05, + "loss": 0.2116, + "mean_token_accuracy": 0.9362821891903877, + "num_tokens": 18398849.0, + "step": 3930 + }, + { + "entropy": 0.2102089051157236, + "epoch": 0.22858303177101086, + "grad_norm": 0.4128032326698303, + "learning_rate": 4.7526731589552636e-05, + "loss": 0.2217, + "mean_token_accuracy": 0.9344322137534619, + "num_tokens": 18450429.0, + "step": 3940 + }, + { + "entropy": 0.21359951589256526, + "epoch": 0.2291631917501251, + "grad_norm": 0.5366026759147644, + "learning_rate": 4.7504729135631746e-05, + "loss": 0.228, + "mean_token_accuracy": 0.9354204870760441, + "num_tokens": 18497589.0, + "step": 3950 + }, + { + "entropy": 0.20847713705152274, + "epoch": 0.22974335172923935, + "grad_norm": 0.5145766139030457, + "learning_rate": 4.748263438593984e-05, + "loss": 0.2243, + "mean_token_accuracy": 0.9339506052434444, + "num_tokens": 18540439.0, + "step": 3960 + }, + { + "entropy": 0.20513907438144088, + "epoch": 0.23032351170835358, + "grad_norm": 0.5451750159263611, + "learning_rate": 4.7460447431091285e-05, + "loss": 0.2212, + "mean_token_accuracy": 0.9347366623580455, + "num_tokens": 18585579.0, + "step": 3970 + }, + { + "entropy": 0.20649288278073072, + "epoch": 0.2309036716874678, + "grad_norm": 0.5134602785110474, + "learning_rate": 4.7438168362078614e-05, + "loss": 0.2248, + "mean_token_accuracy": 0.9342507265508175, + "num_tokens": 18631312.0, + "step": 3980 + }, + { + "entropy": 0.2095065454021096, + "epoch": 0.23148383166658207, + "grad_norm": 0.4622044563293457, + "learning_rate": 4.741579727027212e-05, + "loss": 0.211, + "mean_token_accuracy": 0.9385435178875923, + "num_tokens": 18676881.0, + "step": 3990 + }, + { + "entropy": 0.2103260327130556, + "epoch": 0.2320639916456963, + "grad_norm": 0.5241548418998718, + "learning_rate": 4.73933342474195e-05, + "loss": 0.2229, + "mean_token_accuracy": 0.9350397765636445, + "num_tokens": 18723659.0, + "step": 4000 + }, + { + "entropy": 0.21186708183959127, + "epoch": 0.23264415162481053, + "grad_norm": 0.5258693695068359, + "learning_rate": 4.7370779385645495e-05, + "loss": 0.2235, + "mean_token_accuracy": 0.933479105681181, + "num_tokens": 18776792.0, + "step": 4010 + }, + { + "entropy": 0.21147484304383396, + "epoch": 0.2332243116039248, + "grad_norm": 0.6025766730308533, + "learning_rate": 4.734813277745146e-05, + "loss": 0.2219, + "mean_token_accuracy": 0.9330485247075557, + "num_tokens": 18822665.0, + "step": 4020 + }, + { + "entropy": 0.20675705671310424, + "epoch": 0.23380447158303902, + "grad_norm": 0.55415278673172, + "learning_rate": 4.732539451571506e-05, + "loss": 0.2242, + "mean_token_accuracy": 0.9339765839278698, + "num_tokens": 18866324.0, + "step": 4030 + }, + { + "entropy": 0.22277802443131806, + "epoch": 0.23438463156215325, + "grad_norm": 0.4260983467102051, + "learning_rate": 4.73025646936898e-05, + "loss": 0.2289, + "mean_token_accuracy": 0.9338542819023132, + "num_tokens": 18914170.0, + "step": 4040 + }, + { + "entropy": 0.19311339650303125, + "epoch": 0.2349647915412675, + "grad_norm": 0.5404362082481384, + "learning_rate": 4.727964340500472e-05, + "loss": 0.2098, + "mean_token_accuracy": 0.938558080047369, + "num_tokens": 18955744.0, + "step": 4050 + }, + { + "entropy": 0.22226034495979546, + "epoch": 0.23554495152038174, + "grad_norm": 0.5025978088378906, + "learning_rate": 4.725663074366399e-05, + "loss": 0.2321, + "mean_token_accuracy": 0.9307832181453705, + "num_tokens": 19001231.0, + "step": 4060 + }, + { + "entropy": 0.20555600393563508, + "epoch": 0.236125111499496, + "grad_norm": 0.5506867170333862, + "learning_rate": 4.723352680404648e-05, + "loss": 0.2126, + "mean_token_accuracy": 0.9372551806271077, + "num_tokens": 19059967.0, + "step": 4070 + }, + { + "entropy": 0.2020394585095346, + "epoch": 0.23670527147861023, + "grad_norm": 0.6257017850875854, + "learning_rate": 4.721033168090543e-05, + "loss": 0.2243, + "mean_token_accuracy": 0.9344319425523281, + "num_tokens": 19107715.0, + "step": 4080 + }, + { + "entropy": 0.20211742147803308, + "epoch": 0.23728543145772446, + "grad_norm": 0.44078755378723145, + "learning_rate": 4.718704546936803e-05, + "loss": 0.2199, + "mean_token_accuracy": 0.9362830147147179, + "num_tokens": 19156518.0, + "step": 4090 + }, + { + "entropy": 0.2129787920974195, + "epoch": 0.23786559143683872, + "grad_norm": 0.5184502601623535, + "learning_rate": 4.7163668264935056e-05, + "loss": 0.2204, + "mean_token_accuracy": 0.9358193688094616, + "num_tokens": 19198987.0, + "step": 4100 + }, + { + "entropy": 0.19654914513230323, + "epoch": 0.23844575141595295, + "grad_norm": 0.4312357008457184, + "learning_rate": 4.7140200163480455e-05, + "loss": 0.2146, + "mean_token_accuracy": 0.9372913725674152, + "num_tokens": 19245098.0, + "step": 4110 + }, + { + "entropy": 0.1951150639913976, + "epoch": 0.23902591139506718, + "grad_norm": 0.45724502205848694, + "learning_rate": 4.7116641261250935e-05, + "loss": 0.2172, + "mean_token_accuracy": 0.9369843855500222, + "num_tokens": 19286951.0, + "step": 4120 + }, + { + "entropy": 0.20128755941987037, + "epoch": 0.23960607137418144, + "grad_norm": 0.4933720827102661, + "learning_rate": 4.709299165486563e-05, + "loss": 0.2088, + "mean_token_accuracy": 0.9373709596693516, + "num_tokens": 19332518.0, + "step": 4130 + }, + { + "entropy": 0.20488802660256625, + "epoch": 0.24018623135329567, + "grad_norm": 0.44518086314201355, + "learning_rate": 4.7069251441315644e-05, + "loss": 0.2195, + "mean_token_accuracy": 0.9366111367940902, + "num_tokens": 19375806.0, + "step": 4140 + }, + { + "entropy": 0.2026549375616014, + "epoch": 0.2407663913324099, + "grad_norm": 0.6609086990356445, + "learning_rate": 4.7045420717963676e-05, + "loss": 0.2252, + "mean_token_accuracy": 0.9345681451261043, + "num_tokens": 19421630.0, + "step": 4150 + }, + { + "entropy": 0.19887538813054562, + "epoch": 0.24134655131152416, + "grad_norm": 0.5068793296813965, + "learning_rate": 4.702149958254363e-05, + "loss": 0.2017, + "mean_token_accuracy": 0.9385142393410206, + "num_tokens": 19469086.0, + "step": 4160 + }, + { + "entropy": 0.21070900959894062, + "epoch": 0.2419267112906384, + "grad_norm": 0.49139463901519775, + "learning_rate": 4.699748813316021e-05, + "loss": 0.226, + "mean_token_accuracy": 0.9320964053273201, + "num_tokens": 19519865.0, + "step": 4170 + }, + { + "entropy": 0.21176093704998494, + "epoch": 0.24250687126975262, + "grad_norm": 0.534067690372467, + "learning_rate": 4.69733864682885e-05, + "loss": 0.2239, + "mean_token_accuracy": 0.9342532321810723, + "num_tokens": 19565296.0, + "step": 4180 + }, + { + "entropy": 0.19915788620710373, + "epoch": 0.24308703124886688, + "grad_norm": 0.5252606868743896, + "learning_rate": 4.694919468677359e-05, + "loss": 0.2057, + "mean_token_accuracy": 0.9375826925039291, + "num_tokens": 19609444.0, + "step": 4190 + }, + { + "entropy": 0.20580140659585594, + "epoch": 0.2436671912279811, + "grad_norm": 0.41595250368118286, + "learning_rate": 4.692491288783013e-05, + "loss": 0.228, + "mean_token_accuracy": 0.9352385923266411, + "num_tokens": 19660963.0, + "step": 4200 + }, + { + "entropy": 0.20524226939305662, + "epoch": 0.24424735120709537, + "grad_norm": 0.4919288754463196, + "learning_rate": 4.690054117104197e-05, + "loss": 0.2043, + "mean_token_accuracy": 0.9401963673532009, + "num_tokens": 19709019.0, + "step": 4210 + }, + { + "entropy": 0.19210529569536447, + "epoch": 0.2448275111862096, + "grad_norm": 0.5677445530891418, + "learning_rate": 4.687607963636171e-05, + "loss": 0.2136, + "mean_token_accuracy": 0.9386193588376045, + "num_tokens": 19760032.0, + "step": 4220 + }, + { + "entropy": 0.2057210562750697, + "epoch": 0.24540767116532383, + "grad_norm": 0.47238871455192566, + "learning_rate": 4.685152838411032e-05, + "loss": 0.2197, + "mean_token_accuracy": 0.9344862304627896, + "num_tokens": 19800324.0, + "step": 4230 + }, + { + "entropy": 0.21925745084881781, + "epoch": 0.2459878311444381, + "grad_norm": 0.5404976606369019, + "learning_rate": 4.682688751497673e-05, + "loss": 0.2316, + "mean_token_accuracy": 0.9343979746103287, + "num_tokens": 19850930.0, + "step": 4240 + }, + { + "entropy": 0.19827272957190872, + "epoch": 0.24656799112355232, + "grad_norm": 0.4838732182979584, + "learning_rate": 4.680215713001737e-05, + "loss": 0.2084, + "mean_token_accuracy": 0.9378572024405003, + "num_tokens": 19901346.0, + "step": 4250 + }, + { + "entropy": 0.19619887955486776, + "epoch": 0.24714815110266655, + "grad_norm": 0.5380393862724304, + "learning_rate": 4.677733733065581e-05, + "loss": 0.2086, + "mean_token_accuracy": 0.9378792434930802, + "num_tokens": 19947882.0, + "step": 4260 + }, + { + "entropy": 0.19241483574733137, + "epoch": 0.2477283110817808, + "grad_norm": 0.5434103608131409, + "learning_rate": 4.675242821868233e-05, + "loss": 0.2105, + "mean_token_accuracy": 0.939323166012764, + "num_tokens": 19994853.0, + "step": 4270 + }, + { + "entropy": 0.1936852429062128, + "epoch": 0.24830847106089504, + "grad_norm": 0.5637681484222412, + "learning_rate": 4.672742989625349e-05, + "loss": 0.2022, + "mean_token_accuracy": 0.9408436231315136, + "num_tokens": 20040979.0, + "step": 4280 + }, + { + "entropy": 0.19993506409227849, + "epoch": 0.24888863104000927, + "grad_norm": 0.4547356069087982, + "learning_rate": 4.670234246589171e-05, + "loss": 0.2084, + "mean_token_accuracy": 0.9368733942508698, + "num_tokens": 20092012.0, + "step": 4290 + }, + { + "entropy": 0.1976324682123959, + "epoch": 0.24946879101912353, + "grad_norm": 0.6019384264945984, + "learning_rate": 4.667716603048487e-05, + "loss": 0.2112, + "mean_token_accuracy": 0.9389707684516907, + "num_tokens": 20137019.0, + "step": 4300 + }, + { + "entropy": 0.20288022430613636, + "epoch": 0.25004895099823776, + "grad_norm": 0.5171788930892944, + "learning_rate": 4.6651900693285876e-05, + "loss": 0.2145, + "mean_token_accuracy": 0.936156066507101, + "num_tokens": 20182210.0, + "step": 4310 + }, + { + "entropy": 0.19311839006841183, + "epoch": 0.250629110977352, + "grad_norm": 0.43355512619018555, + "learning_rate": 4.662654655791221e-05, + "loss": 0.2106, + "mean_token_accuracy": 0.9378086969256401, + "num_tokens": 20226409.0, + "step": 4320 + }, + { + "entropy": 0.20073226392269133, + "epoch": 0.2512092709564662, + "grad_norm": 0.6707788705825806, + "learning_rate": 4.6601103728345564e-05, + "loss": 0.2104, + "mean_token_accuracy": 0.9371313653886318, + "num_tokens": 20272010.0, + "step": 4330 + }, + { + "entropy": 0.20173704605549575, + "epoch": 0.2517894309355805, + "grad_norm": 0.5421533584594727, + "learning_rate": 4.657557230893135e-05, + "loss": 0.2224, + "mean_token_accuracy": 0.9354595065116882, + "num_tokens": 20322321.0, + "step": 4340 + }, + { + "entropy": 0.2113415576517582, + "epoch": 0.25236959091469474, + "grad_norm": 0.3972405791282654, + "learning_rate": 4.6549952404378335e-05, + "loss": 0.2211, + "mean_token_accuracy": 0.9360534459352493, + "num_tokens": 20371391.0, + "step": 4350 + }, + { + "entropy": 0.19243698343634605, + "epoch": 0.25294975089380894, + "grad_norm": 0.489467054605484, + "learning_rate": 4.652424411975815e-05, + "loss": 0.201, + "mean_token_accuracy": 0.938987047970295, + "num_tokens": 20414578.0, + "step": 4360 + }, + { + "entropy": 0.20929394476115704, + "epoch": 0.2535299108729232, + "grad_norm": 0.4043976366519928, + "learning_rate": 4.64984475605049e-05, + "loss": 0.2212, + "mean_token_accuracy": 0.9360509037971496, + "num_tokens": 20467301.0, + "step": 4370 + }, + { + "entropy": 0.18945014141499997, + "epoch": 0.25411007085203746, + "grad_norm": 0.4858134388923645, + "learning_rate": 4.647256283241471e-05, + "loss": 0.202, + "mean_token_accuracy": 0.9401775024831295, + "num_tokens": 20509670.0, + "step": 4380 + }, + { + "entropy": 0.19244696283712984, + "epoch": 0.25469023083115166, + "grad_norm": 0.43093934655189514, + "learning_rate": 4.644659004164533e-05, + "loss": 0.2102, + "mean_token_accuracy": 0.93842184394598, + "num_tokens": 20556685.0, + "step": 4390 + }, + { + "entropy": 0.19686746066436173, + "epoch": 0.2552703908102659, + "grad_norm": 0.5705140233039856, + "learning_rate": 4.6420529294715634e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.9386506892740727, + "num_tokens": 20599935.0, + "step": 4400 + }, + { + "entropy": 0.2062140077352524, + "epoch": 0.2558505507893802, + "grad_norm": 0.4421403706073761, + "learning_rate": 4.639438069850524e-05, + "loss": 0.2207, + "mean_token_accuracy": 0.9336604550480843, + "num_tokens": 20648753.0, + "step": 4410 + }, + { + "entropy": 0.20171306412667037, + "epoch": 0.2564307107684944, + "grad_norm": 0.5584080815315247, + "learning_rate": 4.636814436025404e-05, + "loss": 0.2044, + "mean_token_accuracy": 0.9381263568997383, + "num_tokens": 20691766.0, + "step": 4420 + }, + { + "entropy": 0.2017810949124396, + "epoch": 0.25701087074760864, + "grad_norm": 0.5136546492576599, + "learning_rate": 4.634182038756179e-05, + "loss": 0.2147, + "mean_token_accuracy": 0.9355583935976028, + "num_tokens": 20734879.0, + "step": 4430 + }, + { + "entropy": 0.19564786059781908, + "epoch": 0.2575910307267229, + "grad_norm": 0.4433939456939697, + "learning_rate": 4.631540888838761e-05, + "loss": 0.2062, + "mean_token_accuracy": 0.9382089242339134, + "num_tokens": 20783064.0, + "step": 4440 + }, + { + "entropy": 0.200284936837852, + "epoch": 0.25817119070583716, + "grad_norm": 0.49705740809440613, + "learning_rate": 4.6288909971049644e-05, + "loss": 0.2101, + "mean_token_accuracy": 0.9377306647598743, + "num_tokens": 20828367.0, + "step": 4450 + }, + { + "entropy": 0.19394247215241195, + "epoch": 0.25875135068495136, + "grad_norm": 0.5381738543510437, + "learning_rate": 4.626232374422449e-05, + "loss": 0.2052, + "mean_token_accuracy": 0.9388545423746109, + "num_tokens": 20871154.0, + "step": 4460 + }, + { + "entropy": 0.200459392555058, + "epoch": 0.2593315106640656, + "grad_norm": 0.5851970314979553, + "learning_rate": 4.623565031694685e-05, + "loss": 0.219, + "mean_token_accuracy": 0.9346887215971946, + "num_tokens": 20920823.0, + "step": 4470 + }, + { + "entropy": 0.1883087906986475, + "epoch": 0.2599116706431799, + "grad_norm": 0.5354660153388977, + "learning_rate": 4.620888979860906e-05, + "loss": 0.2021, + "mean_token_accuracy": 0.9430471271276474, + "num_tokens": 20964197.0, + "step": 4480 + }, + { + "entropy": 0.1983618415892124, + "epoch": 0.2604918306222941, + "grad_norm": 0.47820013761520386, + "learning_rate": 4.6182042298960585e-05, + "loss": 0.209, + "mean_token_accuracy": 0.939156661182642, + "num_tokens": 21011402.0, + "step": 4490 + }, + { + "entropy": 0.18946091299876572, + "epoch": 0.26107199060140834, + "grad_norm": 0.46002793312072754, + "learning_rate": 4.615510792810767e-05, + "loss": 0.2066, + "mean_token_accuracy": 0.9396903857588768, + "num_tokens": 21056565.0, + "step": 4500 + }, + { + "entropy": 0.19246515780687332, + "epoch": 0.2616521505805226, + "grad_norm": 0.4315761923789978, + "learning_rate": 4.6128086796512804e-05, + "loss": 0.2064, + "mean_token_accuracy": 0.9409561946988105, + "num_tokens": 21097021.0, + "step": 4510 + }, + { + "entropy": 0.20006441986188292, + "epoch": 0.2622323105596368, + "grad_norm": 0.48008298873901367, + "learning_rate": 4.6100979014994306e-05, + "loss": 0.2164, + "mean_token_accuracy": 0.9365143477916718, + "num_tokens": 21144679.0, + "step": 4520 + }, + { + "entropy": 0.20264245523139834, + "epoch": 0.26281247053875106, + "grad_norm": 0.4869425296783447, + "learning_rate": 4.607378469472584e-05, + "loss": 0.2172, + "mean_token_accuracy": 0.9339435592293739, + "num_tokens": 21194288.0, + "step": 4530 + }, + { + "entropy": 0.20341125335544347, + "epoch": 0.2633926305178653, + "grad_norm": 0.5397939085960388, + "learning_rate": 4.604650394723603e-05, + "loss": 0.2111, + "mean_token_accuracy": 0.9365370564162732, + "num_tokens": 21239153.0, + "step": 4540 + }, + { + "entropy": 0.18873490002006293, + "epoch": 0.2639727904969795, + "grad_norm": 0.42418336868286133, + "learning_rate": 4.6019136884407896e-05, + "loss": 0.1959, + "mean_token_accuracy": 0.9409293435513973, + "num_tokens": 21287753.0, + "step": 4550 + }, + { + "entropy": 0.19427702724933624, + "epoch": 0.2645529504760938, + "grad_norm": 0.5141423940658569, + "learning_rate": 4.5991683618478476e-05, + "loss": 0.2097, + "mean_token_accuracy": 0.9374813355505467, + "num_tokens": 21338912.0, + "step": 4560 + }, + { + "entropy": 0.19414519546553494, + "epoch": 0.26513311045520804, + "grad_norm": 0.6582860946655273, + "learning_rate": 4.596414426203835e-05, + "loss": 0.2126, + "mean_token_accuracy": 0.9376175962388515, + "num_tokens": 21384337.0, + "step": 4570 + }, + { + "entropy": 0.20092508532106876, + "epoch": 0.26571327043432225, + "grad_norm": 0.5025556683540344, + "learning_rate": 4.593651892803118e-05, + "loss": 0.2108, + "mean_token_accuracy": 0.9351400010287761, + "num_tokens": 21433477.0, + "step": 4580 + }, + { + "entropy": 0.19750893944874406, + "epoch": 0.2662934304134365, + "grad_norm": 0.738443911075592, + "learning_rate": 4.590880772975319e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.9368741504848004, + "num_tokens": 21478366.0, + "step": 4590 + }, + { + "entropy": 0.19282453479245304, + "epoch": 0.26687359039255076, + "grad_norm": 0.48710373044013977, + "learning_rate": 4.58810107808528e-05, + "loss": 0.2047, + "mean_token_accuracy": 0.9388439431786537, + "num_tokens": 21526560.0, + "step": 4600 + }, + { + "entropy": 0.196967382915318, + "epoch": 0.26745375037166497, + "grad_norm": 0.4703672528266907, + "learning_rate": 4.5853128195330066e-05, + "loss": 0.2056, + "mean_token_accuracy": 0.9391059011220932, + "num_tokens": 21570519.0, + "step": 4610 + }, + { + "entropy": 0.19948362791910768, + "epoch": 0.2680339103507792, + "grad_norm": 0.5299443006515503, + "learning_rate": 4.582516008753629e-05, + "loss": 0.2163, + "mean_token_accuracy": 0.9383684001863003, + "num_tokens": 21617043.0, + "step": 4620 + }, + { + "entropy": 0.197217450896278, + "epoch": 0.2686140703298935, + "grad_norm": 0.43258094787597656, + "learning_rate": 4.57971065721735e-05, + "loss": 0.2164, + "mean_token_accuracy": 0.937662535905838, + "num_tokens": 21664160.0, + "step": 4630 + }, + { + "entropy": 0.20410069748759269, + "epoch": 0.2691942303090077, + "grad_norm": 0.5522704720497131, + "learning_rate": 4.576896776429399e-05, + "loss": 0.217, + "mean_token_accuracy": 0.9349062457680702, + "num_tokens": 21711713.0, + "step": 4640 + }, + { + "entropy": 0.19813506240025164, + "epoch": 0.26977439028812195, + "grad_norm": 0.42131516337394714, + "learning_rate": 4.574074377929985e-05, + "loss": 0.2142, + "mean_token_accuracy": 0.9356303736567497, + "num_tokens": 21763460.0, + "step": 4650 + }, + { + "entropy": 0.1979994830675423, + "epoch": 0.2703545502672362, + "grad_norm": 0.4001702070236206, + "learning_rate": 4.571243473294252e-05, + "loss": 0.2033, + "mean_token_accuracy": 0.9384423069655895, + "num_tokens": 21814730.0, + "step": 4660 + }, + { + "entropy": 0.20771349063143135, + "epoch": 0.2709347102463504, + "grad_norm": 0.4003629684448242, + "learning_rate": 4.5684040741322274e-05, + "loss": 0.2259, + "mean_token_accuracy": 0.9349867656826973, + "num_tokens": 21861270.0, + "step": 4670 + }, + { + "entropy": 0.19541749553754925, + "epoch": 0.27151487022546467, + "grad_norm": 0.4892110824584961, + "learning_rate": 4.565556192088776e-05, + "loss": 0.21, + "mean_token_accuracy": 0.940095967054367, + "num_tokens": 21907670.0, + "step": 4680 + }, + { + "entropy": 0.19452951615676284, + "epoch": 0.2720950302045789, + "grad_norm": 0.5545443892478943, + "learning_rate": 4.5626998388435516e-05, + "loss": 0.2131, + "mean_token_accuracy": 0.9359357498586178, + "num_tokens": 21947556.0, + "step": 4690 + }, + { + "entropy": 0.20332091618329287, + "epoch": 0.27267519018369313, + "grad_norm": 0.4792189300060272, + "learning_rate": 4.5598350261109524e-05, + "loss": 0.2169, + "mean_token_accuracy": 0.936721558123827, + "num_tokens": 21999913.0, + "step": 4700 + }, + { + "entropy": 0.2039651249535382, + "epoch": 0.2732553501628074, + "grad_norm": 0.45039328932762146, + "learning_rate": 4.556961765640068e-05, + "loss": 0.222, + "mean_token_accuracy": 0.9365391418337822, + "num_tokens": 22048107.0, + "step": 4710 + }, + { + "entropy": 0.19542556134983896, + "epoch": 0.27383551014192165, + "grad_norm": 0.5828747153282166, + "learning_rate": 4.5540800692146355e-05, + "loss": 0.2111, + "mean_token_accuracy": 0.9365010820329189, + "num_tokens": 22094807.0, + "step": 4720 + }, + { + "entropy": 0.209370584692806, + "epoch": 0.27441567012103585, + "grad_norm": 0.45267152786254883, + "learning_rate": 4.5511899486529875e-05, + "loss": 0.219, + "mean_token_accuracy": 0.9347944878041744, + "num_tokens": 22144839.0, + "step": 4730 + }, + { + "entropy": 0.20776735162362456, + "epoch": 0.2749958301001501, + "grad_norm": 0.5448766946792603, + "learning_rate": 4.548291415808007e-05, + "loss": 0.2257, + "mean_token_accuracy": 0.933895293623209, + "num_tokens": 22190133.0, + "step": 4740 + }, + { + "entropy": 0.17840658528730274, + "epoch": 0.27557599007926437, + "grad_norm": 0.4423179030418396, + "learning_rate": 4.545384482567076e-05, + "loss": 0.1865, + "mean_token_accuracy": 0.942785707116127, + "num_tokens": 22233129.0, + "step": 4750 + }, + { + "entropy": 0.1884486500173807, + "epoch": 0.2761561500583786, + "grad_norm": 0.4291665852069855, + "learning_rate": 4.542469160852028e-05, + "loss": 0.2016, + "mean_token_accuracy": 0.9390126474201679, + "num_tokens": 22281807.0, + "step": 4760 + }, + { + "entropy": 0.18179945265874267, + "epoch": 0.2767363100374928, + "grad_norm": 0.43880847096443176, + "learning_rate": 4.5395454626191004e-05, + "loss": 0.1931, + "mean_token_accuracy": 0.9422627955675125, + "num_tokens": 22327923.0, + "step": 4770 + }, + { + "entropy": 0.1961971430107951, + "epoch": 0.2773164700166071, + "grad_norm": 0.4060024321079254, + "learning_rate": 4.5366133998588835e-05, + "loss": 0.2061, + "mean_token_accuracy": 0.9379254907369614, + "num_tokens": 22380567.0, + "step": 4780 + }, + { + "entropy": 0.19550915956497192, + "epoch": 0.27789662999572134, + "grad_norm": 0.42962974309921265, + "learning_rate": 4.533672984596271e-05, + "loss": 0.2023, + "mean_token_accuracy": 0.9383601248264313, + "num_tokens": 22429986.0, + "step": 4790 + }, + { + "entropy": 0.192627436760813, + "epoch": 0.27847678997483555, + "grad_norm": 0.40980446338653564, + "learning_rate": 4.530724228890413e-05, + "loss": 0.2098, + "mean_token_accuracy": 0.939090508967638, + "num_tokens": 22482121.0, + "step": 4800 + }, + { + "entropy": 0.1929157697595656, + "epoch": 0.2790569499539498, + "grad_norm": 0.455552339553833, + "learning_rate": 4.527767144834665e-05, + "loss": 0.205, + "mean_token_accuracy": 0.9387724131345749, + "num_tokens": 22531149.0, + "step": 4810 + }, + { + "entropy": 0.19124340293928982, + "epoch": 0.27963710993306407, + "grad_norm": 0.6377856731414795, + "learning_rate": 4.524801744556537e-05, + "loss": 0.1968, + "mean_token_accuracy": 0.9401277489960194, + "num_tokens": 22575067.0, + "step": 4820 + }, + { + "entropy": 0.19201124273240566, + "epoch": 0.28021726991217827, + "grad_norm": 0.5463549494743347, + "learning_rate": 4.521828040217647e-05, + "loss": 0.2126, + "mean_token_accuracy": 0.9387569382786751, + "num_tokens": 22620968.0, + "step": 4830 + }, + { + "entropy": 0.20683917645365, + "epoch": 0.2807974298912925, + "grad_norm": 0.5127503871917725, + "learning_rate": 4.518846044013668e-05, + "loss": 0.2234, + "mean_token_accuracy": 0.9335066489875317, + "num_tokens": 22675827.0, + "step": 4840 + }, + { + "entropy": 0.19939272683113812, + "epoch": 0.2813775898704068, + "grad_norm": 0.4918792247772217, + "learning_rate": 4.5158557681742817e-05, + "loss": 0.2053, + "mean_token_accuracy": 0.938949779421091, + "num_tokens": 22721449.0, + "step": 4850 + }, + { + "entropy": 0.19507512776181102, + "epoch": 0.281957749849521, + "grad_norm": 0.5250486135482788, + "learning_rate": 4.512857224963123e-05, + "loss": 0.2048, + "mean_token_accuracy": 0.9392285287380219, + "num_tokens": 22769048.0, + "step": 4860 + }, + { + "entropy": 0.182545904815197, + "epoch": 0.28253790982863525, + "grad_norm": 0.444903165102005, + "learning_rate": 4.509850426677735e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.9418340779840946, + "num_tokens": 22813082.0, + "step": 4870 + }, + { + "entropy": 0.19679356887936592, + "epoch": 0.2831180698077495, + "grad_norm": 0.44396936893463135, + "learning_rate": 4.506835385649516e-05, + "loss": 0.2067, + "mean_token_accuracy": 0.9386748068034649, + "num_tokens": 22860440.0, + "step": 4880 + }, + { + "entropy": 0.17847408005036414, + "epoch": 0.2836982297868637, + "grad_norm": 0.3958888053894043, + "learning_rate": 4.503812114243667e-05, + "loss": 0.1868, + "mean_token_accuracy": 0.9442094333469868, + "num_tokens": 22902343.0, + "step": 4890 + }, + { + "entropy": 0.1998564526438713, + "epoch": 0.28427838976597797, + "grad_norm": 0.49972668290138245, + "learning_rate": 4.5007806248591454e-05, + "loss": 0.2111, + "mean_token_accuracy": 0.937311265617609, + "num_tokens": 22949637.0, + "step": 4900 + }, + { + "entropy": 0.2069408523850143, + "epoch": 0.2848585497450922, + "grad_norm": 0.41584011912345886, + "learning_rate": 4.497740929928612e-05, + "loss": 0.2248, + "mean_token_accuracy": 0.9343293726444244, + "num_tokens": 22999922.0, + "step": 4910 + }, + { + "entropy": 0.19312181659042835, + "epoch": 0.28543870972420643, + "grad_norm": 0.49668174982070923, + "learning_rate": 4.4946930419183795e-05, + "loss": 0.2005, + "mean_token_accuracy": 0.9379349321126937, + "num_tokens": 23047995.0, + "step": 4920 + }, + { + "entropy": 0.1847459458746016, + "epoch": 0.2860188697033207, + "grad_norm": 0.42898693680763245, + "learning_rate": 4.49163697332836e-05, + "loss": 0.1975, + "mean_token_accuracy": 0.9421278312802315, + "num_tokens": 23091603.0, + "step": 4930 + }, + { + "entropy": 0.19483440490439535, + "epoch": 0.28659902968243495, + "grad_norm": 0.5070547461509705, + "learning_rate": 4.48857273669202e-05, + "loss": 0.2103, + "mean_token_accuracy": 0.9378108859062195, + "num_tokens": 23138102.0, + "step": 4940 + }, + { + "entropy": 0.1999665291979909, + "epoch": 0.28717918966154915, + "grad_norm": 0.5442050695419312, + "learning_rate": 4.4855003445763196e-05, + "loss": 0.217, + "mean_token_accuracy": 0.9382676690816879, + "num_tokens": 23183749.0, + "step": 4950 + }, + { + "entropy": 0.18188294507563113, + "epoch": 0.2877593496406634, + "grad_norm": 0.4933253228664398, + "learning_rate": 4.4824198095816684e-05, + "loss": 0.1923, + "mean_token_accuracy": 0.9427701093256473, + "num_tokens": 23226965.0, + "step": 4960 + }, + { + "entropy": 0.18849715059623123, + "epoch": 0.28833950961977767, + "grad_norm": 0.49177348613739014, + "learning_rate": 4.4793311443418726e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.9406029611825943, + "num_tokens": 23275379.0, + "step": 4970 + }, + { + "entropy": 0.19033315563574432, + "epoch": 0.28891966959889187, + "grad_norm": 0.38876739144325256, + "learning_rate": 4.4762343615240776e-05, + "loss": 0.1971, + "mean_token_accuracy": 0.9384737253189087, + "num_tokens": 23321724.0, + "step": 4980 + }, + { + "entropy": 0.20144046237692237, + "epoch": 0.28949982957800613, + "grad_norm": 0.4132806062698364, + "learning_rate": 4.4731294738287266e-05, + "loss": 0.2166, + "mean_token_accuracy": 0.9375121526420116, + "num_tokens": 23370128.0, + "step": 4990 + }, + { + "entropy": 0.1957883132621646, + "epoch": 0.2900799895571204, + "grad_norm": 0.455134779214859, + "learning_rate": 4.470016493989496e-05, + "loss": 0.196, + "mean_token_accuracy": 0.9404206328094006, + "num_tokens": 23418109.0, + "step": 5000 + }, + { + "epoch": 0.2900799895571204, + "eval_entropy": 0.18952917018072182, + "eval_loss": 0.18933752179145813, + "eval_mean_token_accuracy": 0.9393608011811028, + "eval_num_tokens": 23418109.0, + "eval_runtime": 1678.9331, + "eval_samples_per_second": 5.134, + "eval_steps_per_second": 5.134, + "step": 5000 + }, + { + "entropy": 0.18384186653420329, + "epoch": 0.2906601495362346, + "grad_norm": 0.415244996547699, + "learning_rate": 4.466895434773255e-05, + "loss": 0.1958, + "mean_token_accuracy": 0.9410963021218777, + "num_tokens": 23463764.0, + "step": 5010 + }, + { + "entropy": 0.19440616406500338, + "epoch": 0.29124030951534885, + "grad_norm": 0.4620552957057953, + "learning_rate": 4.463766308980003e-05, + "loss": 0.2081, + "mean_token_accuracy": 0.9376123026013374, + "num_tokens": 23514499.0, + "step": 5020 + }, + { + "entropy": 0.18568031396716833, + "epoch": 0.2918204694944631, + "grad_norm": 0.449211984872818, + "learning_rate": 4.4606291294428235e-05, + "loss": 0.1865, + "mean_token_accuracy": 0.9412092573940753, + "num_tokens": 23556908.0, + "step": 5030 + }, + { + "entropy": 0.20334421722218393, + "epoch": 0.2924006294735773, + "grad_norm": 0.3820071518421173, + "learning_rate": 4.457483909027833e-05, + "loss": 0.21, + "mean_token_accuracy": 0.935131011903286, + "num_tokens": 23612893.0, + "step": 5040 + }, + { + "entropy": 0.1874255149625242, + "epoch": 0.29298078945269157, + "grad_norm": 0.3542194962501526, + "learning_rate": 4.4543306606341204e-05, + "loss": 0.201, + "mean_token_accuracy": 0.9408526137471199, + "num_tokens": 23660904.0, + "step": 5050 + }, + { + "entropy": 0.18695307457819582, + "epoch": 0.29356094943180583, + "grad_norm": 0.5337816476821899, + "learning_rate": 4.4511693971937e-05, + "loss": 0.1927, + "mean_token_accuracy": 0.9398588627576828, + "num_tokens": 23705831.0, + "step": 5060 + }, + { + "entropy": 0.1831263717263937, + "epoch": 0.2941411094109201, + "grad_norm": 0.5006949305534363, + "learning_rate": 4.4480001316714595e-05, + "loss": 0.1952, + "mean_token_accuracy": 0.9420724049210548, + "num_tokens": 23745046.0, + "step": 5070 + }, + { + "entropy": 0.19877206617966295, + "epoch": 0.2947212693900343, + "grad_norm": 0.5280159115791321, + "learning_rate": 4.4448228770651015e-05, + "loss": 0.2098, + "mean_token_accuracy": 0.9365513853728771, + "num_tokens": 23791344.0, + "step": 5080 + }, + { + "entropy": 0.196091147698462, + "epoch": 0.29530142936914855, + "grad_norm": 0.539138913154602, + "learning_rate": 4.441637646405094e-05, + "loss": 0.2129, + "mean_token_accuracy": 0.9372456386685372, + "num_tokens": 23836851.0, + "step": 5090 + }, + { + "entropy": 0.18319514309987425, + "epoch": 0.2958815893482628, + "grad_norm": 0.6992892026901245, + "learning_rate": 4.4384444527546175e-05, + "loss": 0.1942, + "mean_token_accuracy": 0.9423131316900253, + "num_tokens": 23884074.0, + "step": 5100 + }, + { + "entropy": 0.18510634936392306, + "epoch": 0.296461749327377, + "grad_norm": 0.5870891809463501, + "learning_rate": 4.43524330920951e-05, + "loss": 0.2073, + "mean_token_accuracy": 0.9404550582170487, + "num_tokens": 23934725.0, + "step": 5110 + }, + { + "entropy": 0.18993359934538603, + "epoch": 0.29704190930649127, + "grad_norm": 0.488034188747406, + "learning_rate": 4.432034228898212e-05, + "loss": 0.2023, + "mean_token_accuracy": 0.9389263287186622, + "num_tokens": 23983975.0, + "step": 5120 + }, + { + "entropy": 0.19510410781949758, + "epoch": 0.29762206928560553, + "grad_norm": 0.5086171627044678, + "learning_rate": 4.428817224981714e-05, + "loss": 0.2101, + "mean_token_accuracy": 0.937411729246378, + "num_tokens": 24030390.0, + "step": 5130 + }, + { + "entropy": 0.19463763926178218, + "epoch": 0.29820222926471973, + "grad_norm": 0.46839433908462524, + "learning_rate": 4.4255923106535034e-05, + "loss": 0.2037, + "mean_token_accuracy": 0.9382486924529075, + "num_tokens": 24073627.0, + "step": 5140 + }, + { + "entropy": 0.18102540001273154, + "epoch": 0.298782389243834, + "grad_norm": 0.4893419146537781, + "learning_rate": 4.42235949913951e-05, + "loss": 0.1853, + "mean_token_accuracy": 0.9463913284242154, + "num_tokens": 24116933.0, + "step": 5150 + }, + { + "entropy": 0.1964911294169724, + "epoch": 0.29936254922294825, + "grad_norm": 0.384427547454834, + "learning_rate": 4.41911880369805e-05, + "loss": 0.2087, + "mean_token_accuracy": 0.9356391303241253, + "num_tokens": 24164044.0, + "step": 5160 + }, + { + "entropy": 0.18902273941785097, + "epoch": 0.29994270920206245, + "grad_norm": 0.48850327730178833, + "learning_rate": 4.415870237619775e-05, + "loss": 0.2103, + "mean_token_accuracy": 0.9380629472434521, + "num_tokens": 24212472.0, + "step": 5170 + }, + { + "entropy": 0.2028914052993059, + "epoch": 0.3005228691811767, + "grad_norm": 0.42227962613105774, + "learning_rate": 4.412613814227613e-05, + "loss": 0.2189, + "mean_token_accuracy": 0.9385775402188301, + "num_tokens": 24267910.0, + "step": 5180 + }, + { + "entropy": 0.19062174456194042, + "epoch": 0.30110302916029097, + "grad_norm": 0.4292152523994446, + "learning_rate": 4.4093495468767176e-05, + "loss": 0.1987, + "mean_token_accuracy": 0.9425582364201546, + "num_tokens": 24314118.0, + "step": 5190 + }, + { + "entropy": 0.1960416967049241, + "epoch": 0.3016831891394052, + "grad_norm": 0.41508540511131287, + "learning_rate": 4.406077448954411e-05, + "loss": 0.2177, + "mean_token_accuracy": 0.9369618482887745, + "num_tokens": 24364894.0, + "step": 5200 + }, + { + "entropy": 0.18834490990266203, + "epoch": 0.30226334911851943, + "grad_norm": 0.43025848269462585, + "learning_rate": 4.402797533880132e-05, + "loss": 0.2039, + "mean_token_accuracy": 0.9404608644545078, + "num_tokens": 24415726.0, + "step": 5210 + }, + { + "entropy": 0.19504495663568377, + "epoch": 0.3028435090976337, + "grad_norm": 0.48916810750961304, + "learning_rate": 4.3995098151053747e-05, + "loss": 0.2095, + "mean_token_accuracy": 0.9382291570305824, + "num_tokens": 24464831.0, + "step": 5220 + }, + { + "entropy": 0.19366464214399456, + "epoch": 0.3034236690767479, + "grad_norm": 0.428820937871933, + "learning_rate": 4.396214306113643e-05, + "loss": 0.2081, + "mean_token_accuracy": 0.9382305301725864, + "num_tokens": 24512296.0, + "step": 5230 + }, + { + "entropy": 0.1913530702702701, + "epoch": 0.30400382905586215, + "grad_norm": 0.3594709634780884, + "learning_rate": 4.392911020420386e-05, + "loss": 0.2036, + "mean_token_accuracy": 0.9412068620324134, + "num_tokens": 24564693.0, + "step": 5240 + }, + { + "entropy": 0.18619473101571202, + "epoch": 0.3045839890349764, + "grad_norm": 0.4140602946281433, + "learning_rate": 4.389599971572946e-05, + "loss": 0.1999, + "mean_token_accuracy": 0.9420188933610916, + "num_tokens": 24612841.0, + "step": 5250 + }, + { + "entropy": 0.17897500726394355, + "epoch": 0.3051641490140906, + "grad_norm": 0.4587506651878357, + "learning_rate": 4.386281173150506e-05, + "loss": 0.1883, + "mean_token_accuracy": 0.9425855837762356, + "num_tokens": 24659713.0, + "step": 5260 + }, + { + "entropy": 0.1911114257760346, + "epoch": 0.30574430899320487, + "grad_norm": 0.4702602028846741, + "learning_rate": 4.382954638764031e-05, + "loss": 0.2044, + "mean_token_accuracy": 0.9374974519014359, + "num_tokens": 24705065.0, + "step": 5270 + }, + { + "entropy": 0.19082235293462874, + "epoch": 0.30632446897231913, + "grad_norm": 0.582396388053894, + "learning_rate": 4.3796203820562084e-05, + "loss": 0.2006, + "mean_token_accuracy": 0.9405093386769294, + "num_tokens": 24752202.0, + "step": 5280 + }, + { + "entropy": 0.19576047947630287, + "epoch": 0.30690462895143333, + "grad_norm": 0.45986616611480713, + "learning_rate": 4.376278416701403e-05, + "loss": 0.1989, + "mean_token_accuracy": 0.9394418343901634, + "num_tokens": 24800696.0, + "step": 5290 + }, + { + "entropy": 0.1922426424920559, + "epoch": 0.3074847889305476, + "grad_norm": 0.47855502367019653, + "learning_rate": 4.372928756405589e-05, + "loss": 0.2041, + "mean_token_accuracy": 0.9388028778135776, + "num_tokens": 24847877.0, + "step": 5300 + }, + { + "entropy": 0.19925200743600727, + "epoch": 0.30806494890966185, + "grad_norm": 0.4357258677482605, + "learning_rate": 4.369571414906299e-05, + "loss": 0.2108, + "mean_token_accuracy": 0.9370548762381077, + "num_tokens": 24899804.0, + "step": 5310 + }, + { + "entropy": 0.17894351752474905, + "epoch": 0.30864510888877605, + "grad_norm": 0.4799479842185974, + "learning_rate": 4.366206405972571e-05, + "loss": 0.193, + "mean_token_accuracy": 0.9400643669068813, + "num_tokens": 24941835.0, + "step": 5320 + }, + { + "entropy": 0.18642524480819703, + "epoch": 0.3092252688678903, + "grad_norm": 0.5121386051177979, + "learning_rate": 4.362833743404885e-05, + "loss": 0.198, + "mean_token_accuracy": 0.9434010744094848, + "num_tokens": 24984794.0, + "step": 5330 + }, + { + "entropy": 0.17932178396731616, + "epoch": 0.30980542884700457, + "grad_norm": 0.3637288212776184, + "learning_rate": 4.3594534410351105e-05, + "loss": 0.1889, + "mean_token_accuracy": 0.9430772311985492, + "num_tokens": 25031352.0, + "step": 5340 + }, + { + "entropy": 0.18821288626641036, + "epoch": 0.3103855888261188, + "grad_norm": 0.4590301215648651, + "learning_rate": 4.356065512726452e-05, + "loss": 0.2125, + "mean_token_accuracy": 0.9384908527135849, + "num_tokens": 25074818.0, + "step": 5350 + }, + { + "entropy": 0.17901689233258367, + "epoch": 0.31096574880523303, + "grad_norm": 0.5150878429412842, + "learning_rate": 4.352669972373385e-05, + "loss": 0.1853, + "mean_token_accuracy": 0.94421561434865, + "num_tokens": 25120154.0, + "step": 5360 + }, + { + "entropy": 0.18443326326087117, + "epoch": 0.3115459087843473, + "grad_norm": 0.5095081925392151, + "learning_rate": 4.349266833901607e-05, + "loss": 0.1989, + "mean_token_accuracy": 0.9388502359390258, + "num_tokens": 25162154.0, + "step": 5370 + }, + { + "entropy": 0.19005813486874104, + "epoch": 0.31212606876346155, + "grad_norm": 0.4360039234161377, + "learning_rate": 4.3458561112679754e-05, + "loss": 0.2058, + "mean_token_accuracy": 0.9398770056664943, + "num_tokens": 25210053.0, + "step": 5380 + }, + { + "entropy": 0.1855242455378175, + "epoch": 0.31270622874257575, + "grad_norm": 0.5857658386230469, + "learning_rate": 4.342437818460449e-05, + "loss": 0.2132, + "mean_token_accuracy": 0.9398932121694088, + "num_tokens": 25254911.0, + "step": 5390 + }, + { + "entropy": 0.2107409192249179, + "epoch": 0.31328638872169, + "grad_norm": 0.44289225339889526, + "learning_rate": 4.339011969498038e-05, + "loss": 0.2167, + "mean_token_accuracy": 0.9340468741953373, + "num_tokens": 25301051.0, + "step": 5400 + }, + { + "entropy": 0.20040012374520302, + "epoch": 0.31386654870080427, + "grad_norm": 0.48400241136550903, + "learning_rate": 4.335578578430737e-05, + "loss": 0.2114, + "mean_token_accuracy": 0.9372204139828682, + "num_tokens": 25349588.0, + "step": 5410 + }, + { + "entropy": 0.18364614695310594, + "epoch": 0.3144467086799185, + "grad_norm": 0.42406517267227173, + "learning_rate": 4.3321376593394755e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.9412652783095836, + "num_tokens": 25395645.0, + "step": 5420 + }, + { + "entropy": 0.18526196079328655, + "epoch": 0.31502686865903273, + "grad_norm": 0.5867842435836792, + "learning_rate": 4.3286892263360546e-05, + "loss": 0.197, + "mean_token_accuracy": 0.942443060874939, + "num_tokens": 25447213.0, + "step": 5430 + }, + { + "entropy": 0.19685105476528406, + "epoch": 0.315607028638147, + "grad_norm": 0.3856854736804962, + "learning_rate": 4.325233293563094e-05, + "loss": 0.2034, + "mean_token_accuracy": 0.9367602445185185, + "num_tokens": 25492942.0, + "step": 5440 + }, + { + "entropy": 0.18892899230122567, + "epoch": 0.3161871886172612, + "grad_norm": 0.5054187774658203, + "learning_rate": 4.3217698751939674e-05, + "loss": 0.2, + "mean_token_accuracy": 0.9399071708321571, + "num_tokens": 25539868.0, + "step": 5450 + }, + { + "entropy": 0.18678578473627566, + "epoch": 0.31676734859637545, + "grad_norm": 0.6719598770141602, + "learning_rate": 4.3182989854327507e-05, + "loss": 0.2043, + "mean_token_accuracy": 0.9428942665457726, + "num_tokens": 25584393.0, + "step": 5460 + }, + { + "entropy": 0.18429280910640955, + "epoch": 0.3173475085754897, + "grad_norm": 0.4357883930206299, + "learning_rate": 4.314820638514161e-05, + "loss": 0.1926, + "mean_token_accuracy": 0.9418228790163994, + "num_tokens": 25633833.0, + "step": 5470 + }, + { + "entropy": 0.18343933243304492, + "epoch": 0.3179276685546039, + "grad_norm": 0.47512537240982056, + "learning_rate": 4.311334848703498e-05, + "loss": 0.1986, + "mean_token_accuracy": 0.9387409679591656, + "num_tokens": 25679158.0, + "step": 5480 + }, + { + "entropy": 0.17572011519223452, + "epoch": 0.3185078285337182, + "grad_norm": 0.4089210331439972, + "learning_rate": 4.307841630296585e-05, + "loss": 0.1883, + "mean_token_accuracy": 0.9426230058073998, + "num_tokens": 25722774.0, + "step": 5490 + }, + { + "entropy": 0.18530759969726204, + "epoch": 0.31908798851283243, + "grad_norm": 0.3985731303691864, + "learning_rate": 4.304340997619716e-05, + "loss": 0.2025, + "mean_token_accuracy": 0.9397201985120773, + "num_tokens": 25767790.0, + "step": 5500 + }, + { + "entropy": 0.1900131364353001, + "epoch": 0.31966814849194664, + "grad_norm": 0.49463513493537903, + "learning_rate": 4.300832965029585e-05, + "loss": 0.2001, + "mean_token_accuracy": 0.9413102716207504, + "num_tokens": 25812671.0, + "step": 5510 + }, + { + "entropy": 0.18818023400381206, + "epoch": 0.3202483084710609, + "grad_norm": 0.41859859228134155, + "learning_rate": 4.297317546913239e-05, + "loss": 0.1993, + "mean_token_accuracy": 0.9397134833037853, + "num_tokens": 25863929.0, + "step": 5520 + }, + { + "entropy": 0.18623900953680278, + "epoch": 0.32082846845017515, + "grad_norm": 0.5179916024208069, + "learning_rate": 4.293794757688015e-05, + "loss": 0.2071, + "mean_token_accuracy": 0.9388180360198021, + "num_tokens": 25914029.0, + "step": 5530 + }, + { + "entropy": 0.1984161018859595, + "epoch": 0.32140862842928936, + "grad_norm": 0.46212202310562134, + "learning_rate": 4.290264611801475e-05, + "loss": 0.2, + "mean_token_accuracy": 0.9389272019267082, + "num_tokens": 25961544.0, + "step": 5540 + }, + { + "entropy": 0.19867049902677536, + "epoch": 0.3219887884084036, + "grad_norm": 0.4526830315589905, + "learning_rate": 4.2867271237313596e-05, + "loss": 0.2039, + "mean_token_accuracy": 0.9377216011285782, + "num_tokens": 26010294.0, + "step": 5550 + }, + { + "entropy": 0.18333152309060097, + "epoch": 0.3225689483875179, + "grad_norm": 0.43515512347221375, + "learning_rate": 4.283182307985514e-05, + "loss": 0.1937, + "mean_token_accuracy": 0.9425471216440201, + "num_tokens": 26052806.0, + "step": 5560 + }, + { + "entropy": 0.1664991958066821, + "epoch": 0.3231491083666321, + "grad_norm": 0.44685444235801697, + "learning_rate": 4.2796301791018386e-05, + "loss": 0.1778, + "mean_token_accuracy": 0.9467088744044304, + "num_tokens": 26092111.0, + "step": 5570 + }, + { + "entropy": 0.1913020852021873, + "epoch": 0.32372926834574633, + "grad_norm": 0.5555086731910706, + "learning_rate": 4.2760707516482254e-05, + "loss": 0.2045, + "mean_token_accuracy": 0.9390090145170689, + "num_tokens": 26135696.0, + "step": 5580 + }, + { + "entropy": 0.17753601390868426, + "epoch": 0.3243094283248606, + "grad_norm": 0.4322827458381653, + "learning_rate": 4.272504040222502e-05, + "loss": 0.1923, + "mean_token_accuracy": 0.9439040325582028, + "num_tokens": 26178321.0, + "step": 5590 + }, + { + "entropy": 0.18513043140992522, + "epoch": 0.3248895883039748, + "grad_norm": 0.4772743582725525, + "learning_rate": 4.268930059452364e-05, + "loss": 0.1992, + "mean_token_accuracy": 0.9405495315790177, + "num_tokens": 26228880.0, + "step": 5600 + }, + { + "entropy": 0.18025170573964716, + "epoch": 0.32546974828308906, + "grad_norm": 0.44736286997795105, + "learning_rate": 4.2653488239953234e-05, + "loss": 0.1952, + "mean_token_accuracy": 0.9416764236986637, + "num_tokens": 26276959.0, + "step": 5610 + }, + { + "entropy": 0.19205641224980355, + "epoch": 0.3260499082622033, + "grad_norm": 0.46423229575157166, + "learning_rate": 4.261760348538642e-05, + "loss": 0.2052, + "mean_token_accuracy": 0.9369760736823082, + "num_tokens": 26320633.0, + "step": 5620 + }, + { + "entropy": 0.1962360733188689, + "epoch": 0.3266300682413175, + "grad_norm": 0.4633448123931885, + "learning_rate": 4.258164647799278e-05, + "loss": 0.2146, + "mean_token_accuracy": 0.9395989067852497, + "num_tokens": 26368154.0, + "step": 5630 + }, + { + "entropy": 0.19138576313853264, + "epoch": 0.3272102282204318, + "grad_norm": 0.44543471932411194, + "learning_rate": 4.254561736523819e-05, + "loss": 0.2135, + "mean_token_accuracy": 0.9396027751266957, + "num_tokens": 26416512.0, + "step": 5640 + }, + { + "entropy": 0.18187291650101542, + "epoch": 0.32779038819954603, + "grad_norm": 0.5173605680465698, + "learning_rate": 4.2509516294884235e-05, + "loss": 0.1931, + "mean_token_accuracy": 0.9415681891143322, + "num_tokens": 26462785.0, + "step": 5650 + }, + { + "entropy": 0.17933684457093477, + "epoch": 0.3283705481786603, + "grad_norm": 0.47270870208740234, + "learning_rate": 4.2473343414987634e-05, + "loss": 0.2002, + "mean_token_accuracy": 0.9425729550421238, + "num_tokens": 26510721.0, + "step": 5660 + }, + { + "entropy": 0.1832438691984862, + "epoch": 0.3289507081577745, + "grad_norm": 0.43374529480934143, + "learning_rate": 4.243709887389961e-05, + "loss": 0.1982, + "mean_token_accuracy": 0.9429618306457996, + "num_tokens": 26559759.0, + "step": 5670 + }, + { + "entropy": 0.20096280463039876, + "epoch": 0.32953086813688875, + "grad_norm": 0.4954383969306946, + "learning_rate": 4.240078282026524e-05, + "loss": 0.2076, + "mean_token_accuracy": 0.937217416614294, + "num_tokens": 26606991.0, + "step": 5680 + }, + { + "entropy": 0.19580697165802122, + "epoch": 0.330111028116003, + "grad_norm": 0.5028213858604431, + "learning_rate": 4.236439540302295e-05, + "loss": 0.2045, + "mean_token_accuracy": 0.9370015300810337, + "num_tokens": 26657720.0, + "step": 5690 + }, + { + "entropy": 0.1787126907147467, + "epoch": 0.3306911880951172, + "grad_norm": 0.5304685235023499, + "learning_rate": 4.2327936771403785e-05, + "loss": 0.1984, + "mean_token_accuracy": 0.9410437673330307, + "num_tokens": 26699697.0, + "step": 5700 + }, + { + "entropy": 0.18522155405953528, + "epoch": 0.3312713480742315, + "grad_norm": 0.44366881251335144, + "learning_rate": 4.229140707493088e-05, + "loss": 0.194, + "mean_token_accuracy": 0.9422594718635082, + "num_tokens": 26744211.0, + "step": 5710 + }, + { + "entropy": 0.19238233100622892, + "epoch": 0.33185150805334573, + "grad_norm": 0.4870074689388275, + "learning_rate": 4.2254806463418804e-05, + "loss": 0.2153, + "mean_token_accuracy": 0.9377231776714325, + "num_tokens": 26789679.0, + "step": 5720 + }, + { + "entropy": 0.19302238039672376, + "epoch": 0.33243166803245994, + "grad_norm": 0.44375544786453247, + "learning_rate": 4.221813508697299e-05, + "loss": 0.2114, + "mean_token_accuracy": 0.9414068318903446, + "num_tokens": 26837444.0, + "step": 5730 + }, + { + "entropy": 0.18404998490586877, + "epoch": 0.3330118280115742, + "grad_norm": 0.39567533135414124, + "learning_rate": 4.218139309598905e-05, + "loss": 0.1964, + "mean_token_accuracy": 0.9392804376780987, + "num_tokens": 26886536.0, + "step": 5740 + }, + { + "entropy": 0.19031182350590825, + "epoch": 0.33359198799068845, + "grad_norm": 0.475467711687088, + "learning_rate": 4.214458064115223e-05, + "loss": 0.2091, + "mean_token_accuracy": 0.9375824384391308, + "num_tokens": 26935166.0, + "step": 5750 + }, + { + "entropy": 0.19999441346153618, + "epoch": 0.33417214796980266, + "grad_norm": 0.4771309494972229, + "learning_rate": 4.210769787343675e-05, + "loss": 0.21, + "mean_token_accuracy": 0.9378918506205082, + "num_tokens": 26987677.0, + "step": 5760 + }, + { + "entropy": 0.18080097828060387, + "epoch": 0.3347523079489169, + "grad_norm": 0.42649853229522705, + "learning_rate": 4.2070744944105183e-05, + "loss": 0.2056, + "mean_token_accuracy": 0.9422137275338173, + "num_tokens": 27033743.0, + "step": 5770 + }, + { + "entropy": 0.19062768314033746, + "epoch": 0.3353324679280312, + "grad_norm": 0.4450823962688446, + "learning_rate": 4.203372200470787e-05, + "loss": 0.1929, + "mean_token_accuracy": 0.9404326431453228, + "num_tokens": 27083420.0, + "step": 5780 + }, + { + "entropy": 0.17763711158186196, + "epoch": 0.3359126279071454, + "grad_norm": 0.49232926964759827, + "learning_rate": 4.199662920708225e-05, + "loss": 0.1906, + "mean_token_accuracy": 0.9427829816937446, + "num_tokens": 27126054.0, + "step": 5790 + }, + { + "entropy": 0.19098413651809096, + "epoch": 0.33649278788625964, + "grad_norm": 0.45811927318573, + "learning_rate": 4.19594667033523e-05, + "loss": 0.2019, + "mean_token_accuracy": 0.9406383410096169, + "num_tokens": 27177444.0, + "step": 5800 + }, + { + "entropy": 0.1813780483789742, + "epoch": 0.3370729478653739, + "grad_norm": 0.5376542806625366, + "learning_rate": 4.192223464592781e-05, + "loss": 0.186, + "mean_token_accuracy": 0.9424885831773281, + "num_tokens": 27222365.0, + "step": 5810 + }, + { + "entropy": 0.17860100772231818, + "epoch": 0.3376531078444881, + "grad_norm": 0.4458209276199341, + "learning_rate": 4.18849331875039e-05, + "loss": 0.1946, + "mean_token_accuracy": 0.9407327003777027, + "num_tokens": 27270935.0, + "step": 5820 + }, + { + "entropy": 0.18453552899882197, + "epoch": 0.33823326782360236, + "grad_norm": 0.43578699231147766, + "learning_rate": 4.1847562481060265e-05, + "loss": 0.1984, + "mean_token_accuracy": 0.9419894121587277, + "num_tokens": 27318811.0, + "step": 5830 + }, + { + "entropy": 0.18685418805107473, + "epoch": 0.3388134278027166, + "grad_norm": 0.465982586145401, + "learning_rate": 4.181012267986061e-05, + "loss": 0.1958, + "mean_token_accuracy": 0.9427566863596439, + "num_tokens": 27369947.0, + "step": 5840 + }, + { + "entropy": 0.18346712393686176, + "epoch": 0.3393935877818308, + "grad_norm": 0.6134108304977417, + "learning_rate": 4.1772613937451996e-05, + "loss": 0.1846, + "mean_token_accuracy": 0.9413300812244415, + "num_tokens": 27412654.0, + "step": 5850 + }, + { + "entropy": 0.17814030647277831, + "epoch": 0.3399737477609451, + "grad_norm": 0.5343588590621948, + "learning_rate": 4.173503640766425e-05, + "loss": 0.1947, + "mean_token_accuracy": 0.9394714653491973, + "num_tokens": 27459393.0, + "step": 5860 + }, + { + "entropy": 0.1877170274965465, + "epoch": 0.34055390774005934, + "grad_norm": 0.4850185513496399, + "learning_rate": 4.169739024460929e-05, + "loss": 0.2028, + "mean_token_accuracy": 0.9398059688508511, + "num_tokens": 27505210.0, + "step": 5870 + }, + { + "entropy": 0.19109226558357478, + "epoch": 0.34113406771917354, + "grad_norm": 0.49941664934158325, + "learning_rate": 4.1659675602680525e-05, + "loss": 0.2022, + "mean_token_accuracy": 0.9397695355117321, + "num_tokens": 27552283.0, + "step": 5880 + }, + { + "entropy": 0.18384080845862627, + "epoch": 0.3417142276982878, + "grad_norm": 0.5075311660766602, + "learning_rate": 4.1621892636552185e-05, + "loss": 0.1971, + "mean_token_accuracy": 0.9411305837333203, + "num_tokens": 27597168.0, + "step": 5890 + }, + { + "entropy": 0.17930453950539232, + "epoch": 0.34229438767740206, + "grad_norm": 0.4375913143157959, + "learning_rate": 4.158404150117873e-05, + "loss": 0.1948, + "mean_token_accuracy": 0.9431046262383461, + "num_tokens": 27646810.0, + "step": 5900 + }, + { + "entropy": 0.18102289624512197, + "epoch": 0.34287454765651626, + "grad_norm": 0.3563831150531769, + "learning_rate": 4.1546122351794195e-05, + "loss": 0.1988, + "mean_token_accuracy": 0.9404317557811737, + "num_tokens": 27696585.0, + "step": 5910 + }, + { + "entropy": 0.17202505473978819, + "epoch": 0.3434547076356305, + "grad_norm": 0.4850895404815674, + "learning_rate": 4.150813534391152e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.9456000126898289, + "num_tokens": 27740639.0, + "step": 5920 + }, + { + "entropy": 0.18649025894701482, + "epoch": 0.3440348676147448, + "grad_norm": 0.4928680658340454, + "learning_rate": 4.1470080633321994e-05, + "loss": 0.2042, + "mean_token_accuracy": 0.9412629194557667, + "num_tokens": 27783755.0, + "step": 5930 + }, + { + "entropy": 0.1742583812214434, + "epoch": 0.344615027593859, + "grad_norm": 0.4543076157569885, + "learning_rate": 4.143195837609454e-05, + "loss": 0.1853, + "mean_token_accuracy": 0.9457469046115875, + "num_tokens": 27824220.0, + "step": 5940 + }, + { + "entropy": 0.1858961084857583, + "epoch": 0.34519518757297324, + "grad_norm": 0.48219993710517883, + "learning_rate": 4.139376872857508e-05, + "loss": 0.1955, + "mean_token_accuracy": 0.9415961012244225, + "num_tokens": 27872507.0, + "step": 5950 + }, + { + "entropy": 0.18115026894956826, + "epoch": 0.3457753475520875, + "grad_norm": 0.4846329987049103, + "learning_rate": 4.135551184738594e-05, + "loss": 0.1992, + "mean_token_accuracy": 0.9413276076316833, + "num_tokens": 27915623.0, + "step": 5960 + }, + { + "entropy": 0.18825820209458471, + "epoch": 0.34635550753120176, + "grad_norm": 0.42593616247177124, + "learning_rate": 4.1317187889425214e-05, + "loss": 0.1964, + "mean_token_accuracy": 0.9395013928413392, + "num_tokens": 27963749.0, + "step": 5970 + }, + { + "entropy": 0.18870370658114552, + "epoch": 0.34693566751031596, + "grad_norm": 0.5161728262901306, + "learning_rate": 4.127879701186601e-05, + "loss": 0.1984, + "mean_token_accuracy": 0.9395153321325779, + "num_tokens": 28010908.0, + "step": 5980 + }, + { + "entropy": 0.17825634283944963, + "epoch": 0.3475158274894302, + "grad_norm": 0.4234047532081604, + "learning_rate": 4.124033937215596e-05, + "loss": 0.1962, + "mean_token_accuracy": 0.9407003708183765, + "num_tokens": 28054706.0, + "step": 5990 + }, + { + "entropy": 0.17384292539209129, + "epoch": 0.3480959874685445, + "grad_norm": 0.4276653528213501, + "learning_rate": 4.1201815128016464e-05, + "loss": 0.1914, + "mean_token_accuracy": 0.9448108226060867, + "num_tokens": 28100546.0, + "step": 6000 + }, + { + "entropy": 0.18779370598495007, + "epoch": 0.3486761474476587, + "grad_norm": 0.4864109754562378, + "learning_rate": 4.116322443744208e-05, + "loss": 0.1994, + "mean_token_accuracy": 0.9404923714697361, + "num_tokens": 28146991.0, + "step": 6010 + }, + { + "entropy": 0.1904761590063572, + "epoch": 0.34925630742677294, + "grad_norm": 0.4790533781051636, + "learning_rate": 4.1124567458699883e-05, + "loss": 0.2029, + "mean_token_accuracy": 0.9388478018343449, + "num_tokens": 28197106.0, + "step": 6020 + }, + { + "entropy": 0.16955198887735606, + "epoch": 0.3498364674058872, + "grad_norm": 0.3631395399570465, + "learning_rate": 4.108584435032879e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.9451467052102089, + "num_tokens": 28238612.0, + "step": 6030 + }, + { + "entropy": 0.18991271844133734, + "epoch": 0.3504166273850014, + "grad_norm": 0.41851806640625, + "learning_rate": 4.1047055271138945e-05, + "loss": 0.205, + "mean_token_accuracy": 0.9383891329169274, + "num_tokens": 28292022.0, + "step": 6040 + }, + { + "entropy": 0.19310937114059926, + "epoch": 0.35099678736411566, + "grad_norm": 0.39950641989707947, + "learning_rate": 4.100820038021105e-05, + "loss": 0.2017, + "mean_token_accuracy": 0.9411644205451012, + "num_tokens": 28340594.0, + "step": 6050 + }, + { + "entropy": 0.1857264949940145, + "epoch": 0.3515769473432299, + "grad_norm": 0.4625358581542969, + "learning_rate": 4.096927983689571e-05, + "loss": 0.1946, + "mean_token_accuracy": 0.9404945522546768, + "num_tokens": 28384991.0, + "step": 6060 + }, + { + "entropy": 0.17652665302157403, + "epoch": 0.3521571073223441, + "grad_norm": 0.5476397275924683, + "learning_rate": 4.093029380081276e-05, + "loss": 0.1892, + "mean_token_accuracy": 0.9429823912680149, + "num_tokens": 28427819.0, + "step": 6070 + }, + { + "entropy": 0.18392771417275072, + "epoch": 0.3527372673014584, + "grad_norm": 0.5096374750137329, + "learning_rate": 4.089124243185066e-05, + "loss": 0.2019, + "mean_token_accuracy": 0.9380778096616268, + "num_tokens": 28476553.0, + "step": 6080 + }, + { + "entropy": 0.18134585544466972, + "epoch": 0.35331742728057264, + "grad_norm": 0.37714871764183044, + "learning_rate": 4.085212589016581e-05, + "loss": 0.1953, + "mean_token_accuracy": 0.9412830851972103, + "num_tokens": 28525466.0, + "step": 6090 + }, + { + "entropy": 0.19747757213190198, + "epoch": 0.35389758725968684, + "grad_norm": 0.4157574772834778, + "learning_rate": 4.081294433618187e-05, + "loss": 0.2225, + "mean_token_accuracy": 0.9366738103330136, + "num_tokens": 28576005.0, + "step": 6100 + }, + { + "entropy": 0.1794060418382287, + "epoch": 0.3544777472388011, + "grad_norm": 0.43961596488952637, + "learning_rate": 4.077369793058916e-05, + "loss": 0.1804, + "mean_token_accuracy": 0.9430822230875492, + "num_tokens": 28624266.0, + "step": 6110 + }, + { + "entropy": 0.1786673218011856, + "epoch": 0.35505790721791536, + "grad_norm": 0.498909592628479, + "learning_rate": 4.073438683434393e-05, + "loss": 0.191, + "mean_token_accuracy": 0.9418392173945904, + "num_tokens": 28672177.0, + "step": 6120 + }, + { + "entropy": 0.18425078755244612, + "epoch": 0.35563806719702956, + "grad_norm": 0.44084709882736206, + "learning_rate": 4.069501120866778e-05, + "loss": 0.1978, + "mean_token_accuracy": 0.9400599762797356, + "num_tokens": 28721186.0, + "step": 6130 + }, + { + "entropy": 0.17669694437645375, + "epoch": 0.3562182271761438, + "grad_norm": 0.48375892639160156, + "learning_rate": 4.0655571215046905e-05, + "loss": 0.181, + "mean_token_accuracy": 0.9449338369071484, + "num_tokens": 28768464.0, + "step": 6140 + }, + { + "entropy": 0.18262583585456013, + "epoch": 0.3567983871552581, + "grad_norm": 0.364143043756485, + "learning_rate": 4.061606701523154e-05, + "loss": 0.1951, + "mean_token_accuracy": 0.9414925843477249, + "num_tokens": 28815038.0, + "step": 6150 + }, + { + "entropy": 0.179447390884161, + "epoch": 0.3573785471343723, + "grad_norm": 0.5123677253723145, + "learning_rate": 4.05764987712352e-05, + "loss": 0.2009, + "mean_token_accuracy": 0.9397066980600357, + "num_tokens": 28858582.0, + "step": 6160 + }, + { + "entropy": 0.18513180809095503, + "epoch": 0.35795870711348654, + "grad_norm": 0.40458229184150696, + "learning_rate": 4.053686664533408e-05, + "loss": 0.1937, + "mean_token_accuracy": 0.9416390374302864, + "num_tokens": 28907477.0, + "step": 6170 + }, + { + "entropy": 0.19263990381732582, + "epoch": 0.3585388670926008, + "grad_norm": 0.411726176738739, + "learning_rate": 4.0497170800066346e-05, + "loss": 0.202, + "mean_token_accuracy": 0.9385622903704643, + "num_tokens": 28955618.0, + "step": 6180 + }, + { + "entropy": 0.17016669930890202, + "epoch": 0.359119027071715, + "grad_norm": 0.5707817077636719, + "learning_rate": 4.0457411398231503e-05, + "loss": 0.1805, + "mean_token_accuracy": 0.9423522904515267, + "num_tokens": 29001093.0, + "step": 6190 + }, + { + "entropy": 0.17696984810754657, + "epoch": 0.35969918705082926, + "grad_norm": 0.4820272922515869, + "learning_rate": 4.0417588602889705e-05, + "loss": 0.1867, + "mean_token_accuracy": 0.9432531513273716, + "num_tokens": 29044438.0, + "step": 6200 + }, + { + "entropy": 0.2008556004613638, + "epoch": 0.3602793470299435, + "grad_norm": 0.44081419706344604, + "learning_rate": 4.037770257736109e-05, + "loss": 0.2068, + "mean_token_accuracy": 0.937172657251358, + "num_tokens": 29091516.0, + "step": 6210 + }, + { + "entropy": 0.17387844063341618, + "epoch": 0.3608595070090577, + "grad_norm": 0.4054587185382843, + "learning_rate": 4.033775348522514e-05, + "loss": 0.1854, + "mean_token_accuracy": 0.9449115253984928, + "num_tokens": 29136046.0, + "step": 6220 + }, + { + "entropy": 0.17367928046733142, + "epoch": 0.361439666988172, + "grad_norm": 0.5486534833908081, + "learning_rate": 4.029774149031994e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.9416969545185566, + "num_tokens": 29180644.0, + "step": 6230 + }, + { + "entropy": 0.1829654837027192, + "epoch": 0.36201982696728624, + "grad_norm": 0.43673497438430786, + "learning_rate": 4.025766675674158e-05, + "loss": 0.195, + "mean_token_accuracy": 0.942721726000309, + "num_tokens": 29226628.0, + "step": 6240 + }, + { + "entropy": 0.1897281812503934, + "epoch": 0.36259998694640044, + "grad_norm": 0.4111742377281189, + "learning_rate": 4.021752944884346e-05, + "loss": 0.1953, + "mean_token_accuracy": 0.9398799039423466, + "num_tokens": 29276334.0, + "step": 6250 + }, + { + "entropy": 0.20372099364176394, + "epoch": 0.3631801469255147, + "grad_norm": 0.49731412529945374, + "learning_rate": 4.017732973123558e-05, + "loss": 0.221, + "mean_token_accuracy": 0.9330419234931469, + "num_tokens": 29326609.0, + "step": 6260 + }, + { + "entropy": 0.19235314233228565, + "epoch": 0.36376030690462896, + "grad_norm": 0.46285343170166016, + "learning_rate": 4.01370677687839e-05, + "loss": 0.1996, + "mean_token_accuracy": 0.940583098679781, + "num_tokens": 29375884.0, + "step": 6270 + }, + { + "entropy": 0.1927501330152154, + "epoch": 0.3643404668837432, + "grad_norm": 0.4979090094566345, + "learning_rate": 4.009674372660969e-05, + "loss": 0.2146, + "mean_token_accuracy": 0.9385734669864177, + "num_tokens": 29423395.0, + "step": 6280 + }, + { + "entropy": 0.18268118603155017, + "epoch": 0.3649206268628574, + "grad_norm": 0.40441012382507324, + "learning_rate": 4.005635777008877e-05, + "loss": 0.1924, + "mean_token_accuracy": 0.940732141584158, + "num_tokens": 29469220.0, + "step": 6290 + }, + { + "entropy": 0.17607199000194668, + "epoch": 0.3655007868419717, + "grad_norm": 0.4806465208530426, + "learning_rate": 4.0015910064850914e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.942966352403164, + "num_tokens": 29515289.0, + "step": 6300 + }, + { + "entropy": 0.18218752853572368, + "epoch": 0.36608094682108594, + "grad_norm": 0.4561408460140228, + "learning_rate": 3.997540077677913e-05, + "loss": 0.2012, + "mean_token_accuracy": 0.9405402541160583, + "num_tokens": 29557589.0, + "step": 6310 + }, + { + "entropy": 0.18351538581773638, + "epoch": 0.36666110680020014, + "grad_norm": 0.41122138500213623, + "learning_rate": 3.9934830072008986e-05, + "loss": 0.1924, + "mean_token_accuracy": 0.9420288681983948, + "num_tokens": 29607943.0, + "step": 6320 + }, + { + "entropy": 0.1753424869850278, + "epoch": 0.3672412667793144, + "grad_norm": 0.44864052534103394, + "learning_rate": 3.989419811692793e-05, + "loss": 0.1875, + "mean_token_accuracy": 0.9441530197858811, + "num_tokens": 29656872.0, + "step": 6330 + }, + { + "entropy": 0.17596827102825047, + "epoch": 0.36782142675842866, + "grad_norm": 0.4543643295764923, + "learning_rate": 3.985350507817461e-05, + "loss": 0.1885, + "mean_token_accuracy": 0.9427838534116745, + "num_tokens": 29702738.0, + "step": 6340 + }, + { + "entropy": 0.1821872666478157, + "epoch": 0.36840158673754286, + "grad_norm": 0.4841812551021576, + "learning_rate": 3.981275112263818e-05, + "loss": 0.1923, + "mean_token_accuracy": 0.9417825400829315, + "num_tokens": 29749752.0, + "step": 6350 + }, + { + "entropy": 0.1940952089615166, + "epoch": 0.3689817467166571, + "grad_norm": 0.40240734815597534, + "learning_rate": 3.977193641745764e-05, + "loss": 0.2062, + "mean_token_accuracy": 0.9375139743089675, + "num_tokens": 29804244.0, + "step": 6360 + }, + { + "entropy": 0.18420653892681002, + "epoch": 0.3695619066957714, + "grad_norm": 0.4521697163581848, + "learning_rate": 3.973106113002114e-05, + "loss": 0.1935, + "mean_token_accuracy": 0.942259457707405, + "num_tokens": 29852141.0, + "step": 6370 + }, + { + "entropy": 0.17153383535332978, + "epoch": 0.3701420666748856, + "grad_norm": 0.5134953260421753, + "learning_rate": 3.969012542796525e-05, + "loss": 0.1876, + "mean_token_accuracy": 0.9443728156387806, + "num_tokens": 29896862.0, + "step": 6380 + }, + { + "entropy": 0.1765799831598997, + "epoch": 0.37072222665399984, + "grad_norm": 0.4541497528553009, + "learning_rate": 3.964912947917435e-05, + "loss": 0.1894, + "mean_token_accuracy": 0.9434644296765328, + "num_tokens": 29942427.0, + "step": 6390 + }, + { + "entropy": 0.17869546404108405, + "epoch": 0.3713023866331141, + "grad_norm": 0.4203937351703644, + "learning_rate": 3.9608073451779894e-05, + "loss": 0.1923, + "mean_token_accuracy": 0.9433117769658566, + "num_tokens": 29992778.0, + "step": 6400 + }, + { + "entropy": 0.18394129881635307, + "epoch": 0.3718825466122283, + "grad_norm": 0.448595255613327, + "learning_rate": 3.9566957514159704e-05, + "loss": 0.2064, + "mean_token_accuracy": 0.9397278696298599, + "num_tokens": 30039985.0, + "step": 6410 + }, + { + "entropy": 0.1901105625554919, + "epoch": 0.37246270659134256, + "grad_norm": 0.4462684988975525, + "learning_rate": 3.952578183493734e-05, + "loss": 0.1998, + "mean_token_accuracy": 0.9406438216567039, + "num_tokens": 30090652.0, + "step": 6420 + }, + { + "entropy": 0.18052596263587475, + "epoch": 0.3730428665704568, + "grad_norm": 0.38858747482299805, + "learning_rate": 3.948454658298135e-05, + "loss": 0.1875, + "mean_token_accuracy": 0.9431588731706142, + "num_tokens": 30138707.0, + "step": 6430 + }, + { + "entropy": 0.1758591502904892, + "epoch": 0.373623026549571, + "grad_norm": 0.41893795132637024, + "learning_rate": 3.94432519274046e-05, + "loss": 0.1867, + "mean_token_accuracy": 0.941341532766819, + "num_tokens": 30190181.0, + "step": 6440 + }, + { + "entropy": 0.1724454050883651, + "epoch": 0.3742031865286853, + "grad_norm": 0.41733527183532715, + "learning_rate": 3.94018980375636e-05, + "loss": 0.1801, + "mean_token_accuracy": 0.9460037790238858, + "num_tokens": 30231550.0, + "step": 6450 + }, + { + "entropy": 0.1712999941781163, + "epoch": 0.37478334650779954, + "grad_norm": 0.3979969620704651, + "learning_rate": 3.936048508305776e-05, + "loss": 0.1828, + "mean_token_accuracy": 0.9439408771693707, + "num_tokens": 30278898.0, + "step": 6460 + }, + { + "entropy": 0.1756836327724159, + "epoch": 0.37536350648691374, + "grad_norm": 0.5756676197052002, + "learning_rate": 3.9319013233728763e-05, + "loss": 0.1874, + "mean_token_accuracy": 0.9428585924208164, + "num_tokens": 30325313.0, + "step": 6470 + }, + { + "entropy": 0.1842501800507307, + "epoch": 0.375943666466028, + "grad_norm": 0.43221214413642883, + "learning_rate": 3.927748265965978e-05, + "loss": 0.2029, + "mean_token_accuracy": 0.9393207125365735, + "num_tokens": 30374153.0, + "step": 6480 + }, + { + "entropy": 0.18189961416646838, + "epoch": 0.37652382644514226, + "grad_norm": 0.4583556652069092, + "learning_rate": 3.923589353117487e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.9410657279193402, + "num_tokens": 30419344.0, + "step": 6490 + }, + { + "entropy": 0.18451149240136147, + "epoch": 0.37710398642425647, + "grad_norm": 0.5167770981788635, + "learning_rate": 3.91942460188382e-05, + "loss": 0.1979, + "mean_token_accuracy": 0.9433084264397621, + "num_tokens": 30463800.0, + "step": 6500 + }, + { + "entropy": 0.16674640378914773, + "epoch": 0.3776841464033707, + "grad_norm": 0.3506390452384949, + "learning_rate": 3.91525402934534e-05, + "loss": 0.1774, + "mean_token_accuracy": 0.945024161785841, + "num_tokens": 30503082.0, + "step": 6510 + }, + { + "entropy": 0.18598327990621327, + "epoch": 0.378264306382485, + "grad_norm": 0.3987070620059967, + "learning_rate": 3.911077652606284e-05, + "loss": 0.1978, + "mean_token_accuracy": 0.9415906891226768, + "num_tokens": 30551060.0, + "step": 6520 + }, + { + "entropy": 0.18557860478758811, + "epoch": 0.3788444663615992, + "grad_norm": 0.41282761096954346, + "learning_rate": 3.906895488794691e-05, + "loss": 0.1979, + "mean_token_accuracy": 0.940519816428423, + "num_tokens": 30598743.0, + "step": 6530 + }, + { + "entropy": 0.1818525324575603, + "epoch": 0.37942462634071344, + "grad_norm": 0.4710368514060974, + "learning_rate": 3.902707555062336e-05, + "loss": 0.1997, + "mean_token_accuracy": 0.9399299800395966, + "num_tokens": 30645822.0, + "step": 6540 + }, + { + "entropy": 0.18871624590829014, + "epoch": 0.3800047863198277, + "grad_norm": 0.4532318711280823, + "learning_rate": 3.898513868584658e-05, + "loss": 0.2053, + "mean_token_accuracy": 0.9393002271652222, + "num_tokens": 30694557.0, + "step": 6550 + }, + { + "entropy": 0.1718988874927163, + "epoch": 0.3805849462989419, + "grad_norm": 0.43795520067214966, + "learning_rate": 3.8943144465606845e-05, + "loss": 0.1817, + "mean_token_accuracy": 0.9448684252798557, + "num_tokens": 30742150.0, + "step": 6560 + }, + { + "entropy": 0.18731521684676408, + "epoch": 0.38116510627805616, + "grad_norm": 0.4024735689163208, + "learning_rate": 3.890109306212971e-05, + "loss": 0.2019, + "mean_token_accuracy": 0.9414200395345688, + "num_tokens": 30790734.0, + "step": 6570 + }, + { + "entropy": 0.18132814140990378, + "epoch": 0.3817452662571704, + "grad_norm": 0.49794289469718933, + "learning_rate": 3.885898464787523e-05, + "loss": 0.1932, + "mean_token_accuracy": 0.9435012213885784, + "num_tokens": 30834851.0, + "step": 6580 + }, + { + "entropy": 0.1773290304467082, + "epoch": 0.3823254262362847, + "grad_norm": 0.5125042796134949, + "learning_rate": 3.881681939553725e-05, + "loss": 0.1974, + "mean_token_accuracy": 0.941772072762251, + "num_tokens": 30883420.0, + "step": 6590 + }, + { + "entropy": 0.1636314954608679, + "epoch": 0.3829055862153989, + "grad_norm": 0.4270589053630829, + "learning_rate": 3.877459747804274e-05, + "loss": 0.1854, + "mean_token_accuracy": 0.9453643307089805, + "num_tokens": 30925494.0, + "step": 6600 + }, + { + "entropy": 0.18031403087079526, + "epoch": 0.38348574619451314, + "grad_norm": 0.5391045808792114, + "learning_rate": 3.8732319068551055e-05, + "loss": 0.1857, + "mean_token_accuracy": 0.9427533827722072, + "num_tokens": 30966382.0, + "step": 6610 + }, + { + "entropy": 0.17222106019034983, + "epoch": 0.3840659061736274, + "grad_norm": 0.44612446427345276, + "learning_rate": 3.868998434045323e-05, + "loss": 0.1922, + "mean_token_accuracy": 0.9446250841021537, + "num_tokens": 31013925.0, + "step": 6620 + }, + { + "entropy": 0.18110078247264028, + "epoch": 0.3846460661527416, + "grad_norm": 0.4327133893966675, + "learning_rate": 3.864759346737129e-05, + "loss": 0.1919, + "mean_token_accuracy": 0.9424110174179077, + "num_tokens": 31065264.0, + "step": 6630 + }, + { + "entropy": 0.18222592221572995, + "epoch": 0.38522622613185586, + "grad_norm": 0.3923996388912201, + "learning_rate": 3.860514662315751e-05, + "loss": 0.1954, + "mean_token_accuracy": 0.9416597224771976, + "num_tokens": 31113954.0, + "step": 6640 + }, + { + "entropy": 0.17981998762115836, + "epoch": 0.3858063861109701, + "grad_norm": 0.5243796110153198, + "learning_rate": 3.85626439818937e-05, + "loss": 0.1946, + "mean_token_accuracy": 0.9412631519138813, + "num_tokens": 31160593.0, + "step": 6650 + }, + { + "entropy": 0.1679622040130198, + "epoch": 0.3863865460900843, + "grad_norm": 0.47950565814971924, + "learning_rate": 3.852008571789051e-05, + "loss": 0.1935, + "mean_token_accuracy": 0.9427594177424907, + "num_tokens": 31204744.0, + "step": 6660 + }, + { + "entropy": 0.1831962681375444, + "epoch": 0.3869667060691986, + "grad_norm": 0.46405816078186035, + "learning_rate": 3.8477472005686724e-05, + "loss": 0.1909, + "mean_token_accuracy": 0.942121110856533, + "num_tokens": 31252309.0, + "step": 6670 + }, + { + "entropy": 0.18525166353210806, + "epoch": 0.38754686604831284, + "grad_norm": 0.49207213521003723, + "learning_rate": 3.843480302004851e-05, + "loss": 0.1947, + "mean_token_accuracy": 0.9407951846718788, + "num_tokens": 31301516.0, + "step": 6680 + }, + { + "entropy": 0.18375185485929252, + "epoch": 0.38812702602742705, + "grad_norm": 0.5122168064117432, + "learning_rate": 3.839207893596873e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.9431557707488537, + "num_tokens": 31345760.0, + "step": 6690 + }, + { + "entropy": 0.17810683492571117, + "epoch": 0.3887071860065413, + "grad_norm": 0.6958397626876831, + "learning_rate": 3.834929992866622e-05, + "loss": 0.1945, + "mean_token_accuracy": 0.9428967341780663, + "num_tokens": 31390247.0, + "step": 6700 + }, + { + "entropy": 0.17605960816144944, + "epoch": 0.38928734598565556, + "grad_norm": 0.7244221568107605, + "learning_rate": 3.830646617358504e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.9444792829453945, + "num_tokens": 31434732.0, + "step": 6710 + }, + { + "entropy": 0.17296987799927593, + "epoch": 0.38986750596476977, + "grad_norm": 0.47899818420410156, + "learning_rate": 3.826357784639382e-05, + "loss": 0.1834, + "mean_token_accuracy": 0.9450072765350341, + "num_tokens": 31480295.0, + "step": 6720 + }, + { + "entropy": 0.17829689485952258, + "epoch": 0.390447665943884, + "grad_norm": 0.4394999146461487, + "learning_rate": 3.822063512298496e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.9435353308916092, + "num_tokens": 31526533.0, + "step": 6730 + }, + { + "entropy": 0.1747908434830606, + "epoch": 0.3910278259229983, + "grad_norm": 0.44079238176345825, + "learning_rate": 3.817763817947398e-05, + "loss": 0.1887, + "mean_token_accuracy": 0.9424167916178703, + "num_tokens": 31569042.0, + "step": 6740 + }, + { + "entropy": 0.18239487744867802, + "epoch": 0.3916079859021125, + "grad_norm": 0.39098280668258667, + "learning_rate": 3.8134587192198745e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9415286689996719, + "num_tokens": 31615363.0, + "step": 6750 + }, + { + "entropy": 0.17495393911376594, + "epoch": 0.39218814588122675, + "grad_norm": 0.3825701177120209, + "learning_rate": 3.809148233771876e-05, + "loss": 0.1857, + "mean_token_accuracy": 0.9446469381451607, + "num_tokens": 31664415.0, + "step": 6760 + }, + { + "entropy": 0.18106409395113587, + "epoch": 0.392768305860341, + "grad_norm": 0.48030486702919006, + "learning_rate": 3.8048323792814486e-05, + "loss": 0.2019, + "mean_token_accuracy": 0.940441469848156, + "num_tokens": 31711251.0, + "step": 6770 + }, + { + "entropy": 0.1727092968299985, + "epoch": 0.3933484658394552, + "grad_norm": 0.4204908609390259, + "learning_rate": 3.800511173448653e-05, + "loss": 0.1774, + "mean_token_accuracy": 0.9468284137547016, + "num_tokens": 31755675.0, + "step": 6780 + }, + { + "entropy": 0.17710094358772038, + "epoch": 0.39392862581856947, + "grad_norm": 0.4029178321361542, + "learning_rate": 3.7961846339955e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.943480334430933, + "num_tokens": 31801583.0, + "step": 6790 + }, + { + "entropy": 0.1636716655921191, + "epoch": 0.3945087857976837, + "grad_norm": 0.40167343616485596, + "learning_rate": 3.7918527786658716e-05, + "loss": 0.1771, + "mean_token_accuracy": 0.9452772669494152, + "num_tokens": 31844626.0, + "step": 6800 + }, + { + "entropy": 0.1800838670693338, + "epoch": 0.39508894577679793, + "grad_norm": 0.4580978453159332, + "learning_rate": 3.787515625225453e-05, + "loss": 0.1931, + "mean_token_accuracy": 0.9388901807367802, + "num_tokens": 31892198.0, + "step": 6810 + }, + { + "entropy": 0.19548722123727202, + "epoch": 0.3956691057559122, + "grad_norm": 0.42503225803375244, + "learning_rate": 3.783173191461659e-05, + "loss": 0.1994, + "mean_token_accuracy": 0.9396751441061497, + "num_tokens": 31943398.0, + "step": 6820 + }, + { + "entropy": 0.1943204928189516, + "epoch": 0.39624926573502645, + "grad_norm": 0.40914562344551086, + "learning_rate": 3.7788254951835574e-05, + "loss": 0.2104, + "mean_token_accuracy": 0.9383298039436341, + "num_tokens": 31997180.0, + "step": 6830 + }, + { + "entropy": 0.18522732509300113, + "epoch": 0.39682942571414065, + "grad_norm": 0.46836623549461365, + "learning_rate": 3.7744725542218e-05, + "loss": 0.1979, + "mean_token_accuracy": 0.9394366063177586, + "num_tokens": 32044561.0, + "step": 6840 + }, + { + "entropy": 0.1725301381200552, + "epoch": 0.3974095856932549, + "grad_norm": 0.44604629278182983, + "learning_rate": 3.7701143864285485e-05, + "loss": 0.1905, + "mean_token_accuracy": 0.9444777697324753, + "num_tokens": 32091001.0, + "step": 6850 + }, + { + "entropy": 0.17741436008363962, + "epoch": 0.39798974567236917, + "grad_norm": 0.5081518888473511, + "learning_rate": 3.7657510096773965e-05, + "loss": 0.1881, + "mean_token_accuracy": 0.945154870301485, + "num_tokens": 32135364.0, + "step": 6860 + }, + { + "entropy": 0.17087941318750383, + "epoch": 0.39856990565148337, + "grad_norm": 0.36041465401649475, + "learning_rate": 3.761382441863307e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9435390256345272, + "num_tokens": 32181382.0, + "step": 6870 + }, + { + "entropy": 0.17722983127459885, + "epoch": 0.39915006563059763, + "grad_norm": 0.4578292667865753, + "learning_rate": 3.757008700902529e-05, + "loss": 0.1891, + "mean_token_accuracy": 0.9422714248299598, + "num_tokens": 32225325.0, + "step": 6880 + }, + { + "entropy": 0.18836118336766958, + "epoch": 0.3997302256097119, + "grad_norm": 0.5909493565559387, + "learning_rate": 3.752629804732526e-05, + "loss": 0.2028, + "mean_token_accuracy": 0.9402878932654858, + "num_tokens": 32272680.0, + "step": 6890 + }, + { + "entropy": 0.18106345599517226, + "epoch": 0.40031038558882615, + "grad_norm": 0.4702394902706146, + "learning_rate": 3.7482457713119066e-05, + "loss": 0.1856, + "mean_token_accuracy": 0.944454163312912, + "num_tokens": 32314121.0, + "step": 6900 + }, + { + "entropy": 0.1817781964316964, + "epoch": 0.40089054556794035, + "grad_norm": 0.4690053462982178, + "learning_rate": 3.7438566186203474e-05, + "loss": 0.2042, + "mean_token_accuracy": 0.9398420296609402, + "num_tokens": 32359863.0, + "step": 6910 + }, + { + "entropy": 0.18268163558095693, + "epoch": 0.4014707055470546, + "grad_norm": 0.3624870777130127, + "learning_rate": 3.739462364658518e-05, + "loss": 0.1977, + "mean_token_accuracy": 0.940448247641325, + "num_tokens": 32410215.0, + "step": 6920 + }, + { + "entropy": 0.17682492174208164, + "epoch": 0.40205086552616887, + "grad_norm": 0.3944310247898102, + "learning_rate": 3.735063027448012e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.9432011306285858, + "num_tokens": 32458044.0, + "step": 6930 + }, + { + "entropy": 0.1760819842107594, + "epoch": 0.40263102550528307, + "grad_norm": 0.36696767807006836, + "learning_rate": 3.7306586250312705e-05, + "loss": 0.1877, + "mean_token_accuracy": 0.9434549517929554, + "num_tokens": 32505188.0, + "step": 6940 + }, + { + "entropy": 0.17351605081930757, + "epoch": 0.4032111854843973, + "grad_norm": 0.5595124959945679, + "learning_rate": 3.7262491754715036e-05, + "loss": 0.1906, + "mean_token_accuracy": 0.9429864846169949, + "num_tokens": 32545526.0, + "step": 6950 + }, + { + "entropy": 0.1644945800304413, + "epoch": 0.4037913454635116, + "grad_norm": 0.43218109011650085, + "learning_rate": 3.721834696852626e-05, + "loss": 0.1794, + "mean_token_accuracy": 0.9466404393315315, + "num_tokens": 32588625.0, + "step": 6960 + }, + { + "entropy": 0.1749804056249559, + "epoch": 0.4043715054426258, + "grad_norm": 0.5044981837272644, + "learning_rate": 3.717415207279174e-05, + "loss": 0.1844, + "mean_token_accuracy": 0.9439681686460972, + "num_tokens": 32636300.0, + "step": 6970 + }, + { + "entropy": 0.1808977469801903, + "epoch": 0.40495166542174005, + "grad_norm": 0.3961392343044281, + "learning_rate": 3.712990724876235e-05, + "loss": 0.1884, + "mean_token_accuracy": 0.9442379385232925, + "num_tokens": 32684714.0, + "step": 6980 + }, + { + "entropy": 0.179692873172462, + "epoch": 0.4055318254008543, + "grad_norm": 0.42921167612075806, + "learning_rate": 3.708561267789376e-05, + "loss": 0.1917, + "mean_token_accuracy": 0.942794892191887, + "num_tokens": 32735520.0, + "step": 6990 + }, + { + "entropy": 0.1605877958238125, + "epoch": 0.4061119853799685, + "grad_norm": 0.41140127182006836, + "learning_rate": 3.704126854184561e-05, + "loss": 0.1694, + "mean_token_accuracy": 0.9485975980758667, + "num_tokens": 32778723.0, + "step": 7000 + }, + { + "entropy": 0.1704848145134747, + "epoch": 0.40669214535908277, + "grad_norm": 0.4324689209461212, + "learning_rate": 3.699687502248085e-05, + "loss": 0.1895, + "mean_token_accuracy": 0.9437277413904667, + "num_tokens": 32826480.0, + "step": 7010 + }, + { + "entropy": 0.17487102830782533, + "epoch": 0.407272305338197, + "grad_norm": 0.5377344489097595, + "learning_rate": 3.695243230186494e-05, + "loss": 0.1893, + "mean_token_accuracy": 0.9447576411068439, + "num_tokens": 32872800.0, + "step": 7020 + }, + { + "entropy": 0.18698208779096603, + "epoch": 0.40785246531731123, + "grad_norm": 0.46365851163864136, + "learning_rate": 3.690794056226516e-05, + "loss": 0.2019, + "mean_token_accuracy": 0.9406974829733372, + "num_tokens": 32925569.0, + "step": 7030 + }, + { + "entropy": 0.17784799300134183, + "epoch": 0.4084326252964255, + "grad_norm": 0.45428919792175293, + "learning_rate": 3.6863399986149775e-05, + "loss": 0.1857, + "mean_token_accuracy": 0.9437539398670196, + "num_tokens": 32970413.0, + "step": 7040 + }, + { + "entropy": 0.17829010020941496, + "epoch": 0.40901278527553975, + "grad_norm": 0.3932447135448456, + "learning_rate": 3.681881075618737e-05, + "loss": 0.1876, + "mean_token_accuracy": 0.9415183290839195, + "num_tokens": 33025497.0, + "step": 7050 + }, + { + "entropy": 0.16724251983687283, + "epoch": 0.40959294525465395, + "grad_norm": 0.4690726399421692, + "learning_rate": 3.677417305524606e-05, + "loss": 0.1809, + "mean_token_accuracy": 0.9456048548221588, + "num_tokens": 33072179.0, + "step": 7060 + }, + { + "entropy": 0.16831655241549015, + "epoch": 0.4101731052337682, + "grad_norm": 0.43986421823501587, + "learning_rate": 3.672948706639274e-05, + "loss": 0.17, + "mean_token_accuracy": 0.9472413562238217, + "num_tokens": 33115899.0, + "step": 7070 + }, + { + "entropy": 0.18198816413059832, + "epoch": 0.41075326521288247, + "grad_norm": 0.4611426293849945, + "learning_rate": 3.6684752972892347e-05, + "loss": 0.1999, + "mean_token_accuracy": 0.9409242458641529, + "num_tokens": 33164728.0, + "step": 7080 + }, + { + "entropy": 0.17500849021598697, + "epoch": 0.41133342519199667, + "grad_norm": 0.3747369349002838, + "learning_rate": 3.663997095820711e-05, + "loss": 0.1892, + "mean_token_accuracy": 0.9431763991713524, + "num_tokens": 33212452.0, + "step": 7090 + }, + { + "entropy": 0.16788928732275962, + "epoch": 0.41191358517111093, + "grad_norm": 0.5338327288627625, + "learning_rate": 3.659514120599579e-05, + "loss": 0.1764, + "mean_token_accuracy": 0.9478957839310169, + "num_tokens": 33253684.0, + "step": 7100 + }, + { + "entropy": 0.16783091071993111, + "epoch": 0.4124937451502252, + "grad_norm": 0.4290676414966583, + "learning_rate": 3.655026390011291e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9440383724868298, + "num_tokens": 33300945.0, + "step": 7110 + }, + { + "entropy": 0.18100420301780104, + "epoch": 0.4130739051293394, + "grad_norm": 0.37962427735328674, + "learning_rate": 3.650533922460804e-05, + "loss": 0.191, + "mean_token_accuracy": 0.94116270840168, + "num_tokens": 33344721.0, + "step": 7120 + }, + { + "entropy": 0.18950425712391733, + "epoch": 0.41365406510845365, + "grad_norm": 0.6690701842308044, + "learning_rate": 3.646036736372502e-05, + "loss": 0.1985, + "mean_token_accuracy": 0.9395280808210373, + "num_tokens": 33396798.0, + "step": 7130 + }, + { + "entropy": 0.1749830137938261, + "epoch": 0.4142342250875679, + "grad_norm": 0.49683138728141785, + "learning_rate": 3.641534850190118e-05, + "loss": 0.1934, + "mean_token_accuracy": 0.941690581291914, + "num_tokens": 33449457.0, + "step": 7140 + }, + { + "entropy": 0.18296645451337099, + "epoch": 0.4148143850666821, + "grad_norm": 0.43448758125305176, + "learning_rate": 3.637028282376666e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.942917887121439, + "num_tokens": 33499444.0, + "step": 7150 + }, + { + "entropy": 0.16935697579756379, + "epoch": 0.41539454504579637, + "grad_norm": 0.39070844650268555, + "learning_rate": 3.6325170514143524e-05, + "loss": 0.1736, + "mean_token_accuracy": 0.9469477735459805, + "num_tokens": 33541977.0, + "step": 7160 + }, + { + "entropy": 0.16148536475375294, + "epoch": 0.41597470502491063, + "grad_norm": 0.436962366104126, + "learning_rate": 3.6280011758045165e-05, + "loss": 0.1742, + "mean_token_accuracy": 0.947004608809948, + "num_tokens": 33584661.0, + "step": 7170 + }, + { + "entropy": 0.1691883247811347, + "epoch": 0.41655486500402483, + "grad_norm": 0.507943332195282, + "learning_rate": 3.62348067406754e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9450710609555244, + "num_tokens": 33631328.0, + "step": 7180 + }, + { + "entropy": 0.18260740414261817, + "epoch": 0.4171350249831391, + "grad_norm": 0.4435980021953583, + "learning_rate": 3.618955564742781e-05, + "loss": 0.2086, + "mean_token_accuracy": 0.9391208313405514, + "num_tokens": 33681151.0, + "step": 7190 + }, + { + "entropy": 0.17636866867542267, + "epoch": 0.41771518496225335, + "grad_norm": 0.46011683344841003, + "learning_rate": 3.61442586638849e-05, + "loss": 0.179, + "mean_token_accuracy": 0.9453225299715996, + "num_tokens": 33725839.0, + "step": 7200 + }, + { + "entropy": 0.17887682607397437, + "epoch": 0.4182953449413676, + "grad_norm": 0.46498608589172363, + "learning_rate": 3.6098915975817424e-05, + "loss": 0.1894, + "mean_token_accuracy": 0.94220100492239, + "num_tokens": 33774275.0, + "step": 7210 + }, + { + "entropy": 0.1699100023135543, + "epoch": 0.4188755049204818, + "grad_norm": 0.4263991415500641, + "learning_rate": 3.605352776918354e-05, + "loss": 0.1902, + "mean_token_accuracy": 0.94272196367383, + "num_tokens": 33823444.0, + "step": 7220 + }, + { + "entropy": 0.1859594164416194, + "epoch": 0.41945566489959607, + "grad_norm": 0.5168442726135254, + "learning_rate": 3.600809423012811e-05, + "loss": 0.2025, + "mean_token_accuracy": 0.9395956307649612, + "num_tokens": 33866425.0, + "step": 7230 + }, + { + "entropy": 0.18047962095588446, + "epoch": 0.42003582487871033, + "grad_norm": 0.45975539088249207, + "learning_rate": 3.5962615544981914e-05, + "loss": 0.1893, + "mean_token_accuracy": 0.9428842552006245, + "num_tokens": 33912048.0, + "step": 7240 + }, + { + "entropy": 0.18580907676368952, + "epoch": 0.42061598485782453, + "grad_norm": 0.47968772053718567, + "learning_rate": 3.5917091900260854e-05, + "loss": 0.1973, + "mean_token_accuracy": 0.9404053151607513, + "num_tokens": 33964069.0, + "step": 7250 + }, + { + "entropy": 0.167741743568331, + "epoch": 0.4211961448369388, + "grad_norm": 0.3901202976703644, + "learning_rate": 3.5871523482665265e-05, + "loss": 0.181, + "mean_token_accuracy": 0.9463054060935974, + "num_tokens": 34009105.0, + "step": 7260 + }, + { + "entropy": 0.1726191102527082, + "epoch": 0.42177630481605305, + "grad_norm": 0.5129478573799133, + "learning_rate": 3.582591047907906e-05, + "loss": 0.1885, + "mean_token_accuracy": 0.9437590159475804, + "num_tokens": 34056992.0, + "step": 7270 + }, + { + "entropy": 0.17678427025675775, + "epoch": 0.42235646479516725, + "grad_norm": 0.42883723974227905, + "learning_rate": 3.578025307656901e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.9433455400168895, + "num_tokens": 34109929.0, + "step": 7280 + }, + { + "entropy": 0.1679611991159618, + "epoch": 0.4229366247742815, + "grad_norm": 0.43149304389953613, + "learning_rate": 3.5734551462384025e-05, + "loss": 0.1769, + "mean_token_accuracy": 0.9448987565934658, + "num_tokens": 34154396.0, + "step": 7290 + }, + { + "entropy": 0.1729629147797823, + "epoch": 0.42351678475339577, + "grad_norm": 0.42287546396255493, + "learning_rate": 3.568880582395427e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9447868876159191, + "num_tokens": 34196892.0, + "step": 7300 + }, + { + "entropy": 0.1668147119693458, + "epoch": 0.42409694473251, + "grad_norm": 0.4258587956428528, + "learning_rate": 3.5643016348890494e-05, + "loss": 0.1738, + "mean_token_accuracy": 0.9463316857814789, + "num_tokens": 34242112.0, + "step": 7310 + }, + { + "entropy": 0.16632893970236182, + "epoch": 0.42467710471162423, + "grad_norm": 0.4890320897102356, + "learning_rate": 3.559718322498323e-05, + "loss": 0.1783, + "mean_token_accuracy": 0.9462191581726074, + "num_tokens": 34288642.0, + "step": 7320 + }, + { + "entropy": 0.1746590020135045, + "epoch": 0.4252572646907385, + "grad_norm": 0.5330976843833923, + "learning_rate": 3.555130664020201e-05, + "loss": 0.1829, + "mean_token_accuracy": 0.9434598192572594, + "num_tokens": 34331191.0, + "step": 7330 + }, + { + "entropy": 0.17595587261021137, + "epoch": 0.4258374246698527, + "grad_norm": 0.4158857762813568, + "learning_rate": 3.5505386782694604e-05, + "loss": 0.1873, + "mean_token_accuracy": 0.9433610133826733, + "num_tokens": 34378470.0, + "step": 7340 + }, + { + "entropy": 0.17125609181821347, + "epoch": 0.42641758464896695, + "grad_norm": 0.39074185490608215, + "learning_rate": 3.5459423840786267e-05, + "loss": 0.1833, + "mean_token_accuracy": 0.9437054388225079, + "num_tokens": 34423742.0, + "step": 7350 + }, + { + "entropy": 0.16711760656908153, + "epoch": 0.4269977446280812, + "grad_norm": 0.522204577922821, + "learning_rate": 3.541341800297895e-05, + "loss": 0.1786, + "mean_token_accuracy": 0.9462713405489922, + "num_tokens": 34467361.0, + "step": 7360 + }, + { + "entropy": 0.1718498262576759, + "epoch": 0.4275779046071954, + "grad_norm": 0.563748300075531, + "learning_rate": 3.5367369457950496e-05, + "loss": 0.1821, + "mean_token_accuracy": 0.9452539779245853, + "num_tokens": 34510367.0, + "step": 7370 + }, + { + "entropy": 0.17780220918357373, + "epoch": 0.4281580645863097, + "grad_norm": 0.47857925295829773, + "learning_rate": 3.5321278394553934e-05, + "loss": 0.201, + "mean_token_accuracy": 0.9419735886156559, + "num_tokens": 34557816.0, + "step": 7380 + }, + { + "entropy": 0.18328024987131358, + "epoch": 0.42873822456542393, + "grad_norm": 0.40884608030319214, + "learning_rate": 3.5275145001816655e-05, + "loss": 0.1985, + "mean_token_accuracy": 0.9399777725338936, + "num_tokens": 34608825.0, + "step": 7390 + }, + { + "entropy": 0.181365849589929, + "epoch": 0.42931838454453813, + "grad_norm": 0.4526987075805664, + "learning_rate": 3.522896946893966e-05, + "loss": 0.1902, + "mean_token_accuracy": 0.9428675331175327, + "num_tokens": 34663515.0, + "step": 7400 + }, + { + "entropy": 0.1734090038575232, + "epoch": 0.4298985445236524, + "grad_norm": 0.44622698426246643, + "learning_rate": 3.518275198529675e-05, + "loss": 0.1801, + "mean_token_accuracy": 0.9446061864495278, + "num_tokens": 34711921.0, + "step": 7410 + }, + { + "entropy": 0.17163175139576198, + "epoch": 0.43047870450276665, + "grad_norm": 0.4656537175178528, + "learning_rate": 3.513649274043378e-05, + "loss": 0.184, + "mean_token_accuracy": 0.9425687521696091, + "num_tokens": 34759023.0, + "step": 7420 + }, + { + "entropy": 0.17136323498561978, + "epoch": 0.43105886448188085, + "grad_norm": 0.40900731086730957, + "learning_rate": 3.50901919240679e-05, + "loss": 0.1852, + "mean_token_accuracy": 0.9449736908078193, + "num_tokens": 34811692.0, + "step": 7430 + }, + { + "entropy": 0.1677059425506741, + "epoch": 0.4316390244609951, + "grad_norm": 0.38418325781822205, + "learning_rate": 3.5043849726086724e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.9457319281995297, + "num_tokens": 34857302.0, + "step": 7440 + }, + { + "entropy": 0.1624292747117579, + "epoch": 0.43221918444010937, + "grad_norm": 0.3584018349647522, + "learning_rate": 3.49974663365476e-05, + "loss": 0.1717, + "mean_token_accuracy": 0.9463366121053696, + "num_tokens": 34901758.0, + "step": 7450 + }, + { + "entropy": 0.1691069696098566, + "epoch": 0.4327993444192236, + "grad_norm": 0.4321309030056, + "learning_rate": 3.495104194567679e-05, + "loss": 0.1807, + "mean_token_accuracy": 0.9461843430995941, + "num_tokens": 34948469.0, + "step": 7460 + }, + { + "entropy": 0.17528624488040806, + "epoch": 0.43337950439833783, + "grad_norm": 0.4442686140537262, + "learning_rate": 3.4904576743868704e-05, + "loss": 0.1856, + "mean_token_accuracy": 0.9418997175991535, + "num_tokens": 34999005.0, + "step": 7470 + }, + { + "entropy": 0.17213935079053044, + "epoch": 0.4339596643774521, + "grad_norm": 0.39608338475227356, + "learning_rate": 3.485807092168517e-05, + "loss": 0.1849, + "mean_token_accuracy": 0.9430527843534946, + "num_tokens": 35043139.0, + "step": 7480 + }, + { + "entropy": 0.17761487727984787, + "epoch": 0.4345398243565663, + "grad_norm": 0.4367091655731201, + "learning_rate": 3.481152466985455e-05, + "loss": 0.1922, + "mean_token_accuracy": 0.9420728705823421, + "num_tokens": 35088273.0, + "step": 7490 + }, + { + "entropy": 0.17562436861917377, + "epoch": 0.43511998433568055, + "grad_norm": 0.36865130066871643, + "learning_rate": 3.4764938179271054e-05, + "loss": 0.1817, + "mean_token_accuracy": 0.9450137317180634, + "num_tokens": 35133223.0, + "step": 7500 + }, + { + "entropy": 0.1733238535001874, + "epoch": 0.4357001443147948, + "grad_norm": 0.4055769443511963, + "learning_rate": 3.47183116409939e-05, + "loss": 0.1947, + "mean_token_accuracy": 0.9401926137506962, + "num_tokens": 35181243.0, + "step": 7510 + }, + { + "entropy": 0.17777962209656833, + "epoch": 0.43628030429390907, + "grad_norm": 0.4202897250652313, + "learning_rate": 3.467164524624657e-05, + "loss": 0.1875, + "mean_token_accuracy": 0.9417169697582721, + "num_tokens": 35227761.0, + "step": 7520 + }, + { + "entropy": 0.190540861338377, + "epoch": 0.4368604642730233, + "grad_norm": 0.4159415364265442, + "learning_rate": 3.462493918641597e-05, + "loss": 0.2021, + "mean_token_accuracy": 0.9398764669895172, + "num_tokens": 35278549.0, + "step": 7530 + }, + { + "entropy": 0.17427377356216311, + "epoch": 0.43744062425213753, + "grad_norm": 0.5262669920921326, + "learning_rate": 3.457819365305169e-05, + "loss": 0.1912, + "mean_token_accuracy": 0.941782108694315, + "num_tokens": 35329643.0, + "step": 7540 + }, + { + "entropy": 0.17409297777339816, + "epoch": 0.4380207842312518, + "grad_norm": 0.5587676763534546, + "learning_rate": 3.453140883786524e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9438704140484333, + "num_tokens": 35376376.0, + "step": 7550 + }, + { + "entropy": 0.18225516732782127, + "epoch": 0.438600944210366, + "grad_norm": 0.4059174656867981, + "learning_rate": 3.448458493272918e-05, + "loss": 0.1913, + "mean_token_accuracy": 0.9408869221806526, + "num_tokens": 35425115.0, + "step": 7560 + }, + { + "entropy": 0.18234021523967386, + "epoch": 0.43918110418948025, + "grad_norm": 0.383766233921051, + "learning_rate": 3.443772212967644e-05, + "loss": 0.1985, + "mean_token_accuracy": 0.9414879597723484, + "num_tokens": 35474917.0, + "step": 7570 + }, + { + "entropy": 0.17213527327403427, + "epoch": 0.4397612641685945, + "grad_norm": 0.45395001769065857, + "learning_rate": 3.439082062089942e-05, + "loss": 0.1891, + "mean_token_accuracy": 0.9424972467124462, + "num_tokens": 35519807.0, + "step": 7580 + }, + { + "entropy": 0.1767697749659419, + "epoch": 0.4403414241477087, + "grad_norm": 0.4101344645023346, + "learning_rate": 3.4343880598749315e-05, + "loss": 0.1824, + "mean_token_accuracy": 0.9435585349798202, + "num_tokens": 35566454.0, + "step": 7590 + }, + { + "entropy": 0.17496791272424161, + "epoch": 0.440921584126823, + "grad_norm": 0.41049808263778687, + "learning_rate": 3.429690225573521e-05, + "loss": 0.1953, + "mean_token_accuracy": 0.9424116529524327, + "num_tokens": 35609046.0, + "step": 7600 + }, + { + "entropy": 0.1722237253561616, + "epoch": 0.44150174410593723, + "grad_norm": 0.46416428685188293, + "learning_rate": 3.424988578452342e-05, + "loss": 0.1786, + "mean_token_accuracy": 0.9442384950816631, + "num_tokens": 35652688.0, + "step": 7610 + }, + { + "entropy": 0.1824321463704109, + "epoch": 0.44208190408505144, + "grad_norm": 0.48781418800354004, + "learning_rate": 3.4202831377936555e-05, + "loss": 0.201, + "mean_token_accuracy": 0.9406830325722695, + "num_tokens": 35702405.0, + "step": 7620 + }, + { + "entropy": 0.17100682714954019, + "epoch": 0.4426620640641657, + "grad_norm": 0.4054839611053467, + "learning_rate": 3.4155739228952866e-05, + "loss": 0.1908, + "mean_token_accuracy": 0.9437445238232612, + "num_tokens": 35747009.0, + "step": 7630 + }, + { + "entropy": 0.15809463090263307, + "epoch": 0.44324222404327995, + "grad_norm": 0.396712988615036, + "learning_rate": 3.410860953070536e-05, + "loss": 0.1675, + "mean_token_accuracy": 0.950005977600813, + "num_tokens": 35787330.0, + "step": 7640 + }, + { + "entropy": 0.18344695214182138, + "epoch": 0.44382238402239416, + "grad_norm": 0.3827393651008606, + "learning_rate": 3.406144247648105e-05, + "loss": 0.2038, + "mean_token_accuracy": 0.9396763779222965, + "num_tokens": 35836941.0, + "step": 7650 + }, + { + "entropy": 0.17946529667824507, + "epoch": 0.4444025440015084, + "grad_norm": 0.40904203057289124, + "learning_rate": 3.401423825972016e-05, + "loss": 0.1892, + "mean_token_accuracy": 0.9410572983324528, + "num_tokens": 35884039.0, + "step": 7660 + }, + { + "entropy": 0.1821272809058428, + "epoch": 0.4449827039806227, + "grad_norm": 0.4709073305130005, + "learning_rate": 3.39669970740153e-05, + "loss": 0.1942, + "mean_token_accuracy": 0.9421871677041054, + "num_tokens": 35935855.0, + "step": 7670 + }, + { + "entropy": 0.18428019238635898, + "epoch": 0.4455628639597369, + "grad_norm": 0.4095437824726105, + "learning_rate": 3.391971911311071e-05, + "loss": 0.1971, + "mean_token_accuracy": 0.9398056790232658, + "num_tokens": 35986349.0, + "step": 7680 + }, + { + "entropy": 0.1753478514030576, + "epoch": 0.44614302393885114, + "grad_norm": 0.4490523636341095, + "learning_rate": 3.3872404570901476e-05, + "loss": 0.1883, + "mean_token_accuracy": 0.9430297248065471, + "num_tokens": 36037433.0, + "step": 7690 + }, + { + "entropy": 0.16963940002024175, + "epoch": 0.4467231839179654, + "grad_norm": 0.43331336975097656, + "learning_rate": 3.382505364143265e-05, + "loss": 0.1818, + "mean_token_accuracy": 0.9445929273962974, + "num_tokens": 36087199.0, + "step": 7700 + }, + { + "entropy": 0.17252364438027143, + "epoch": 0.4473033438970796, + "grad_norm": 0.4243619740009308, + "learning_rate": 3.377766651889858e-05, + "loss": 0.184, + "mean_token_accuracy": 0.9441722214221955, + "num_tokens": 36129237.0, + "step": 7710 + }, + { + "entropy": 0.17775068059563637, + "epoch": 0.44788350387619386, + "grad_norm": 0.3890032470226288, + "learning_rate": 3.373024339764201e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.9428026862442493, + "num_tokens": 36178142.0, + "step": 7720 + }, + { + "entropy": 0.17591477800160646, + "epoch": 0.4484636638553081, + "grad_norm": 0.40901046991348267, + "learning_rate": 3.368278447215332e-05, + "loss": 0.1949, + "mean_token_accuracy": 0.943128464370966, + "num_tokens": 36226182.0, + "step": 7730 + }, + { + "entropy": 0.1581026030704379, + "epoch": 0.4490438238344223, + "grad_norm": 0.4428081512451172, + "learning_rate": 3.363528993706976e-05, + "loss": 0.1648, + "mean_token_accuracy": 0.9485232539474964, + "num_tokens": 36266312.0, + "step": 7740 + }, + { + "entropy": 0.15679234713315965, + "epoch": 0.4496239838135366, + "grad_norm": 0.3939167559146881, + "learning_rate": 3.358775998717458e-05, + "loss": 0.1659, + "mean_token_accuracy": 0.9496321305632591, + "num_tokens": 36315035.0, + "step": 7750 + }, + { + "entropy": 0.17333023194223643, + "epoch": 0.45020414379265083, + "grad_norm": 0.5292229056358337, + "learning_rate": 3.3540194817396306e-05, + "loss": 0.1958, + "mean_token_accuracy": 0.9407670021057128, + "num_tokens": 36359012.0, + "step": 7760 + }, + { + "entropy": 0.17620274759829044, + "epoch": 0.45078430377176504, + "grad_norm": 0.4889729619026184, + "learning_rate": 3.349259462280788e-05, + "loss": 0.1922, + "mean_token_accuracy": 0.9424700453877449, + "num_tokens": 36401081.0, + "step": 7770 + }, + { + "entropy": 0.1728500454686582, + "epoch": 0.4513644637508793, + "grad_norm": 0.41306111216545105, + "learning_rate": 3.3444959598625916e-05, + "loss": 0.1843, + "mean_token_accuracy": 0.9440127201378345, + "num_tokens": 36447956.0, + "step": 7780 + }, + { + "entropy": 0.1730897706001997, + "epoch": 0.45194462372999356, + "grad_norm": 0.4501255452632904, + "learning_rate": 3.3397289940209834e-05, + "loss": 0.1825, + "mean_token_accuracy": 0.9445605784654617, + "num_tokens": 36493284.0, + "step": 7790 + }, + { + "entropy": 0.17593724047765136, + "epoch": 0.4525247837091078, + "grad_norm": 0.4565020501613617, + "learning_rate": 3.334958584306113e-05, + "loss": 0.1964, + "mean_token_accuracy": 0.941444843262434, + "num_tokens": 36539538.0, + "step": 7800 + }, + { + "entropy": 0.17858766857534647, + "epoch": 0.453104943688222, + "grad_norm": 0.6305527687072754, + "learning_rate": 3.3301847502822524e-05, + "loss": 0.1888, + "mean_token_accuracy": 0.9418301671743393, + "num_tokens": 36581049.0, + "step": 7810 + }, + { + "entropy": 0.1704928230494261, + "epoch": 0.4536851036673363, + "grad_norm": 0.5597914457321167, + "learning_rate": 3.3254075115277175e-05, + "loss": 0.1792, + "mean_token_accuracy": 0.9457172617316246, + "num_tokens": 36622083.0, + "step": 7820 + }, + { + "entropy": 0.17105721179395914, + "epoch": 0.45426526364645053, + "grad_norm": 0.4232339859008789, + "learning_rate": 3.320626887634786e-05, + "loss": 0.186, + "mean_token_accuracy": 0.9450602442026138, + "num_tokens": 36667674.0, + "step": 7830 + }, + { + "entropy": 0.16620323988609015, + "epoch": 0.45484542362556474, + "grad_norm": 0.5057905912399292, + "learning_rate": 3.315842898209622e-05, + "loss": 0.1809, + "mean_token_accuracy": 0.9452323228120804, + "num_tokens": 36712219.0, + "step": 7840 + }, + { + "entropy": 0.17276617456227542, + "epoch": 0.455425583604679, + "grad_norm": 0.5853102803230286, + "learning_rate": 3.3110555628721885e-05, + "loss": 0.1872, + "mean_token_accuracy": 0.9426279075443744, + "num_tokens": 36757316.0, + "step": 7850 + }, + { + "entropy": 0.18316365126520395, + "epoch": 0.45600574358379325, + "grad_norm": 0.6364777088165283, + "learning_rate": 3.306264901256173e-05, + "loss": 0.1895, + "mean_token_accuracy": 0.9423549734055996, + "num_tokens": 36804490.0, + "step": 7860 + }, + { + "entropy": 0.17354477029293774, + "epoch": 0.45658590356290746, + "grad_norm": 0.4475865960121155, + "learning_rate": 3.301470933008904e-05, + "loss": 0.1864, + "mean_token_accuracy": 0.9443018920719624, + "num_tokens": 36855047.0, + "step": 7870 + }, + { + "entropy": 0.1783720664680004, + "epoch": 0.4571660635420217, + "grad_norm": 0.4054356515407562, + "learning_rate": 3.2966736777912723e-05, + "loss": 0.1927, + "mean_token_accuracy": 0.9419763013720512, + "num_tokens": 36901621.0, + "step": 7880 + }, + { + "entropy": 0.16720719253644348, + "epoch": 0.457746223521136, + "grad_norm": 0.3998508155345917, + "learning_rate": 3.291873155277646e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9442383706569671, + "num_tokens": 36946540.0, + "step": 7890 + }, + { + "entropy": 0.1859224579297006, + "epoch": 0.4583263835002502, + "grad_norm": 0.5400324463844299, + "learning_rate": 3.287069385155796e-05, + "loss": 0.1959, + "mean_token_accuracy": 0.9395829647779464, + "num_tokens": 36997307.0, + "step": 7900 + }, + { + "entropy": 0.18311282163485884, + "epoch": 0.45890654347936444, + "grad_norm": 0.3947174847126007, + "learning_rate": 3.282262387126811e-05, + "loss": 0.187, + "mean_token_accuracy": 0.9418772615492343, + "num_tokens": 37048580.0, + "step": 7910 + }, + { + "entropy": 0.16808812627568842, + "epoch": 0.4594867034584787, + "grad_norm": 0.456355482339859, + "learning_rate": 3.277452180905018e-05, + "loss": 0.1819, + "mean_token_accuracy": 0.9447632350027562, + "num_tokens": 37095285.0, + "step": 7920 + }, + { + "entropy": 0.15868745064362882, + "epoch": 0.4600668634375929, + "grad_norm": 0.47921884059906006, + "learning_rate": 3.272638786217901e-05, + "loss": 0.1727, + "mean_token_accuracy": 0.9455358870327473, + "num_tokens": 37137003.0, + "step": 7930 + }, + { + "entropy": 0.1607928649522364, + "epoch": 0.46064702341670716, + "grad_norm": 0.38130784034729004, + "learning_rate": 3.267822222806022e-05, + "loss": 0.1736, + "mean_token_accuracy": 0.9478852033615113, + "num_tokens": 37182070.0, + "step": 7940 + }, + { + "entropy": 0.1735306503251195, + "epoch": 0.4612271833958214, + "grad_norm": 0.41431066393852234, + "learning_rate": 3.2630025104229355e-05, + "loss": 0.1839, + "mean_token_accuracy": 0.944563377648592, + "num_tokens": 37231898.0, + "step": 7950 + }, + { + "entropy": 0.18016524575650691, + "epoch": 0.4618073433749356, + "grad_norm": 0.39642634987831116, + "learning_rate": 3.258179668835114e-05, + "loss": 0.1902, + "mean_token_accuracy": 0.9423787713050842, + "num_tokens": 37282540.0, + "step": 7960 + }, + { + "entropy": 0.17314906911924483, + "epoch": 0.4623875033540499, + "grad_norm": 0.4256667494773865, + "learning_rate": 3.25335371782186e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.9422760687768459, + "num_tokens": 37331876.0, + "step": 7970 + }, + { + "entropy": 0.17314035217277707, + "epoch": 0.46296766333316414, + "grad_norm": 0.40858936309814453, + "learning_rate": 3.248524677175231e-05, + "loss": 0.1901, + "mean_token_accuracy": 0.9430893130600453, + "num_tokens": 37378525.0, + "step": 7980 + }, + { + "entropy": 0.1699408959597349, + "epoch": 0.46354782331227834, + "grad_norm": 0.40741077065467834, + "learning_rate": 3.243692566699955e-05, + "loss": 0.177, + "mean_token_accuracy": 0.944687084108591, + "num_tokens": 37423583.0, + "step": 7990 + }, + { + "entropy": 0.1759395389817655, + "epoch": 0.4641279832913926, + "grad_norm": 0.38594120740890503, + "learning_rate": 3.2388574062133484e-05, + "loss": 0.1852, + "mean_token_accuracy": 0.9438326768577099, + "num_tokens": 37473443.0, + "step": 8000 + }, + { + "entropy": 0.1834797970019281, + "epoch": 0.46470814327050686, + "grad_norm": 0.4073975086212158, + "learning_rate": 3.2340192155452376e-05, + "loss": 0.2013, + "mean_token_accuracy": 0.9406263038516045, + "num_tokens": 37524660.0, + "step": 8010 + }, + { + "entropy": 0.17364355251193048, + "epoch": 0.46528830324962106, + "grad_norm": 0.4124443531036377, + "learning_rate": 3.229178014537877e-05, + "loss": 0.1832, + "mean_token_accuracy": 0.9435022868216038, + "num_tokens": 37573634.0, + "step": 8020 + }, + { + "entropy": 0.17387441219761968, + "epoch": 0.4658684632287353, + "grad_norm": 0.3629413843154907, + "learning_rate": 3.2243338230458645e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9420187614858151, + "num_tokens": 37626225.0, + "step": 8030 + }, + { + "entropy": 0.17595630325376987, + "epoch": 0.4664486232078496, + "grad_norm": 0.37559252977371216, + "learning_rate": 3.219486660936064e-05, + "loss": 0.1908, + "mean_token_accuracy": 0.9437500812113285, + "num_tokens": 37673756.0, + "step": 8040 + }, + { + "entropy": 0.172442477196455, + "epoch": 0.4670287831869638, + "grad_norm": 0.5087631940841675, + "learning_rate": 3.214636548087523e-05, + "loss": 0.1833, + "mean_token_accuracy": 0.9451712928712368, + "num_tokens": 37722162.0, + "step": 8050 + }, + { + "entropy": 0.15384310502558945, + "epoch": 0.46760894316607804, + "grad_norm": 0.4122638404369354, + "learning_rate": 3.209783504391391e-05, + "loss": 0.1713, + "mean_token_accuracy": 0.9498492695391179, + "num_tokens": 37765380.0, + "step": 8060 + }, + { + "entropy": 0.1599185736849904, + "epoch": 0.4681891031451923, + "grad_norm": 0.401643842458725, + "learning_rate": 3.2049275497508335e-05, + "loss": 0.1732, + "mean_token_accuracy": 0.9467650771141052, + "num_tokens": 37806654.0, + "step": 8070 + }, + { + "entropy": 0.16561200898140668, + "epoch": 0.4687692631243065, + "grad_norm": 0.44515377283096313, + "learning_rate": 3.2000687040809584e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9461215913295746, + "num_tokens": 37849919.0, + "step": 8080 + }, + { + "entropy": 0.16869171811267733, + "epoch": 0.46934942310342076, + "grad_norm": 0.4872053563594818, + "learning_rate": 3.1952069873087285e-05, + "loss": 0.1883, + "mean_token_accuracy": 0.9446787603199482, + "num_tokens": 37890635.0, + "step": 8090 + }, + { + "entropy": 0.17286886293441056, + "epoch": 0.469929583082535, + "grad_norm": 0.46300023794174194, + "learning_rate": 3.1903424193728825e-05, + "loss": 0.1853, + "mean_token_accuracy": 0.9445849724113942, + "num_tokens": 37942857.0, + "step": 8100 + }, + { + "entropy": 0.1639576220884919, + "epoch": 0.4705097430616493, + "grad_norm": 0.47407010197639465, + "learning_rate": 3.185475020223852e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.946852695941925, + "num_tokens": 37992820.0, + "step": 8110 + }, + { + "entropy": 0.161442634742707, + "epoch": 0.4710899030407635, + "grad_norm": 0.5241191387176514, + "learning_rate": 3.1806048098236765e-05, + "loss": 0.1746, + "mean_token_accuracy": 0.9465925134718418, + "num_tokens": 38034523.0, + "step": 8120 + }, + { + "entropy": 0.1566352159716189, + "epoch": 0.47167006301987774, + "grad_norm": 0.4201061427593231, + "learning_rate": 3.1757318081459305e-05, + "loss": 0.1687, + "mean_token_accuracy": 0.9474760428071022, + "num_tokens": 38078983.0, + "step": 8130 + }, + { + "entropy": 0.1657921805046499, + "epoch": 0.472250222998992, + "grad_norm": 0.44805440306663513, + "learning_rate": 3.170856035175633e-05, + "loss": 0.1688, + "mean_token_accuracy": 0.9467803351581097, + "num_tokens": 38128101.0, + "step": 8140 + }, + { + "entropy": 0.1590811276808381, + "epoch": 0.4728303829781062, + "grad_norm": 0.4904957115650177, + "learning_rate": 3.165977510909168e-05, + "loss": 0.1665, + "mean_token_accuracy": 0.9476091884076595, + "num_tokens": 38171559.0, + "step": 8150 + }, + { + "entropy": 0.16439850796014072, + "epoch": 0.47341054295722046, + "grad_norm": 0.5050429701805115, + "learning_rate": 3.161096255354205e-05, + "loss": 0.1806, + "mean_token_accuracy": 0.9446990296244622, + "num_tokens": 38218421.0, + "step": 8160 + }, + { + "entropy": 0.17393589136190712, + "epoch": 0.4739907029363347, + "grad_norm": 0.4193595349788666, + "learning_rate": 3.156212288529614e-05, + "loss": 0.1861, + "mean_token_accuracy": 0.9442672118544578, + "num_tokens": 38266216.0, + "step": 8170 + }, + { + "entropy": 0.1670900379307568, + "epoch": 0.4745708629154489, + "grad_norm": 0.5799826383590698, + "learning_rate": 3.1513256304653826e-05, + "loss": 0.1781, + "mean_token_accuracy": 0.9460842855274677, + "num_tokens": 38312731.0, + "step": 8180 + }, + { + "entropy": 0.1690112728625536, + "epoch": 0.4751510228945632, + "grad_norm": 0.3996475040912628, + "learning_rate": 3.1464363012025376e-05, + "loss": 0.1848, + "mean_token_accuracy": 0.9434234328567982, + "num_tokens": 38360839.0, + "step": 8190 + }, + { + "entropy": 0.16463953573256732, + "epoch": 0.47573118287367744, + "grad_norm": 0.38823920488357544, + "learning_rate": 3.141544320793063e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.9464644953608513, + "num_tokens": 38406600.0, + "step": 8200 + }, + { + "entropy": 0.16501461248844862, + "epoch": 0.47631134285279164, + "grad_norm": 0.44330766797065735, + "learning_rate": 3.13664970929981e-05, + "loss": 0.1843, + "mean_token_accuracy": 0.9442371062934398, + "num_tokens": 38458494.0, + "step": 8210 + }, + { + "entropy": 0.16705239256843923, + "epoch": 0.4768915028319059, + "grad_norm": 0.46959781646728516, + "learning_rate": 3.131752486796427e-05, + "loss": 0.1752, + "mean_token_accuracy": 0.9470366314053535, + "num_tokens": 38500173.0, + "step": 8220 + }, + { + "entropy": 0.1582988055422902, + "epoch": 0.47747166281102016, + "grad_norm": 0.4356507658958435, + "learning_rate": 3.126852673367264e-05, + "loss": 0.1697, + "mean_token_accuracy": 0.9485973887145519, + "num_tokens": 38546766.0, + "step": 8230 + }, + { + "entropy": 0.15800197846256198, + "epoch": 0.47805182279013436, + "grad_norm": 0.40555018186569214, + "learning_rate": 3.1219502891073016e-05, + "loss": 0.1717, + "mean_token_accuracy": 0.9468589149415493, + "num_tokens": 38595972.0, + "step": 8240 + }, + { + "entropy": 0.16929144971072674, + "epoch": 0.4786319827692486, + "grad_norm": 0.4088403284549713, + "learning_rate": 3.117045354122061e-05, + "loss": 0.185, + "mean_token_accuracy": 0.943747116625309, + "num_tokens": 38643454.0, + "step": 8250 + }, + { + "entropy": 0.17592263892292975, + "epoch": 0.4792121427483629, + "grad_norm": 0.429304301738739, + "learning_rate": 3.112137888527527e-05, + "loss": 0.1943, + "mean_token_accuracy": 0.9419780470430851, + "num_tokens": 38694071.0, + "step": 8260 + }, + { + "entropy": 0.17685515694320203, + "epoch": 0.4797923027274771, + "grad_norm": 0.4318338632583618, + "learning_rate": 3.10722791245006e-05, + "loss": 0.1912, + "mean_token_accuracy": 0.9420236147940159, + "num_tokens": 38740507.0, + "step": 8270 + }, + { + "entropy": 0.17180330213159323, + "epoch": 0.48037246270659134, + "grad_norm": 0.42575502395629883, + "learning_rate": 3.1023154460263194e-05, + "loss": 0.1777, + "mean_token_accuracy": 0.9457370191812515, + "num_tokens": 38785265.0, + "step": 8280 + }, + { + "entropy": 0.17306322045624256, + "epoch": 0.4809526226857056, + "grad_norm": 0.4791145920753479, + "learning_rate": 3.097400509403174e-05, + "loss": 0.1805, + "mean_token_accuracy": 0.9449944190680981, + "num_tokens": 38831949.0, + "step": 8290 + }, + { + "entropy": 0.15909845614805818, + "epoch": 0.4815327826648198, + "grad_norm": 0.40465861558914185, + "learning_rate": 3.092483122737628e-05, + "loss": 0.176, + "mean_token_accuracy": 0.946733620762825, + "num_tokens": 38875578.0, + "step": 8300 + }, + { + "entropy": 0.165493771340698, + "epoch": 0.48211294264393406, + "grad_norm": 0.4685038626194, + "learning_rate": 3.0875633061967294e-05, + "loss": 0.1764, + "mean_token_accuracy": 0.9458193510770798, + "num_tokens": 38922506.0, + "step": 8310 + }, + { + "entropy": 0.16567518413066865, + "epoch": 0.4826931026230483, + "grad_norm": 0.44083112478256226, + "learning_rate": 3.0826410799574946e-05, + "loss": 0.1803, + "mean_token_accuracy": 0.9464865364134312, + "num_tokens": 38969448.0, + "step": 8320 + }, + { + "entropy": 0.16825048122555017, + "epoch": 0.4832732626021625, + "grad_norm": 0.4166043698787689, + "learning_rate": 3.0777164642068195e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.9440950363874435, + "num_tokens": 39017667.0, + "step": 8330 + }, + { + "entropy": 0.17871626755222678, + "epoch": 0.4838534225812768, + "grad_norm": 0.5093548893928528, + "learning_rate": 3.072789479141404e-05, + "loss": 0.1829, + "mean_token_accuracy": 0.943255165964365, + "num_tokens": 39065569.0, + "step": 8340 + }, + { + "entropy": 0.1637207169085741, + "epoch": 0.48443358256039104, + "grad_norm": 0.4341610372066498, + "learning_rate": 3.067860144967661e-05, + "loss": 0.175, + "mean_token_accuracy": 0.9481724545359611, + "num_tokens": 39113930.0, + "step": 8350 + }, + { + "entropy": 0.1620858261361718, + "epoch": 0.48501374253950524, + "grad_norm": 0.451816201210022, + "learning_rate": 3.0629284819016395e-05, + "loss": 0.1718, + "mean_token_accuracy": 0.9453750275075435, + "num_tokens": 39155610.0, + "step": 8360 + }, + { + "entropy": 0.17347168698906898, + "epoch": 0.4855939025186195, + "grad_norm": 0.5037128329277039, + "learning_rate": 3.0579945101689393e-05, + "loss": 0.1855, + "mean_token_accuracy": 0.9410739719867707, + "num_tokens": 39198709.0, + "step": 8370 + }, + { + "entropy": 0.16140768034383654, + "epoch": 0.48617406249773376, + "grad_norm": 0.4283958375453949, + "learning_rate": 3.053058250004629e-05, + "loss": 0.1773, + "mean_token_accuracy": 0.9464427605271339, + "num_tokens": 39241518.0, + "step": 8380 + }, + { + "entropy": 0.1765934806317091, + "epoch": 0.48675422247684796, + "grad_norm": 0.49489104747772217, + "learning_rate": 3.0481197216531614e-05, + "loss": 0.1894, + "mean_token_accuracy": 0.9417820803821086, + "num_tokens": 39287752.0, + "step": 8390 + }, + { + "entropy": 0.16703589893877507, + "epoch": 0.4873343824559622, + "grad_norm": 0.5416718125343323, + "learning_rate": 3.0431789453682917e-05, + "loss": 0.177, + "mean_token_accuracy": 0.9452290743589401, + "num_tokens": 39331340.0, + "step": 8400 + }, + { + "entropy": 0.16267693070694805, + "epoch": 0.4879145424350765, + "grad_norm": 0.4262806177139282, + "learning_rate": 3.0382359414129956e-05, + "loss": 0.1734, + "mean_token_accuracy": 0.9475521795451641, + "num_tokens": 39376702.0, + "step": 8410 + }, + { + "entropy": 0.163469997420907, + "epoch": 0.48849470241419074, + "grad_norm": 0.454074501991272, + "learning_rate": 3.0332907300593827e-05, + "loss": 0.1744, + "mean_token_accuracy": 0.9449733972549439, + "num_tokens": 39422179.0, + "step": 8420 + }, + { + "entropy": 0.17110976474359632, + "epoch": 0.48907486239330494, + "grad_norm": 0.4029136002063751, + "learning_rate": 3.0283433315886168e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9440277047455311, + "num_tokens": 39462127.0, + "step": 8430 + }, + { + "entropy": 0.1764822371304035, + "epoch": 0.4896550223724192, + "grad_norm": 0.4175196886062622, + "learning_rate": 3.023393766290833e-05, + "loss": 0.1866, + "mean_token_accuracy": 0.9440544925630092, + "num_tokens": 39505740.0, + "step": 8440 + }, + { + "entropy": 0.17522826893255114, + "epoch": 0.49023518235153346, + "grad_norm": 0.45613810420036316, + "learning_rate": 3.0184420544650483e-05, + "loss": 0.1825, + "mean_token_accuracy": 0.943079648911953, + "num_tokens": 39553056.0, + "step": 8450 + }, + { + "entropy": 0.17085548797622324, + "epoch": 0.49081534233064766, + "grad_norm": 0.4538099765777588, + "learning_rate": 3.013488216419088e-05, + "loss": 0.1829, + "mean_token_accuracy": 0.9452882707118988, + "num_tokens": 39600184.0, + "step": 8460 + }, + { + "entropy": 0.16457013748586177, + "epoch": 0.4913955023097619, + "grad_norm": 0.3749425411224365, + "learning_rate": 3.0085322724694954e-05, + "loss": 0.178, + "mean_token_accuracy": 0.9460626810789108, + "num_tokens": 39646982.0, + "step": 8470 + }, + { + "entropy": 0.16895728455856443, + "epoch": 0.4919756622888762, + "grad_norm": 0.4597746729850769, + "learning_rate": 3.0035742429414492e-05, + "loss": 0.187, + "mean_token_accuracy": 0.9437876284122467, + "num_tokens": 39692935.0, + "step": 8480 + }, + { + "entropy": 0.17100888825953006, + "epoch": 0.4925558222679904, + "grad_norm": 0.4776872396469116, + "learning_rate": 2.9986141481686837e-05, + "loss": 0.1821, + "mean_token_accuracy": 0.9433431334793567, + "num_tokens": 39741016.0, + "step": 8490 + }, + { + "entropy": 0.17228604545816778, + "epoch": 0.49313598224710464, + "grad_norm": 0.36947688460350037, + "learning_rate": 2.9936520084934005e-05, + "loss": 0.183, + "mean_token_accuracy": 0.9436006538569928, + "num_tokens": 39786066.0, + "step": 8500 + }, + { + "entropy": 0.1669056244660169, + "epoch": 0.4937161422262189, + "grad_norm": 0.5374568700790405, + "learning_rate": 2.9886878442661915e-05, + "loss": 0.1788, + "mean_token_accuracy": 0.9451548315584659, + "num_tokens": 39835846.0, + "step": 8510 + }, + { + "entropy": 0.17296021515503526, + "epoch": 0.4942963022053331, + "grad_norm": 0.44647207856178284, + "learning_rate": 2.9837216758459467e-05, + "loss": 0.184, + "mean_token_accuracy": 0.9434181332588196, + "num_tokens": 39885400.0, + "step": 8520 + }, + { + "entropy": 0.17169405249878764, + "epoch": 0.49487646218444736, + "grad_norm": 0.4542631208896637, + "learning_rate": 2.978753523599781e-05, + "loss": 0.1843, + "mean_token_accuracy": 0.9451465234160423, + "num_tokens": 39937434.0, + "step": 8530 + }, + { + "entropy": 0.17164804926142097, + "epoch": 0.4954566221635616, + "grad_norm": 0.41338449716567993, + "learning_rate": 2.9737834079029402e-05, + "loss": 0.1779, + "mean_token_accuracy": 0.9453194215893745, + "num_tokens": 39983846.0, + "step": 8540 + }, + { + "entropy": 0.17130106785334648, + "epoch": 0.4960367821426758, + "grad_norm": 0.46771982312202454, + "learning_rate": 2.968811349138727e-05, + "loss": 0.1833, + "mean_token_accuracy": 0.9432528421282769, + "num_tokens": 40029316.0, + "step": 8550 + }, + { + "entropy": 0.16206809361465274, + "epoch": 0.4966169421217901, + "grad_norm": 0.39189720153808594, + "learning_rate": 2.96383736769841e-05, + "loss": 0.1724, + "mean_token_accuracy": 0.9477090053260326, + "num_tokens": 40076489.0, + "step": 8560 + }, + { + "entropy": 0.16949255093932153, + "epoch": 0.49719710210090434, + "grad_norm": 0.39477548003196716, + "learning_rate": 2.9588614839811434e-05, + "loss": 0.1849, + "mean_token_accuracy": 0.9443994626402855, + "num_tokens": 40121028.0, + "step": 8570 + }, + { + "entropy": 0.18138641593977808, + "epoch": 0.49777726208001855, + "grad_norm": 0.46649444103240967, + "learning_rate": 2.9538837183938846e-05, + "loss": 0.1996, + "mean_token_accuracy": 0.9392085924744606, + "num_tokens": 40169091.0, + "step": 8580 + }, + { + "entropy": 0.17700270880013705, + "epoch": 0.4983574220591328, + "grad_norm": 0.46116018295288086, + "learning_rate": 2.9489040913513068e-05, + "loss": 0.1882, + "mean_token_accuracy": 0.9435209140181542, + "num_tokens": 40222879.0, + "step": 8590 + }, + { + "entropy": 0.16585291661322116, + "epoch": 0.49893758203824706, + "grad_norm": 0.4218469560146332, + "learning_rate": 2.943922623275719e-05, + "loss": 0.1792, + "mean_token_accuracy": 0.9453875742852688, + "num_tokens": 40267935.0, + "step": 8600 + }, + { + "entropy": 0.1587002668529749, + "epoch": 0.49951774201736127, + "grad_norm": 0.5107164978981018, + "learning_rate": 2.938939334596979e-05, + "loss": 0.182, + "mean_token_accuracy": 0.9462330967187882, + "num_tokens": 40312931.0, + "step": 8610 + }, + { + "entropy": 0.16753232860937714, + "epoch": 0.5000979019964755, + "grad_norm": 0.37327226996421814, + "learning_rate": 2.933954245752414e-05, + "loss": 0.1732, + "mean_token_accuracy": 0.9461178667843342, + "num_tokens": 40361657.0, + "step": 8620 + }, + { + "entropy": 0.17417701743543149, + "epoch": 0.5006780619755897, + "grad_norm": 0.41866761445999146, + "learning_rate": 2.92896737718673e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.9414475351572037, + "num_tokens": 40405235.0, + "step": 8630 + }, + { + "entropy": 0.16874113581143318, + "epoch": 0.501258221954704, + "grad_norm": 0.3958810567855835, + "learning_rate": 2.9239787493519354e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9454725593328476, + "num_tokens": 40450243.0, + "step": 8640 + }, + { + "entropy": 0.15562377981841563, + "epoch": 0.5018383819338182, + "grad_norm": 0.4682304263114929, + "learning_rate": 2.9189883827072524e-05, + "loss": 0.161, + "mean_token_accuracy": 0.9504593417048455, + "num_tokens": 40496625.0, + "step": 8650 + }, + { + "entropy": 0.16569286528974772, + "epoch": 0.5024185419129324, + "grad_norm": 0.6049787402153015, + "learning_rate": 2.913996297719033e-05, + "loss": 0.1766, + "mean_token_accuracy": 0.9447605706751346, + "num_tokens": 40543929.0, + "step": 8660 + }, + { + "entropy": 0.16654700997751207, + "epoch": 0.5029987018920468, + "grad_norm": 0.47592246532440186, + "learning_rate": 2.9090025148606787e-05, + "loss": 0.1846, + "mean_token_accuracy": 0.9445287466049195, + "num_tokens": 40590034.0, + "step": 8670 + }, + { + "entropy": 0.17346955980174245, + "epoch": 0.503578861871161, + "grad_norm": 0.4507816731929779, + "learning_rate": 2.904007054612553e-05, + "loss": 0.1865, + "mean_token_accuracy": 0.9425362482666969, + "num_tokens": 40638318.0, + "step": 8680 + }, + { + "entropy": 0.16786886360496284, + "epoch": 0.5041590218502752, + "grad_norm": 0.42272499203681946, + "learning_rate": 2.8990099374618996e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9460474669933319, + "num_tokens": 40686438.0, + "step": 8690 + }, + { + "entropy": 0.16649736613035201, + "epoch": 0.5047391818293895, + "grad_norm": 0.34476473927497864, + "learning_rate": 2.8940111839027555e-05, + "loss": 0.1734, + "mean_token_accuracy": 0.945746548473835, + "num_tokens": 40736514.0, + "step": 8700 + }, + { + "entropy": 0.16115882876329124, + "epoch": 0.5053193418085037, + "grad_norm": 0.37035441398620605, + "learning_rate": 2.8890108144358712e-05, + "loss": 0.1668, + "mean_token_accuracy": 0.9461585856974125, + "num_tokens": 40785287.0, + "step": 8710 + }, + { + "entropy": 0.17356630377471446, + "epoch": 0.5058995017876179, + "grad_norm": 0.45182883739471436, + "learning_rate": 2.884008849568623e-05, + "loss": 0.1872, + "mean_token_accuracy": 0.9448146045207977, + "num_tokens": 40833291.0, + "step": 8720 + }, + { + "entropy": 0.1660636038519442, + "epoch": 0.5064796617667322, + "grad_norm": 0.5861982703208923, + "learning_rate": 2.87900530981493e-05, + "loss": 0.1801, + "mean_token_accuracy": 0.9466930814087391, + "num_tokens": 40877524.0, + "step": 8730 + }, + { + "entropy": 0.17367669511586428, + "epoch": 0.5070598217458464, + "grad_norm": 0.456103652715683, + "learning_rate": 2.8740002156951724e-05, + "loss": 0.1877, + "mean_token_accuracy": 0.9437623836100102, + "num_tokens": 40924287.0, + "step": 8740 + }, + { + "entropy": 0.16775277443230152, + "epoch": 0.5076399817249606, + "grad_norm": 0.49331727623939514, + "learning_rate": 2.8689935877361006e-05, + "loss": 0.1763, + "mean_token_accuracy": 0.9446254387497902, + "num_tokens": 40967806.0, + "step": 8750 + }, + { + "entropy": 0.1722121736034751, + "epoch": 0.5082201417040749, + "grad_norm": 0.469926118850708, + "learning_rate": 2.8639854464707605e-05, + "loss": 0.1786, + "mean_token_accuracy": 0.9429228663444519, + "num_tokens": 41015665.0, + "step": 8760 + }, + { + "entropy": 0.16097563942894338, + "epoch": 0.5088003016831891, + "grad_norm": 0.5368056297302246, + "learning_rate": 2.8589758124384014e-05, + "loss": 0.1773, + "mean_token_accuracy": 0.9467252388596534, + "num_tokens": 41061148.0, + "step": 8770 + }, + { + "entropy": 0.1735880217514932, + "epoch": 0.5093804616623033, + "grad_norm": 0.424911230802536, + "learning_rate": 2.853964706184396e-05, + "loss": 0.1818, + "mean_token_accuracy": 0.943539397418499, + "num_tokens": 41109212.0, + "step": 8780 + }, + { + "entropy": 0.17161042438820004, + "epoch": 0.5099606216414176, + "grad_norm": 0.37340906262397766, + "learning_rate": 2.8489521482601538e-05, + "loss": 0.1815, + "mean_token_accuracy": 0.9440919525921345, + "num_tokens": 41159037.0, + "step": 8790 + }, + { + "entropy": 0.17092387867160141, + "epoch": 0.5105407816205318, + "grad_norm": 0.4235631823539734, + "learning_rate": 2.8439381592230385e-05, + "loss": 0.1769, + "mean_token_accuracy": 0.9449858352541923, + "num_tokens": 41210406.0, + "step": 8800 + }, + { + "entropy": 0.15925732152536512, + "epoch": 0.511120941599646, + "grad_norm": 0.4428273141384125, + "learning_rate": 2.8389227596362826e-05, + "loss": 0.1746, + "mean_token_accuracy": 0.9485030405223369, + "num_tokens": 41259937.0, + "step": 8810 + }, + { + "entropy": 0.15318927997723222, + "epoch": 0.5117011015787604, + "grad_norm": 0.40703344345092773, + "learning_rate": 2.8339059700689034e-05, + "loss": 0.166, + "mean_token_accuracy": 0.9496708013117313, + "num_tokens": 41304570.0, + "step": 8820 + }, + { + "entropy": 0.16080571841448546, + "epoch": 0.5122812615578746, + "grad_norm": 0.43229687213897705, + "learning_rate": 2.8288878110956213e-05, + "loss": 0.1753, + "mean_token_accuracy": 0.9473040826618672, + "num_tokens": 41350793.0, + "step": 8830 + }, + { + "entropy": 0.1671501286327839, + "epoch": 0.5128614215369888, + "grad_norm": 0.4248155951499939, + "learning_rate": 2.8238683032967682e-05, + "loss": 0.1844, + "mean_token_accuracy": 0.9440686739981174, + "num_tokens": 41399399.0, + "step": 8840 + }, + { + "entropy": 0.16634151143953205, + "epoch": 0.5134415815161031, + "grad_norm": 0.39828136563301086, + "learning_rate": 2.8188474672582116e-05, + "loss": 0.1743, + "mean_token_accuracy": 0.9465730123221874, + "num_tokens": 41446321.0, + "step": 8850 + }, + { + "entropy": 0.1648148291744292, + "epoch": 0.5140217414952173, + "grad_norm": 0.454592227935791, + "learning_rate": 2.8138253235712653e-05, + "loss": 0.1746, + "mean_token_accuracy": 0.9483383007347583, + "num_tokens": 41492427.0, + "step": 8860 + }, + { + "entropy": 0.15513906106352807, + "epoch": 0.5146019014743316, + "grad_norm": 0.43605899810791016, + "learning_rate": 2.808801892832604e-05, + "loss": 0.1671, + "mean_token_accuracy": 0.9486884124577045, + "num_tokens": 41535921.0, + "step": 8870 + }, + { + "entropy": 0.16739982599392533, + "epoch": 0.5151820614534458, + "grad_norm": 0.39981818199157715, + "learning_rate": 2.8037771956441837e-05, + "loss": 0.1826, + "mean_token_accuracy": 0.9435298353433609, + "num_tokens": 41584579.0, + "step": 8880 + }, + { + "entropy": 0.16788930762559176, + "epoch": 0.51576222143256, + "grad_norm": 0.5113769769668579, + "learning_rate": 2.7987512526131536e-05, + "loss": 0.1807, + "mean_token_accuracy": 0.9445333778858185, + "num_tokens": 41636317.0, + "step": 8890 + }, + { + "entropy": 0.16648211209103464, + "epoch": 0.5163423814116743, + "grad_norm": 0.44056060910224915, + "learning_rate": 2.7937240843517713e-05, + "loss": 0.1796, + "mean_token_accuracy": 0.9470652565360069, + "num_tokens": 41686601.0, + "step": 8900 + }, + { + "entropy": 0.15560025200247765, + "epoch": 0.5169225413907885, + "grad_norm": 0.6175201535224915, + "learning_rate": 2.78869571147732e-05, + "loss": 0.1697, + "mean_token_accuracy": 0.9487142622470855, + "num_tokens": 41730810.0, + "step": 8910 + }, + { + "entropy": 0.16293275402858853, + "epoch": 0.5175027013699027, + "grad_norm": 0.5179318189620972, + "learning_rate": 2.7836661546120224e-05, + "loss": 0.1754, + "mean_token_accuracy": 0.9464200370013713, + "num_tokens": 41778219.0, + "step": 8920 + }, + { + "entropy": 0.16767940269783138, + "epoch": 0.518082861349017, + "grad_norm": 0.42390674352645874, + "learning_rate": 2.778635434382959e-05, + "loss": 0.1811, + "mean_token_accuracy": 0.9457180812954903, + "num_tokens": 41825293.0, + "step": 8930 + }, + { + "entropy": 0.17281589368358255, + "epoch": 0.5186630213281312, + "grad_norm": 0.45752277970314026, + "learning_rate": 2.773603571421979e-05, + "loss": 0.1894, + "mean_token_accuracy": 0.9440454617142677, + "num_tokens": 41871428.0, + "step": 8940 + }, + { + "entropy": 0.16482806871645153, + "epoch": 0.5192431813072454, + "grad_norm": 0.49600058794021606, + "learning_rate": 2.7685705863656198e-05, + "loss": 0.1777, + "mean_token_accuracy": 0.9445014469325542, + "num_tokens": 41916981.0, + "step": 8950 + }, + { + "entropy": 0.16709604393690825, + "epoch": 0.5198233412863598, + "grad_norm": 0.45547837018966675, + "learning_rate": 2.7635364998550195e-05, + "loss": 0.176, + "mean_token_accuracy": 0.9471689634025097, + "num_tokens": 41968573.0, + "step": 8960 + }, + { + "entropy": 0.17253656024113298, + "epoch": 0.520403501265474, + "grad_norm": 0.42922985553741455, + "learning_rate": 2.758501332535835e-05, + "loss": 0.1873, + "mean_token_accuracy": 0.9424532979726792, + "num_tokens": 42016076.0, + "step": 8970 + }, + { + "entropy": 0.1651924449019134, + "epoch": 0.5209836612445882, + "grad_norm": 0.4083471894264221, + "learning_rate": 2.7534651050581543e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9456571877002716, + "num_tokens": 42057292.0, + "step": 8980 + }, + { + "entropy": 0.16905175289139152, + "epoch": 0.5215638212237025, + "grad_norm": 0.4557422399520874, + "learning_rate": 2.748427838076414e-05, + "loss": 0.1784, + "mean_token_accuracy": 0.9471900500357151, + "num_tokens": 42103818.0, + "step": 8990 + }, + { + "entropy": 0.16041738130152225, + "epoch": 0.5221439812028167, + "grad_norm": 0.40316042304039, + "learning_rate": 2.7433895522493135e-05, + "loss": 0.1709, + "mean_token_accuracy": 0.9468243516981601, + "num_tokens": 42146572.0, + "step": 9000 + }, + { + "entropy": 0.17861140416935087, + "epoch": 0.5227241411819309, + "grad_norm": 0.45082399249076843, + "learning_rate": 2.738350268239731e-05, + "loss": 0.1849, + "mean_token_accuracy": 0.9422081857919693, + "num_tokens": 42198646.0, + "step": 9010 + }, + { + "entropy": 0.16903355671092868, + "epoch": 0.5233043011610452, + "grad_norm": 0.38484877347946167, + "learning_rate": 2.733310006714639e-05, + "loss": 0.1779, + "mean_token_accuracy": 0.9440260700881481, + "num_tokens": 42248562.0, + "step": 9020 + }, + { + "entropy": 0.16446058303117753, + "epoch": 0.5238844611401594, + "grad_norm": 0.3902280926704407, + "learning_rate": 2.728268788345017e-05, + "loss": 0.176, + "mean_token_accuracy": 0.9459143429994583, + "num_tokens": 42294928.0, + "step": 9030 + }, + { + "entropy": 0.15905745211057365, + "epoch": 0.5244646211192736, + "grad_norm": 0.384782999753952, + "learning_rate": 2.723226633805771e-05, + "loss": 0.1715, + "mean_token_accuracy": 0.9474834434688091, + "num_tokens": 42339555.0, + "step": 9040 + }, + { + "entropy": 0.15691307215020062, + "epoch": 0.5250447810983879, + "grad_norm": 0.5381737351417542, + "learning_rate": 2.718183563775646e-05, + "loss": 0.1739, + "mean_token_accuracy": 0.9453691519796849, + "num_tokens": 42383848.0, + "step": 9050 + }, + { + "entropy": 0.16307076425291597, + "epoch": 0.5256249410775021, + "grad_norm": 0.5486294031143188, + "learning_rate": 2.7131395989371385e-05, + "loss": 0.1802, + "mean_token_accuracy": 0.9452343493700027, + "num_tokens": 42428712.0, + "step": 9060 + }, + { + "entropy": 0.1695011556148529, + "epoch": 0.5262051010566163, + "grad_norm": 0.4892549514770508, + "learning_rate": 2.7080947599764188e-05, + "loss": 0.1817, + "mean_token_accuracy": 0.9444872170686722, + "num_tokens": 42473808.0, + "step": 9070 + }, + { + "entropy": 0.1676627116277814, + "epoch": 0.5267852610357306, + "grad_norm": 0.4139344096183777, + "learning_rate": 2.7030490675832414e-05, + "loss": 0.1733, + "mean_token_accuracy": 0.9449983663856983, + "num_tokens": 42519634.0, + "step": 9080 + }, + { + "entropy": 0.16674949564039707, + "epoch": 0.5273654210148448, + "grad_norm": 0.5177903175354004, + "learning_rate": 2.6980025424508587e-05, + "loss": 0.1816, + "mean_token_accuracy": 0.9438636682927608, + "num_tokens": 42566663.0, + "step": 9090 + }, + { + "entropy": 0.15759858069941401, + "epoch": 0.527945580993959, + "grad_norm": 0.5165897607803345, + "learning_rate": 2.6929552052759398e-05, + "loss": 0.1792, + "mean_token_accuracy": 0.9485062882304192, + "num_tokens": 42609107.0, + "step": 9100 + }, + { + "entropy": 0.17389797382056713, + "epoch": 0.5285257409730734, + "grad_norm": 0.5269511938095093, + "learning_rate": 2.6879070767584848e-05, + "loss": 0.1843, + "mean_token_accuracy": 0.9445799067616463, + "num_tokens": 42653658.0, + "step": 9110 + }, + { + "entropy": 0.17889742171391845, + "epoch": 0.5291059009521876, + "grad_norm": 0.34971532225608826, + "learning_rate": 2.682858177601737e-05, + "loss": 0.181, + "mean_token_accuracy": 0.94356245175004, + "num_tokens": 42709031.0, + "step": 9120 + }, + { + "entropy": 0.16605154238641262, + "epoch": 0.5296860609313018, + "grad_norm": 0.4752177298069, + "learning_rate": 2.677808528512102e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9440542809665203, + "num_tokens": 42749855.0, + "step": 9130 + }, + { + "entropy": 0.17195372022688388, + "epoch": 0.5302662209104161, + "grad_norm": 0.3715435266494751, + "learning_rate": 2.6727581501990616e-05, + "loss": 0.1813, + "mean_token_accuracy": 0.9448219604790211, + "num_tokens": 42791558.0, + "step": 9140 + }, + { + "entropy": 0.16133958715945482, + "epoch": 0.5308463808895303, + "grad_norm": 0.48621347546577454, + "learning_rate": 2.667707063375086e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9457940697669983, + "num_tokens": 42838364.0, + "step": 9150 + }, + { + "entropy": 0.1673739342018962, + "epoch": 0.5314265408686445, + "grad_norm": 0.5434457659721375, + "learning_rate": 2.6626552887555538e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.944213280826807, + "num_tokens": 42889556.0, + "step": 9160 + }, + { + "entropy": 0.16635732902213932, + "epoch": 0.5320067008477588, + "grad_norm": 0.5175045132637024, + "learning_rate": 2.6576028470586617e-05, + "loss": 0.1769, + "mean_token_accuracy": 0.9449176661670208, + "num_tokens": 42936596.0, + "step": 9170 + }, + { + "entropy": 0.16402168460190297, + "epoch": 0.532586860826873, + "grad_norm": 0.4503480792045593, + "learning_rate": 2.6525497590053445e-05, + "loss": 0.1798, + "mean_token_accuracy": 0.9459506720304489, + "num_tokens": 42984471.0, + "step": 9180 + }, + { + "entropy": 0.1576265714596957, + "epoch": 0.5331670208059872, + "grad_norm": 0.39443016052246094, + "learning_rate": 2.6474960453191857e-05, + "loss": 0.1666, + "mean_token_accuracy": 0.9496423929929734, + "num_tokens": 43032367.0, + "step": 9190 + }, + { + "entropy": 0.15125372000038623, + "epoch": 0.5337471807851015, + "grad_norm": 0.45288538932800293, + "learning_rate": 2.6424417267263364e-05, + "loss": 0.1627, + "mean_token_accuracy": 0.9509739637374878, + "num_tokens": 43085178.0, + "step": 9200 + }, + { + "entropy": 0.15563413305208088, + "epoch": 0.5343273407642157, + "grad_norm": 0.486808180809021, + "learning_rate": 2.6373868239554278e-05, + "loss": 0.169, + "mean_token_accuracy": 0.9466548673808575, + "num_tokens": 43133147.0, + "step": 9210 + }, + { + "entropy": 0.1629641281440854, + "epoch": 0.5349075007433299, + "grad_norm": 0.3701784312725067, + "learning_rate": 2.6323313577374863e-05, + "loss": 0.1779, + "mean_token_accuracy": 0.9462707154452801, + "num_tokens": 43180896.0, + "step": 9220 + }, + { + "entropy": 0.1682162970304489, + "epoch": 0.5354876607224442, + "grad_norm": 0.5038458704948425, + "learning_rate": 2.6272753488058516e-05, + "loss": 0.1809, + "mean_token_accuracy": 0.9440638527274132, + "num_tokens": 43235198.0, + "step": 9230 + }, + { + "entropy": 0.160696579515934, + "epoch": 0.5360678207015585, + "grad_norm": 0.4341427981853485, + "learning_rate": 2.6222188178960848e-05, + "loss": 0.1693, + "mean_token_accuracy": 0.9460996977984906, + "num_tokens": 43275392.0, + "step": 9240 + }, + { + "entropy": 0.17299333848059179, + "epoch": 0.5366479806806727, + "grad_norm": 0.45954564213752747, + "learning_rate": 2.617161785745892e-05, + "loss": 0.1835, + "mean_token_accuracy": 0.9450544893741608, + "num_tokens": 43325187.0, + "step": 9250 + }, + { + "entropy": 0.16810349486768245, + "epoch": 0.537228140659787, + "grad_norm": 0.36082723736763, + "learning_rate": 2.612104273095033e-05, + "loss": 0.1774, + "mean_token_accuracy": 0.9450991690158844, + "num_tokens": 43373347.0, + "step": 9260 + }, + { + "entropy": 0.15850407760590315, + "epoch": 0.5378083006389012, + "grad_norm": 0.45759648084640503, + "learning_rate": 2.6070463006852374e-05, + "loss": 0.1625, + "mean_token_accuracy": 0.9491868197917939, + "num_tokens": 43421814.0, + "step": 9270 + }, + { + "entropy": 0.16139659904874862, + "epoch": 0.5383884606180154, + "grad_norm": 0.4799107313156128, + "learning_rate": 2.6019878892601224e-05, + "loss": 0.1723, + "mean_token_accuracy": 0.9473508208990097, + "num_tokens": 43468424.0, + "step": 9280 + }, + { + "entropy": 0.163993626460433, + "epoch": 0.5389686205971297, + "grad_norm": 0.4831599295139313, + "learning_rate": 2.5969290595651043e-05, + "loss": 0.1788, + "mean_token_accuracy": 0.9465563364326954, + "num_tokens": 43518261.0, + "step": 9290 + }, + { + "entropy": 0.1703875140286982, + "epoch": 0.5395487805762439, + "grad_norm": 0.4122602939605713, + "learning_rate": 2.5918698323473156e-05, + "loss": 0.1825, + "mean_token_accuracy": 0.9440789453685283, + "num_tokens": 43566797.0, + "step": 9300 + }, + { + "entropy": 0.16565121673047542, + "epoch": 0.5401289405553581, + "grad_norm": 0.49758052825927734, + "learning_rate": 2.5868102283555178e-05, + "loss": 0.1788, + "mean_token_accuracy": 0.9447576500475406, + "num_tokens": 43612854.0, + "step": 9310 + }, + { + "entropy": 0.16146022314205766, + "epoch": 0.5407091005344724, + "grad_norm": 0.44309765100479126, + "learning_rate": 2.5817502683400192e-05, + "loss": 0.1778, + "mean_token_accuracy": 0.9450323827564716, + "num_tokens": 43656038.0, + "step": 9320 + }, + { + "entropy": 0.17004582565277815, + "epoch": 0.5412892605135866, + "grad_norm": 0.4046518802642822, + "learning_rate": 2.576689973052587e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.9455995336174965, + "num_tokens": 43701901.0, + "step": 9330 + }, + { + "entropy": 0.15940480190329254, + "epoch": 0.5418694204927008, + "grad_norm": 0.43702781200408936, + "learning_rate": 2.5716293632463632e-05, + "loss": 0.1709, + "mean_token_accuracy": 0.9488982416689395, + "num_tokens": 43747440.0, + "step": 9340 + }, + { + "entropy": 0.16041765650734305, + "epoch": 0.5424495804718151, + "grad_norm": 0.42245015501976013, + "learning_rate": 2.5665684596757832e-05, + "loss": 0.1742, + "mean_token_accuracy": 0.9460077852010726, + "num_tokens": 43797745.0, + "step": 9350 + }, + { + "entropy": 0.16312142638489605, + "epoch": 0.5430297404509293, + "grad_norm": 0.4149959981441498, + "learning_rate": 2.561507283096481e-05, + "loss": 0.1832, + "mean_token_accuracy": 0.9451576635241509, + "num_tokens": 43847597.0, + "step": 9360 + }, + { + "entropy": 0.1688772052526474, + "epoch": 0.5436099004300435, + "grad_norm": 0.45885875821113586, + "learning_rate": 2.556445854265216e-05, + "loss": 0.1846, + "mean_token_accuracy": 0.9441474325954914, + "num_tokens": 43895018.0, + "step": 9370 + }, + { + "entropy": 0.17162321833893657, + "epoch": 0.5441900604091578, + "grad_norm": 0.4567737579345703, + "learning_rate": 2.5513841939397782e-05, + "loss": 0.1882, + "mean_token_accuracy": 0.9440203510224819, + "num_tokens": 43942392.0, + "step": 9380 + }, + { + "entropy": 0.1641680029220879, + "epoch": 0.544770220388272, + "grad_norm": 0.4154190421104431, + "learning_rate": 2.5463223228789095e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9470834277570248, + "num_tokens": 43984736.0, + "step": 9390 + }, + { + "entropy": 0.15933284908533096, + "epoch": 0.5453503803673863, + "grad_norm": 0.5181949138641357, + "learning_rate": 2.5412602618422153e-05, + "loss": 0.1709, + "mean_token_accuracy": 0.9487712480127811, + "num_tokens": 44032862.0, + "step": 9400 + }, + { + "entropy": 0.16291980724781752, + "epoch": 0.5459305403465006, + "grad_norm": 0.37184661626815796, + "learning_rate": 2.536198031590079e-05, + "loss": 0.1752, + "mean_token_accuracy": 0.9452111005783081, + "num_tokens": 44082874.0, + "step": 9410 + }, + { + "entropy": 0.15996323442086577, + "epoch": 0.5465107003256148, + "grad_norm": 0.5319421887397766, + "learning_rate": 2.5311356528835794e-05, + "loss": 0.1733, + "mean_token_accuracy": 0.9473493114113808, + "num_tokens": 44127865.0, + "step": 9420 + }, + { + "entropy": 0.1714618039317429, + "epoch": 0.547090860304729, + "grad_norm": 0.458799809217453, + "learning_rate": 2.5260731464844033e-05, + "loss": 0.1812, + "mean_token_accuracy": 0.943031957000494, + "num_tokens": 44176988.0, + "step": 9430 + }, + { + "entropy": 0.16981745697557926, + "epoch": 0.5476710202838433, + "grad_norm": 0.4158706068992615, + "learning_rate": 2.521010533154763e-05, + "loss": 0.1837, + "mean_token_accuracy": 0.9451175943017006, + "num_tokens": 44226606.0, + "step": 9440 + }, + { + "entropy": 0.1656275845132768, + "epoch": 0.5482511802629575, + "grad_norm": 0.5028427839279175, + "learning_rate": 2.515947833657306e-05, + "loss": 0.1803, + "mean_token_accuracy": 0.9456063888967037, + "num_tokens": 44274111.0, + "step": 9450 + }, + { + "entropy": 0.17572863604873418, + "epoch": 0.5488313402420717, + "grad_norm": 0.3881778419017792, + "learning_rate": 2.5108850687550354e-05, + "loss": 0.1904, + "mean_token_accuracy": 0.9437159344553947, + "num_tokens": 44320436.0, + "step": 9460 + }, + { + "entropy": 0.17380598415620624, + "epoch": 0.549411500221186, + "grad_norm": 0.486258864402771, + "learning_rate": 2.505822259211224e-05, + "loss": 0.1785, + "mean_token_accuracy": 0.9435172617435456, + "num_tokens": 44372837.0, + "step": 9470 + }, + { + "entropy": 0.1654342211317271, + "epoch": 0.5499916602003002, + "grad_norm": 0.4930897355079651, + "learning_rate": 2.5007594257893236e-05, + "loss": 0.1731, + "mean_token_accuracy": 0.9454665139317513, + "num_tokens": 44419790.0, + "step": 9480 + }, + { + "entropy": 0.1683452964760363, + "epoch": 0.5505718201794145, + "grad_norm": 0.4404093325138092, + "learning_rate": 2.4956965892528882e-05, + "loss": 0.1798, + "mean_token_accuracy": 0.9453298918902874, + "num_tokens": 44466709.0, + "step": 9490 + }, + { + "entropy": 0.16797020575031638, + "epoch": 0.5511519801585287, + "grad_norm": 0.47315728664398193, + "learning_rate": 2.4906337703654806e-05, + "loss": 0.1826, + "mean_token_accuracy": 0.9442701898515224, + "num_tokens": 44512687.0, + "step": 9500 + }, + { + "entropy": 0.17097735358402133, + "epoch": 0.5517321401376429, + "grad_norm": 0.5022308230400085, + "learning_rate": 2.4855709898905947e-05, + "loss": 0.1861, + "mean_token_accuracy": 0.9432589441537857, + "num_tokens": 44560277.0, + "step": 9510 + }, + { + "entropy": 0.16296194046735762, + "epoch": 0.5523123001167572, + "grad_norm": 0.4730151295661926, + "learning_rate": 2.4805082685915644e-05, + "loss": 0.178, + "mean_token_accuracy": 0.9471959851682186, + "num_tokens": 44605153.0, + "step": 9520 + }, + { + "entropy": 0.16764092771336436, + "epoch": 0.5528924600958715, + "grad_norm": 0.4078678488731384, + "learning_rate": 2.4754456272314826e-05, + "loss": 0.1706, + "mean_token_accuracy": 0.946760842949152, + "num_tokens": 44651574.0, + "step": 9530 + }, + { + "entropy": 0.16137828938663007, + "epoch": 0.5534726200749857, + "grad_norm": 0.4917527139186859, + "learning_rate": 2.4703830865731143e-05, + "loss": 0.172, + "mean_token_accuracy": 0.946791821718216, + "num_tokens": 44698236.0, + "step": 9540 + }, + { + "entropy": 0.1610173671040684, + "epoch": 0.5540527800541, + "grad_norm": 0.47147828340530396, + "learning_rate": 2.4653206673788092e-05, + "loss": 0.1745, + "mean_token_accuracy": 0.9457192569971085, + "num_tokens": 44745827.0, + "step": 9550 + }, + { + "entropy": 0.1598843365907669, + "epoch": 0.5546329400332142, + "grad_norm": 0.4956726133823395, + "learning_rate": 2.460258390410421e-05, + "loss": 0.1756, + "mean_token_accuracy": 0.9448515512049198, + "num_tokens": 44792202.0, + "step": 9560 + }, + { + "entropy": 0.16266485573723913, + "epoch": 0.5552131000123284, + "grad_norm": 0.4368143379688263, + "learning_rate": 2.45519627642922e-05, + "loss": 0.1787, + "mean_token_accuracy": 0.9453688159584999, + "num_tokens": 44837519.0, + "step": 9570 + }, + { + "entropy": 0.1584016853943467, + "epoch": 0.5557932599914427, + "grad_norm": 0.4028737246990204, + "learning_rate": 2.4501343461958088e-05, + "loss": 0.1625, + "mean_token_accuracy": 0.9512647345662117, + "num_tokens": 44880912.0, + "step": 9580 + }, + { + "entropy": 0.16472243405878545, + "epoch": 0.5563734199705569, + "grad_norm": 0.3895881175994873, + "learning_rate": 2.445072620470033e-05, + "loss": 0.1849, + "mean_token_accuracy": 0.9463803850114345, + "num_tokens": 44929098.0, + "step": 9590 + }, + { + "entropy": 0.162673273216933, + "epoch": 0.5569535799496711, + "grad_norm": 0.4117458164691925, + "learning_rate": 2.4400111200109027e-05, + "loss": 0.1742, + "mean_token_accuracy": 0.9456972852349281, + "num_tokens": 44977839.0, + "step": 9600 + }, + { + "entropy": 0.1650606391020119, + "epoch": 0.5575337399287854, + "grad_norm": 0.3962087035179138, + "learning_rate": 2.4349498655765046e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.9455499455332756, + "num_tokens": 45024616.0, + "step": 9610 + }, + { + "entropy": 0.1760987563058734, + "epoch": 0.5581138999078996, + "grad_norm": 0.35995471477508545, + "learning_rate": 2.429888877923912e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.9443122819066048, + "num_tokens": 45077933.0, + "step": 9620 + }, + { + "entropy": 0.16862560766749085, + "epoch": 0.5586940598870138, + "grad_norm": 0.3599713146686554, + "learning_rate": 2.4248281778091104e-05, + "loss": 0.1806, + "mean_token_accuracy": 0.9461624316871167, + "num_tokens": 45127757.0, + "step": 9630 + }, + { + "entropy": 0.14704026598483325, + "epoch": 0.5592742198661281, + "grad_norm": 0.4208920896053314, + "learning_rate": 2.4197677859869007e-05, + "loss": 0.1604, + "mean_token_accuracy": 0.9497509479522706, + "num_tokens": 45170212.0, + "step": 9640 + }, + { + "entropy": 0.1647076427936554, + "epoch": 0.5598543798452423, + "grad_norm": 0.4202902317047119, + "learning_rate": 2.4147077232108215e-05, + "loss": 0.1787, + "mean_token_accuracy": 0.9448741696774959, + "num_tokens": 45217438.0, + "step": 9650 + }, + { + "entropy": 0.161786731146276, + "epoch": 0.5604345398243565, + "grad_norm": 0.478389173746109, + "learning_rate": 2.4096480102330637e-05, + "loss": 0.1733, + "mean_token_accuracy": 0.9477825678884984, + "num_tokens": 45262122.0, + "step": 9660 + }, + { + "entropy": 0.16723282411694526, + "epoch": 0.5610146998034709, + "grad_norm": 0.40748724341392517, + "learning_rate": 2.40458866780438e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.9456728413701058, + "num_tokens": 45309446.0, + "step": 9670 + }, + { + "entropy": 0.17343690246343613, + "epoch": 0.561594859782585, + "grad_norm": 0.4418453574180603, + "learning_rate": 2.3995297166740054e-05, + "loss": 0.1862, + "mean_token_accuracy": 0.9451116546988487, + "num_tokens": 45358944.0, + "step": 9680 + }, + { + "entropy": 0.16433494798839093, + "epoch": 0.5621750197616993, + "grad_norm": 0.3471347987651825, + "learning_rate": 2.3944711775895713e-05, + "loss": 0.1722, + "mean_token_accuracy": 0.9467417791485786, + "num_tokens": 45407358.0, + "step": 9690 + }, + { + "entropy": 0.1607672914862633, + "epoch": 0.5627551797408136, + "grad_norm": 0.46852996945381165, + "learning_rate": 2.3894130712970174e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9470467485487462, + "num_tokens": 45452193.0, + "step": 9700 + }, + { + "entropy": 0.16984836841002107, + "epoch": 0.5633353397199278, + "grad_norm": 0.6044245362281799, + "learning_rate": 2.3843554185405077e-05, + "loss": 0.1801, + "mean_token_accuracy": 0.943142193555832, + "num_tokens": 45500814.0, + "step": 9710 + }, + { + "entropy": 0.15317465793341398, + "epoch": 0.563915499699042, + "grad_norm": 0.47155165672302246, + "learning_rate": 2.3792982400623485e-05, + "loss": 0.1514, + "mean_token_accuracy": 0.9507311105728149, + "num_tokens": 45544112.0, + "step": 9720 + }, + { + "entropy": 0.15244226586073636, + "epoch": 0.5644956596781563, + "grad_norm": 0.4036012887954712, + "learning_rate": 2.3742415566029004e-05, + "loss": 0.166, + "mean_token_accuracy": 0.9487246960401535, + "num_tokens": 45585791.0, + "step": 9730 + }, + { + "entropy": 0.16500356663018464, + "epoch": 0.5650758196572705, + "grad_norm": 0.4620867371559143, + "learning_rate": 2.3691853889004918e-05, + "loss": 0.1806, + "mean_token_accuracy": 0.9463424101471901, + "num_tokens": 45631234.0, + "step": 9740 + }, + { + "entropy": 0.1706990057602525, + "epoch": 0.5656559796363847, + "grad_norm": 0.4212273955345154, + "learning_rate": 2.3641297576913397e-05, + "loss": 0.1799, + "mean_token_accuracy": 0.9428709007799625, + "num_tokens": 45678988.0, + "step": 9750 + }, + { + "entropy": 0.16355942506343127, + "epoch": 0.566236139615499, + "grad_norm": 0.40911680459976196, + "learning_rate": 2.3590746837094568e-05, + "loss": 0.1707, + "mean_token_accuracy": 0.9454892829060555, + "num_tokens": 45721504.0, + "step": 9760 + }, + { + "entropy": 0.15760099291801452, + "epoch": 0.5668162995946132, + "grad_norm": 0.44408485293388367, + "learning_rate": 2.3540201876865724e-05, + "loss": 0.1736, + "mean_token_accuracy": 0.9477891601622105, + "num_tokens": 45768171.0, + "step": 9770 + }, + { + "entropy": 0.16570585509762167, + "epoch": 0.5673964595737274, + "grad_norm": 0.4317452907562256, + "learning_rate": 2.3489662903520462e-05, + "loss": 0.1855, + "mean_token_accuracy": 0.9452350549399853, + "num_tokens": 45814298.0, + "step": 9780 + }, + { + "entropy": 0.16221060371026397, + "epoch": 0.5679766195528417, + "grad_norm": 0.3857319951057434, + "learning_rate": 2.343913012432782e-05, + "loss": 0.166, + "mean_token_accuracy": 0.947121188044548, + "num_tokens": 45859499.0, + "step": 9790 + }, + { + "entropy": 0.16162889106199146, + "epoch": 0.5685567795319559, + "grad_norm": 0.3870103061199188, + "learning_rate": 2.3388603746531414e-05, + "loss": 0.1691, + "mean_token_accuracy": 0.9461629398167133, + "num_tokens": 45907260.0, + "step": 9800 + }, + { + "entropy": 0.15901830308139325, + "epoch": 0.5691369395110701, + "grad_norm": 0.36805129051208496, + "learning_rate": 2.3338083977348644e-05, + "loss": 0.1686, + "mean_token_accuracy": 0.9462184205651283, + "num_tokens": 45955673.0, + "step": 9810 + }, + { + "entropy": 0.1507730851881206, + "epoch": 0.5697170994901845, + "grad_norm": 0.42928171157836914, + "learning_rate": 2.328757102396978e-05, + "loss": 0.1588, + "mean_token_accuracy": 0.9518140807747841, + "num_tokens": 46000601.0, + "step": 9820 + }, + { + "entropy": 0.15890052262693644, + "epoch": 0.5702972594692987, + "grad_norm": 0.46329066157341003, + "learning_rate": 2.323706509355713e-05, + "loss": 0.1712, + "mean_token_accuracy": 0.9461792930960655, + "num_tokens": 46049120.0, + "step": 9830 + }, + { + "entropy": 0.15717183500528337, + "epoch": 0.5708774194484129, + "grad_norm": 0.47465068101882935, + "learning_rate": 2.3186566393244237e-05, + "loss": 0.1729, + "mean_token_accuracy": 0.9477498821914196, + "num_tokens": 46094154.0, + "step": 9840 + }, + { + "entropy": 0.16627834057435392, + "epoch": 0.5714575794275272, + "grad_norm": 0.4735212028026581, + "learning_rate": 2.313607513013496e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.9466375716030597, + "num_tokens": 46139696.0, + "step": 9850 + }, + { + "entropy": 0.16911688968539237, + "epoch": 0.5720377394066414, + "grad_norm": 0.4000650942325592, + "learning_rate": 2.3085591511302663e-05, + "loss": 0.18, + "mean_token_accuracy": 0.944958183169365, + "num_tokens": 46188188.0, + "step": 9860 + }, + { + "entropy": 0.1595370253548026, + "epoch": 0.5726178993857556, + "grad_norm": 0.5020022988319397, + "learning_rate": 2.303511574378938e-05, + "loss": 0.1692, + "mean_token_accuracy": 0.94760707244277, + "num_tokens": 46233289.0, + "step": 9870 + }, + { + "entropy": 0.1608332647010684, + "epoch": 0.5731980593648699, + "grad_norm": 0.37173131108283997, + "learning_rate": 2.2984648034604912e-05, + "loss": 0.1728, + "mean_token_accuracy": 0.9467341110110283, + "num_tokens": 46281527.0, + "step": 9880 + }, + { + "entropy": 0.1633620430715382, + "epoch": 0.5737782193439841, + "grad_norm": 0.4745265543460846, + "learning_rate": 2.2934188590726024e-05, + "loss": 0.1854, + "mean_token_accuracy": 0.9452161870896816, + "num_tokens": 46332073.0, + "step": 9890 + }, + { + "entropy": 0.16734816702082753, + "epoch": 0.5743583793230983, + "grad_norm": 0.4301162660121918, + "learning_rate": 2.288373761909561e-05, + "loss": 0.1757, + "mean_token_accuracy": 0.9452953837811947, + "num_tokens": 46380183.0, + "step": 9900 + }, + { + "entropy": 0.16689033168368042, + "epoch": 0.5749385393022126, + "grad_norm": 0.4533033072948456, + "learning_rate": 2.2833295326621783e-05, + "loss": 0.1769, + "mean_token_accuracy": 0.9453410044312477, + "num_tokens": 46425791.0, + "step": 9910 + }, + { + "entropy": 0.16790217822417616, + "epoch": 0.5755186992813268, + "grad_norm": 0.38770753145217896, + "learning_rate": 2.2782861920177066e-05, + "loss": 0.1826, + "mean_token_accuracy": 0.9456121303141117, + "num_tokens": 46475607.0, + "step": 9920 + }, + { + "entropy": 0.153115772921592, + "epoch": 0.576098859260441, + "grad_norm": 0.4937645196914673, + "learning_rate": 2.273243760659756e-05, + "loss": 0.1694, + "mean_token_accuracy": 0.9490656092762947, + "num_tokens": 46519722.0, + "step": 9930 + }, + { + "entropy": 0.1714660581201315, + "epoch": 0.5766790192395553, + "grad_norm": 0.4736926853656769, + "learning_rate": 2.2682022592682056e-05, + "loss": 0.1871, + "mean_token_accuracy": 0.9439347296953201, + "num_tokens": 46572405.0, + "step": 9940 + }, + { + "entropy": 0.17164755035191775, + "epoch": 0.5772591792186695, + "grad_norm": 0.4643619656562805, + "learning_rate": 2.2631617085191208e-05, + "loss": 0.1822, + "mean_token_accuracy": 0.9423431426286697, + "num_tokens": 46621673.0, + "step": 9950 + }, + { + "entropy": 0.161504493560642, + "epoch": 0.5778393391977837, + "grad_norm": 0.5037738680839539, + "learning_rate": 2.25812212908467e-05, + "loss": 0.1743, + "mean_token_accuracy": 0.946816012263298, + "num_tokens": 46665599.0, + "step": 9960 + }, + { + "entropy": 0.15877979882061483, + "epoch": 0.578419499176898, + "grad_norm": 0.3718167245388031, + "learning_rate": 2.2530835416330348e-05, + "loss": 0.1668, + "mean_token_accuracy": 0.9489897556602955, + "num_tokens": 46713078.0, + "step": 9970 + }, + { + "entropy": 0.1589330260641873, + "epoch": 0.5789996591560123, + "grad_norm": 0.46272632479667664, + "learning_rate": 2.2480459668283307e-05, + "loss": 0.1657, + "mean_token_accuracy": 0.9507333621382713, + "num_tokens": 46758375.0, + "step": 9980 + }, + { + "entropy": 0.15388044295832515, + "epoch": 0.5795798191351265, + "grad_norm": 0.3971559703350067, + "learning_rate": 2.2430094253305207e-05, + "loss": 0.1743, + "mean_token_accuracy": 0.9474498547613621, + "num_tokens": 46802127.0, + "step": 9990 + }, + { + "entropy": 0.15459269220009447, + "epoch": 0.5801599791142408, + "grad_norm": 0.4099501073360443, + "learning_rate": 2.237973937795329e-05, + "loss": 0.1709, + "mean_token_accuracy": 0.9482626207172871, + "num_tokens": 46851397.0, + "step": 10000 + }, + { + "epoch": 0.5801599791142408, + "eval_entropy": 0.15852300578056003, + "eval_loss": 0.16012564301490784, + "eval_mean_token_accuracy": 0.9467058408798371, + "eval_num_tokens": 46851397.0, + "eval_runtime": 1679.11, + "eval_samples_per_second": 5.133, + "eval_steps_per_second": 5.133, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 17237, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.028759516368329e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}