{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 7839, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.0787046998739243, "epoch": 0.003827018752391887, "grad_norm": 0.37200024724006653, "learning_rate": 4.591836734693878e-06, "loss": 2.590205955505371, "mean_token_accuracy": 0.5478626236319541, "num_tokens": 43996.0, "step": 10 }, { "entropy": 1.1275236845016479, "epoch": 0.007654037504783774, "grad_norm": 0.4282406270503998, "learning_rate": 9.693877551020408e-06, "loss": 2.732739067077637, "mean_token_accuracy": 0.532574575394392, "num_tokens": 84448.0, "step": 20 }, { "entropy": 1.1098140180110931, "epoch": 0.011481056257175661, "grad_norm": 0.45254817605018616, "learning_rate": 1.479591836734694e-05, "loss": 2.595915603637695, "mean_token_accuracy": 0.5385765254497528, "num_tokens": 127136.0, "step": 30 }, { "entropy": 1.1594089552760125, "epoch": 0.015308075009567547, "grad_norm": 0.26816287636756897, "learning_rate": 1.989795918367347e-05, "loss": 2.3643749237060545, "mean_token_accuracy": 0.5597088657319546, "num_tokens": 172549.0, "step": 40 }, { "entropy": 1.2817068248987198, "epoch": 0.019135093761959432, "grad_norm": 0.19904343783855438, "learning_rate": 2.5e-05, "loss": 2.1694852828979494, "mean_token_accuracy": 0.5605865910649299, "num_tokens": 218317.0, "step": 50 }, { "entropy": 1.356394973397255, "epoch": 0.022962112514351322, "grad_norm": 0.21708081662654877, "learning_rate": 3.0102040816326533e-05, "loss": 2.0824514389038087, "mean_token_accuracy": 0.584179612249136, "num_tokens": 255107.0, "step": 60 }, { "entropy": 1.4309053242206573, "epoch": 0.026789131266743208, "grad_norm": 0.09860006719827652, "learning_rate": 3.520408163265306e-05, "loss": 1.777943229675293, "mean_token_accuracy": 0.6243460461497307, "num_tokens": 298973.0, "step": 70 }, { "entropy": 1.4572882741689681, "epoch": 0.030616150019135095, "grad_norm": 0.07813975214958191, "learning_rate": 4.0306122448979596e-05, "loss": 1.7085393905639648, "mean_token_accuracy": 0.641383134573698, "num_tokens": 343287.0, "step": 80 }, { "entropy": 1.5205285474658012, "epoch": 0.03444316877152698, "grad_norm": 0.08015387505292892, "learning_rate": 4.5408163265306124e-05, "loss": 1.680305290222168, "mean_token_accuracy": 0.6431211873888969, "num_tokens": 376882.0, "step": 90 }, { "entropy": 1.529197846353054, "epoch": 0.038270187523918864, "grad_norm": 0.1533895879983902, "learning_rate": 5.051020408163265e-05, "loss": 1.538798999786377, "mean_token_accuracy": 0.6604955434799195, "num_tokens": 414084.0, "step": 100 }, { "entropy": 1.454574093222618, "epoch": 0.04209720627631076, "grad_norm": 0.08893708884716034, "learning_rate": 5.561224489795919e-05, "loss": 1.482753372192383, "mean_token_accuracy": 0.6748222857713699, "num_tokens": 451489.0, "step": 110 }, { "entropy": 1.4169663548469544, "epoch": 0.045924225028702644, "grad_norm": 0.10994797945022583, "learning_rate": 6.0714285714285715e-05, "loss": 1.4192767143249512, "mean_token_accuracy": 0.6815833821892738, "num_tokens": 492001.0, "step": 120 }, { "entropy": 1.3825553365051746, "epoch": 0.04975124378109453, "grad_norm": 0.09565065056085587, "learning_rate": 6.581632653061225e-05, "loss": 1.4434242248535156, "mean_token_accuracy": 0.6830388471484184, "num_tokens": 534229.0, "step": 130 }, { "entropy": 1.3660246580839157, "epoch": 0.053578262533486416, "grad_norm": 0.09277962148189545, "learning_rate": 7.091836734693877e-05, "loss": 1.3881919860839844, "mean_token_accuracy": 0.6916770502924919, "num_tokens": 573333.0, "step": 140 }, { "entropy": 1.3041961744427681, "epoch": 0.0574052812858783, "grad_norm": 0.14179331064224243, "learning_rate": 7.60204081632653e-05, "loss": 1.3051923751831054, "mean_token_accuracy": 0.7079917460680007, "num_tokens": 612198.0, "step": 150 }, { "entropy": 1.3157715648412704, "epoch": 0.06123230003827019, "grad_norm": 0.11061020940542221, "learning_rate": 8.112244897959184e-05, "loss": 1.3127019882202149, "mean_token_accuracy": 0.6986684441566468, "num_tokens": 654309.0, "step": 160 }, { "entropy": 1.2894421368837357, "epoch": 0.06505931879066207, "grad_norm": 0.12903185188770294, "learning_rate": 8.622448979591838e-05, "loss": 1.3279677391052247, "mean_token_accuracy": 0.7028318449854851, "num_tokens": 694076.0, "step": 170 }, { "entropy": 1.2676123276352882, "epoch": 0.06888633754305395, "grad_norm": 0.10816285014152527, "learning_rate": 9.13265306122449e-05, "loss": 1.3254461288452148, "mean_token_accuracy": 0.7063754379749299, "num_tokens": 733806.0, "step": 180 }, { "entropy": 1.0988808318972587, "epoch": 0.07271335629544584, "grad_norm": 0.08472651243209839, "learning_rate": 9.642857142857143e-05, "loss": 1.1510659217834474, "mean_token_accuracy": 0.7411173984408379, "num_tokens": 771622.0, "step": 190 }, { "entropy": 1.2284281507134438, "epoch": 0.07654037504783773, "grad_norm": 0.10897475481033325, "learning_rate": 0.00010153061224489797, "loss": 1.2724005699157714, "mean_token_accuracy": 0.7178638219833374, "num_tokens": 813167.0, "step": 200 }, { "entropy": 1.2159623876214027, "epoch": 0.08036739380022963, "grad_norm": 0.12170197069644928, "learning_rate": 0.0001066326530612245, "loss": 1.26397647857666, "mean_token_accuracy": 0.7138236090540886, "num_tokens": 856210.0, "step": 210 }, { "entropy": 1.2309471271932124, "epoch": 0.08419441255262151, "grad_norm": 0.08406181633472443, "learning_rate": 0.00011173469387755102, "loss": 1.3110918998718262, "mean_token_accuracy": 0.7173333883285522, "num_tokens": 893432.0, "step": 220 }, { "entropy": 1.2228039711713792, "epoch": 0.0880214313050134, "grad_norm": 0.10588081181049347, "learning_rate": 0.00011683673469387754, "loss": 1.2445635795593262, "mean_token_accuracy": 0.7160170584917068, "num_tokens": 931919.0, "step": 230 }, { "entropy": 1.138296764343977, "epoch": 0.09184845005740529, "grad_norm": 0.110760398209095, "learning_rate": 0.00012193877551020409, "loss": 1.2083134651184082, "mean_token_accuracy": 0.7306654810905456, "num_tokens": 976039.0, "step": 240 }, { "entropy": 1.1915819495916367, "epoch": 0.09567546880979717, "grad_norm": 0.15018120408058167, "learning_rate": 0.00012704081632653063, "loss": 1.2230369567871093, "mean_token_accuracy": 0.719833716750145, "num_tokens": 1019312.0, "step": 250 }, { "entropy": 1.2996815636754036, "epoch": 0.09950248756218906, "grad_norm": 0.10838313400745392, "learning_rate": 0.00013214285714285715, "loss": 1.3142367362976075, "mean_token_accuracy": 0.7013067752122879, "num_tokens": 1062901.0, "step": 260 }, { "entropy": 1.149668525904417, "epoch": 0.10332950631458095, "grad_norm": 0.09911312907934189, "learning_rate": 0.00013724489795918367, "loss": 1.1573083877563477, "mean_token_accuracy": 0.728775355219841, "num_tokens": 1102630.0, "step": 270 }, { "entropy": 1.271340447664261, "epoch": 0.10715652506697283, "grad_norm": 0.09665267169475555, "learning_rate": 0.00014234693877551022, "loss": 1.341374111175537, "mean_token_accuracy": 0.7027333110570908, "num_tokens": 1142262.0, "step": 280 }, { "entropy": 1.225122657418251, "epoch": 0.11098354381936472, "grad_norm": 0.13240815699100494, "learning_rate": 0.00014744897959183674, "loss": 1.2614737510681153, "mean_token_accuracy": 0.7198452442884445, "num_tokens": 1182386.0, "step": 290 }, { "entropy": 1.2733432039618493, "epoch": 0.1148105625717566, "grad_norm": 0.10651895403862, "learning_rate": 0.00015255102040816326, "loss": 1.2933347702026368, "mean_token_accuracy": 0.7025195896625519, "num_tokens": 1222805.0, "step": 300 }, { "entropy": 1.1543171763420106, "epoch": 0.11863758132414849, "grad_norm": 0.08577804267406464, "learning_rate": 0.00015765306122448978, "loss": 1.197078323364258, "mean_token_accuracy": 0.7300360783934593, "num_tokens": 1263121.0, "step": 310 }, { "entropy": 1.1908820882439612, "epoch": 0.12246460007654038, "grad_norm": 0.11925600469112396, "learning_rate": 0.00016275510204081633, "loss": 1.2366827964782714, "mean_token_accuracy": 0.7277334719896317, "num_tokens": 1296346.0, "step": 320 }, { "entropy": 1.1711702406406403, "epoch": 0.12629161882893225, "grad_norm": 0.12476309388875961, "learning_rate": 0.00016785714285714288, "loss": 1.2408350944519042, "mean_token_accuracy": 0.7282937213778495, "num_tokens": 1335547.0, "step": 330 }, { "entropy": 1.1748667433857918, "epoch": 0.13011863758132414, "grad_norm": 0.08671289682388306, "learning_rate": 0.0001729591836734694, "loss": 1.199030303955078, "mean_token_accuracy": 0.7312668621540069, "num_tokens": 1377093.0, "step": 340 }, { "entropy": 1.153764547407627, "epoch": 0.13394565633371602, "grad_norm": 0.10536976903676987, "learning_rate": 0.00017806122448979592, "loss": 1.201906967163086, "mean_token_accuracy": 0.7266524419188499, "num_tokens": 1417236.0, "step": 350 }, { "entropy": 1.2303058430552483, "epoch": 0.1377726750861079, "grad_norm": 0.09069176018238068, "learning_rate": 0.00018316326530612247, "loss": 1.2867681503295898, "mean_token_accuracy": 0.7198954582214355, "num_tokens": 1460306.0, "step": 360 }, { "entropy": 1.1944106668233871, "epoch": 0.1415996938384998, "grad_norm": 0.08539925515651703, "learning_rate": 0.000188265306122449, "loss": 1.2451179504394532, "mean_token_accuracy": 0.7186401098966598, "num_tokens": 1505966.0, "step": 370 }, { "entropy": 1.2239032357931137, "epoch": 0.14542671259089168, "grad_norm": 0.08434446156024933, "learning_rate": 0.0001933673469387755, "loss": 1.2803629875183105, "mean_token_accuracy": 0.7178368359804154, "num_tokens": 1544132.0, "step": 380 }, { "entropy": 1.1901779979467393, "epoch": 0.14925373134328357, "grad_norm": 0.08662886172533035, "learning_rate": 0.00019846938775510203, "loss": 1.2282370567321776, "mean_token_accuracy": 0.7221132159233093, "num_tokens": 1587355.0, "step": 390 }, { "entropy": 1.106569343805313, "epoch": 0.15308075009567546, "grad_norm": 0.13149450719356537, "learning_rate": 0.00019981200483416142, "loss": 1.136556625366211, "mean_token_accuracy": 0.7384186327457428, "num_tokens": 1624638.0, "step": 400 }, { "entropy": 1.0393452920019626, "epoch": 0.15690776884806737, "grad_norm": 0.13831719756126404, "learning_rate": 0.00019954344031153484, "loss": 1.074817180633545, "mean_token_accuracy": 0.7567356958985328, "num_tokens": 1665215.0, "step": 410 }, { "entropy": 1.1298492863774299, "epoch": 0.16073478760045926, "grad_norm": 0.10244159400463104, "learning_rate": 0.0001992748757889083, "loss": 1.1741769790649415, "mean_token_accuracy": 0.7414689466357232, "num_tokens": 1701543.0, "step": 420 }, { "entropy": 1.1646860882639885, "epoch": 0.16456180635285114, "grad_norm": 0.09356453269720078, "learning_rate": 0.00019900631126628174, "loss": 1.2229989051818848, "mean_token_accuracy": 0.7278152450919151, "num_tokens": 1744719.0, "step": 430 }, { "entropy": 1.1580850452184677, "epoch": 0.16838882510524303, "grad_norm": 0.08699047565460205, "learning_rate": 0.00019873774674365518, "loss": 1.1999470710754394, "mean_token_accuracy": 0.7270642057061195, "num_tokens": 1787999.0, "step": 440 }, { "entropy": 1.105088683962822, "epoch": 0.17221584385763491, "grad_norm": 0.10489863902330399, "learning_rate": 0.0001984691822210286, "loss": 1.123628807067871, "mean_token_accuracy": 0.7375599846243859, "num_tokens": 1825171.0, "step": 450 }, { "entropy": 1.0744315460324287, "epoch": 0.1760428626100268, "grad_norm": 0.10170256346464157, "learning_rate": 0.00019820061769840205, "loss": 1.1449885368347168, "mean_token_accuracy": 0.7466330513358116, "num_tokens": 1863245.0, "step": 460 }, { "entropy": 1.021899376064539, "epoch": 0.1798698813624187, "grad_norm": 0.09046658873558044, "learning_rate": 0.0001979320531757755, "loss": 1.0087275505065918, "mean_token_accuracy": 0.7654190301895142, "num_tokens": 1902205.0, "step": 470 }, { "entropy": 1.1595304682850838, "epoch": 0.18369690011481057, "grad_norm": 0.09361740201711655, "learning_rate": 0.00019766348865314892, "loss": 1.2238757133483886, "mean_token_accuracy": 0.7275362908840179, "num_tokens": 1943827.0, "step": 480 }, { "entropy": 1.1023890599608421, "epoch": 0.18752391886720246, "grad_norm": 0.08471602201461792, "learning_rate": 0.00019739492413052236, "loss": 1.1567111015319824, "mean_token_accuracy": 0.742167092859745, "num_tokens": 1982630.0, "step": 490 }, { "entropy": 1.1427962884306908, "epoch": 0.19135093761959435, "grad_norm": 0.1170063391327858, "learning_rate": 0.0001971263596078958, "loss": 1.208080005645752, "mean_token_accuracy": 0.7308674260973931, "num_tokens": 2021311.0, "step": 500 }, { "entropy": 1.0002792343497275, "epoch": 0.19517795637198623, "grad_norm": 0.10567828267812729, "learning_rate": 0.00019685779508526926, "loss": 1.026076889038086, "mean_token_accuracy": 0.7641370877623558, "num_tokens": 2055396.0, "step": 510 }, { "entropy": 1.1096693962812423, "epoch": 0.19900497512437812, "grad_norm": 0.08597096055746078, "learning_rate": 0.00019658923056264268, "loss": 1.1631651878356934, "mean_token_accuracy": 0.7389342650771141, "num_tokens": 2097931.0, "step": 520 }, { "entropy": 1.04201333373785, "epoch": 0.20283199387677, "grad_norm": 0.1260094940662384, "learning_rate": 0.00019632066604001613, "loss": 1.0765873908996582, "mean_token_accuracy": 0.7539548426866531, "num_tokens": 2137657.0, "step": 530 }, { "entropy": 1.0748623803257942, "epoch": 0.2066590126291619, "grad_norm": 0.0845552608370781, "learning_rate": 0.00019605210151738955, "loss": 1.1419748306274413, "mean_token_accuracy": 0.7481048628687859, "num_tokens": 2177343.0, "step": 540 }, { "entropy": 1.0970528617501258, "epoch": 0.21048603138155378, "grad_norm": 0.07105763256549835, "learning_rate": 0.000195783536994763, "loss": 1.1284755706787108, "mean_token_accuracy": 0.747196614742279, "num_tokens": 2211423.0, "step": 550 }, { "entropy": 1.0551751986145974, "epoch": 0.21431305013394567, "grad_norm": 0.12569685280323029, "learning_rate": 0.00019551497247213644, "loss": 1.1170102119445802, "mean_token_accuracy": 0.7472440049052238, "num_tokens": 2249350.0, "step": 560 }, { "entropy": 1.0562219873070717, "epoch": 0.21814006888633755, "grad_norm": 0.08452208340167999, "learning_rate": 0.0001952464079495099, "loss": 1.0921730995178223, "mean_token_accuracy": 0.7492805704474449, "num_tokens": 2289564.0, "step": 570 }, { "entropy": 1.022915106266737, "epoch": 0.22196708763872944, "grad_norm": 0.08168510347604752, "learning_rate": 0.00019497784342688333, "loss": 1.064980697631836, "mean_token_accuracy": 0.7562290355563164, "num_tokens": 2332592.0, "step": 580 }, { "entropy": 1.1041950330138206, "epoch": 0.22579410639112132, "grad_norm": 0.07596516609191895, "learning_rate": 0.00019470927890425675, "loss": 1.1410536766052246, "mean_token_accuracy": 0.7350378915667534, "num_tokens": 2380215.0, "step": 590 }, { "entropy": 1.1252056039869784, "epoch": 0.2296211251435132, "grad_norm": 0.07240597158670425, "learning_rate": 0.0001944407143816302, "loss": 1.1348044395446777, "mean_token_accuracy": 0.7389790266752243, "num_tokens": 2417819.0, "step": 600 }, { "entropy": 1.0117394506931305, "epoch": 0.2334481438959051, "grad_norm": 0.08603253215551376, "learning_rate": 0.00019417214985900362, "loss": 1.0533303260803222, "mean_token_accuracy": 0.7578112691640854, "num_tokens": 2459072.0, "step": 610 }, { "entropy": 1.0193642653524875, "epoch": 0.23727516264829698, "grad_norm": 0.08400722593069077, "learning_rate": 0.00019390358533637707, "loss": 1.0955985069274903, "mean_token_accuracy": 0.7598358646035195, "num_tokens": 2497901.0, "step": 620 }, { "entropy": 1.0488237984478475, "epoch": 0.24110218140068887, "grad_norm": 0.07221511751413345, "learning_rate": 0.00019363502081375052, "loss": 1.151495361328125, "mean_token_accuracy": 0.7541669681668282, "num_tokens": 2536530.0, "step": 630 }, { "entropy": 1.0595403373241425, "epoch": 0.24492920015308076, "grad_norm": 0.10258961468935013, "learning_rate": 0.00019336645629112396, "loss": 1.0888574600219727, "mean_token_accuracy": 0.7474928990006446, "num_tokens": 2571599.0, "step": 640 }, { "entropy": 1.0924376487731933, "epoch": 0.24875621890547264, "grad_norm": 0.0751282125711441, "learning_rate": 0.0001930978917684974, "loss": 1.1468082427978517, "mean_token_accuracy": 0.7465573191642761, "num_tokens": 2612401.0, "step": 650 }, { "entropy": 0.9765479557216168, "epoch": 0.2525832376578645, "grad_norm": 0.09039046615362167, "learning_rate": 0.00019282932724587083, "loss": 1.054959201812744, "mean_token_accuracy": 0.7649626806378365, "num_tokens": 2652042.0, "step": 660 }, { "entropy": 1.0833388939499855, "epoch": 0.2564102564102564, "grad_norm": 0.08905521035194397, "learning_rate": 0.00019256076272324425, "loss": 1.0937233924865724, "mean_token_accuracy": 0.7510278865694999, "num_tokens": 2692211.0, "step": 670 }, { "entropy": 1.0478161230683327, "epoch": 0.2602372751626483, "grad_norm": 0.09634676575660706, "learning_rate": 0.0001922921982006177, "loss": 1.1077991485595704, "mean_token_accuracy": 0.7490576148033142, "num_tokens": 2734562.0, "step": 680 }, { "entropy": 1.13061283826828, "epoch": 0.26406429391504016, "grad_norm": 0.07757367938756943, "learning_rate": 0.00019202363367799114, "loss": 1.1330499649047852, "mean_token_accuracy": 0.7350772902369499, "num_tokens": 2778990.0, "step": 690 }, { "entropy": 1.0450415380299092, "epoch": 0.26789131266743205, "grad_norm": 0.06570328027009964, "learning_rate": 0.0001917550691553646, "loss": 1.1425201416015625, "mean_token_accuracy": 0.7551864832639694, "num_tokens": 2814475.0, "step": 700 }, { "entropy": 1.0114438571035862, "epoch": 0.27171833141982393, "grad_norm": 0.11020322889089584, "learning_rate": 0.00019148650463273804, "loss": 1.0655290603637695, "mean_token_accuracy": 0.7622251763939858, "num_tokens": 2847751.0, "step": 710 }, { "entropy": 1.1008106037974357, "epoch": 0.2755453501722158, "grad_norm": 0.07282241433858871, "learning_rate": 0.00019121794011011146, "loss": 1.139061450958252, "mean_token_accuracy": 0.7368278667330742, "num_tokens": 2889553.0, "step": 720 }, { "entropy": 1.0155851803719997, "epoch": 0.2793723689246077, "grad_norm": 0.11799076199531555, "learning_rate": 0.0001909493755874849, "loss": 1.0634971618652345, "mean_token_accuracy": 0.7578275159001351, "num_tokens": 2926860.0, "step": 730 }, { "entropy": 1.0430648550391197, "epoch": 0.2831993876769996, "grad_norm": 0.08702066540718079, "learning_rate": 0.00019068081106485832, "loss": 1.0851760864257813, "mean_token_accuracy": 0.7573095709085464, "num_tokens": 2964029.0, "step": 740 }, { "entropy": 1.0908455178141594, "epoch": 0.2870264064293915, "grad_norm": 0.06593967229127884, "learning_rate": 0.00019041224654223177, "loss": 1.0929256439208985, "mean_token_accuracy": 0.7440102145075798, "num_tokens": 3004528.0, "step": 750 }, { "entropy": 0.971546346694231, "epoch": 0.29085342518178336, "grad_norm": 0.08857332915067673, "learning_rate": 0.00019014368201960522, "loss": 1.0575197219848633, "mean_token_accuracy": 0.7712547823786735, "num_tokens": 3041203.0, "step": 760 }, { "entropy": 1.0496620319783687, "epoch": 0.29468044393417525, "grad_norm": 0.07172030210494995, "learning_rate": 0.00018987511749697867, "loss": 1.100113582611084, "mean_token_accuracy": 0.747462597489357, "num_tokens": 3086263.0, "step": 770 }, { "entropy": 1.0853942684829234, "epoch": 0.29850746268656714, "grad_norm": 0.0861373096704483, "learning_rate": 0.0001896065529743521, "loss": 1.1116449356079101, "mean_token_accuracy": 0.7485424548387527, "num_tokens": 3126741.0, "step": 780 }, { "entropy": 1.039051755145192, "epoch": 0.302334481438959, "grad_norm": 0.07344193756580353, "learning_rate": 0.00018933798845172553, "loss": 1.092859935760498, "mean_token_accuracy": 0.7524166733026505, "num_tokens": 3164039.0, "step": 790 }, { "entropy": 1.021885236352682, "epoch": 0.3061615001913509, "grad_norm": 0.09843221306800842, "learning_rate": 0.00018906942392909895, "loss": 1.0813950538635253, "mean_token_accuracy": 0.7612547591328621, "num_tokens": 3202455.0, "step": 800 }, { "entropy": 1.0329275727272034, "epoch": 0.3099885189437428, "grad_norm": 0.07059452682733536, "learning_rate": 0.0001888008594064724, "loss": 1.0516587257385255, "mean_token_accuracy": 0.7528441205620766, "num_tokens": 3239911.0, "step": 810 }, { "entropy": 1.0202949695289134, "epoch": 0.31381553769613474, "grad_norm": 0.07269048690795898, "learning_rate": 0.00018853229488384585, "loss": 1.0879244804382324, "mean_token_accuracy": 0.7542849883437157, "num_tokens": 3278682.0, "step": 820 }, { "entropy": 1.0690899170935153, "epoch": 0.3176425564485266, "grad_norm": 0.14370054006576538, "learning_rate": 0.0001882637303612193, "loss": 1.1063778877258301, "mean_token_accuracy": 0.75286915153265, "num_tokens": 3325465.0, "step": 830 }, { "entropy": 1.0819261983036994, "epoch": 0.3214695752009185, "grad_norm": 0.0973975881934166, "learning_rate": 0.00018799516583859274, "loss": 1.0978353500366211, "mean_token_accuracy": 0.749411192536354, "num_tokens": 3363479.0, "step": 840 }, { "entropy": 1.0502549454569816, "epoch": 0.3252965939533104, "grad_norm": 0.11021706461906433, "learning_rate": 0.0001877266013159662, "loss": 1.1489330291748048, "mean_token_accuracy": 0.7476440489292144, "num_tokens": 3405863.0, "step": 850 }, { "entropy": 1.1556663788855075, "epoch": 0.3291236127057023, "grad_norm": 0.06459799408912659, "learning_rate": 0.0001874580367933396, "loss": 1.1840539932250977, "mean_token_accuracy": 0.7288095027208328, "num_tokens": 3450829.0, "step": 860 }, { "entropy": 1.097336183488369, "epoch": 0.33295063145809417, "grad_norm": 0.06765513867139816, "learning_rate": 0.00018718947227071303, "loss": 1.1286226272583009, "mean_token_accuracy": 0.7439226225018502, "num_tokens": 3490640.0, "step": 870 }, { "entropy": 1.0772622771561147, "epoch": 0.33677765021048606, "grad_norm": 0.08126482367515564, "learning_rate": 0.00018692090774808648, "loss": 1.1434885025024415, "mean_token_accuracy": 0.7434220835566521, "num_tokens": 3529438.0, "step": 880 }, { "entropy": 1.0091869838535785, "epoch": 0.34060466896287794, "grad_norm": 0.0654602199792862, "learning_rate": 0.00018665234322545992, "loss": 1.0767542839050293, "mean_token_accuracy": 0.7644046351313591, "num_tokens": 3565217.0, "step": 890 }, { "entropy": 1.0432863399386405, "epoch": 0.34443168771526983, "grad_norm": 0.10025763511657715, "learning_rate": 0.00018638377870283337, "loss": 1.0826923370361328, "mean_token_accuracy": 0.7591656729578972, "num_tokens": 3603940.0, "step": 900 }, { "entropy": 1.007722695171833, "epoch": 0.3482587064676617, "grad_norm": 0.06779270619153976, "learning_rate": 0.00018611521418020682, "loss": 1.0158637046813965, "mean_token_accuracy": 0.763145099580288, "num_tokens": 3644547.0, "step": 910 }, { "entropy": 1.0556264080107212, "epoch": 0.3520857252200536, "grad_norm": 0.07834554463624954, "learning_rate": 0.00018584664965758026, "loss": 1.091851806640625, "mean_token_accuracy": 0.7483858004212379, "num_tokens": 3691979.0, "step": 920 }, { "entropy": 1.0730156242847442, "epoch": 0.3559127439724455, "grad_norm": 0.10772417485713959, "learning_rate": 0.00018557808513495368, "loss": 1.1370153427124023, "mean_token_accuracy": 0.7466916054487228, "num_tokens": 3728767.0, "step": 930 }, { "entropy": 1.081335111707449, "epoch": 0.3597397627248374, "grad_norm": 0.07669705897569656, "learning_rate": 0.0001853095206123271, "loss": 1.141366958618164, "mean_token_accuracy": 0.744081811606884, "num_tokens": 3772234.0, "step": 940 }, { "entropy": 0.9984517656266689, "epoch": 0.36356678147722926, "grad_norm": 0.0695272758603096, "learning_rate": 0.00018504095608970055, "loss": 1.0501303672790527, "mean_token_accuracy": 0.7590280339121819, "num_tokens": 3816970.0, "step": 950 }, { "entropy": 0.910194194689393, "epoch": 0.36739380022962115, "grad_norm": 0.06411932408809662, "learning_rate": 0.000184772391567074, "loss": 0.9656248092651367, "mean_token_accuracy": 0.7823562085628509, "num_tokens": 3853816.0, "step": 960 }, { "entropy": 0.9763211451470852, "epoch": 0.37122081898201303, "grad_norm": 0.08389662951231003, "learning_rate": 0.00018450382704444744, "loss": 1.0703671455383301, "mean_token_accuracy": 0.76742093116045, "num_tokens": 3896404.0, "step": 970 }, { "entropy": 1.076941692829132, "epoch": 0.3750478377344049, "grad_norm": 0.13239043951034546, "learning_rate": 0.0001842352625218209, "loss": 1.1353830337524413, "mean_token_accuracy": 0.7479456245899201, "num_tokens": 3934187.0, "step": 980 }, { "entropy": 1.077423833310604, "epoch": 0.3788748564867968, "grad_norm": 0.06203702092170715, "learning_rate": 0.00018396669799919434, "loss": 1.1363765716552734, "mean_token_accuracy": 0.7437105163931846, "num_tokens": 3975766.0, "step": 990 }, { "entropy": 1.009491826593876, "epoch": 0.3827018752391887, "grad_norm": 0.06740409135818481, "learning_rate": 0.00018369813347656776, "loss": 1.0752355575561523, "mean_token_accuracy": 0.7598015293478966, "num_tokens": 4018368.0, "step": 1000 }, { "entropy": 0.9744428530335426, "epoch": 0.3865288939915806, "grad_norm": 0.07750537246465683, "learning_rate": 0.00018342956895394118, "loss": 1.0554892539978027, "mean_token_accuracy": 0.7682438552379608, "num_tokens": 4057647.0, "step": 1010 }, { "entropy": 1.0335246473550797, "epoch": 0.39035591274397247, "grad_norm": 0.07627248764038086, "learning_rate": 0.00018316100443131463, "loss": 1.0600407600402832, "mean_token_accuracy": 0.7552958622574806, "num_tokens": 4098377.0, "step": 1020 }, { "entropy": 1.0256185740232469, "epoch": 0.39418293149636435, "grad_norm": 0.10117889940738678, "learning_rate": 0.00018289243990868807, "loss": 1.0706727027893066, "mean_token_accuracy": 0.75880047082901, "num_tokens": 4141633.0, "step": 1030 }, { "entropy": 0.9883378148078918, "epoch": 0.39800995024875624, "grad_norm": 0.064593605697155, "learning_rate": 0.00018262387538606152, "loss": 1.007016372680664, "mean_token_accuracy": 0.764974731206894, "num_tokens": 4181155.0, "step": 1040 }, { "entropy": 1.0692595109343528, "epoch": 0.4018369690011481, "grad_norm": 0.07493151724338531, "learning_rate": 0.00018235531086343497, "loss": 1.123647975921631, "mean_token_accuracy": 0.7452841177582741, "num_tokens": 4218175.0, "step": 1050 }, { "entropy": 0.990117172151804, "epoch": 0.40566398775354, "grad_norm": 0.06332839280366898, "learning_rate": 0.0001820867463408084, "loss": 1.0538661003112793, "mean_token_accuracy": 0.7637043848633767, "num_tokens": 4262326.0, "step": 1060 }, { "entropy": 1.002436650544405, "epoch": 0.4094910065059319, "grad_norm": 0.07898294180631638, "learning_rate": 0.00018181818181818183, "loss": 0.9973239898681641, "mean_token_accuracy": 0.7644759714603424, "num_tokens": 4300876.0, "step": 1070 }, { "entropy": 0.9635432817041873, "epoch": 0.4133180252583238, "grad_norm": 0.09760674089193344, "learning_rate": 0.00018154961729555525, "loss": 1.0411369323730468, "mean_token_accuracy": 0.7664693981409073, "num_tokens": 4338887.0, "step": 1080 }, { "entropy": 0.9780610945075751, "epoch": 0.41714504401071567, "grad_norm": 0.08076441287994385, "learning_rate": 0.0001812810527729287, "loss": 1.0544751167297364, "mean_token_accuracy": 0.76624975502491, "num_tokens": 4380678.0, "step": 1090 }, { "entropy": 1.0548741944134234, "epoch": 0.42097206276310756, "grad_norm": 0.0646439641714096, "learning_rate": 0.00018101248825030215, "loss": 1.128230667114258, "mean_token_accuracy": 0.7504925444722176, "num_tokens": 4422899.0, "step": 1100 }, { "entropy": 1.0767812803387642, "epoch": 0.42479908151549944, "grad_norm": 0.06994366645812988, "learning_rate": 0.0001807439237276756, "loss": 1.1209583282470703, "mean_token_accuracy": 0.7477620646357537, "num_tokens": 4461864.0, "step": 1110 }, { "entropy": 1.062944334745407, "epoch": 0.42862610026789133, "grad_norm": 0.11016593873500824, "learning_rate": 0.00018047535920504904, "loss": 1.0880105018615722, "mean_token_accuracy": 0.7455767750740051, "num_tokens": 4501378.0, "step": 1120 }, { "entropy": 1.0511136516928672, "epoch": 0.4324531190202832, "grad_norm": 0.08707646280527115, "learning_rate": 0.00018020679468242246, "loss": 1.0764313697814942, "mean_token_accuracy": 0.7519838035106658, "num_tokens": 4541448.0, "step": 1130 }, { "entropy": 0.9529998056590557, "epoch": 0.4362801377726751, "grad_norm": 0.07353853434324265, "learning_rate": 0.00017993823015979588, "loss": 1.0098756790161132, "mean_token_accuracy": 0.7720077604055404, "num_tokens": 4586147.0, "step": 1140 }, { "entropy": 1.143844011425972, "epoch": 0.440107156525067, "grad_norm": 0.06268489360809326, "learning_rate": 0.00017966966563716933, "loss": 1.1934361457824707, "mean_token_accuracy": 0.7281116575002671, "num_tokens": 4631906.0, "step": 1150 }, { "entropy": 1.0761573910713196, "epoch": 0.4439341752774589, "grad_norm": 0.07078517228364944, "learning_rate": 0.00017940110111454278, "loss": 1.1359615325927734, "mean_token_accuracy": 0.7409780561923981, "num_tokens": 4674626.0, "step": 1160 }, { "entropy": 0.9940036550164223, "epoch": 0.44776119402985076, "grad_norm": 0.08054502308368683, "learning_rate": 0.00017913253659191622, "loss": 1.033839225769043, "mean_token_accuracy": 0.7682256817817688, "num_tokens": 4715717.0, "step": 1170 }, { "entropy": 0.9752239182591438, "epoch": 0.45158821278224265, "grad_norm": 0.08600450307130814, "learning_rate": 0.00017886397206928967, "loss": 1.0254844665527343, "mean_token_accuracy": 0.7688430979847908, "num_tokens": 4747316.0, "step": 1180 }, { "entropy": 1.064694558084011, "epoch": 0.45541523153463453, "grad_norm": 0.07270248234272003, "learning_rate": 0.0001785954075466631, "loss": 1.0806646347045898, "mean_token_accuracy": 0.7534265503287315, "num_tokens": 4788606.0, "step": 1190 }, { "entropy": 0.958549628406763, "epoch": 0.4592422502870264, "grad_norm": 0.0644846111536026, "learning_rate": 0.00017832684302403654, "loss": 1.0015847206115722, "mean_token_accuracy": 0.7644492238759995, "num_tokens": 4831371.0, "step": 1200 }, { "entropy": 1.0885677203536033, "epoch": 0.4630692690394183, "grad_norm": 0.13487283885478973, "learning_rate": 0.00017805827850140996, "loss": 1.1495524406433106, "mean_token_accuracy": 0.7414823487401009, "num_tokens": 4871231.0, "step": 1210 }, { "entropy": 1.1117899976670742, "epoch": 0.4668962877918102, "grad_norm": 0.08015701174736023, "learning_rate": 0.0001777897139787834, "loss": 1.1366958618164062, "mean_token_accuracy": 0.7351289570331574, "num_tokens": 4911520.0, "step": 1220 }, { "entropy": 0.9722193017601967, "epoch": 0.4707233065442021, "grad_norm": 0.06839531660079956, "learning_rate": 0.00017752114945615685, "loss": 1.0259140968322753, "mean_token_accuracy": 0.7658233359456063, "num_tokens": 4950296.0, "step": 1230 }, { "entropy": 1.0021446757018566, "epoch": 0.47455032529659397, "grad_norm": 0.08231978863477707, "learning_rate": 0.0001772525849335303, "loss": 1.0437036514282227, "mean_token_accuracy": 0.7644398525357247, "num_tokens": 4989688.0, "step": 1240 }, { "entropy": 0.9640353135764599, "epoch": 0.47837734404898585, "grad_norm": 0.11587074398994446, "learning_rate": 0.00017698402041090375, "loss": 1.0072126388549805, "mean_token_accuracy": 0.7740294471383095, "num_tokens": 5029135.0, "step": 1250 }, { "entropy": 1.0122342824935913, "epoch": 0.48220436280137774, "grad_norm": 0.07646426558494568, "learning_rate": 0.00017671545588827717, "loss": 1.0733034133911132, "mean_token_accuracy": 0.7619047269225121, "num_tokens": 5066488.0, "step": 1260 }, { "entropy": 1.0465880073606968, "epoch": 0.4860313815537696, "grad_norm": 0.07594821602106094, "learning_rate": 0.0001764468913656506, "loss": 1.0953254699707031, "mean_token_accuracy": 0.7511951208114624, "num_tokens": 5102103.0, "step": 1270 }, { "entropy": 1.0104024082422256, "epoch": 0.4898584003061615, "grad_norm": 0.07695835083723068, "learning_rate": 0.00017617832684302403, "loss": 1.1025714874267578, "mean_token_accuracy": 0.7583977058529854, "num_tokens": 5141113.0, "step": 1280 }, { "entropy": 1.044089037179947, "epoch": 0.4936854190585534, "grad_norm": 0.07186906039714813, "learning_rate": 0.00017590976232039748, "loss": 1.0713683128356934, "mean_token_accuracy": 0.7546971932053566, "num_tokens": 5181454.0, "step": 1290 }, { "entropy": 0.9154780797660351, "epoch": 0.4975124378109453, "grad_norm": 0.08934911340475082, "learning_rate": 0.00017564119779777093, "loss": 0.9871469497680664, "mean_token_accuracy": 0.7756836161017417, "num_tokens": 5214143.0, "step": 1300 }, { "entropy": 1.05635926425457, "epoch": 0.5013394565633371, "grad_norm": 0.07880513370037079, "learning_rate": 0.00017537263327514437, "loss": 1.0985527038574219, "mean_token_accuracy": 0.7524559125304222, "num_tokens": 5258310.0, "step": 1310 }, { "entropy": 1.0448619149625302, "epoch": 0.505166475315729, "grad_norm": 0.10507462918758392, "learning_rate": 0.0001751040687525178, "loss": 1.1040778160095215, "mean_token_accuracy": 0.7477249845862388, "num_tokens": 5296865.0, "step": 1320 }, { "entropy": 1.0706947155296802, "epoch": 0.5089934940681209, "grad_norm": 0.09437765926122665, "learning_rate": 0.00017483550422989124, "loss": 1.1372867584228517, "mean_token_accuracy": 0.7502224639058113, "num_tokens": 5335301.0, "step": 1330 }, { "entropy": 0.9736435614526272, "epoch": 0.5128205128205128, "grad_norm": 0.07162626087665558, "learning_rate": 0.0001745669397072647, "loss": 1.0273897171020507, "mean_token_accuracy": 0.7711918234825135, "num_tokens": 5372533.0, "step": 1340 }, { "entropy": 0.9989161014556884, "epoch": 0.5166475315729047, "grad_norm": 0.08805254101753235, "learning_rate": 0.0001742983751846381, "loss": 1.0603778839111329, "mean_token_accuracy": 0.7600028276443481, "num_tokens": 5412651.0, "step": 1350 }, { "entropy": 1.063752220571041, "epoch": 0.5204745503252965, "grad_norm": 0.08056829869747162, "learning_rate": 0.00017402981066201156, "loss": 1.0876687049865723, "mean_token_accuracy": 0.7487231969833374, "num_tokens": 5454518.0, "step": 1360 }, { "entropy": 0.966426993906498, "epoch": 0.5243015690776884, "grad_norm": 0.06970727443695068, "learning_rate": 0.000173761246139385, "loss": 1.0302441596984864, "mean_token_accuracy": 0.7640074551105499, "num_tokens": 5495257.0, "step": 1370 }, { "entropy": 0.9727898858487606, "epoch": 0.5281285878300803, "grad_norm": 0.09694326668977737, "learning_rate": 0.00017349268161675842, "loss": 1.0327792167663574, "mean_token_accuracy": 0.7687779292464256, "num_tokens": 5527573.0, "step": 1380 }, { "entropy": 1.043993879854679, "epoch": 0.5319556065824722, "grad_norm": 0.05676735192537308, "learning_rate": 0.00017322411709413187, "loss": 1.1139988899230957, "mean_token_accuracy": 0.7597961351275444, "num_tokens": 5566542.0, "step": 1390 }, { "entropy": 0.9891408108174801, "epoch": 0.5357826253348641, "grad_norm": 0.08670998364686966, "learning_rate": 0.00017295555257150532, "loss": 1.0878351211547852, "mean_token_accuracy": 0.7640718072652817, "num_tokens": 5604986.0, "step": 1400 }, { "entropy": 1.0097288101911546, "epoch": 0.539609644087256, "grad_norm": 0.09190856665372849, "learning_rate": 0.00017268698804887876, "loss": 1.079444408416748, "mean_token_accuracy": 0.7590912491083145, "num_tokens": 5642224.0, "step": 1410 }, { "entropy": 0.9927844725549221, "epoch": 0.5434366628396479, "grad_norm": 0.08191007375717163, "learning_rate": 0.00017241842352625218, "loss": 1.0661033630371093, "mean_token_accuracy": 0.7664914444088936, "num_tokens": 5680912.0, "step": 1420 }, { "entropy": 0.973179691657424, "epoch": 0.5472636815920398, "grad_norm": 0.08161566406488419, "learning_rate": 0.00017214985900362563, "loss": 1.078667163848877, "mean_token_accuracy": 0.7686992704868316, "num_tokens": 5717171.0, "step": 1430 }, { "entropy": 1.0467095457017421, "epoch": 0.5510907003444316, "grad_norm": 0.09403429925441742, "learning_rate": 0.00017188129448099908, "loss": 1.0912303924560547, "mean_token_accuracy": 0.7550861686468124, "num_tokens": 5755956.0, "step": 1440 }, { "entropy": 1.0082954704761504, "epoch": 0.5549177190968235, "grad_norm": 0.09858231991529465, "learning_rate": 0.0001716127299583725, "loss": 1.0449023246765137, "mean_token_accuracy": 0.7586705282330513, "num_tokens": 5798765.0, "step": 1450 }, { "entropy": 1.0528397418558597, "epoch": 0.5587447378492154, "grad_norm": 0.06697855144739151, "learning_rate": 0.00017134416543574594, "loss": 1.0901053428649903, "mean_token_accuracy": 0.7517547190189362, "num_tokens": 5839833.0, "step": 1460 }, { "entropy": 1.009619940817356, "epoch": 0.5625717566016073, "grad_norm": 0.07271189987659454, "learning_rate": 0.0001710756009131194, "loss": 1.0171070098876953, "mean_token_accuracy": 0.7594649389386177, "num_tokens": 5880613.0, "step": 1470 }, { "entropy": 0.9699329622089863, "epoch": 0.5663987753539992, "grad_norm": 0.07800697535276413, "learning_rate": 0.0001708070363904928, "loss": 1.1077412605285644, "mean_token_accuracy": 0.7667307928204536, "num_tokens": 5918218.0, "step": 1480 }, { "entropy": 0.9389957278966904, "epoch": 0.5702257941063911, "grad_norm": 0.08150342851877213, "learning_rate": 0.00017053847186786626, "loss": 0.9634763717651367, "mean_token_accuracy": 0.7806087970733643, "num_tokens": 5960284.0, "step": 1490 }, { "entropy": 1.0140163496136665, "epoch": 0.574052812858783, "grad_norm": 0.06430503726005554, "learning_rate": 0.0001702699073452397, "loss": 1.0751851081848145, "mean_token_accuracy": 0.7564210310578346, "num_tokens": 6000562.0, "step": 1500 }, { "entropy": 1.047791599482298, "epoch": 0.5778798316111748, "grad_norm": 0.07922326028347015, "learning_rate": 0.00017000134282261313, "loss": 1.1397698402404786, "mean_token_accuracy": 0.7494736298918724, "num_tokens": 6045192.0, "step": 1510 }, { "entropy": 1.1085532158613205, "epoch": 0.5817068503635667, "grad_norm": 0.1093953400850296, "learning_rate": 0.00016973277829998657, "loss": 1.142368698120117, "mean_token_accuracy": 0.7414237394928932, "num_tokens": 6090148.0, "step": 1520 }, { "entropy": 0.9427969709038735, "epoch": 0.5855338691159586, "grad_norm": 0.09579843282699585, "learning_rate": 0.00016946421377736002, "loss": 0.9980224609375, "mean_token_accuracy": 0.7730853497982025, "num_tokens": 6129845.0, "step": 1530 }, { "entropy": 1.0571323171257974, "epoch": 0.5893608878683505, "grad_norm": 0.09482655674219131, "learning_rate": 0.00016919564925473347, "loss": 1.0665513038635255, "mean_token_accuracy": 0.749831511080265, "num_tokens": 6171961.0, "step": 1540 }, { "entropy": 0.978803563863039, "epoch": 0.5931879066207424, "grad_norm": 0.08609842509031296, "learning_rate": 0.0001689270847321069, "loss": 1.0541341781616211, "mean_token_accuracy": 0.769312071800232, "num_tokens": 6210126.0, "step": 1550 }, { "entropy": 1.0714900024235248, "epoch": 0.5970149253731343, "grad_norm": 0.05390879884362221, "learning_rate": 0.00016865852020948033, "loss": 1.1057682037353516, "mean_token_accuracy": 0.7402923837304115, "num_tokens": 6262394.0, "step": 1560 }, { "entropy": 0.9234071888029576, "epoch": 0.6008419441255262, "grad_norm": 0.09692159295082092, "learning_rate": 0.00016838995568685378, "loss": 0.9622941017150879, "mean_token_accuracy": 0.7832354381680489, "num_tokens": 6292140.0, "step": 1570 }, { "entropy": 0.9591108359396457, "epoch": 0.604668962877918, "grad_norm": 0.0720645934343338, "learning_rate": 0.0001681213911642272, "loss": 1.019169235229492, "mean_token_accuracy": 0.771478471159935, "num_tokens": 6332134.0, "step": 1580 }, { "entropy": 1.0945825845003128, "epoch": 0.6084959816303099, "grad_norm": 0.07380460202693939, "learning_rate": 0.00016785282664160065, "loss": 1.1463271141052247, "mean_token_accuracy": 0.7419712334871292, "num_tokens": 6373967.0, "step": 1590 }, { "entropy": 1.055589073896408, "epoch": 0.6123230003827018, "grad_norm": 0.07209772616624832, "learning_rate": 0.0001675842621189741, "loss": 1.1144783973693848, "mean_token_accuracy": 0.7461996227502823, "num_tokens": 6418236.0, "step": 1600 }, { "entropy": 1.1149650782346725, "epoch": 0.6161500191350937, "grad_norm": 0.07935164868831635, "learning_rate": 0.00016731569759634754, "loss": 1.1727729797363282, "mean_token_accuracy": 0.7365738078951836, "num_tokens": 6464589.0, "step": 1610 }, { "entropy": 1.0068970195949078, "epoch": 0.6199770378874856, "grad_norm": 0.0804886594414711, "learning_rate": 0.00016704713307372096, "loss": 1.0483062744140625, "mean_token_accuracy": 0.7632505163550377, "num_tokens": 6504385.0, "step": 1620 }, { "entropy": 0.9996877416968346, "epoch": 0.6238040566398775, "grad_norm": 0.0723455473780632, "learning_rate": 0.0001667785685510944, "loss": 1.0592655181884765, "mean_token_accuracy": 0.7597218692302704, "num_tokens": 6545928.0, "step": 1630 }, { "entropy": 0.9159566629678011, "epoch": 0.6276310753922695, "grad_norm": 0.08028513193130493, "learning_rate": 0.00016651000402846783, "loss": 0.9434080123901367, "mean_token_accuracy": 0.7826011970639228, "num_tokens": 6586166.0, "step": 1640 }, { "entropy": 1.0089009895920753, "epoch": 0.6314580941446614, "grad_norm": 0.09154181182384491, "learning_rate": 0.00016624143950584128, "loss": 1.0339744567871094, "mean_token_accuracy": 0.7604109585285187, "num_tokens": 6624706.0, "step": 1650 }, { "entropy": 1.0208431974053382, "epoch": 0.6352851128970533, "grad_norm": 0.08039630204439163, "learning_rate": 0.00016597287498321472, "loss": 1.0823830604553222, "mean_token_accuracy": 0.7548153042793274, "num_tokens": 6665626.0, "step": 1660 }, { "entropy": 0.9645413011312485, "epoch": 0.6391121316494451, "grad_norm": 0.08834270387887955, "learning_rate": 0.00016570431046058817, "loss": 1.0401340484619142, "mean_token_accuracy": 0.7703565835952759, "num_tokens": 6699532.0, "step": 1670 }, { "entropy": 1.0028597339987755, "epoch": 0.642939150401837, "grad_norm": 0.08974612504243851, "learning_rate": 0.00016543574593796162, "loss": 1.0680928230285645, "mean_token_accuracy": 0.7617413088679313, "num_tokens": 6740574.0, "step": 1680 }, { "entropy": 0.8952145710587501, "epoch": 0.6467661691542289, "grad_norm": 0.09289242327213287, "learning_rate": 0.00016516718141533504, "loss": 0.9696210861206055, "mean_token_accuracy": 0.7848611980676651, "num_tokens": 6782208.0, "step": 1690 }, { "entropy": 0.9520700328052044, "epoch": 0.6505931879066208, "grad_norm": 0.07298107445240021, "learning_rate": 0.00016489861689270848, "loss": 0.9891908645629883, "mean_token_accuracy": 0.7725306749343872, "num_tokens": 6818937.0, "step": 1700 }, { "entropy": 0.9734554067254066, "epoch": 0.6544202066590127, "grad_norm": 0.08233233541250229, "learning_rate": 0.0001646300523700819, "loss": 1.0311893463134765, "mean_token_accuracy": 0.7683428943157196, "num_tokens": 6851548.0, "step": 1710 }, { "entropy": 0.9947349905967713, "epoch": 0.6582472254114046, "grad_norm": 0.08351403474807739, "learning_rate": 0.00016436148784745535, "loss": 1.0382192611694336, "mean_token_accuracy": 0.7587384819984436, "num_tokens": 6891553.0, "step": 1720 }, { "entropy": 1.0383310310542584, "epoch": 0.6620742441637965, "grad_norm": 0.07240983843803406, "learning_rate": 0.0001640929233248288, "loss": 1.0978598594665527, "mean_token_accuracy": 0.7515711337327957, "num_tokens": 6930875.0, "step": 1730 }, { "entropy": 1.085533195734024, "epoch": 0.6659012629161883, "grad_norm": 0.06999973207712173, "learning_rate": 0.00016382435880220225, "loss": 1.1412755966186523, "mean_token_accuracy": 0.7471029132604599, "num_tokens": 6971721.0, "step": 1740 }, { "entropy": 0.978867219388485, "epoch": 0.6697282816685802, "grad_norm": 0.06091843172907829, "learning_rate": 0.0001635557942795757, "loss": 1.023170566558838, "mean_token_accuracy": 0.7686347916722298, "num_tokens": 7010340.0, "step": 1750 }, { "entropy": 1.0823599390685559, "epoch": 0.6735553004209721, "grad_norm": 0.07732617110013962, "learning_rate": 0.0001632872297569491, "loss": 1.1036816596984864, "mean_token_accuracy": 0.7404509574174881, "num_tokens": 7061189.0, "step": 1760 }, { "entropy": 0.9984334908425808, "epoch": 0.677382319173364, "grad_norm": 0.11516186594963074, "learning_rate": 0.00016301866523432253, "loss": 1.1070829391479493, "mean_token_accuracy": 0.7593477964401245, "num_tokens": 7098345.0, "step": 1770 }, { "entropy": 1.0023914370685816, "epoch": 0.6812093379257559, "grad_norm": 0.08624757081270218, "learning_rate": 0.00016275010071169598, "loss": 1.043964958190918, "mean_token_accuracy": 0.7635682225227356, "num_tokens": 7136873.0, "step": 1780 }, { "entropy": 1.0823404759168624, "epoch": 0.6850363566781478, "grad_norm": 0.0846925675868988, "learning_rate": 0.00016248153618906943, "loss": 1.1333115577697754, "mean_token_accuracy": 0.7394865393638611, "num_tokens": 7181553.0, "step": 1790 }, { "entropy": 1.0224985226988792, "epoch": 0.6888633754305397, "grad_norm": 0.060152288526296616, "learning_rate": 0.00016221297166644287, "loss": 1.0782301902770997, "mean_token_accuracy": 0.759938097000122, "num_tokens": 7223076.0, "step": 1800 }, { "entropy": 1.0880159534513951, "epoch": 0.6926903941829315, "grad_norm": 0.06577905267477036, "learning_rate": 0.00016194440714381632, "loss": 1.1103734016418456, "mean_token_accuracy": 0.7457254812121391, "num_tokens": 7265594.0, "step": 1810 }, { "entropy": 1.0144932381808758, "epoch": 0.6965174129353234, "grad_norm": 0.07276095449924469, "learning_rate": 0.00016167584262118974, "loss": 1.0733102798461913, "mean_token_accuracy": 0.759276558458805, "num_tokens": 7305359.0, "step": 1820 }, { "entropy": 1.0186967477202415, "epoch": 0.7003444316877153, "grad_norm": 0.08775337040424347, "learning_rate": 0.0001614072780985632, "loss": 1.0672088623046876, "mean_token_accuracy": 0.7567476496100426, "num_tokens": 7348060.0, "step": 1830 }, { "entropy": 0.9723582908511161, "epoch": 0.7041714504401072, "grad_norm": 0.09250030666589737, "learning_rate": 0.0001611387135759366, "loss": 1.0032880783081055, "mean_token_accuracy": 0.7693375036120415, "num_tokens": 7387110.0, "step": 1840 }, { "entropy": 0.9738463938236237, "epoch": 0.7079984691924991, "grad_norm": 0.09884033352136612, "learning_rate": 0.00016087014905331006, "loss": 1.0514408111572267, "mean_token_accuracy": 0.7625621780753136, "num_tokens": 7428511.0, "step": 1850 }, { "entropy": 1.0910252556204796, "epoch": 0.711825487944891, "grad_norm": 0.09194686263799667, "learning_rate": 0.0001606015845306835, "loss": 1.1226488113403321, "mean_token_accuracy": 0.7452121302485466, "num_tokens": 7474736.0, "step": 1860 }, { "entropy": 1.0159518368542195, "epoch": 0.7156525066972829, "grad_norm": 0.07921712845563889, "learning_rate": 0.00016033302000805695, "loss": 1.061795711517334, "mean_token_accuracy": 0.761272345483303, "num_tokens": 7516891.0, "step": 1870 }, { "entropy": 0.8903906352818012, "epoch": 0.7194795254496748, "grad_norm": 0.10288332402706146, "learning_rate": 0.0001600644554854304, "loss": 0.9471863746643067, "mean_token_accuracy": 0.7831206247210503, "num_tokens": 7551597.0, "step": 1880 }, { "entropy": 1.0759326584637166, "epoch": 0.7233065442020666, "grad_norm": 0.06488945335149765, "learning_rate": 0.00015979589096280382, "loss": 1.136262798309326, "mean_token_accuracy": 0.742707334458828, "num_tokens": 7597529.0, "step": 1890 }, { "entropy": 0.9951202683150768, "epoch": 0.7271335629544585, "grad_norm": 0.06628359109163284, "learning_rate": 0.00015952732644017724, "loss": 1.0448930740356446, "mean_token_accuracy": 0.7615623638033867, "num_tokens": 7634291.0, "step": 1900 }, { "entropy": 1.0593122780323028, "epoch": 0.7309605817068504, "grad_norm": 0.08212320506572723, "learning_rate": 0.00015925876191755068, "loss": 1.1099414825439453, "mean_token_accuracy": 0.7455122962594032, "num_tokens": 7677086.0, "step": 1910 }, { "entropy": 1.0122868783771992, "epoch": 0.7347876004592423, "grad_norm": 0.06458455324172974, "learning_rate": 0.00015899019739492413, "loss": 1.0447346687316894, "mean_token_accuracy": 0.7542453840374946, "num_tokens": 7721785.0, "step": 1920 }, { "entropy": 1.0801358975470066, "epoch": 0.7386146192116342, "grad_norm": 0.06971931457519531, "learning_rate": 0.00015872163287229758, "loss": 1.109630012512207, "mean_token_accuracy": 0.7450100436806679, "num_tokens": 7761887.0, "step": 1930 }, { "entropy": 0.9081138484179974, "epoch": 0.7424416379640261, "grad_norm": 0.06223156675696373, "learning_rate": 0.00015845306834967102, "loss": 1.0026588439941406, "mean_token_accuracy": 0.7772255912423134, "num_tokens": 7807072.0, "step": 1940 }, { "entropy": 1.0170219503343105, "epoch": 0.746268656716418, "grad_norm": 0.0685853511095047, "learning_rate": 0.00015818450382704447, "loss": 1.055277442932129, "mean_token_accuracy": 0.7580551549792289, "num_tokens": 7845791.0, "step": 1950 }, { "entropy": 1.0753178864717483, "epoch": 0.7500956754688098, "grad_norm": 0.08306553959846497, "learning_rate": 0.0001579159393044179, "loss": 1.1332826614379883, "mean_token_accuracy": 0.7421450033783913, "num_tokens": 7891091.0, "step": 1960 }, { "entropy": 0.9297005102038384, "epoch": 0.7539226942212017, "grad_norm": 0.08018683642148972, "learning_rate": 0.0001576473747817913, "loss": 1.000318431854248, "mean_token_accuracy": 0.7793557167053222, "num_tokens": 7928252.0, "step": 1970 }, { "entropy": 1.0840253300964833, "epoch": 0.7577497129735936, "grad_norm": 0.06487595289945602, "learning_rate": 0.00015737881025916476, "loss": 1.1166275024414063, "mean_token_accuracy": 0.7378593400120735, "num_tokens": 7972071.0, "step": 1980 }, { "entropy": 1.0406386695802212, "epoch": 0.7615767317259855, "grad_norm": 0.0615115687251091, "learning_rate": 0.0001571102457365382, "loss": 1.0869349479675292, "mean_token_accuracy": 0.7490768045186996, "num_tokens": 8016865.0, "step": 1990 }, { "entropy": 0.9573215276002884, "epoch": 0.7654037504783774, "grad_norm": 0.0715412124991417, "learning_rate": 0.00015684168121391165, "loss": 1.0404720306396484, "mean_token_accuracy": 0.7706617951393128, "num_tokens": 8055917.0, "step": 2000 }, { "entropy": 0.9201878193765879, "epoch": 0.7692307692307693, "grad_norm": 0.07988248765468597, "learning_rate": 0.0001565731166912851, "loss": 0.9380558967590332, "mean_token_accuracy": 0.782890722155571, "num_tokens": 8093252.0, "step": 2010 }, { "entropy": 1.0045961767435074, "epoch": 0.7730577879831612, "grad_norm": 0.061089444905519485, "learning_rate": 0.00015630455216865855, "loss": 1.0528027534484863, "mean_token_accuracy": 0.7598949059844017, "num_tokens": 8135244.0, "step": 2020 }, { "entropy": 0.9942824639379978, "epoch": 0.776884806735553, "grad_norm": 0.06443686783313751, "learning_rate": 0.00015603598764603197, "loss": 1.0168493270874024, "mean_token_accuracy": 0.7590687796473503, "num_tokens": 8178961.0, "step": 2030 }, { "entropy": 0.9773981764912605, "epoch": 0.7807118254879449, "grad_norm": 0.0818348303437233, "learning_rate": 0.0001557674231234054, "loss": 1.0193141937255858, "mean_token_accuracy": 0.7708378821611405, "num_tokens": 8217139.0, "step": 2040 }, { "entropy": 0.9836540646851063, "epoch": 0.7845388442403368, "grad_norm": 0.06240411475300789, "learning_rate": 0.00015549885860077883, "loss": 1.0662775993347169, "mean_token_accuracy": 0.7658124819397927, "num_tokens": 8252825.0, "step": 2050 }, { "entropy": 1.036501456052065, "epoch": 0.7883658629927287, "grad_norm": 0.09231610596179962, "learning_rate": 0.00015523029407815228, "loss": 1.112645435333252, "mean_token_accuracy": 0.7541953936219216, "num_tokens": 8295113.0, "step": 2060 }, { "entropy": 0.9800528183579444, "epoch": 0.7921928817451206, "grad_norm": 0.08806589245796204, "learning_rate": 0.00015496172955552573, "loss": 1.0401280403137207, "mean_token_accuracy": 0.7672899037599563, "num_tokens": 8335977.0, "step": 2070 }, { "entropy": 0.9678378522396087, "epoch": 0.7960199004975125, "grad_norm": 0.08777868002653122, "learning_rate": 0.00015469316503289918, "loss": 1.0509014129638672, "mean_token_accuracy": 0.7696513712406159, "num_tokens": 8374917.0, "step": 2080 }, { "entropy": 1.042826947569847, "epoch": 0.7998469192499044, "grad_norm": 0.09018490463495255, "learning_rate": 0.00015442460051027262, "loss": 1.0869378089904784, "mean_token_accuracy": 0.7507286682724953, "num_tokens": 8415614.0, "step": 2090 }, { "entropy": 1.0548966623842717, "epoch": 0.8036739380022963, "grad_norm": 0.07267605513334274, "learning_rate": 0.00015415603598764604, "loss": 1.0960289001464845, "mean_token_accuracy": 0.7545556098222732, "num_tokens": 8455059.0, "step": 2100 }, { "entropy": 1.044345210492611, "epoch": 0.8075009567546881, "grad_norm": 0.08414279669523239, "learning_rate": 0.00015388747146501946, "loss": 1.1200661659240723, "mean_token_accuracy": 0.7490431442856789, "num_tokens": 8493866.0, "step": 2110 }, { "entropy": 1.0317029684782029, "epoch": 0.81132797550708, "grad_norm": 0.06549747288227081, "learning_rate": 0.0001536189069423929, "loss": 1.0583623886108398, "mean_token_accuracy": 0.7555923700332642, "num_tokens": 8536147.0, "step": 2120 }, { "entropy": 0.9694572634994983, "epoch": 0.8151549942594719, "grad_norm": 0.08112777769565582, "learning_rate": 0.00015335034241976636, "loss": 1.0503274917602539, "mean_token_accuracy": 0.7646921187639236, "num_tokens": 8578007.0, "step": 2130 }, { "entropy": 0.9358880028128624, "epoch": 0.8189820130118638, "grad_norm": 0.07176466286182404, "learning_rate": 0.0001530817778971398, "loss": 1.000410270690918, "mean_token_accuracy": 0.773740467429161, "num_tokens": 8620999.0, "step": 2140 }, { "entropy": 1.0444137938320637, "epoch": 0.8228090317642557, "grad_norm": 0.06355756521224976, "learning_rate": 0.00015281321337451325, "loss": 1.0860448837280274, "mean_token_accuracy": 0.751850588619709, "num_tokens": 8663354.0, "step": 2150 }, { "entropy": 0.9044980220496655, "epoch": 0.8266360505166476, "grad_norm": 0.080223448574543, "learning_rate": 0.00015254464885188667, "loss": 0.9434403419494629, "mean_token_accuracy": 0.7828752338886261, "num_tokens": 8699748.0, "step": 2160 }, { "entropy": 1.0172922544181346, "epoch": 0.8304630692690395, "grad_norm": 0.06971501559019089, "learning_rate": 0.00015227608432926012, "loss": 1.0325962066650392, "mean_token_accuracy": 0.7651202365756035, "num_tokens": 8739901.0, "step": 2170 }, { "entropy": 0.9639742732048034, "epoch": 0.8342900880214313, "grad_norm": 0.06396778672933578, "learning_rate": 0.00015200751980663354, "loss": 1.0435317039489747, "mean_token_accuracy": 0.7667818054556846, "num_tokens": 8778980.0, "step": 2180 }, { "entropy": 0.8876220636069775, "epoch": 0.8381171067738232, "grad_norm": 0.09910868853330612, "learning_rate": 0.00015173895528400698, "loss": 0.9876300811767578, "mean_token_accuracy": 0.7865215808153152, "num_tokens": 8815525.0, "step": 2190 }, { "entropy": 1.0369405087083579, "epoch": 0.8419441255262151, "grad_norm": 0.08775259554386139, "learning_rate": 0.00015147039076138043, "loss": 1.1244413375854492, "mean_token_accuracy": 0.7550949841737747, "num_tokens": 8857085.0, "step": 2200 }, { "entropy": 0.9762422502040863, "epoch": 0.845771144278607, "grad_norm": 0.08659302443265915, "learning_rate": 0.00015120182623875388, "loss": 1.0164811134338378, "mean_token_accuracy": 0.771617329120636, "num_tokens": 8894271.0, "step": 2210 }, { "entropy": 0.9543228000402451, "epoch": 0.8495981630309989, "grad_norm": 0.09588434547185898, "learning_rate": 0.00015093326171612733, "loss": 1.0303520202636718, "mean_token_accuracy": 0.768992331624031, "num_tokens": 8934095.0, "step": 2220 }, { "entropy": 1.1307236567139625, "epoch": 0.8534251817833908, "grad_norm": 0.07016360014677048, "learning_rate": 0.00015066469719350075, "loss": 1.1526556968688966, "mean_token_accuracy": 0.7296861469745636, "num_tokens": 8982341.0, "step": 2230 }, { "entropy": 1.0867296956479549, "epoch": 0.8572522005357827, "grad_norm": 0.07838597148656845, "learning_rate": 0.00015039613267087417, "loss": 1.1031158447265625, "mean_token_accuracy": 0.7445572927594185, "num_tokens": 9027401.0, "step": 2240 }, { "entropy": 0.9492381684482097, "epoch": 0.8610792192881745, "grad_norm": 0.08416638523340225, "learning_rate": 0.0001501275681482476, "loss": 1.0079804420471192, "mean_token_accuracy": 0.7709973976016045, "num_tokens": 9069985.0, "step": 2250 }, { "entropy": 0.9767517909407616, "epoch": 0.8649062380405664, "grad_norm": 0.09798935055732727, "learning_rate": 0.00014985900362562106, "loss": 1.0394697189331055, "mean_token_accuracy": 0.7647709026932716, "num_tokens": 9108246.0, "step": 2260 }, { "entropy": 0.9779160171747208, "epoch": 0.8687332567929583, "grad_norm": 0.08669373393058777, "learning_rate": 0.0001495904391029945, "loss": 1.0398100852966308, "mean_token_accuracy": 0.7669417649507523, "num_tokens": 9147055.0, "step": 2270 }, { "entropy": 1.014696953445673, "epoch": 0.8725602755453502, "grad_norm": 0.07674991339445114, "learning_rate": 0.00014932187458036795, "loss": 1.0742408752441406, "mean_token_accuracy": 0.7583330690860748, "num_tokens": 9187727.0, "step": 2280 }, { "entropy": 0.9619584158062935, "epoch": 0.8763872942977421, "grad_norm": 0.09512930363416672, "learning_rate": 0.00014905331005774137, "loss": 1.01895112991333, "mean_token_accuracy": 0.7718996241688728, "num_tokens": 9228518.0, "step": 2290 }, { "entropy": 0.8759313493967056, "epoch": 0.880214313050134, "grad_norm": 0.06927543133497238, "learning_rate": 0.00014878474553511482, "loss": 0.9590776443481446, "mean_token_accuracy": 0.783099564909935, "num_tokens": 9269392.0, "step": 2300 }, { "entropy": 1.0930156745016575, "epoch": 0.8840413318025259, "grad_norm": 0.07149595022201538, "learning_rate": 0.00014851618101248824, "loss": 1.132398796081543, "mean_token_accuracy": 0.7445679202675819, "num_tokens": 9310993.0, "step": 2310 }, { "entropy": 0.9991384916007519, "epoch": 0.8878683505549178, "grad_norm": 0.100126251578331, "learning_rate": 0.0001482476164898617, "loss": 1.0395862579345703, "mean_token_accuracy": 0.7618231356143952, "num_tokens": 9349210.0, "step": 2320 }, { "entropy": 0.9891969002783298, "epoch": 0.8916953693073096, "grad_norm": 0.07942050695419312, "learning_rate": 0.00014797905196723514, "loss": 1.0403067588806152, "mean_token_accuracy": 0.7636258214712143, "num_tokens": 9386251.0, "step": 2330 }, { "entropy": 1.034816125780344, "epoch": 0.8955223880597015, "grad_norm": 0.07803855836391449, "learning_rate": 0.00014771048744460858, "loss": 1.088371467590332, "mean_token_accuracy": 0.7585563778877258, "num_tokens": 9425492.0, "step": 2340 }, { "entropy": 0.998091223090887, "epoch": 0.8993494068120934, "grad_norm": 0.06696243584156036, "learning_rate": 0.00014744192292198203, "loss": 1.0410521507263184, "mean_token_accuracy": 0.7595112159848213, "num_tokens": 9466862.0, "step": 2350 }, { "entropy": 0.9615898832678795, "epoch": 0.9031764255644853, "grad_norm": 0.07813845574855804, "learning_rate": 0.00014717335839935545, "loss": 1.0265610694885254, "mean_token_accuracy": 0.7707905381917953, "num_tokens": 9503827.0, "step": 2360 }, { "entropy": 0.8776158876717091, "epoch": 0.9070034443168772, "grad_norm": 0.10287057608366013, "learning_rate": 0.0001469047938767289, "loss": 0.9231206893920898, "mean_token_accuracy": 0.7909859612584114, "num_tokens": 9536194.0, "step": 2370 }, { "entropy": 0.980732673406601, "epoch": 0.9108304630692691, "grad_norm": 0.06174289435148239, "learning_rate": 0.00014663622935410232, "loss": 1.0316704750061034, "mean_token_accuracy": 0.7596900418400765, "num_tokens": 9577621.0, "step": 2380 }, { "entropy": 1.0083129487931728, "epoch": 0.914657481821661, "grad_norm": 0.08805451542139053, "learning_rate": 0.00014636766483147576, "loss": 1.0296180725097657, "mean_token_accuracy": 0.7577597886323929, "num_tokens": 9616522.0, "step": 2390 }, { "entropy": 1.0002505116164684, "epoch": 0.9184845005740528, "grad_norm": 0.07697928696870804, "learning_rate": 0.0001460991003088492, "loss": 1.0411831855773925, "mean_token_accuracy": 0.7589930936694145, "num_tokens": 9659217.0, "step": 2400 }, { "entropy": 0.971958789229393, "epoch": 0.9223115193264447, "grad_norm": 0.08504882454872131, "learning_rate": 0.00014583053578622266, "loss": 1.015835952758789, "mean_token_accuracy": 0.7664303690195083, "num_tokens": 9694120.0, "step": 2410 }, { "entropy": 0.9250703640282154, "epoch": 0.9261385380788366, "grad_norm": 0.06279303133487701, "learning_rate": 0.00014556197126359608, "loss": 0.9673631668090821, "mean_token_accuracy": 0.782692727446556, "num_tokens": 9732460.0, "step": 2420 }, { "entropy": 1.0777716524899006, "epoch": 0.9299655568312285, "grad_norm": 0.06884833425283432, "learning_rate": 0.00014529340674096952, "loss": 1.1415311813354492, "mean_token_accuracy": 0.7447684407234192, "num_tokens": 9773760.0, "step": 2430 }, { "entropy": 1.0116477236151695, "epoch": 0.9337925755836204, "grad_norm": 0.06346814334392548, "learning_rate": 0.00014502484221834297, "loss": 1.0904932975769044, "mean_token_accuracy": 0.7616935014724732, "num_tokens": 9808910.0, "step": 2440 }, { "entropy": 0.9434679664671421, "epoch": 0.9376195943360123, "grad_norm": 0.09843038022518158, "learning_rate": 0.0001447562776957164, "loss": 1.0111047744750976, "mean_token_accuracy": 0.774254959821701, "num_tokens": 9846472.0, "step": 2450 }, { "entropy": 1.035598163306713, "epoch": 0.9414466130884042, "grad_norm": 0.08025770634412766, "learning_rate": 0.00014448771317308984, "loss": 1.1550275802612304, "mean_token_accuracy": 0.7497850373387337, "num_tokens": 9885082.0, "step": 2460 }, { "entropy": 1.057615876197815, "epoch": 0.945273631840796, "grad_norm": 0.07916443794965744, "learning_rate": 0.00014421914865046329, "loss": 1.114585781097412, "mean_token_accuracy": 0.7495191320776939, "num_tokens": 9924849.0, "step": 2470 }, { "entropy": 0.9576205931603908, "epoch": 0.9491006505931879, "grad_norm": 0.10745597630739212, "learning_rate": 0.00014395058412783673, "loss": 1.0471231460571289, "mean_token_accuracy": 0.7697127804160118, "num_tokens": 9969210.0, "step": 2480 }, { "entropy": 1.012363300472498, "epoch": 0.9529276693455798, "grad_norm": 0.09448845684528351, "learning_rate": 0.00014368201960521015, "loss": 1.0322566986083985, "mean_token_accuracy": 0.7568502962589264, "num_tokens": 10009532.0, "step": 2490 }, { "entropy": 0.9387446999549866, "epoch": 0.9567546880979717, "grad_norm": 0.08835543692111969, "learning_rate": 0.0001434134550825836, "loss": 0.9836790084838867, "mean_token_accuracy": 0.7740270137786865, "num_tokens": 10051767.0, "step": 2500 }, { "entropy": 1.043863268941641, "epoch": 0.9605817068503636, "grad_norm": 0.0590866394340992, "learning_rate": 0.00014314489055995705, "loss": 1.1286373138427734, "mean_token_accuracy": 0.755294018983841, "num_tokens": 10093518.0, "step": 2510 }, { "entropy": 1.068480123579502, "epoch": 0.9644087256027555, "grad_norm": 0.06240773946046829, "learning_rate": 0.00014287632603733047, "loss": 1.1243531227111816, "mean_token_accuracy": 0.7457959160208703, "num_tokens": 10137842.0, "step": 2520 }, { "entropy": 0.9648511357605457, "epoch": 0.9682357443551474, "grad_norm": 0.07577214390039444, "learning_rate": 0.00014260776151470391, "loss": 1.0646875381469727, "mean_token_accuracy": 0.7689151406288147, "num_tokens": 10177541.0, "step": 2530 }, { "entropy": 1.0034234993159772, "epoch": 0.9720627631075393, "grad_norm": 0.06887607276439667, "learning_rate": 0.00014233919699207736, "loss": 1.0736650466918944, "mean_token_accuracy": 0.7580653995275497, "num_tokens": 10217056.0, "step": 2540 }, { "entropy": 0.9054977536201477, "epoch": 0.9758897818599311, "grad_norm": 0.12731540203094482, "learning_rate": 0.00014207063246945078, "loss": 0.9581779479980469, "mean_token_accuracy": 0.7800818130373954, "num_tokens": 10249622.0, "step": 2550 }, { "entropy": 1.0892111197113992, "epoch": 0.979716800612323, "grad_norm": 0.08707671612501144, "learning_rate": 0.00014180206794682423, "loss": 1.1551457405090333, "mean_token_accuracy": 0.7434241071343421, "num_tokens": 10287483.0, "step": 2560 }, { "entropy": 0.9462251186370849, "epoch": 0.9835438193647149, "grad_norm": 0.10457631945610046, "learning_rate": 0.00014153350342419768, "loss": 0.9859563827514648, "mean_token_accuracy": 0.7729493409395218, "num_tokens": 10324562.0, "step": 2570 }, { "entropy": 0.9609014384448529, "epoch": 0.9873708381171068, "grad_norm": 0.1095169261097908, "learning_rate": 0.0001412649389015711, "loss": 1.00408992767334, "mean_token_accuracy": 0.769461353123188, "num_tokens": 10368482.0, "step": 2580 }, { "entropy": 0.9500531531870365, "epoch": 0.9911978568694987, "grad_norm": 0.12787973880767822, "learning_rate": 0.00014099637437894454, "loss": 1.0082733154296875, "mean_token_accuracy": 0.7726384818553924, "num_tokens": 10407666.0, "step": 2590 }, { "entropy": 0.9639500208199024, "epoch": 0.9950248756218906, "grad_norm": 0.08555731922388077, "learning_rate": 0.000140727809856318, "loss": 0.9910324096679688, "mean_token_accuracy": 0.7700270056724549, "num_tokens": 10445419.0, "step": 2600 }, { "entropy": 0.9984636768698693, "epoch": 0.9988518943742825, "grad_norm": 0.10294629633426666, "learning_rate": 0.00014045924533369144, "loss": 1.0837631225585938, "mean_token_accuracy": 0.7655858203768731, "num_tokens": 10483287.0, "step": 2610 }, { "entropy": 0.940229170024395, "epoch": 1.0026789131266742, "grad_norm": 0.10580310225486755, "learning_rate": 0.00014019068081106486, "loss": 0.9650541305541992, "mean_token_accuracy": 0.7728109017014504, "num_tokens": 10523841.0, "step": 2620 }, { "entropy": 0.9358184114098549, "epoch": 1.0065059318790661, "grad_norm": 0.12460961192846298, "learning_rate": 0.0001399221162884383, "loss": 0.9570166587829589, "mean_token_accuracy": 0.7772100657224655, "num_tokens": 10561636.0, "step": 2630 }, { "entropy": 1.010379894077778, "epoch": 1.010332950631458, "grad_norm": 0.0781383365392685, "learning_rate": 0.00013965355176581175, "loss": 1.0524909019470214, "mean_token_accuracy": 0.7589353621006012, "num_tokens": 10605899.0, "step": 2640 }, { "entropy": 0.977487600594759, "epoch": 1.01415996938385, "grad_norm": 0.0902724489569664, "learning_rate": 0.00013938498724318517, "loss": 1.0475889205932618, "mean_token_accuracy": 0.7629667386412621, "num_tokens": 10642372.0, "step": 2650 }, { "entropy": 0.9681369736790657, "epoch": 1.0179869881362418, "grad_norm": 0.06344746798276901, "learning_rate": 0.00013911642272055862, "loss": 1.0268775939941406, "mean_token_accuracy": 0.7677509978413581, "num_tokens": 10682308.0, "step": 2660 }, { "entropy": 0.9013996437191963, "epoch": 1.0218140068886337, "grad_norm": 0.09890369325876236, "learning_rate": 0.00013884785819793206, "loss": 0.969085693359375, "mean_token_accuracy": 0.7815661624073982, "num_tokens": 10720755.0, "step": 2670 }, { "entropy": 0.9415140472352505, "epoch": 1.0256410256410255, "grad_norm": 0.08691754937171936, "learning_rate": 0.00013857929367530548, "loss": 0.9783688545227051, "mean_token_accuracy": 0.7722749456763267, "num_tokens": 10759842.0, "step": 2680 }, { "entropy": 0.9437286920845509, "epoch": 1.0294680443934174, "grad_norm": 0.06577731668949127, "learning_rate": 0.00013831072915267893, "loss": 0.9904938697814941, "mean_token_accuracy": 0.7716649904847145, "num_tokens": 10803740.0, "step": 2690 }, { "entropy": 0.9657303221523762, "epoch": 1.0332950631458093, "grad_norm": 0.07847272604703903, "learning_rate": 0.00013804216463005238, "loss": 1.0073646545410155, "mean_token_accuracy": 0.7678608119487762, "num_tokens": 10841808.0, "step": 2700 }, { "entropy": 0.881027878075838, "epoch": 1.0371220818982012, "grad_norm": 0.12755495309829712, "learning_rate": 0.00013777360010742583, "loss": 0.955751895904541, "mean_token_accuracy": 0.7835927039384842, "num_tokens": 10880108.0, "step": 2710 }, { "entropy": 0.8458237417042256, "epoch": 1.040949100650593, "grad_norm": 0.07641884684562683, "learning_rate": 0.00013750503558479925, "loss": 0.9140083312988281, "mean_token_accuracy": 0.7939343526959419, "num_tokens": 10916272.0, "step": 2720 }, { "entropy": 0.8845301080495119, "epoch": 1.044776119402985, "grad_norm": 0.08896184712648392, "learning_rate": 0.0001372364710621727, "loss": 0.9332797050476074, "mean_token_accuracy": 0.7884662911295891, "num_tokens": 10951932.0, "step": 2730 }, { "entropy": 0.963884600251913, "epoch": 1.0486031381553769, "grad_norm": 0.10196536034345627, "learning_rate": 0.00013696790653954614, "loss": 1.0123867988586426, "mean_token_accuracy": 0.7659088596701622, "num_tokens": 10991548.0, "step": 2740 }, { "entropy": 0.9720129862427711, "epoch": 1.0524301569077688, "grad_norm": 0.07552212476730347, "learning_rate": 0.00013669934201691956, "loss": 1.015409564971924, "mean_token_accuracy": 0.7689290955662728, "num_tokens": 11028749.0, "step": 2750 }, { "entropy": 0.9871743015944958, "epoch": 1.0562571756601606, "grad_norm": 0.09255808591842651, "learning_rate": 0.000136430777494293, "loss": 1.0351217269897461, "mean_token_accuracy": 0.7620491668581962, "num_tokens": 11071336.0, "step": 2760 }, { "entropy": 0.809666246920824, "epoch": 1.0600841944125525, "grad_norm": 0.08891233056783676, "learning_rate": 0.00013616221297166645, "loss": 0.8595174789428711, "mean_token_accuracy": 0.8053640425205231, "num_tokens": 11107708.0, "step": 2770 }, { "entropy": 0.9220615286380053, "epoch": 1.0639112131649444, "grad_norm": 0.0731620192527771, "learning_rate": 0.0001358936484490399, "loss": 0.9694333076477051, "mean_token_accuracy": 0.7767527863383293, "num_tokens": 11149005.0, "step": 2780 }, { "entropy": 0.8744502332061529, "epoch": 1.0677382319173363, "grad_norm": 0.0865791067481041, "learning_rate": 0.00013562508392641332, "loss": 0.9401009559631348, "mean_token_accuracy": 0.7854847684502602, "num_tokens": 11189214.0, "step": 2790 }, { "entropy": 0.989877526462078, "epoch": 1.0715652506697282, "grad_norm": 0.09394430369138718, "learning_rate": 0.00013535651940378677, "loss": 1.0487696647644043, "mean_token_accuracy": 0.7607394486665726, "num_tokens": 11225161.0, "step": 2800 }, { "entropy": 0.8656694941222668, "epoch": 1.07539226942212, "grad_norm": 0.10940351337194443, "learning_rate": 0.0001350879548811602, "loss": 0.9236039161682129, "mean_token_accuracy": 0.7919901207089424, "num_tokens": 11261274.0, "step": 2810 }, { "entropy": 1.063130483776331, "epoch": 1.079219288174512, "grad_norm": 0.06853083521127701, "learning_rate": 0.00013481939035853364, "loss": 1.0725152015686035, "mean_token_accuracy": 0.7454188778996468, "num_tokens": 11302522.0, "step": 2820 }, { "entropy": 0.92764787748456, "epoch": 1.0830463069269038, "grad_norm": 0.10344231128692627, "learning_rate": 0.00013455082583590708, "loss": 0.9725144386291504, "mean_token_accuracy": 0.7810687303543091, "num_tokens": 11339898.0, "step": 2830 }, { "entropy": 0.9415482886135578, "epoch": 1.0868733256792957, "grad_norm": 0.12117484956979752, "learning_rate": 0.00013428226131328053, "loss": 1.0216625213623047, "mean_token_accuracy": 0.7713929772377014, "num_tokens": 11380187.0, "step": 2840 }, { "entropy": 0.9300718136131764, "epoch": 1.0907003444316876, "grad_norm": 0.09950343519449234, "learning_rate": 0.00013401369679065398, "loss": 0.9862215042114257, "mean_token_accuracy": 0.7748491272330285, "num_tokens": 11417351.0, "step": 2850 }, { "entropy": 0.9016943011432886, "epoch": 1.0945273631840795, "grad_norm": 0.10104110836982727, "learning_rate": 0.0001337451322680274, "loss": 0.9565576553344727, "mean_token_accuracy": 0.7823473244905472, "num_tokens": 11455566.0, "step": 2860 }, { "entropy": 1.0184541821479798, "epoch": 1.0983543819364714, "grad_norm": 0.07055146247148514, "learning_rate": 0.00013347656774540084, "loss": 1.0644380569458007, "mean_token_accuracy": 0.7551941126585007, "num_tokens": 11499960.0, "step": 2870 }, { "entropy": 0.9143499568104744, "epoch": 1.1021814006888633, "grad_norm": 0.09798481315374374, "learning_rate": 0.00013320800322277426, "loss": 0.9477805137634278, "mean_token_accuracy": 0.778240317106247, "num_tokens": 11536434.0, "step": 2880 }, { "entropy": 0.8803758375346661, "epoch": 1.1060084194412552, "grad_norm": 0.09720771014690399, "learning_rate": 0.0001329394387001477, "loss": 0.9369168281555176, "mean_token_accuracy": 0.786097663640976, "num_tokens": 11572420.0, "step": 2890 }, { "entropy": 0.9127089619636536, "epoch": 1.109835438193647, "grad_norm": 0.07493265718221664, "learning_rate": 0.00013267087417752116, "loss": 0.9610566139221192, "mean_token_accuracy": 0.7780416712164879, "num_tokens": 11607494.0, "step": 2900 }, { "entropy": 0.9359945230185985, "epoch": 1.113662456946039, "grad_norm": 0.09086300432682037, "learning_rate": 0.0001324023096548946, "loss": 0.9519670486450196, "mean_token_accuracy": 0.7745376393198967, "num_tokens": 11647057.0, "step": 2910 }, { "entropy": 0.9206651791930198, "epoch": 1.1174894756984308, "grad_norm": 0.10007902979850769, "learning_rate": 0.00013213374513226805, "loss": 0.9783179283142089, "mean_token_accuracy": 0.778519794344902, "num_tokens": 11685762.0, "step": 2920 }, { "entropy": 0.9937357418239117, "epoch": 1.1213164944508227, "grad_norm": 0.0993100181221962, "learning_rate": 0.00013186518060964147, "loss": 1.0440019607543944, "mean_token_accuracy": 0.7590440228581429, "num_tokens": 11727379.0, "step": 2930 }, { "entropy": 1.048055526614189, "epoch": 1.1251435132032146, "grad_norm": 0.11140380054712296, "learning_rate": 0.0001315966160870149, "loss": 1.1046284675598144, "mean_token_accuracy": 0.7413847833871842, "num_tokens": 11770734.0, "step": 2940 }, { "entropy": 0.9562077779322863, "epoch": 1.1289705319556065, "grad_norm": 0.11506770551204681, "learning_rate": 0.00013132805156438834, "loss": 0.9946146011352539, "mean_token_accuracy": 0.7750585973262787, "num_tokens": 11806270.0, "step": 2950 }, { "entropy": 0.9747304327785968, "epoch": 1.1327975507079984, "grad_norm": 0.1126897856593132, "learning_rate": 0.00013105948704176179, "loss": 1.061129093170166, "mean_token_accuracy": 0.7613553464412689, "num_tokens": 11852779.0, "step": 2960 }, { "entropy": 1.0132145062088966, "epoch": 1.1366245694603903, "grad_norm": 0.08260762691497803, "learning_rate": 0.00013079092251913523, "loss": 1.0199948310852052, "mean_token_accuracy": 0.7617463275790215, "num_tokens": 11897084.0, "step": 2970 }, { "entropy": 0.9878915682435035, "epoch": 1.1404515882127821, "grad_norm": 0.08098926395177841, "learning_rate": 0.00013052235799650868, "loss": 1.0480783462524415, "mean_token_accuracy": 0.763205036520958, "num_tokens": 11938987.0, "step": 2980 }, { "entropy": 1.0176467482000588, "epoch": 1.144278606965174, "grad_norm": 0.0966029092669487, "learning_rate": 0.0001302537934738821, "loss": 1.093599796295166, "mean_token_accuracy": 0.7526282608509064, "num_tokens": 11981156.0, "step": 2990 }, { "entropy": 1.0054687768220902, "epoch": 1.148105625717566, "grad_norm": 0.09327300637960434, "learning_rate": 0.00012998522895125555, "loss": 1.039564609527588, "mean_token_accuracy": 0.7592228040099144, "num_tokens": 12025389.0, "step": 3000 }, { "entropy": 0.9626951858401298, "epoch": 1.1519326444699578, "grad_norm": 0.06154703348875046, "learning_rate": 0.00012971666442862897, "loss": 0.9993762016296387, "mean_token_accuracy": 0.769777101278305, "num_tokens": 12069545.0, "step": 3010 }, { "entropy": 0.9221224367618561, "epoch": 1.1557596632223497, "grad_norm": 0.1140643060207367, "learning_rate": 0.00012944809990600241, "loss": 0.9887493133544922, "mean_token_accuracy": 0.7754134178161621, "num_tokens": 12113892.0, "step": 3020 }, { "entropy": 1.011741641908884, "epoch": 1.1595866819747416, "grad_norm": 0.08721659332513809, "learning_rate": 0.00012917953538337586, "loss": 1.068478488922119, "mean_token_accuracy": 0.7615607067942619, "num_tokens": 12153746.0, "step": 3030 }, { "entropy": 0.9926261432468891, "epoch": 1.1634137007271335, "grad_norm": 0.07577186822891235, "learning_rate": 0.0001289109708607493, "loss": 1.047102451324463, "mean_token_accuracy": 0.7669480383396149, "num_tokens": 12199067.0, "step": 3040 }, { "entropy": 0.945004402846098, "epoch": 1.1672407194795253, "grad_norm": 0.08443465083837509, "learning_rate": 0.00012864240633812276, "loss": 0.9891506195068359, "mean_token_accuracy": 0.7756656989455223, "num_tokens": 12243766.0, "step": 3050 }, { "entropy": 0.9602406993508339, "epoch": 1.1710677382319172, "grad_norm": 0.07647141069173813, "learning_rate": 0.00012837384181549618, "loss": 1.0091946601867676, "mean_token_accuracy": 0.7702717915177345, "num_tokens": 12279555.0, "step": 3060 }, { "entropy": 0.9430582121014595, "epoch": 1.1748947569843091, "grad_norm": 0.10050038248300552, "learning_rate": 0.0001281052772928696, "loss": 1.0251899719238282, "mean_token_accuracy": 0.7759435445070266, "num_tokens": 12316974.0, "step": 3070 }, { "entropy": 1.0339640237390995, "epoch": 1.178721775736701, "grad_norm": 0.09026551991701126, "learning_rate": 0.00012783671277024304, "loss": 1.0652464866638183, "mean_token_accuracy": 0.7533303231000901, "num_tokens": 12358111.0, "step": 3080 }, { "entropy": 0.9808862328529357, "epoch": 1.182548794489093, "grad_norm": 0.08769362419843674, "learning_rate": 0.0001275681482476165, "loss": 1.0068347930908204, "mean_token_accuracy": 0.7660810023546218, "num_tokens": 12401669.0, "step": 3090 }, { "entropy": 0.9436531282961369, "epoch": 1.1863758132414848, "grad_norm": 0.09366963803768158, "learning_rate": 0.00012729958372498994, "loss": 1.0298351287841796, "mean_token_accuracy": 0.7704201564192772, "num_tokens": 12442005.0, "step": 3100 }, { "entropy": 0.8712134130299092, "epoch": 1.1902028319938767, "grad_norm": 0.14041900634765625, "learning_rate": 0.00012703101920236338, "loss": 0.9094470977783203, "mean_token_accuracy": 0.7861496224999428, "num_tokens": 12484476.0, "step": 3110 }, { "entropy": 0.9474696554243565, "epoch": 1.1940298507462686, "grad_norm": 0.10449594259262085, "learning_rate": 0.00012676245467973683, "loss": 0.9729720115661621, "mean_token_accuracy": 0.7746587276458741, "num_tokens": 12521351.0, "step": 3120 }, { "entropy": 0.9215874671936035, "epoch": 1.1978568694986604, "grad_norm": 0.07733117789030075, "learning_rate": 0.00012649389015711025, "loss": 0.992548942565918, "mean_token_accuracy": 0.7789316549897194, "num_tokens": 12564603.0, "step": 3130 }, { "entropy": 0.9349980562925339, "epoch": 1.2016838882510523, "grad_norm": 0.06924714148044586, "learning_rate": 0.00012622532563448367, "loss": 1.010727596282959, "mean_token_accuracy": 0.7728876963257789, "num_tokens": 12606025.0, "step": 3140 }, { "entropy": 0.9719727545976639, "epoch": 1.2055109070034442, "grad_norm": 0.07646770775318146, "learning_rate": 0.00012595676111185712, "loss": 1.0482423782348633, "mean_token_accuracy": 0.7659243881702423, "num_tokens": 12647703.0, "step": 3150 }, { "entropy": 1.0236301876604557, "epoch": 1.209337925755836, "grad_norm": 0.08547945320606232, "learning_rate": 0.00012568819658923056, "loss": 1.0771334648132325, "mean_token_accuracy": 0.7551302567124367, "num_tokens": 12692347.0, "step": 3160 }, { "entropy": 0.9277745552361012, "epoch": 1.213164944508228, "grad_norm": 0.10816850513219833, "learning_rate": 0.000125419632066604, "loss": 0.9680308341979981, "mean_token_accuracy": 0.7722468450665474, "num_tokens": 12729671.0, "step": 3170 }, { "entropy": 0.9760092988610267, "epoch": 1.2169919632606199, "grad_norm": 0.08950033783912659, "learning_rate": 0.00012515106754397746, "loss": 1.000643539428711, "mean_token_accuracy": 0.7665232941508293, "num_tokens": 12768100.0, "step": 3180 }, { "entropy": 0.9292771026492119, "epoch": 1.2208189820130118, "grad_norm": 0.08686704933643341, "learning_rate": 0.0001248825030213509, "loss": 1.019674015045166, "mean_token_accuracy": 0.7758068069815636, "num_tokens": 12801323.0, "step": 3190 }, { "entropy": 0.8500060614198446, "epoch": 1.2246460007654036, "grad_norm": 0.07462778687477112, "learning_rate": 0.00012461393849872433, "loss": 0.9042973518371582, "mean_token_accuracy": 0.7897424980998039, "num_tokens": 12839880.0, "step": 3200 }, { "entropy": 0.9205234386026859, "epoch": 1.2284730195177955, "grad_norm": 0.07027672231197357, "learning_rate": 0.00012434537397609775, "loss": 0.9424190521240234, "mean_token_accuracy": 0.7767854332923889, "num_tokens": 12878349.0, "step": 3210 }, { "entropy": 0.9074239492416382, "epoch": 1.2323000382701874, "grad_norm": 0.09741132706403732, "learning_rate": 0.0001240768094534712, "loss": 0.9651589393615723, "mean_token_accuracy": 0.7790584430098534, "num_tokens": 12917588.0, "step": 3220 }, { "entropy": 0.8874296098947525, "epoch": 1.2361270570225793, "grad_norm": 0.08608463406562805, "learning_rate": 0.00012380824493084464, "loss": 0.9437139511108399, "mean_token_accuracy": 0.7854243695735932, "num_tokens": 12956199.0, "step": 3230 }, { "entropy": 0.9470510125160218, "epoch": 1.2399540757749712, "grad_norm": 0.09247037768363953, "learning_rate": 0.0001235396804082181, "loss": 1.032781982421875, "mean_token_accuracy": 0.7712572082877159, "num_tokens": 13000822.0, "step": 3240 }, { "entropy": 0.8850176699459553, "epoch": 1.243781094527363, "grad_norm": 0.08397585898637772, "learning_rate": 0.00012327111588559153, "loss": 0.9292671203613281, "mean_token_accuracy": 0.787578609585762, "num_tokens": 13043532.0, "step": 3250 }, { "entropy": 0.8605544999241829, "epoch": 1.247608113279755, "grad_norm": 0.0952179804444313, "learning_rate": 0.00012300255136296498, "loss": 0.8990240097045898, "mean_token_accuracy": 0.7919793605804444, "num_tokens": 13081376.0, "step": 3260 }, { "entropy": 1.003395075351, "epoch": 1.2514351320321468, "grad_norm": 0.08914512395858765, "learning_rate": 0.0001227339868403384, "loss": 1.1446642875671387, "mean_token_accuracy": 0.7565032340586185, "num_tokens": 13119474.0, "step": 3270 }, { "entropy": 0.9566417217254639, "epoch": 1.2552621507845387, "grad_norm": 0.13220350444316864, "learning_rate": 0.00012246542231771182, "loss": 0.9976698875427246, "mean_token_accuracy": 0.7722181305289268, "num_tokens": 13162637.0, "step": 3280 }, { "entropy": 0.888442064449191, "epoch": 1.2590891695369306, "grad_norm": 0.10493922978639603, "learning_rate": 0.00012219685779508527, "loss": 0.916744613647461, "mean_token_accuracy": 0.7896391779184342, "num_tokens": 13199412.0, "step": 3290 }, { "entropy": 0.9262259535491466, "epoch": 1.2629161882893225, "grad_norm": 0.09022962301969528, "learning_rate": 0.00012192829327245872, "loss": 0.9885137557983399, "mean_token_accuracy": 0.778158649802208, "num_tokens": 13240292.0, "step": 3300 }, { "entropy": 0.9356066003441811, "epoch": 1.2667432070417144, "grad_norm": 0.09693239629268646, "learning_rate": 0.00012165972874983216, "loss": 0.9731400489807129, "mean_token_accuracy": 0.7748182758688926, "num_tokens": 13275876.0, "step": 3310 }, { "entropy": 0.868951104208827, "epoch": 1.2705702257941063, "grad_norm": 0.09237370640039444, "learning_rate": 0.0001213911642272056, "loss": 0.9127277374267578, "mean_token_accuracy": 0.7890144631266593, "num_tokens": 13314857.0, "step": 3320 }, { "entropy": 0.9311054348945618, "epoch": 1.2743972445464982, "grad_norm": 0.08701436221599579, "learning_rate": 0.00012112259970457902, "loss": 0.9666108131408692, "mean_token_accuracy": 0.7752738267183303, "num_tokens": 13357039.0, "step": 3330 }, { "entropy": 0.9256260149180889, "epoch": 1.27822426329889, "grad_norm": 0.08751461654901505, "learning_rate": 0.00012085403518195246, "loss": 0.9926286697387695, "mean_token_accuracy": 0.7750931903719902, "num_tokens": 13397058.0, "step": 3340 }, { "entropy": 1.0074332721531392, "epoch": 1.282051282051282, "grad_norm": 0.07409587502479553, "learning_rate": 0.00012058547065932591, "loss": 1.062586498260498, "mean_token_accuracy": 0.7546869352459907, "num_tokens": 13441381.0, "step": 3350 }, { "entropy": 0.9596263833343983, "epoch": 1.2858783008036738, "grad_norm": 0.09343665838241577, "learning_rate": 0.00012031690613669934, "loss": 1.0023324012756347, "mean_token_accuracy": 0.7719831839203835, "num_tokens": 13481914.0, "step": 3360 }, { "entropy": 0.9313522674143314, "epoch": 1.2897053195560657, "grad_norm": 0.0879049226641655, "learning_rate": 0.00012004834161407279, "loss": 0.9833806991577149, "mean_token_accuracy": 0.7737741976976394, "num_tokens": 13519831.0, "step": 3370 }, { "entropy": 0.8369917057454586, "epoch": 1.2935323383084576, "grad_norm": 0.14339204132556915, "learning_rate": 0.00011977977709144624, "loss": 0.9147489547729493, "mean_token_accuracy": 0.7984762340784073, "num_tokens": 13559768.0, "step": 3380 }, { "entropy": 0.9055653363466263, "epoch": 1.2973593570608495, "grad_norm": 0.1441742479801178, "learning_rate": 0.00011951121256881967, "loss": 0.9521515846252442, "mean_token_accuracy": 0.7834478095173836, "num_tokens": 13595966.0, "step": 3390 }, { "entropy": 0.9677796266973019, "epoch": 1.3011863758132414, "grad_norm": 0.11233013868331909, "learning_rate": 0.00011924264804619309, "loss": 1.0522055625915527, "mean_token_accuracy": 0.7664702609181404, "num_tokens": 13638463.0, "step": 3400 }, { "entropy": 0.9398517791181803, "epoch": 1.3050133945656333, "grad_norm": 0.088468998670578, "learning_rate": 0.00011897408352356654, "loss": 0.9618704795837403, "mean_token_accuracy": 0.7755557060241699, "num_tokens": 13677769.0, "step": 3410 }, { "entropy": 0.8900398269295693, "epoch": 1.3088404133180251, "grad_norm": 0.09742283076047897, "learning_rate": 0.00011870551900093999, "loss": 0.9422917366027832, "mean_token_accuracy": 0.7865706130862236, "num_tokens": 13713374.0, "step": 3420 }, { "entropy": 0.9008657015860081, "epoch": 1.312667432070417, "grad_norm": 0.09111864864826202, "learning_rate": 0.00011843695447831342, "loss": 0.9726786613464355, "mean_token_accuracy": 0.7835188135504723, "num_tokens": 13753165.0, "step": 3430 }, { "entropy": 0.954158465564251, "epoch": 1.316494450822809, "grad_norm": 0.0949985608458519, "learning_rate": 0.00011816838995568687, "loss": 1.0072153091430665, "mean_token_accuracy": 0.7668681025505066, "num_tokens": 13790265.0, "step": 3440 }, { "entropy": 0.9259054005146027, "epoch": 1.3203214695752008, "grad_norm": 0.09144506603479385, "learning_rate": 0.00011789982543306031, "loss": 1.0319811820983886, "mean_token_accuracy": 0.77575224339962, "num_tokens": 13830720.0, "step": 3450 }, { "entropy": 0.9554400585591794, "epoch": 1.3241484883275927, "grad_norm": 0.05986972153186798, "learning_rate": 0.00011763126091043373, "loss": 0.9840157508850098, "mean_token_accuracy": 0.7714304268360138, "num_tokens": 13874024.0, "step": 3460 }, { "entropy": 0.9618137650191784, "epoch": 1.3279755070799846, "grad_norm": 0.08746087551116943, "learning_rate": 0.00011736269638780717, "loss": 1.0280908584594726, "mean_token_accuracy": 0.7679046332836151, "num_tokens": 13916099.0, "step": 3470 }, { "entropy": 1.02601458132267, "epoch": 1.3318025258323765, "grad_norm": 0.09883694350719452, "learning_rate": 0.00011709413186518061, "loss": 1.0893220901489258, "mean_token_accuracy": 0.7487106472253799, "num_tokens": 13955163.0, "step": 3480 }, { "entropy": 1.025067638605833, "epoch": 1.3356295445847683, "grad_norm": 0.07656730711460114, "learning_rate": 0.00011682556734255406, "loss": 1.0527194023132325, "mean_token_accuracy": 0.7569629296660423, "num_tokens": 13996990.0, "step": 3490 }, { "entropy": 0.8709930831566453, "epoch": 1.3394565633371602, "grad_norm": 0.1119026467204094, "learning_rate": 0.0001165570028199275, "loss": 0.9183405876159668, "mean_token_accuracy": 0.784464044868946, "num_tokens": 14040315.0, "step": 3500 }, { "entropy": 0.9783565014600754, "epoch": 1.3432835820895521, "grad_norm": 0.09997576475143433, "learning_rate": 0.00011628843829730094, "loss": 1.0318940162658692, "mean_token_accuracy": 0.7614112690091133, "num_tokens": 14083204.0, "step": 3510 }, { "entropy": 0.9975252889096737, "epoch": 1.347110600841944, "grad_norm": 0.10046812891960144, "learning_rate": 0.00011601987377467437, "loss": 1.0214290618896484, "mean_token_accuracy": 0.7584437146782875, "num_tokens": 14127039.0, "step": 3520 }, { "entropy": 0.8959422588348389, "epoch": 1.350937619594336, "grad_norm": 0.09512703120708466, "learning_rate": 0.0001157513092520478, "loss": 0.9528075218200683, "mean_token_accuracy": 0.7823959946632385, "num_tokens": 14163989.0, "step": 3530 }, { "entropy": 0.8903120748698712, "epoch": 1.3547646383467278, "grad_norm": 0.10500185191631317, "learning_rate": 0.00011548274472942124, "loss": 0.9784683227539063, "mean_token_accuracy": 0.7854589730501175, "num_tokens": 14198562.0, "step": 3540 }, { "entropy": 0.8580869071185588, "epoch": 1.3585916570991197, "grad_norm": 0.08716659992933273, "learning_rate": 0.00011521418020679469, "loss": 0.9078399658203125, "mean_token_accuracy": 0.7894850671291351, "num_tokens": 14236952.0, "step": 3550 }, { "entropy": 0.9841447554528713, "epoch": 1.3624186758515116, "grad_norm": 0.08638570457696915, "learning_rate": 0.00011494561568416812, "loss": 1.0438207626342773, "mean_token_accuracy": 0.7629329964518548, "num_tokens": 14278208.0, "step": 3560 }, { "entropy": 0.9100395441055298, "epoch": 1.3662456946039034, "grad_norm": 0.09058145433664322, "learning_rate": 0.00011467705116154157, "loss": 0.9560261726379394, "mean_token_accuracy": 0.7807327762246132, "num_tokens": 14314076.0, "step": 3570 }, { "entropy": 0.8529263667762279, "epoch": 1.3700727133562953, "grad_norm": 0.08847236633300781, "learning_rate": 0.00011440848663891502, "loss": 0.9192025184631347, "mean_token_accuracy": 0.7945622354745865, "num_tokens": 14349740.0, "step": 3580 }, { "entropy": 0.8977530397474766, "epoch": 1.3738997321086872, "grad_norm": 0.09535886347293854, "learning_rate": 0.00011413992211628844, "loss": 0.9331538200378418, "mean_token_accuracy": 0.7803975984454155, "num_tokens": 14392492.0, "step": 3590 }, { "entropy": 1.0430821359157563, "epoch": 1.377726750861079, "grad_norm": 0.08564139902591705, "learning_rate": 0.00011387135759366187, "loss": 1.0767670631408692, "mean_token_accuracy": 0.7479040876030922, "num_tokens": 14436961.0, "step": 3600 }, { "entropy": 0.8358541168272495, "epoch": 1.381553769613471, "grad_norm": 0.09847365319728851, "learning_rate": 0.00011360279307103532, "loss": 0.8758580207824707, "mean_token_accuracy": 0.7964837267994881, "num_tokens": 14472251.0, "step": 3610 }, { "entropy": 0.8302674755454064, "epoch": 1.3853807883658629, "grad_norm": 0.08570406585931778, "learning_rate": 0.00011333422854840876, "loss": 0.9068514823913574, "mean_token_accuracy": 0.7943103745579719, "num_tokens": 14509818.0, "step": 3620 }, { "entropy": 0.9825982883572578, "epoch": 1.3892078071182548, "grad_norm": 0.10844281315803528, "learning_rate": 0.0001130656640257822, "loss": 1.0484787940979003, "mean_token_accuracy": 0.7600376740097999, "num_tokens": 14553567.0, "step": 3630 }, { "entropy": 1.0431513242423534, "epoch": 1.3930348258706466, "grad_norm": 0.0750717744231224, "learning_rate": 0.00011279709950315564, "loss": 1.0337225914001464, "mean_token_accuracy": 0.7504511162638664, "num_tokens": 14598239.0, "step": 3640 }, { "entropy": 0.9319969929754734, "epoch": 1.3968618446230385, "grad_norm": 0.08307385444641113, "learning_rate": 0.00011252853498052909, "loss": 0.9771868705749511, "mean_token_accuracy": 0.7778135031461716, "num_tokens": 14638064.0, "step": 3650 }, { "entropy": 0.9992426164448261, "epoch": 1.4006888633754304, "grad_norm": 0.09222020208835602, "learning_rate": 0.00011225997045790251, "loss": 1.0516475677490233, "mean_token_accuracy": 0.7587143570184708, "num_tokens": 14682012.0, "step": 3660 }, { "entropy": 0.9670721650123596, "epoch": 1.4045158821278223, "grad_norm": 0.09432315081357956, "learning_rate": 0.00011199140593527595, "loss": 1.0164658546447753, "mean_token_accuracy": 0.7670722231268883, "num_tokens": 14722922.0, "step": 3670 }, { "entropy": 0.9808389253914356, "epoch": 1.4083429008802142, "grad_norm": 0.08502112329006195, "learning_rate": 0.00011172284141264939, "loss": 1.0553858757019043, "mean_token_accuracy": 0.76065753698349, "num_tokens": 14765083.0, "step": 3680 }, { "entropy": 1.011240091174841, "epoch": 1.412169919632606, "grad_norm": 0.07948844134807587, "learning_rate": 0.00011145427689002284, "loss": 1.0446209907531738, "mean_token_accuracy": 0.75536377876997, "num_tokens": 14806465.0, "step": 3690 }, { "entropy": 0.911352240294218, "epoch": 1.415996938384998, "grad_norm": 0.08382374793291092, "learning_rate": 0.00011118571236739627, "loss": 0.9388965606689453, "mean_token_accuracy": 0.7807439729571343, "num_tokens": 14850133.0, "step": 3700 }, { "entropy": 0.9055514119565486, "epoch": 1.4198239571373898, "grad_norm": 0.10713934898376465, "learning_rate": 0.00011091714784476972, "loss": 0.9727254867553711, "mean_token_accuracy": 0.7801795959472656, "num_tokens": 14887327.0, "step": 3710 }, { "entropy": 0.9338000696152449, "epoch": 1.4236509758897817, "grad_norm": 0.11418487876653671, "learning_rate": 0.00011064858332214314, "loss": 0.9989487648010253, "mean_token_accuracy": 0.7747065275907516, "num_tokens": 14927730.0, "step": 3720 }, { "entropy": 0.869029226526618, "epoch": 1.4274779946421736, "grad_norm": 0.10778038948774338, "learning_rate": 0.00011038001879951659, "loss": 0.9393071174621582, "mean_token_accuracy": 0.7909289851784707, "num_tokens": 14964847.0, "step": 3730 }, { "entropy": 0.8993408516049385, "epoch": 1.4313050133945655, "grad_norm": 0.08339972048997879, "learning_rate": 0.00011011145427689002, "loss": 0.9511364936828614, "mean_token_accuracy": 0.7844893127679825, "num_tokens": 15003449.0, "step": 3740 }, { "entropy": 0.9478372372686863, "epoch": 1.4351320321469574, "grad_norm": 0.07547847181558609, "learning_rate": 0.00010984288975426347, "loss": 0.9942925453186036, "mean_token_accuracy": 0.772410535812378, "num_tokens": 15046091.0, "step": 3750 }, { "entropy": 0.8367562972009182, "epoch": 1.4389590508993493, "grad_norm": 0.06902482360601425, "learning_rate": 0.00010957432523163691, "loss": 0.8951096534729004, "mean_token_accuracy": 0.7985799089074135, "num_tokens": 15091826.0, "step": 3760 }, { "entropy": 0.9437298484146595, "epoch": 1.4427860696517412, "grad_norm": 0.10231524705886841, "learning_rate": 0.00010930576070901035, "loss": 0.9919009208679199, "mean_token_accuracy": 0.7663119360804558, "num_tokens": 15133719.0, "step": 3770 }, { "entropy": 1.0057852260768414, "epoch": 1.446613088404133, "grad_norm": 0.09349844604730606, "learning_rate": 0.0001090371961863838, "loss": 1.0667811393737794, "mean_token_accuracy": 0.757930365204811, "num_tokens": 15173670.0, "step": 3780 }, { "entropy": 0.9152357578277588, "epoch": 1.450440107156525, "grad_norm": 0.09612533450126648, "learning_rate": 0.00010876863166375722, "loss": 0.9641363143920898, "mean_token_accuracy": 0.7791497871279717, "num_tokens": 15215154.0, "step": 3790 }, { "entropy": 0.849637558311224, "epoch": 1.4542671259089168, "grad_norm": 0.07079404592514038, "learning_rate": 0.00010850006714113066, "loss": 0.8924535751342774, "mean_token_accuracy": 0.7958060145378113, "num_tokens": 15261773.0, "step": 3800 }, { "entropy": 0.9689324770122767, "epoch": 1.4580941446613087, "grad_norm": 0.10107272863388062, "learning_rate": 0.0001082315026185041, "loss": 1.000623607635498, "mean_token_accuracy": 0.7690365821123123, "num_tokens": 15295693.0, "step": 3810 }, { "entropy": 0.8926774315536022, "epoch": 1.4619211634137006, "grad_norm": 0.0883372351527214, "learning_rate": 0.00010796293809587754, "loss": 0.9312380790710449, "mean_token_accuracy": 0.7839185446500778, "num_tokens": 15332324.0, "step": 3820 }, { "entropy": 0.9962236389517785, "epoch": 1.4657481821660925, "grad_norm": 0.09174945950508118, "learning_rate": 0.00010769437357325099, "loss": 1.0419865608215333, "mean_token_accuracy": 0.7592507138848305, "num_tokens": 15370812.0, "step": 3830 }, { "entropy": 1.0249286435544491, "epoch": 1.4695752009184844, "grad_norm": 0.07152284681797028, "learning_rate": 0.00010742580905062442, "loss": 1.0437363624572753, "mean_token_accuracy": 0.7567671984434128, "num_tokens": 15417719.0, "step": 3840 }, { "entropy": 0.903605168312788, "epoch": 1.4734022196708763, "grad_norm": 0.09400783479213715, "learning_rate": 0.00010715724452799784, "loss": 0.9410040855407715, "mean_token_accuracy": 0.7839412048459053, "num_tokens": 15455856.0, "step": 3850 }, { "entropy": 1.0259956195950508, "epoch": 1.4772292384232681, "grad_norm": 0.08671914041042328, "learning_rate": 0.00010688868000537129, "loss": 1.1025453567504884, "mean_token_accuracy": 0.7507242172956466, "num_tokens": 15492109.0, "step": 3860 }, { "entropy": 0.9178053669631481, "epoch": 1.48105625717566, "grad_norm": 0.07717446982860565, "learning_rate": 0.00010662011548274474, "loss": 0.96353178024292, "mean_token_accuracy": 0.7797438561916351, "num_tokens": 15532130.0, "step": 3870 }, { "entropy": 0.9423278756439686, "epoch": 1.484883275928052, "grad_norm": 0.11039029061794281, "learning_rate": 0.00010635155096011817, "loss": 0.979669189453125, "mean_token_accuracy": 0.7755513936281204, "num_tokens": 15575609.0, "step": 3880 }, { "entropy": 0.8999218411743641, "epoch": 1.4887102946804438, "grad_norm": 0.08974706381559372, "learning_rate": 0.00010608298643749162, "loss": 0.9477033615112305, "mean_token_accuracy": 0.7822227850556374, "num_tokens": 15621264.0, "step": 3890 }, { "entropy": 0.8756623603403568, "epoch": 1.4925373134328357, "grad_norm": 0.10864510387182236, "learning_rate": 0.00010581442191486505, "loss": 0.9711783409118653, "mean_token_accuracy": 0.7893951386213303, "num_tokens": 15656959.0, "step": 3900 }, { "entropy": 0.951158057898283, "epoch": 1.4963643321852276, "grad_norm": 0.09398993104696274, "learning_rate": 0.0001055458573922385, "loss": 1.0387070655822754, "mean_token_accuracy": 0.7698590591549873, "num_tokens": 15700293.0, "step": 3910 }, { "entropy": 0.9240442231297493, "epoch": 1.5001913509376195, "grad_norm": 0.09761729091405869, "learning_rate": 0.00010527729286961192, "loss": 0.9758125305175781, "mean_token_accuracy": 0.7737968236207962, "num_tokens": 15739304.0, "step": 3920 }, { "entropy": 0.9025500647723674, "epoch": 1.5040183696900113, "grad_norm": 0.08816131204366684, "learning_rate": 0.00010500872834698537, "loss": 0.913144302368164, "mean_token_accuracy": 0.7775477200746537, "num_tokens": 15785086.0, "step": 3930 }, { "entropy": 0.8958883471786976, "epoch": 1.5078453884424032, "grad_norm": 0.09690563380718231, "learning_rate": 0.0001047401638243588, "loss": 0.9484706878662109, "mean_token_accuracy": 0.7867727875709534, "num_tokens": 15822631.0, "step": 3940 }, { "entropy": 0.8738761503249407, "epoch": 1.5116724071947951, "grad_norm": 0.08325833082199097, "learning_rate": 0.00010447159930173225, "loss": 0.9258977890014648, "mean_token_accuracy": 0.7862061053514481, "num_tokens": 15863533.0, "step": 3950 }, { "entropy": 0.952784775942564, "epoch": 1.515499425947187, "grad_norm": 0.09089304506778717, "learning_rate": 0.0001042030347791057, "loss": 0.9893428802490234, "mean_token_accuracy": 0.769037912786007, "num_tokens": 15903798.0, "step": 3960 }, { "entropy": 0.9974973328411579, "epoch": 1.519326444699579, "grad_norm": 0.06594393402338028, "learning_rate": 0.00010393447025647913, "loss": 0.9982621192932128, "mean_token_accuracy": 0.7653156638145446, "num_tokens": 15947894.0, "step": 3970 }, { "entropy": 1.042479208856821, "epoch": 1.5231534634519708, "grad_norm": 0.09250905364751816, "learning_rate": 0.00010366590573385255, "loss": 1.0862640380859374, "mean_token_accuracy": 0.7515693128108978, "num_tokens": 15985609.0, "step": 3980 }, { "entropy": 0.869631578028202, "epoch": 1.5269804822043627, "grad_norm": 0.10154584795236588, "learning_rate": 0.000103397341211226, "loss": 0.9275701522827149, "mean_token_accuracy": 0.7910413116216659, "num_tokens": 16022339.0, "step": 3990 }, { "entropy": 0.9228729590773582, "epoch": 1.5308075009567546, "grad_norm": 0.08860265463590622, "learning_rate": 0.00010312877668859944, "loss": 1.0074289321899415, "mean_token_accuracy": 0.7770419105887413, "num_tokens": 16063778.0, "step": 4000 }, { "entropy": 0.9469372771680356, "epoch": 1.5346345197091464, "grad_norm": 0.08613952249288559, "learning_rate": 0.00010286021216597287, "loss": 1.0328418731689453, "mean_token_accuracy": 0.7784339845180511, "num_tokens": 16103389.0, "step": 4010 }, { "entropy": 0.9240258730947971, "epoch": 1.5384615384615383, "grad_norm": 0.09255630522966385, "learning_rate": 0.00010259164764334632, "loss": 0.9813838958740234, "mean_token_accuracy": 0.779928731918335, "num_tokens": 16141739.0, "step": 4020 }, { "entropy": 0.8300335463136435, "epoch": 1.5422885572139302, "grad_norm": 0.11173315346240997, "learning_rate": 0.00010232308312071977, "loss": 0.8650222778320312, "mean_token_accuracy": 0.8004546627402306, "num_tokens": 16179442.0, "step": 4030 }, { "entropy": 0.970530441403389, "epoch": 1.546115575966322, "grad_norm": 0.08758437633514404, "learning_rate": 0.0001020545185980932, "loss": 1.029263401031494, "mean_token_accuracy": 0.7669389978051185, "num_tokens": 16220502.0, "step": 4040 }, { "entropy": 0.8929917253553867, "epoch": 1.549942594718714, "grad_norm": 0.0840209424495697, "learning_rate": 0.00010178595407546662, "loss": 0.9574555397033692, "mean_token_accuracy": 0.7882895812392234, "num_tokens": 16263944.0, "step": 4050 }, { "entropy": 0.9571633011102676, "epoch": 1.5537696134711059, "grad_norm": 0.07731885462999344, "learning_rate": 0.00010151738955284007, "loss": 1.014600658416748, "mean_token_accuracy": 0.7691228404641152, "num_tokens": 16307508.0, "step": 4060 }, { "entropy": 0.9627384431660175, "epoch": 1.5575966322234978, "grad_norm": 0.09968744218349457, "learning_rate": 0.00010124882503021352, "loss": 1.0220794677734375, "mean_token_accuracy": 0.7685489565134048, "num_tokens": 16349178.0, "step": 4070 }, { "entropy": 0.8696753971278668, "epoch": 1.5614236509758896, "grad_norm": 0.08411276340484619, "learning_rate": 0.00010098026050758695, "loss": 0.9325771331787109, "mean_token_accuracy": 0.7903442814946174, "num_tokens": 16390375.0, "step": 4080 }, { "entropy": 0.8790203854441643, "epoch": 1.5652506697282815, "grad_norm": 0.0969686210155487, "learning_rate": 0.0001007116959849604, "loss": 0.9325167655944824, "mean_token_accuracy": 0.7890133559703827, "num_tokens": 16429198.0, "step": 4090 }, { "entropy": 0.9447548128664494, "epoch": 1.5690776884806734, "grad_norm": 0.07992373406887054, "learning_rate": 0.00010044313146233384, "loss": 0.9708291053771972, "mean_token_accuracy": 0.7737105548381805, "num_tokens": 16472336.0, "step": 4100 }, { "entropy": 0.974559249728918, "epoch": 1.5729047072330653, "grad_norm": 0.09685226529836655, "learning_rate": 0.00010017456693970726, "loss": 1.0289334297180175, "mean_token_accuracy": 0.7674296617507934, "num_tokens": 16511109.0, "step": 4110 }, { "entropy": 0.8575489681214095, "epoch": 1.5767317259854572, "grad_norm": 0.09298260509967804, "learning_rate": 9.990600241708071e-05, "loss": 0.8897696495056152, "mean_token_accuracy": 0.7952411189675331, "num_tokens": 16552802.0, "step": 4120 }, { "entropy": 0.869475956633687, "epoch": 1.580558744737849, "grad_norm": 0.129170760512352, "learning_rate": 9.963743789445414e-05, "loss": 0.9408356666564941, "mean_token_accuracy": 0.7868246123194694, "num_tokens": 16592603.0, "step": 4130 }, { "entropy": 0.9167623318731785, "epoch": 1.584385763490241, "grad_norm": 0.08131655305624008, "learning_rate": 9.936887337182759e-05, "loss": 1.005775260925293, "mean_token_accuracy": 0.7779423877596855, "num_tokens": 16633674.0, "step": 4140 }, { "entropy": 0.9069061763584614, "epoch": 1.5882127822426328, "grad_norm": 0.07485036551952362, "learning_rate": 9.910030884920103e-05, "loss": 0.9540878295898437, "mean_token_accuracy": 0.7809736356139183, "num_tokens": 16669966.0, "step": 4150 }, { "entropy": 1.0095594763755797, "epoch": 1.5920398009950247, "grad_norm": 0.11678522825241089, "learning_rate": 9.883174432657446e-05, "loss": 1.0742655754089356, "mean_token_accuracy": 0.7636483564972878, "num_tokens": 16711538.0, "step": 4160 }, { "entropy": 0.8342153321951628, "epoch": 1.5958668197474166, "grad_norm": 0.09654127061367035, "learning_rate": 9.85631798039479e-05, "loss": 0.8637946128845215, "mean_token_accuracy": 0.7977021634578705, "num_tokens": 16746947.0, "step": 4170 }, { "entropy": 0.9147222273051738, "epoch": 1.5996938384998085, "grad_norm": 0.10032576322555542, "learning_rate": 9.829461528132134e-05, "loss": 0.9848580360412598, "mean_token_accuracy": 0.7794791385531425, "num_tokens": 16792089.0, "step": 4180 }, { "entropy": 0.9350447114557028, "epoch": 1.6035208572522004, "grad_norm": 0.11322317272424698, "learning_rate": 9.802605075869477e-05, "loss": 0.9632351875305176, "mean_token_accuracy": 0.7710213780403137, "num_tokens": 16831782.0, "step": 4190 }, { "entropy": 0.8924577154219151, "epoch": 1.6073478760045923, "grad_norm": 0.08842343091964722, "learning_rate": 9.775748623606822e-05, "loss": 0.9661048889160156, "mean_token_accuracy": 0.7863042891025543, "num_tokens": 16867851.0, "step": 4200 }, { "entropy": 0.9452814936637879, "epoch": 1.6111748947569842, "grad_norm": 0.10469862073659897, "learning_rate": 9.748892171344167e-05, "loss": 1.0315632820129395, "mean_token_accuracy": 0.769272243976593, "num_tokens": 16909819.0, "step": 4210 }, { "entropy": 0.8794655621051788, "epoch": 1.615001913509376, "grad_norm": 0.08528223633766174, "learning_rate": 9.72203571908151e-05, "loss": 0.9158189773559571, "mean_token_accuracy": 0.791112196445465, "num_tokens": 16945241.0, "step": 4220 }, { "entropy": 0.9216304633766412, "epoch": 1.618828932261768, "grad_norm": 0.07684458047151566, "learning_rate": 9.695179266818853e-05, "loss": 1.0047569274902344, "mean_token_accuracy": 0.7764274105429649, "num_tokens": 16986516.0, "step": 4230 }, { "entropy": 0.8806056842207909, "epoch": 1.6226559510141598, "grad_norm": 0.09925177693367004, "learning_rate": 9.668322814556198e-05, "loss": 0.9321705818176269, "mean_token_accuracy": 0.7873435765504837, "num_tokens": 17026974.0, "step": 4240 }, { "entropy": 1.0260133132338525, "epoch": 1.6264829697665517, "grad_norm": 0.07781514525413513, "learning_rate": 9.641466362293541e-05, "loss": 1.0732348442077637, "mean_token_accuracy": 0.755302457511425, "num_tokens": 17063628.0, "step": 4250 }, { "entropy": 0.8771878894418478, "epoch": 1.6303099885189436, "grad_norm": 0.12377400696277618, "learning_rate": 9.614609910030885e-05, "loss": 0.9051324844360351, "mean_token_accuracy": 0.7877693608403206, "num_tokens": 17102243.0, "step": 4260 }, { "entropy": 0.9575911372900009, "epoch": 1.6341370072713355, "grad_norm": 0.07953961193561554, "learning_rate": 9.58775345776823e-05, "loss": 1.0206258773803711, "mean_token_accuracy": 0.770101509988308, "num_tokens": 17143256.0, "step": 4270 }, { "entropy": 0.9909125387668609, "epoch": 1.6379640260237274, "grad_norm": 0.09304741024971008, "learning_rate": 9.560897005505573e-05, "loss": 1.043109130859375, "mean_token_accuracy": 0.7598145559430123, "num_tokens": 17188878.0, "step": 4280 }, { "entropy": 0.8626054737716913, "epoch": 1.6417910447761193, "grad_norm": 0.08982561528682709, "learning_rate": 9.534040553242916e-05, "loss": 0.9062054634094239, "mean_token_accuracy": 0.790125061571598, "num_tokens": 17224537.0, "step": 4290 }, { "entropy": 0.919727610051632, "epoch": 1.6456180635285111, "grad_norm": 0.11226653307676315, "learning_rate": 9.507184100980261e-05, "loss": 0.970013427734375, "mean_token_accuracy": 0.7747739493846894, "num_tokens": 17262347.0, "step": 4300 }, { "entropy": 1.032866196334362, "epoch": 1.649445082280903, "grad_norm": 0.09440238773822784, "learning_rate": 9.480327648717606e-05, "loss": 1.0287545204162598, "mean_token_accuracy": 0.7550160124897957, "num_tokens": 17307578.0, "step": 4310 }, { "entropy": 0.907962580025196, "epoch": 1.653272101033295, "grad_norm": 0.11395370960235596, "learning_rate": 9.453471196454948e-05, "loss": 0.9705679893493653, "mean_token_accuracy": 0.7807198286056518, "num_tokens": 17342943.0, "step": 4320 }, { "entropy": 0.8495472550392151, "epoch": 1.6570991197856868, "grad_norm": 0.07685171812772751, "learning_rate": 9.426614744192292e-05, "loss": 0.9079866409301758, "mean_token_accuracy": 0.7923004642128945, "num_tokens": 17378158.0, "step": 4330 }, { "entropy": 0.8389323726296425, "epoch": 1.6609261385380787, "grad_norm": 0.09541229903697968, "learning_rate": 9.399758291929637e-05, "loss": 0.9092423439025878, "mean_token_accuracy": 0.7946408927440644, "num_tokens": 17412703.0, "step": 4340 }, { "entropy": 0.9035130314528942, "epoch": 1.6647531572904706, "grad_norm": 0.08291888236999512, "learning_rate": 9.37290183966698e-05, "loss": 0.9255120277404785, "mean_token_accuracy": 0.7840688213706016, "num_tokens": 17456250.0, "step": 4350 }, { "entropy": 0.8917031817138195, "epoch": 1.6685801760428625, "grad_norm": 0.08787538856267929, "learning_rate": 9.346045387404324e-05, "loss": 0.9318277359008789, "mean_token_accuracy": 0.7854569494724274, "num_tokens": 17492566.0, "step": 4360 }, { "entropy": 0.8860244527459145, "epoch": 1.6724071947952543, "grad_norm": 0.10287550836801529, "learning_rate": 9.319188935141668e-05, "loss": 0.9169553756713867, "mean_token_accuracy": 0.7801365301012992, "num_tokens": 17530267.0, "step": 4370 }, { "entropy": 0.8470614090561867, "epoch": 1.6762342135476462, "grad_norm": 0.13052308559417725, "learning_rate": 9.292332482879013e-05, "loss": 0.9004100799560547, "mean_token_accuracy": 0.791596457362175, "num_tokens": 17566336.0, "step": 4380 }, { "entropy": 0.9627884522080421, "epoch": 1.6800612323000381, "grad_norm": 0.09305555373430252, "learning_rate": 9.265476030616355e-05, "loss": 0.9837147712707519, "mean_token_accuracy": 0.7687505498528481, "num_tokens": 17609294.0, "step": 4390 }, { "entropy": 0.9614691123366356, "epoch": 1.68388825105243, "grad_norm": 0.08118042349815369, "learning_rate": 9.2386195783537e-05, "loss": 1.0093948364257812, "mean_token_accuracy": 0.7686966329813003, "num_tokens": 17653408.0, "step": 4400 }, { "entropy": 0.8255576498806476, "epoch": 1.687715269804822, "grad_norm": 0.07197146117687225, "learning_rate": 9.211763126091045e-05, "loss": 0.9013225555419921, "mean_token_accuracy": 0.7994248151779175, "num_tokens": 17693303.0, "step": 4410 }, { "entropy": 0.9197361193597317, "epoch": 1.6915422885572138, "grad_norm": 0.10147208720445633, "learning_rate": 9.184906673828388e-05, "loss": 0.966912841796875, "mean_token_accuracy": 0.774210800230503, "num_tokens": 17734446.0, "step": 4420 }, { "entropy": 0.8828513637185097, "epoch": 1.6953693073096057, "grad_norm": 0.08126919716596603, "learning_rate": 9.158050221565731e-05, "loss": 0.9237348556518554, "mean_token_accuracy": 0.788974218070507, "num_tokens": 17776056.0, "step": 4430 }, { "entropy": 0.8538446951657533, "epoch": 1.6991963260619976, "grad_norm": 0.08602278679609299, "learning_rate": 9.131193769303076e-05, "loss": 0.9384878158569336, "mean_token_accuracy": 0.7924654617905617, "num_tokens": 17814560.0, "step": 4440 }, { "entropy": 0.9160130321979523, "epoch": 1.7030233448143894, "grad_norm": 0.10127890110015869, "learning_rate": 9.10433731704042e-05, "loss": 0.9924029350280762, "mean_token_accuracy": 0.7764815479516983, "num_tokens": 17852872.0, "step": 4450 }, { "entropy": 0.8855723738670349, "epoch": 1.7068503635667813, "grad_norm": 0.09295201301574707, "learning_rate": 9.077480864777763e-05, "loss": 0.9131739616394043, "mean_token_accuracy": 0.7876585990190506, "num_tokens": 17893403.0, "step": 4460 }, { "entropy": 0.8825645297765732, "epoch": 1.7106773823191732, "grad_norm": 0.1038793995976448, "learning_rate": 9.050624412515107e-05, "loss": 0.9621119499206543, "mean_token_accuracy": 0.7840767920017242, "num_tokens": 17933069.0, "step": 4470 }, { "entropy": 0.9438045337796211, "epoch": 1.714504401071565, "grad_norm": 0.08998332172632217, "learning_rate": 9.023767960252452e-05, "loss": 1.0042546272277832, "mean_token_accuracy": 0.7710625112056733, "num_tokens": 17978081.0, "step": 4480 }, { "entropy": 0.9605814971029758, "epoch": 1.718331419823957, "grad_norm": 0.0936085507273674, "learning_rate": 8.996911507989794e-05, "loss": 1.0731863021850585, "mean_token_accuracy": 0.7675610318779945, "num_tokens": 18026355.0, "step": 4490 }, { "entropy": 0.9412197135388851, "epoch": 1.7221584385763489, "grad_norm": 0.11693151295185089, "learning_rate": 8.970055055727139e-05, "loss": 1.0271482467651367, "mean_token_accuracy": 0.7749442532658577, "num_tokens": 18064648.0, "step": 4500 }, { "entropy": 0.9840309470891953, "epoch": 1.7259854573287408, "grad_norm": 0.07721691578626633, "learning_rate": 8.943198603464484e-05, "loss": 0.9978925704956054, "mean_token_accuracy": 0.7662706628441811, "num_tokens": 18104352.0, "step": 4510 }, { "entropy": 0.9122109733521938, "epoch": 1.7298124760811326, "grad_norm": 0.10790548473596573, "learning_rate": 8.916342151201827e-05, "loss": 0.9912397384643554, "mean_token_accuracy": 0.774566973745823, "num_tokens": 18145632.0, "step": 4520 }, { "entropy": 0.8214024558663369, "epoch": 1.7336394948335245, "grad_norm": 0.0873790979385376, "learning_rate": 8.88948569893917e-05, "loss": 0.9174188613891602, "mean_token_accuracy": 0.797317324578762, "num_tokens": 18182216.0, "step": 4530 }, { "entropy": 0.8851194910705089, "epoch": 1.7374665135859164, "grad_norm": 0.08441472053527832, "learning_rate": 8.862629246676515e-05, "loss": 0.9345614433288574, "mean_token_accuracy": 0.7909206628799439, "num_tokens": 18220152.0, "step": 4540 }, { "entropy": 0.9045546390116215, "epoch": 1.7412935323383083, "grad_norm": 0.09491857141256332, "learning_rate": 8.835772794413858e-05, "loss": 1.0261774063110352, "mean_token_accuracy": 0.7792002618312835, "num_tokens": 18253615.0, "step": 4550 }, { "entropy": 0.8957971200346947, "epoch": 1.7451205510907002, "grad_norm": 0.07239943742752075, "learning_rate": 8.808916342151202e-05, "loss": 0.9220120429992675, "mean_token_accuracy": 0.7812404081225395, "num_tokens": 18292565.0, "step": 4560 }, { "entropy": 0.9733762003481388, "epoch": 1.748947569843092, "grad_norm": 0.07816951721906662, "learning_rate": 8.782059889888546e-05, "loss": 1.0176166534423827, "mean_token_accuracy": 0.7636090680956841, "num_tokens": 18337951.0, "step": 4570 }, { "entropy": 0.9952755816280842, "epoch": 1.752774588595484, "grad_norm": 0.09595679491758347, "learning_rate": 8.75520343762589e-05, "loss": 1.0484466552734375, "mean_token_accuracy": 0.7624279737472535, "num_tokens": 18378541.0, "step": 4580 }, { "entropy": 0.9325974151492119, "epoch": 1.7566016073478758, "grad_norm": 0.1425638496875763, "learning_rate": 8.728346985363234e-05, "loss": 1.007568645477295, "mean_token_accuracy": 0.7753438904881478, "num_tokens": 18416147.0, "step": 4590 }, { "entropy": 0.8879670143127442, "epoch": 1.7604286261002677, "grad_norm": 0.08936052024364471, "learning_rate": 8.701490533100578e-05, "loss": 0.9574133872985839, "mean_token_accuracy": 0.7878236457705498, "num_tokens": 18452658.0, "step": 4600 }, { "entropy": 0.9596087213605642, "epoch": 1.7642556448526596, "grad_norm": 0.08222804218530655, "learning_rate": 8.674634080837921e-05, "loss": 1.0134518623352051, "mean_token_accuracy": 0.7686690568923951, "num_tokens": 18493806.0, "step": 4610 }, { "entropy": 0.9412514306604862, "epoch": 1.7680826636050515, "grad_norm": 0.08482176810503006, "learning_rate": 8.647777628575266e-05, "loss": 0.9830768585205079, "mean_token_accuracy": 0.7789766594767571, "num_tokens": 18538740.0, "step": 4620 }, { "entropy": 0.8279416210949421, "epoch": 1.7719096823574434, "grad_norm": 0.12101086974143982, "learning_rate": 8.620921176312609e-05, "loss": 0.8422709465026855, "mean_token_accuracy": 0.8009032368659973, "num_tokens": 18579991.0, "step": 4630 }, { "entropy": 0.8889543637633324, "epoch": 1.7757367011098353, "grad_norm": 0.09586559236049652, "learning_rate": 8.594064724049954e-05, "loss": 0.9579720497131348, "mean_token_accuracy": 0.7857530102133751, "num_tokens": 18616195.0, "step": 4640 }, { "entropy": 0.9021936893463135, "epoch": 1.7795637198622272, "grad_norm": 0.0920713022351265, "learning_rate": 8.567208271787297e-05, "loss": 0.9568814277648926, "mean_token_accuracy": 0.7835437625646591, "num_tokens": 18650396.0, "step": 4650 }, { "entropy": 0.9605553701519967, "epoch": 1.783390738614619, "grad_norm": 0.0752284824848175, "learning_rate": 8.54035181952464e-05, "loss": 1.0107332229614259, "mean_token_accuracy": 0.7712536633014679, "num_tokens": 18692519.0, "step": 4660 }, { "entropy": 0.8929145928472281, "epoch": 1.787217757367011, "grad_norm": 0.08124406635761261, "learning_rate": 8.513495367261985e-05, "loss": 0.9392594337463379, "mean_token_accuracy": 0.7837390914559365, "num_tokens": 18730865.0, "step": 4670 }, { "entropy": 0.8995866551995277, "epoch": 1.7910447761194028, "grad_norm": 0.07306879013776779, "learning_rate": 8.486638914999329e-05, "loss": 0.9512563705444336, "mean_token_accuracy": 0.7803288042545319, "num_tokens": 18774851.0, "step": 4680 }, { "entropy": 0.9283428456634283, "epoch": 1.7948717948717947, "grad_norm": 0.06833672523498535, "learning_rate": 8.459782462736673e-05, "loss": 0.9614426612854003, "mean_token_accuracy": 0.7776324123144149, "num_tokens": 18815273.0, "step": 4690 }, { "entropy": 0.8980611331760884, "epoch": 1.7986988136241866, "grad_norm": 0.09426148980855942, "learning_rate": 8.432926010474017e-05, "loss": 0.9397372245788574, "mean_token_accuracy": 0.7818324938416481, "num_tokens": 18854806.0, "step": 4700 }, { "entropy": 0.9534067753702402, "epoch": 1.8025258323765785, "grad_norm": 0.11984719336032867, "learning_rate": 8.40606955821136e-05, "loss": 1.0058012008666992, "mean_token_accuracy": 0.7710662186145782, "num_tokens": 18893820.0, "step": 4710 }, { "entropy": 0.9396863542497158, "epoch": 1.8063528511289704, "grad_norm": 0.1126495823264122, "learning_rate": 8.379213105948705e-05, "loss": 0.9968315124511719, "mean_token_accuracy": 0.7748499393463135, "num_tokens": 18932078.0, "step": 4720 }, { "entropy": 1.0531147465109825, "epoch": 1.8101798698813623, "grad_norm": 0.0951380655169487, "learning_rate": 8.352356653686048e-05, "loss": 1.1058255195617677, "mean_token_accuracy": 0.7495945364236831, "num_tokens": 18974602.0, "step": 4730 }, { "entropy": 0.8520326256752014, "epoch": 1.8140068886337541, "grad_norm": 0.08623862266540527, "learning_rate": 8.325500201423391e-05, "loss": 0.8825064659118652, "mean_token_accuracy": 0.7940022364258766, "num_tokens": 19014261.0, "step": 4740 }, { "entropy": 0.979587784409523, "epoch": 1.817833907386146, "grad_norm": 0.11787699162960052, "learning_rate": 8.298643749160736e-05, "loss": 1.070664405822754, "mean_token_accuracy": 0.7620177045464516, "num_tokens": 19058223.0, "step": 4750 }, { "entropy": 0.8753061652183532, "epoch": 1.821660926138538, "grad_norm": 0.130862757563591, "learning_rate": 8.271787296898081e-05, "loss": 0.9366108894348144, "mean_token_accuracy": 0.7874863654375076, "num_tokens": 19100204.0, "step": 4760 }, { "entropy": 0.8777839131653309, "epoch": 1.8254879448909298, "grad_norm": 0.09261229634284973, "learning_rate": 8.244930844635424e-05, "loss": 0.9104420661926269, "mean_token_accuracy": 0.7895808070898056, "num_tokens": 19141588.0, "step": 4770 }, { "entropy": 0.9170226149260998, "epoch": 1.8293149636433217, "grad_norm": 0.06741383671760559, "learning_rate": 8.218074392372768e-05, "loss": 0.9543824195861816, "mean_token_accuracy": 0.7765088111162186, "num_tokens": 19182818.0, "step": 4780 }, { "entropy": 0.8602717489004135, "epoch": 1.8331419823957136, "grad_norm": 0.12861686944961548, "learning_rate": 8.191217940110112e-05, "loss": 0.926014518737793, "mean_token_accuracy": 0.7917162731289864, "num_tokens": 19217919.0, "step": 4790 }, { "entropy": 0.9471398882567883, "epoch": 1.8369690011481055, "grad_norm": 0.0744423121213913, "learning_rate": 8.164361487847456e-05, "loss": 0.9777775764465332, "mean_token_accuracy": 0.7716371163725853, "num_tokens": 19263991.0, "step": 4800 }, { "entropy": 0.8363759070634842, "epoch": 1.8407960199004973, "grad_norm": 0.08627327531576157, "learning_rate": 8.137505035584799e-05, "loss": 0.887846565246582, "mean_token_accuracy": 0.7938979223370553, "num_tokens": 19305441.0, "step": 4810 }, { "entropy": 0.9200571574270725, "epoch": 1.8446230386528892, "grad_norm": 0.08358518034219742, "learning_rate": 8.110648583322144e-05, "loss": 0.9703543663024903, "mean_token_accuracy": 0.777827826142311, "num_tokens": 19342309.0, "step": 4820 }, { "entropy": 0.9295372806489468, "epoch": 1.8484500574052811, "grad_norm": 0.0970570370554924, "learning_rate": 8.083792131059487e-05, "loss": 0.9934672355651856, "mean_token_accuracy": 0.7739364430308342, "num_tokens": 19387707.0, "step": 4830 }, { "entropy": 0.9003355488181114, "epoch": 1.852277076157673, "grad_norm": 0.09357219189405441, "learning_rate": 8.05693567879683e-05, "loss": 0.9544237136840821, "mean_token_accuracy": 0.7824135825037957, "num_tokens": 19428449.0, "step": 4840 }, { "entropy": 1.0107353992760182, "epoch": 1.856104094910065, "grad_norm": 0.08587910234928131, "learning_rate": 8.030079226534175e-05, "loss": 1.0694228172302247, "mean_token_accuracy": 0.7585012704133988, "num_tokens": 19469625.0, "step": 4850 }, { "entropy": 0.9866157718002796, "epoch": 1.8599311136624568, "grad_norm": 0.11663772910833359, "learning_rate": 8.00322277427152e-05, "loss": 1.0394890785217286, "mean_token_accuracy": 0.7632519364356994, "num_tokens": 19510480.0, "step": 4860 }, { "entropy": 0.881837759912014, "epoch": 1.8637581324148487, "grad_norm": 0.13599033653736115, "learning_rate": 7.976366322008862e-05, "loss": 0.9441938400268555, "mean_token_accuracy": 0.7848336577415467, "num_tokens": 19550305.0, "step": 4870 }, { "entropy": 0.9193019077181817, "epoch": 1.8675851511672406, "grad_norm": 0.09272989630699158, "learning_rate": 7.949509869746207e-05, "loss": 0.9862917900085449, "mean_token_accuracy": 0.7736792579293251, "num_tokens": 19588278.0, "step": 4880 }, { "entropy": 0.9219990812242032, "epoch": 1.8714121699196324, "grad_norm": 0.10006739944219589, "learning_rate": 7.922653417483551e-05, "loss": 0.9700265884399414, "mean_token_accuracy": 0.780723437666893, "num_tokens": 19625749.0, "step": 4890 }, { "entropy": 0.8564371943473816, "epoch": 1.8752391886720245, "grad_norm": 0.08216696232557297, "learning_rate": 7.895796965220895e-05, "loss": 0.9064787864685059, "mean_token_accuracy": 0.7962334454059601, "num_tokens": 19663935.0, "step": 4900 }, { "entropy": 0.9482992745935916, "epoch": 1.8790662074244164, "grad_norm": 0.06782303750514984, "learning_rate": 7.868940512958238e-05, "loss": 0.9920551300048828, "mean_token_accuracy": 0.7696940049529075, "num_tokens": 19704663.0, "step": 4910 }, { "entropy": 0.884397204965353, "epoch": 1.8828932261768083, "grad_norm": 0.06414399296045303, "learning_rate": 7.842084060695583e-05, "loss": 0.9083518981933594, "mean_token_accuracy": 0.7858098462224007, "num_tokens": 19753438.0, "step": 4920 }, { "entropy": 0.8019696604460478, "epoch": 1.8867202449292002, "grad_norm": 0.08456243574619293, "learning_rate": 7.815227608432927e-05, "loss": 0.8896969795227051, "mean_token_accuracy": 0.8053153440356254, "num_tokens": 19790404.0, "step": 4930 }, { "entropy": 0.8008564852178097, "epoch": 1.890547263681592, "grad_norm": 0.10543688386678696, "learning_rate": 7.78837115617027e-05, "loss": 0.8535223007202148, "mean_token_accuracy": 0.8059684678912162, "num_tokens": 19825645.0, "step": 4940 }, { "entropy": 0.8714719720184803, "epoch": 1.894374282433984, "grad_norm": 0.09498755633831024, "learning_rate": 7.761514703907614e-05, "loss": 0.9063860893249511, "mean_token_accuracy": 0.7914781123399734, "num_tokens": 19866092.0, "step": 4950 }, { "entropy": 0.9202240366488695, "epoch": 1.8982013011863759, "grad_norm": 0.07342597842216492, "learning_rate": 7.734658251644959e-05, "loss": 0.9767581939697265, "mean_token_accuracy": 0.7755973920226097, "num_tokens": 19907356.0, "step": 4960 }, { "entropy": 0.9477262906730175, "epoch": 1.9020283199387678, "grad_norm": 0.08742880076169968, "learning_rate": 7.707801799382302e-05, "loss": 1.0063783645629882, "mean_token_accuracy": 0.7687567621469498, "num_tokens": 19952869.0, "step": 4970 }, { "entropy": 0.977492806315422, "epoch": 1.9058553386911596, "grad_norm": 0.10321515798568726, "learning_rate": 7.680945347119645e-05, "loss": 1.0323823928833007, "mean_token_accuracy": 0.7646988987922668, "num_tokens": 19991372.0, "step": 4980 }, { "entropy": 0.7999268680810928, "epoch": 1.9096823574435515, "grad_norm": 0.08925452828407288, "learning_rate": 7.65408889485699e-05, "loss": 0.8391226768493653, "mean_token_accuracy": 0.8017501994967461, "num_tokens": 20029189.0, "step": 4990 }, { "entropy": 0.8757653787732125, "epoch": 1.9135093761959434, "grad_norm": 0.1915360540151596, "learning_rate": 7.627232442594334e-05, "loss": 0.9243562698364258, "mean_token_accuracy": 0.7841411307454109, "num_tokens": 20070611.0, "step": 5000 }, { "entropy": 0.9357082359492779, "epoch": 1.9173363949483353, "grad_norm": 0.08219558745622635, "learning_rate": 7.600375990331677e-05, "loss": 0.9772232055664063, "mean_token_accuracy": 0.7725088000297546, "num_tokens": 20110392.0, "step": 5010 }, { "entropy": 0.9191611532121897, "epoch": 1.9211634137007272, "grad_norm": 0.07629676163196564, "learning_rate": 7.573519538069022e-05, "loss": 0.9754646301269532, "mean_token_accuracy": 0.7830281540751457, "num_tokens": 20150683.0, "step": 5020 }, { "entropy": 0.9279548175632953, "epoch": 1.924990432453119, "grad_norm": 0.09845773130655289, "learning_rate": 7.546663085806366e-05, "loss": 0.9818471908569336, "mean_token_accuracy": 0.7738550245761872, "num_tokens": 20190521.0, "step": 5030 }, { "entropy": 0.9281142316758633, "epoch": 1.928817451205511, "grad_norm": 0.10571245104074478, "learning_rate": 7.519806633543708e-05, "loss": 0.999634075164795, "mean_token_accuracy": 0.7708285465836525, "num_tokens": 20230615.0, "step": 5040 }, { "entropy": 0.8793018095195293, "epoch": 1.9326444699579028, "grad_norm": 0.11255183815956116, "learning_rate": 7.492950181281053e-05, "loss": 0.9399495124816895, "mean_token_accuracy": 0.7893765285611153, "num_tokens": 20269332.0, "step": 5050 }, { "entropy": 0.8188632413744926, "epoch": 1.9364714887102947, "grad_norm": 0.08683498203754425, "learning_rate": 7.466093729018398e-05, "loss": 0.8760917663574219, "mean_token_accuracy": 0.800470444560051, "num_tokens": 20316849.0, "step": 5060 }, { "entropy": 0.9165158126503229, "epoch": 1.9402985074626866, "grad_norm": 0.12123431265354156, "learning_rate": 7.439237276755741e-05, "loss": 0.9515151023864746, "mean_token_accuracy": 0.7772148326039314, "num_tokens": 20354641.0, "step": 5070 }, { "entropy": 0.8890400048345327, "epoch": 1.9441255262150785, "grad_norm": 0.09551843255758286, "learning_rate": 7.412380824493084e-05, "loss": 0.9720385551452637, "mean_token_accuracy": 0.7855533555150032, "num_tokens": 20400703.0, "step": 5080 }, { "entropy": 0.9226945102214813, "epoch": 1.9479525449674704, "grad_norm": 0.11462504416704178, "learning_rate": 7.385524372230429e-05, "loss": 0.9757321357727051, "mean_token_accuracy": 0.7739654749631881, "num_tokens": 20442145.0, "step": 5090 }, { "entropy": 0.8108384694904089, "epoch": 1.9517795637198623, "grad_norm": 0.13017524778842926, "learning_rate": 7.358667919967772e-05, "loss": 0.8620017051696778, "mean_token_accuracy": 0.8028488114476204, "num_tokens": 20472714.0, "step": 5100 }, { "entropy": 0.9563053950667382, "epoch": 1.9556065824722542, "grad_norm": 0.10588496923446655, "learning_rate": 7.331811467705116e-05, "loss": 0.9805202484130859, "mean_token_accuracy": 0.7729632049798966, "num_tokens": 20518593.0, "step": 5110 }, { "entropy": 0.9307407476007938, "epoch": 1.959433601224646, "grad_norm": 0.09899015724658966, "learning_rate": 7.30495501544246e-05, "loss": 0.998748779296875, "mean_token_accuracy": 0.7733172833919525, "num_tokens": 20558008.0, "step": 5120 }, { "entropy": 0.9505821786820888, "epoch": 1.963260619977038, "grad_norm": 0.0943673700094223, "learning_rate": 7.278098563179804e-05, "loss": 1.0047925949096679, "mean_token_accuracy": 0.7691358909010887, "num_tokens": 20603741.0, "step": 5130 }, { "entropy": 1.04148171544075, "epoch": 1.9670876387294298, "grad_norm": 0.08869694918394089, "learning_rate": 7.251242110917149e-05, "loss": 1.0801177024841309, "mean_token_accuracy": 0.7499634683132171, "num_tokens": 20645827.0, "step": 5140 }, { "entropy": 0.7822969853878021, "epoch": 1.9709146574818217, "grad_norm": 0.0994991883635521, "learning_rate": 7.224385658654492e-05, "loss": 0.8042619705200196, "mean_token_accuracy": 0.8097834318876267, "num_tokens": 20684019.0, "step": 5150 }, { "entropy": 0.918664800748229, "epoch": 1.9747416762342136, "grad_norm": 0.11157739907503128, "learning_rate": 7.197529206391837e-05, "loss": 0.983153247833252, "mean_token_accuracy": 0.7776870116591453, "num_tokens": 20726278.0, "step": 5160 }, { "entropy": 0.911195681989193, "epoch": 1.9785686949866055, "grad_norm": 0.13472694158554077, "learning_rate": 7.17067275412918e-05, "loss": 0.9662351608276367, "mean_token_accuracy": 0.7743990138173104, "num_tokens": 20759927.0, "step": 5170 }, { "entropy": 0.8238823972642422, "epoch": 1.9823957137389974, "grad_norm": 0.08864834159612656, "learning_rate": 7.143816301866523e-05, "loss": 0.8870213508605957, "mean_token_accuracy": 0.7989589869976044, "num_tokens": 20798325.0, "step": 5180 }, { "entropy": 0.9405660286545754, "epoch": 1.9862227324913893, "grad_norm": 0.08372621983289719, "learning_rate": 7.116959849603868e-05, "loss": 0.9449873924255371, "mean_token_accuracy": 0.7792889401316643, "num_tokens": 20837136.0, "step": 5190 }, { "entropy": 0.8287422813475132, "epoch": 1.9900497512437811, "grad_norm": 0.0968240275979042, "learning_rate": 7.090103397341211e-05, "loss": 0.8873905181884766, "mean_token_accuracy": 0.7976622357964516, "num_tokens": 20877693.0, "step": 5200 }, { "entropy": 0.9188660819083452, "epoch": 1.993876769996173, "grad_norm": 0.09275626391172409, "learning_rate": 7.063246945078555e-05, "loss": 0.989016342163086, "mean_token_accuracy": 0.7755422025918961, "num_tokens": 20924885.0, "step": 5210 }, { "entropy": 0.9058490604162216, "epoch": 1.997703788748565, "grad_norm": 0.08644875138998032, "learning_rate": 7.0363904928159e-05, "loss": 0.9660470008850097, "mean_token_accuracy": 0.7761533245444298, "num_tokens": 20966342.0, "step": 5220 }, { "entropy": 0.7741431064903737, "epoch": 2.0015308075009566, "grad_norm": 0.07492107152938843, "learning_rate": 7.009534040553243e-05, "loss": 0.8241374015808105, "mean_token_accuracy": 0.8149536207318306, "num_tokens": 21004798.0, "step": 5230 }, { "entropy": 0.8813200116157531, "epoch": 2.0053578262533485, "grad_norm": 0.07805436849594116, "learning_rate": 6.982677588290588e-05, "loss": 0.921663761138916, "mean_token_accuracy": 0.7912002876400948, "num_tokens": 21049021.0, "step": 5240 }, { "entropy": 0.8896506872028113, "epoch": 2.0091848450057403, "grad_norm": 0.13928763568401337, "learning_rate": 6.955821136027931e-05, "loss": 0.9278170585632324, "mean_token_accuracy": 0.7765205070376396, "num_tokens": 21086531.0, "step": 5250 }, { "entropy": 0.9149777121841908, "epoch": 2.0130118637581322, "grad_norm": 0.06992843002080917, "learning_rate": 6.928964683765274e-05, "loss": 0.9667098045349121, "mean_token_accuracy": 0.7750229969620704, "num_tokens": 21127453.0, "step": 5260 }, { "entropy": 0.8076952576637269, "epoch": 2.016838882510524, "grad_norm": 0.12632791697978973, "learning_rate": 6.902108231502619e-05, "loss": 0.8237466812133789, "mean_token_accuracy": 0.804887568950653, "num_tokens": 21165297.0, "step": 5270 }, { "entropy": 0.8818444184958935, "epoch": 2.020665901262916, "grad_norm": 0.08924616128206253, "learning_rate": 6.875251779239962e-05, "loss": 0.9049506187438965, "mean_token_accuracy": 0.7822276562452316, "num_tokens": 21206219.0, "step": 5280 }, { "entropy": 0.7953705489635468, "epoch": 2.024492920015308, "grad_norm": 0.1111336424946785, "learning_rate": 6.848395326977307e-05, "loss": 0.8433744430541992, "mean_token_accuracy": 0.8049945279955864, "num_tokens": 21249239.0, "step": 5290 }, { "entropy": 0.904665675573051, "epoch": 2.0283199387677, "grad_norm": 0.09494993835687637, "learning_rate": 6.82153887471465e-05, "loss": 0.9693451881408691, "mean_token_accuracy": 0.779350683093071, "num_tokens": 21289639.0, "step": 5300 }, { "entropy": 0.7958274722099304, "epoch": 2.0321469575200917, "grad_norm": 0.10396509617567062, "learning_rate": 6.794682422451995e-05, "loss": 0.8559811592102051, "mean_token_accuracy": 0.8057383120059967, "num_tokens": 21329136.0, "step": 5310 }, { "entropy": 0.9416906848549843, "epoch": 2.0359739762724836, "grad_norm": 0.08166563510894775, "learning_rate": 6.767825970189338e-05, "loss": 0.9891387939453125, "mean_token_accuracy": 0.7737650781869888, "num_tokens": 21371300.0, "step": 5320 }, { "entropy": 0.9342201549559832, "epoch": 2.0398009950248754, "grad_norm": 0.09459090232849121, "learning_rate": 6.740969517926682e-05, "loss": 0.9509946823120117, "mean_token_accuracy": 0.7751364663243294, "num_tokens": 21412268.0, "step": 5330 }, { "entropy": 0.8397190041840077, "epoch": 2.0436280137772673, "grad_norm": 0.10005268454551697, "learning_rate": 6.714113065664026e-05, "loss": 0.9056560516357421, "mean_token_accuracy": 0.79336898624897, "num_tokens": 21451975.0, "step": 5340 }, { "entropy": 0.9148454248905182, "epoch": 2.047455032529659, "grad_norm": 0.10257065296173096, "learning_rate": 6.68725661340137e-05, "loss": 0.9611604690551758, "mean_token_accuracy": 0.7737416908144951, "num_tokens": 21491818.0, "step": 5350 }, { "entropy": 0.9010646104812622, "epoch": 2.051282051282051, "grad_norm": 0.11826229095458984, "learning_rate": 6.660400161138713e-05, "loss": 0.9446893692016601, "mean_token_accuracy": 0.7851994633674622, "num_tokens": 21528066.0, "step": 5360 }, { "entropy": 0.8987722039222718, "epoch": 2.055109070034443, "grad_norm": 0.10371451824903488, "learning_rate": 6.633543708876058e-05, "loss": 0.9595455169677735, "mean_token_accuracy": 0.7833559066057205, "num_tokens": 21562883.0, "step": 5370 }, { "entropy": 0.8856854721903801, "epoch": 2.058936088786835, "grad_norm": 0.1089499220252037, "learning_rate": 6.606687256613403e-05, "loss": 0.9219722747802734, "mean_token_accuracy": 0.7822227373719215, "num_tokens": 21600910.0, "step": 5380 }, { "entropy": 0.8720096081495285, "epoch": 2.0627631075392268, "grad_norm": 0.09962328523397446, "learning_rate": 6.579830804350745e-05, "loss": 0.9654089927673339, "mean_token_accuracy": 0.7856920391321183, "num_tokens": 21640445.0, "step": 5390 }, { "entropy": 0.9440382812172174, "epoch": 2.0665901262916186, "grad_norm": 0.08670477569103241, "learning_rate": 6.552974352088089e-05, "loss": 0.9934238433837891, "mean_token_accuracy": 0.7687147289514542, "num_tokens": 21682432.0, "step": 5400 }, { "entropy": 0.774172055721283, "epoch": 2.0704171450440105, "grad_norm": 0.11862040311098099, "learning_rate": 6.526117899825434e-05, "loss": 0.8106603622436523, "mean_token_accuracy": 0.8135839134454728, "num_tokens": 21721359.0, "step": 5410 }, { "entropy": 0.9194908868521452, "epoch": 2.0742441637964024, "grad_norm": 0.10227365791797638, "learning_rate": 6.499261447562777e-05, "loss": 0.9410523414611817, "mean_token_accuracy": 0.7788734346628189, "num_tokens": 21763700.0, "step": 5420 }, { "entropy": 0.7955736435949803, "epoch": 2.0780711825487943, "grad_norm": 0.09657785296440125, "learning_rate": 6.472404995300121e-05, "loss": 0.8665301322937011, "mean_token_accuracy": 0.8067882195115089, "num_tokens": 21804190.0, "step": 5430 }, { "entropy": 0.8065498791635036, "epoch": 2.081898201301186, "grad_norm": 0.11568085849285126, "learning_rate": 6.445548543037465e-05, "loss": 0.8515932083129882, "mean_token_accuracy": 0.8035058185458184, "num_tokens": 21839801.0, "step": 5440 }, { "entropy": 0.9087674509733915, "epoch": 2.085725220053578, "grad_norm": 0.09318574517965317, "learning_rate": 6.418692090774809e-05, "loss": 0.9387861251831054, "mean_token_accuracy": 0.77939523011446, "num_tokens": 21877125.0, "step": 5450 }, { "entropy": 0.86418566852808, "epoch": 2.08955223880597, "grad_norm": 0.08796729892492294, "learning_rate": 6.391835638512152e-05, "loss": 0.9152085304260253, "mean_token_accuracy": 0.7899368211627007, "num_tokens": 21921493.0, "step": 5460 }, { "entropy": 0.8593201294541359, "epoch": 2.093379257558362, "grad_norm": 0.14465564489364624, "learning_rate": 6.364979186249497e-05, "loss": 0.8955412864685058, "mean_token_accuracy": 0.7898772984743119, "num_tokens": 21961188.0, "step": 5470 }, { "entropy": 0.8998314358294011, "epoch": 2.0972062763107537, "grad_norm": 0.11634784191846848, "learning_rate": 6.338122733986842e-05, "loss": 0.9114861488342285, "mean_token_accuracy": 0.7838647082448006, "num_tokens": 22001738.0, "step": 5480 }, { "entropy": 0.8693659231066704, "epoch": 2.1010332950631456, "grad_norm": 0.11536803841590881, "learning_rate": 6.311266281724184e-05, "loss": 0.9232154846191406, "mean_token_accuracy": 0.7881089702248574, "num_tokens": 22039626.0, "step": 5490 }, { "entropy": 0.9556272588670254, "epoch": 2.1048603138155375, "grad_norm": 0.09614596515893936, "learning_rate": 6.284409829461528e-05, "loss": 1.0266177177429199, "mean_token_accuracy": 0.7646962344646454, "num_tokens": 22081971.0, "step": 5500 }, { "entropy": 0.7735307298600673, "epoch": 2.1086873325679294, "grad_norm": 0.10002073645591736, "learning_rate": 6.257553377198873e-05, "loss": 0.8011887550354004, "mean_token_accuracy": 0.8088100135326386, "num_tokens": 22117897.0, "step": 5510 }, { "entropy": 0.8981072999536991, "epoch": 2.1125143513203213, "grad_norm": 0.10524707287549973, "learning_rate": 6.230696924936216e-05, "loss": 0.9659936904907227, "mean_token_accuracy": 0.7843907788395882, "num_tokens": 22161049.0, "step": 5520 }, { "entropy": 0.8891891561448574, "epoch": 2.116341370072713, "grad_norm": 0.10095740854740143, "learning_rate": 6.20384047267356e-05, "loss": 0.9199987411499023, "mean_token_accuracy": 0.7833669245243072, "num_tokens": 22201183.0, "step": 5530 }, { "entropy": 0.9359986830502749, "epoch": 2.120168388825105, "grad_norm": 0.08723930269479752, "learning_rate": 6.176984020410904e-05, "loss": 0.9635790824890137, "mean_token_accuracy": 0.7724878415465355, "num_tokens": 22240779.0, "step": 5540 }, { "entropy": 0.8017430886626243, "epoch": 2.123995407577497, "grad_norm": 0.10579924285411835, "learning_rate": 6.150127568148249e-05, "loss": 0.842125129699707, "mean_token_accuracy": 0.8020379558205605, "num_tokens": 22279289.0, "step": 5550 }, { "entropy": 0.7666160762310028, "epoch": 2.127822426329889, "grad_norm": 0.09871628880500793, "learning_rate": 6.123271115885591e-05, "loss": 0.8378163337707519, "mean_token_accuracy": 0.8119754999876022, "num_tokens": 22316715.0, "step": 5560 }, { "entropy": 0.9505756117403508, "epoch": 2.1316494450822807, "grad_norm": 0.11093632131814957, "learning_rate": 6.096414663622936e-05, "loss": 0.9677371025085449, "mean_token_accuracy": 0.7698320209980011, "num_tokens": 22360112.0, "step": 5570 }, { "entropy": 0.7982158973813057, "epoch": 2.1354764638346726, "grad_norm": 0.11260368674993515, "learning_rate": 6.06955821136028e-05, "loss": 0.8571239471435547, "mean_token_accuracy": 0.804571321606636, "num_tokens": 22399114.0, "step": 5580 }, { "entropy": 0.8869463637471199, "epoch": 2.1393034825870645, "grad_norm": 0.08550643920898438, "learning_rate": 6.042701759097623e-05, "loss": 0.9476675033569336, "mean_token_accuracy": 0.7807673364877701, "num_tokens": 22440187.0, "step": 5590 }, { "entropy": 0.9491269618272782, "epoch": 2.1431305013394564, "grad_norm": 0.09019884467124939, "learning_rate": 6.015845306834967e-05, "loss": 1.0232599258422852, "mean_token_accuracy": 0.7681664958596229, "num_tokens": 22479682.0, "step": 5600 }, { "entropy": 0.8861779697239399, "epoch": 2.1469575200918483, "grad_norm": 0.11756031215190887, "learning_rate": 5.988988854572312e-05, "loss": 0.9251557350158691, "mean_token_accuracy": 0.7849425792694091, "num_tokens": 22520352.0, "step": 5610 }, { "entropy": 0.8735060147941113, "epoch": 2.15078453884424, "grad_norm": 0.0996679812669754, "learning_rate": 5.9621324023096546e-05, "loss": 0.9677264213562011, "mean_token_accuracy": 0.7881714150309562, "num_tokens": 22561677.0, "step": 5620 }, { "entropy": 0.991636025160551, "epoch": 2.154611557596632, "grad_norm": 0.10682649165391922, "learning_rate": 5.935275950046999e-05, "loss": 1.050811195373535, "mean_token_accuracy": 0.7574850931763649, "num_tokens": 22609671.0, "step": 5630 }, { "entropy": 0.9028345100581646, "epoch": 2.158438576349024, "grad_norm": 0.11249802261590958, "learning_rate": 5.908419497784343e-05, "loss": 0.9876343727111816, "mean_token_accuracy": 0.783162035048008, "num_tokens": 22650924.0, "step": 5640 }, { "entropy": 0.868353420495987, "epoch": 2.162265595101416, "grad_norm": 0.08846433460712433, "learning_rate": 5.8815630455216867e-05, "loss": 0.9271388053894043, "mean_token_accuracy": 0.7898381799459457, "num_tokens": 22691550.0, "step": 5650 }, { "entropy": 0.9247912406921387, "epoch": 2.1660926138538077, "grad_norm": 0.10013602674007416, "learning_rate": 5.854706593259031e-05, "loss": 1.0093653678894043, "mean_token_accuracy": 0.7723490744829178, "num_tokens": 22728956.0, "step": 5660 }, { "entropy": 0.82930968105793, "epoch": 2.1699196326061996, "grad_norm": 0.11004043370485306, "learning_rate": 5.827850140996375e-05, "loss": 0.8801467895507813, "mean_token_accuracy": 0.798722094297409, "num_tokens": 22765064.0, "step": 5670 }, { "entropy": 0.8950945638120175, "epoch": 2.1737466513585915, "grad_norm": 0.09994686394929886, "learning_rate": 5.800993688733719e-05, "loss": 0.9781051635742187, "mean_token_accuracy": 0.7849533364176751, "num_tokens": 22802213.0, "step": 5680 }, { "entropy": 0.8847132481634616, "epoch": 2.1775736701109834, "grad_norm": 0.09891512989997864, "learning_rate": 5.774137236471062e-05, "loss": 0.9338027954101562, "mean_token_accuracy": 0.7867394030094147, "num_tokens": 22839400.0, "step": 5690 }, { "entropy": 0.8212509788572788, "epoch": 2.1814006888633752, "grad_norm": 0.10451705008745193, "learning_rate": 5.747280784208406e-05, "loss": 0.8740688323974609, "mean_token_accuracy": 0.7968196496367455, "num_tokens": 22877771.0, "step": 5700 }, { "entropy": 0.7856742814183235, "epoch": 2.185227707615767, "grad_norm": 0.09351614862680435, "learning_rate": 5.720424331945751e-05, "loss": 0.8385543823242188, "mean_token_accuracy": 0.8064358577132225, "num_tokens": 22916159.0, "step": 5710 }, { "entropy": 0.9431014984846116, "epoch": 2.189054726368159, "grad_norm": 0.09432144463062286, "learning_rate": 5.6935678796830935e-05, "loss": 1.0021851539611817, "mean_token_accuracy": 0.7693860113620759, "num_tokens": 22958014.0, "step": 5720 }, { "entropy": 0.9080683786422015, "epoch": 2.192881745120551, "grad_norm": 0.08724278956651688, "learning_rate": 5.666711427420438e-05, "loss": 0.9878963470458985, "mean_token_accuracy": 0.7802156403660774, "num_tokens": 23003222.0, "step": 5730 }, { "entropy": 0.8772326201200485, "epoch": 2.196708763872943, "grad_norm": 0.1096489354968071, "learning_rate": 5.639854975157782e-05, "loss": 0.9326786041259766, "mean_token_accuracy": 0.7881689593195915, "num_tokens": 23039512.0, "step": 5740 }, { "entropy": 0.9084336057305336, "epoch": 2.2005357826253347, "grad_norm": 0.11137977987527847, "learning_rate": 5.6129985228951256e-05, "loss": 0.9574773788452149, "mean_token_accuracy": 0.7860094889998436, "num_tokens": 23078238.0, "step": 5750 }, { "entropy": 0.836103780195117, "epoch": 2.2043628013777266, "grad_norm": 0.11038387566804886, "learning_rate": 5.5861420706324696e-05, "loss": 0.88037109375, "mean_token_accuracy": 0.7916925936937332, "num_tokens": 23121089.0, "step": 5760 }, { "entropy": 0.9425606489181518, "epoch": 2.2081898201301184, "grad_norm": 0.10270453989505768, "learning_rate": 5.5592856183698137e-05, "loss": 0.983431339263916, "mean_token_accuracy": 0.7715479463338852, "num_tokens": 23158047.0, "step": 5770 }, { "entropy": 0.8212515480816365, "epoch": 2.2120168388825103, "grad_norm": 0.0880119651556015, "learning_rate": 5.532429166107157e-05, "loss": 0.887947940826416, "mean_token_accuracy": 0.7997770145535469, "num_tokens": 23204019.0, "step": 5780 }, { "entropy": 0.8668085850775242, "epoch": 2.215843857634902, "grad_norm": 0.11390146613121033, "learning_rate": 5.505572713844501e-05, "loss": 0.9010316848754882, "mean_token_accuracy": 0.7880747586488723, "num_tokens": 23241922.0, "step": 5790 }, { "entropy": 0.7907863073050976, "epoch": 2.219670876387294, "grad_norm": 0.11713080108165741, "learning_rate": 5.478716261581846e-05, "loss": 0.8595284461975098, "mean_token_accuracy": 0.8068661123514176, "num_tokens": 23280534.0, "step": 5800 }, { "entropy": 0.8358560226857662, "epoch": 2.223497895139686, "grad_norm": 0.11117064207792282, "learning_rate": 5.45185980931919e-05, "loss": 0.8745571136474609, "mean_token_accuracy": 0.793362820148468, "num_tokens": 23323119.0, "step": 5810 }, { "entropy": 0.8238232973963022, "epoch": 2.227324913892078, "grad_norm": 0.13185663521289825, "learning_rate": 5.425003357056533e-05, "loss": 0.8659845352172851, "mean_token_accuracy": 0.8025152862071991, "num_tokens": 23363749.0, "step": 5820 }, { "entropy": 0.8596846207976341, "epoch": 2.2311519326444698, "grad_norm": 0.09360291808843613, "learning_rate": 5.398146904793877e-05, "loss": 0.9118245124816895, "mean_token_accuracy": 0.7882251426577568, "num_tokens": 23402886.0, "step": 5830 }, { "entropy": 0.8035648860037327, "epoch": 2.2349789513968616, "grad_norm": 0.09347285330295563, "learning_rate": 5.371290452531221e-05, "loss": 0.8725827217102051, "mean_token_accuracy": 0.8045972406864166, "num_tokens": 23442339.0, "step": 5840 }, { "entropy": 0.9175308585166931, "epoch": 2.2388059701492535, "grad_norm": 0.12336985766887665, "learning_rate": 5.3444340002685645e-05, "loss": 0.9388077735900879, "mean_token_accuracy": 0.7768721342086792, "num_tokens": 23481344.0, "step": 5850 }, { "entropy": 0.868817687779665, "epoch": 2.2426329889016454, "grad_norm": 0.10311949998140335, "learning_rate": 5.3175775480059086e-05, "loss": 0.9337680816650391, "mean_token_accuracy": 0.7877210825681686, "num_tokens": 23520637.0, "step": 5860 }, { "entropy": 0.854228886961937, "epoch": 2.2464600076540373, "grad_norm": 0.10659918189048767, "learning_rate": 5.2907210957432526e-05, "loss": 0.9077530860900879, "mean_token_accuracy": 0.7909654468297959, "num_tokens": 23559877.0, "step": 5870 }, { "entropy": 0.8457217663526535, "epoch": 2.250287026406429, "grad_norm": 0.09633689373731613, "learning_rate": 5.263864643480596e-05, "loss": 0.8785475730895996, "mean_token_accuracy": 0.7941769883036613, "num_tokens": 23597033.0, "step": 5880 }, { "entropy": 0.8822055049240589, "epoch": 2.254114045158821, "grad_norm": 0.09562286734580994, "learning_rate": 5.23700819121794e-05, "loss": 0.8851138114929199, "mean_token_accuracy": 0.7860250055789948, "num_tokens": 23634788.0, "step": 5890 }, { "entropy": 0.8556318368762732, "epoch": 2.257941063911213, "grad_norm": 0.08814764767885208, "learning_rate": 5.210151738955285e-05, "loss": 0.8866415977478027, "mean_token_accuracy": 0.7966004252433777, "num_tokens": 23673283.0, "step": 5900 }, { "entropy": 0.7395530994981527, "epoch": 2.261768082663605, "grad_norm": 0.07671936601400375, "learning_rate": 5.1832952866926274e-05, "loss": 0.7680532455444335, "mean_token_accuracy": 0.8190904691815376, "num_tokens": 23711540.0, "step": 5910 }, { "entropy": 0.8898126773536206, "epoch": 2.2655951014159967, "grad_norm": 0.06960798799991608, "learning_rate": 5.156438834429972e-05, "loss": 1.026920700073242, "mean_token_accuracy": 0.7816770374774933, "num_tokens": 23756178.0, "step": 5920 }, { "entropy": 0.8902945756912232, "epoch": 2.2694221201683886, "grad_norm": 0.1114925891160965, "learning_rate": 5.129582382167316e-05, "loss": 0.9598423957824707, "mean_token_accuracy": 0.784630736708641, "num_tokens": 23792151.0, "step": 5930 }, { "entropy": 0.8439918398857117, "epoch": 2.2732491389207805, "grad_norm": 0.16730423271656036, "learning_rate": 5.10272592990466e-05, "loss": 0.851725959777832, "mean_token_accuracy": 0.7940610617399215, "num_tokens": 23830309.0, "step": 5940 }, { "entropy": 0.9178552135825158, "epoch": 2.2770761576731724, "grad_norm": 0.16359879076480865, "learning_rate": 5.0758694776420035e-05, "loss": 0.9417426109313964, "mean_token_accuracy": 0.7781487166881561, "num_tokens": 23874638.0, "step": 5950 }, { "entropy": 0.9053961969912052, "epoch": 2.2809031764255643, "grad_norm": 0.08877693116664886, "learning_rate": 5.0490130253793475e-05, "loss": 0.9975083351135254, "mean_token_accuracy": 0.7837231978774071, "num_tokens": 23918641.0, "step": 5960 }, { "entropy": 0.8590337552130223, "epoch": 2.284730195177956, "grad_norm": 0.1032002717256546, "learning_rate": 5.022156573116692e-05, "loss": 0.8895168304443359, "mean_token_accuracy": 0.7937395930290222, "num_tokens": 23964403.0, "step": 5970 }, { "entropy": 0.8678315542638302, "epoch": 2.288557213930348, "grad_norm": 0.12054577469825745, "learning_rate": 4.9953001208540356e-05, "loss": 0.9571179389953614, "mean_token_accuracy": 0.7875312000513077, "num_tokens": 24001736.0, "step": 5980 }, { "entropy": 0.8353918489068747, "epoch": 2.29238423268274, "grad_norm": 0.1126277968287468, "learning_rate": 4.9684436685913796e-05, "loss": 0.927174186706543, "mean_token_accuracy": 0.7998543947935104, "num_tokens": 24038494.0, "step": 5990 }, { "entropy": 0.7281714532524347, "epoch": 2.296211251435132, "grad_norm": 0.09404657036066055, "learning_rate": 4.941587216328723e-05, "loss": 0.7814407825469971, "mean_token_accuracy": 0.8194777265191078, "num_tokens": 24077404.0, "step": 6000 }, { "entropy": 0.8627386562526226, "epoch": 2.3000382701875237, "grad_norm": 0.07272294908761978, "learning_rate": 4.914730764066067e-05, "loss": 0.8920239448547364, "mean_token_accuracy": 0.7905093863606453, "num_tokens": 24123483.0, "step": 6010 }, { "entropy": 0.8679380901157856, "epoch": 2.3038652889399156, "grad_norm": 0.09443669021129608, "learning_rate": 4.887874311803411e-05, "loss": 0.874543571472168, "mean_token_accuracy": 0.7891486629843711, "num_tokens": 24165215.0, "step": 6020 }, { "entropy": 0.8942526787519455, "epoch": 2.3076923076923075, "grad_norm": 0.0953405573964119, "learning_rate": 4.861017859540755e-05, "loss": 0.9304584503173828, "mean_token_accuracy": 0.7855148240923882, "num_tokens": 24204454.0, "step": 6030 }, { "entropy": 0.7896301347762347, "epoch": 2.3115193264446994, "grad_norm": 0.11093971133232117, "learning_rate": 4.834161407278099e-05, "loss": 0.8957646369934082, "mean_token_accuracy": 0.8066290900111198, "num_tokens": 24245578.0, "step": 6040 }, { "entropy": 0.9012999664992094, "epoch": 2.3153463451970913, "grad_norm": 0.09953141212463379, "learning_rate": 4.8073049550154424e-05, "loss": 0.9699124336242676, "mean_token_accuracy": 0.7792607560753823, "num_tokens": 24286627.0, "step": 6050 }, { "entropy": 0.8553815156221389, "epoch": 2.319173363949483, "grad_norm": 0.09737669676542282, "learning_rate": 4.7804485027527864e-05, "loss": 0.9319831848144531, "mean_token_accuracy": 0.7943563163280487, "num_tokens": 24326050.0, "step": 6060 }, { "entropy": 0.8088245622813701, "epoch": 2.323000382701875, "grad_norm": 0.11754145473241806, "learning_rate": 4.7535920504901305e-05, "loss": 0.8612746238708496, "mean_token_accuracy": 0.7998821645975113, "num_tokens": 24365505.0, "step": 6070 }, { "entropy": 0.8720655493438244, "epoch": 2.326827401454267, "grad_norm": 0.10582665354013443, "learning_rate": 4.726735598227474e-05, "loss": 0.9663046836853028, "mean_token_accuracy": 0.78773233294487, "num_tokens": 24403619.0, "step": 6080 }, { "entropy": 0.814146314561367, "epoch": 2.330654420206659, "grad_norm": 0.10099766403436661, "learning_rate": 4.6998791459648185e-05, "loss": 0.8403602600097656, "mean_token_accuracy": 0.8022790655493737, "num_tokens": 24441133.0, "step": 6090 }, { "entropy": 0.8325122386217118, "epoch": 2.3344814389590507, "grad_norm": 0.0968555137515068, "learning_rate": 4.673022693702162e-05, "loss": 0.8952775955200195, "mean_token_accuracy": 0.7972952157258988, "num_tokens": 24487908.0, "step": 6100 }, { "entropy": 0.8313679326325655, "epoch": 2.3383084577114426, "grad_norm": 0.09856109321117401, "learning_rate": 4.6461662414395066e-05, "loss": 0.8740328788757324, "mean_token_accuracy": 0.7973453208804131, "num_tokens": 24528859.0, "step": 6110 }, { "entropy": 0.9734285809099674, "epoch": 2.3421354764638345, "grad_norm": 0.08564373850822449, "learning_rate": 4.61930978917685e-05, "loss": 1.0028407096862793, "mean_token_accuracy": 0.761284664273262, "num_tokens": 24574604.0, "step": 6120 }, { "entropy": 0.9015337243676186, "epoch": 2.3459624952162264, "grad_norm": 0.09626568853855133, "learning_rate": 4.592453336914194e-05, "loss": 0.9965445518493652, "mean_token_accuracy": 0.7804829552769661, "num_tokens": 24615926.0, "step": 6130 }, { "entropy": 0.8764280565083027, "epoch": 2.3497895139686182, "grad_norm": 0.09104456007480621, "learning_rate": 4.565596884651538e-05, "loss": 0.9158814430236817, "mean_token_accuracy": 0.7859255224466324, "num_tokens": 24656662.0, "step": 6140 }, { "entropy": 0.8626538865268231, "epoch": 2.35361653272101, "grad_norm": 0.10454346984624863, "learning_rate": 4.5387404323888814e-05, "loss": 0.9093445777893067, "mean_token_accuracy": 0.7909897804260254, "num_tokens": 24696048.0, "step": 6150 }, { "entropy": 0.9042750746011734, "epoch": 2.357443551473402, "grad_norm": 0.09976542741060257, "learning_rate": 4.511883980126226e-05, "loss": 0.9527711868286133, "mean_token_accuracy": 0.7807446241378784, "num_tokens": 24738856.0, "step": 6160 }, { "entropy": 0.892713101953268, "epoch": 2.361270570225794, "grad_norm": 0.09778838604688644, "learning_rate": 4.4850275278635694e-05, "loss": 0.9142132759094238, "mean_token_accuracy": 0.7793798848986626, "num_tokens": 24781940.0, "step": 6170 }, { "entropy": 0.8652282394468784, "epoch": 2.365097588978186, "grad_norm": 0.13737474381923676, "learning_rate": 4.4581710756009134e-05, "loss": 0.9030959129333496, "mean_token_accuracy": 0.7882118329405785, "num_tokens": 24818476.0, "step": 6180 }, { "entropy": 0.880942365527153, "epoch": 2.3689246077305777, "grad_norm": 0.09460416436195374, "learning_rate": 4.4313146233382575e-05, "loss": 0.9684123992919922, "mean_token_accuracy": 0.7829654842615128, "num_tokens": 24856950.0, "step": 6190 }, { "entropy": 0.9563789039850235, "epoch": 2.3727516264829696, "grad_norm": 0.10954713076353073, "learning_rate": 4.404458171075601e-05, "loss": 1.029030704498291, "mean_token_accuracy": 0.7727080956101418, "num_tokens": 24895606.0, "step": 6200 }, { "entropy": 0.827500730752945, "epoch": 2.3765786452353614, "grad_norm": 0.1212112084031105, "learning_rate": 4.377601718812945e-05, "loss": 0.8650990486145019, "mean_token_accuracy": 0.7993797525763512, "num_tokens": 24932482.0, "step": 6210 }, { "entropy": 0.8221234314143657, "epoch": 2.3804056639877533, "grad_norm": 0.10023710876703262, "learning_rate": 4.350745266550289e-05, "loss": 0.8777777671813964, "mean_token_accuracy": 0.7987013593316078, "num_tokens": 24975109.0, "step": 6220 }, { "entropy": 0.8734230428934098, "epoch": 2.384232682740145, "grad_norm": 0.09403553605079651, "learning_rate": 4.323888814287633e-05, "loss": 0.8978803634643555, "mean_token_accuracy": 0.7872134670615196, "num_tokens": 25020916.0, "step": 6230 }, { "entropy": 0.9003870271146297, "epoch": 2.388059701492537, "grad_norm": 0.09854581952095032, "learning_rate": 4.297032362024977e-05, "loss": 0.9225659370422363, "mean_token_accuracy": 0.7807397484779358, "num_tokens": 25061018.0, "step": 6240 }, { "entropy": 0.8118300527334213, "epoch": 2.391886720244929, "grad_norm": 0.11139514297246933, "learning_rate": 4.27017590976232e-05, "loss": 0.8876243591308594, "mean_token_accuracy": 0.800039604306221, "num_tokens": 25097954.0, "step": 6250 }, { "entropy": 0.8419897515326739, "epoch": 2.395713738997321, "grad_norm": 0.09123879671096802, "learning_rate": 4.243319457499664e-05, "loss": 0.86744384765625, "mean_token_accuracy": 0.7919191718101501, "num_tokens": 25134260.0, "step": 6260 }, { "entropy": 0.9123246632516384, "epoch": 2.3995407577497128, "grad_norm": 0.10300562530755997, "learning_rate": 4.2164630052370084e-05, "loss": 0.9368386268615723, "mean_token_accuracy": 0.7797829449176789, "num_tokens": 25176001.0, "step": 6270 }, { "entropy": 0.9066010326147079, "epoch": 2.4033677765021046, "grad_norm": 0.10231593996286392, "learning_rate": 4.1896065529743524e-05, "loss": 0.9637252807617187, "mean_token_accuracy": 0.7807635113596916, "num_tokens": 25214450.0, "step": 6280 }, { "entropy": 0.8680018067359925, "epoch": 2.4071947952544965, "grad_norm": 0.09813899546861649, "learning_rate": 4.162750100711696e-05, "loss": 0.9405930519104004, "mean_token_accuracy": 0.7862071350216866, "num_tokens": 25249019.0, "step": 6290 }, { "entropy": 0.8444254245609045, "epoch": 2.4110218140068884, "grad_norm": 0.09815159440040588, "learning_rate": 4.1358936484490404e-05, "loss": 0.9015726089477539, "mean_token_accuracy": 0.7970604464411736, "num_tokens": 25287466.0, "step": 6300 }, { "entropy": 0.9179269846528768, "epoch": 2.4148488327592803, "grad_norm": 0.1013285368680954, "learning_rate": 4.109037196186384e-05, "loss": 0.9629206657409668, "mean_token_accuracy": 0.7756785362958908, "num_tokens": 25325488.0, "step": 6310 }, { "entropy": 0.8627055402845144, "epoch": 2.418675851511672, "grad_norm": 0.09085863828659058, "learning_rate": 4.082180743923728e-05, "loss": 0.8825644493103028, "mean_token_accuracy": 0.7927587017416954, "num_tokens": 25362470.0, "step": 6320 }, { "entropy": 0.8909512132406234, "epoch": 2.422502870264064, "grad_norm": 0.12609654664993286, "learning_rate": 4.055324291661072e-05, "loss": 0.9005517959594727, "mean_token_accuracy": 0.784729179739952, "num_tokens": 25405399.0, "step": 6330 }, { "entropy": 0.8371693149209023, "epoch": 2.426329889016456, "grad_norm": 0.09511356055736542, "learning_rate": 4.028467839398415e-05, "loss": 0.8819235801696778, "mean_token_accuracy": 0.7933985084295273, "num_tokens": 25443537.0, "step": 6340 }, { "entropy": 0.8452706336975098, "epoch": 2.430156907768848, "grad_norm": 0.08440756797790527, "learning_rate": 4.00161138713576e-05, "loss": 0.9220956802368164, "mean_token_accuracy": 0.791832709312439, "num_tokens": 25482874.0, "step": 6350 }, { "entropy": 0.8533206440508365, "epoch": 2.4339839265212397, "grad_norm": 0.10529948770999908, "learning_rate": 3.974754934873103e-05, "loss": 0.8976041793823242, "mean_token_accuracy": 0.7917203813791275, "num_tokens": 25523091.0, "step": 6360 }, { "entropy": 0.8192368470132351, "epoch": 2.4378109452736316, "grad_norm": 0.08338342607021332, "learning_rate": 3.947898482610447e-05, "loss": 0.8657890319824219, "mean_token_accuracy": 0.8002077579498291, "num_tokens": 25566050.0, "step": 6370 }, { "entropy": 0.9303523369133473, "epoch": 2.4416379640260235, "grad_norm": 0.09010683745145798, "learning_rate": 3.921042030347791e-05, "loss": 0.9760264396667481, "mean_token_accuracy": 0.7748634815216064, "num_tokens": 25608936.0, "step": 6380 }, { "entropy": 0.7555282160639762, "epoch": 2.4454649827784154, "grad_norm": 0.11948851495981216, "learning_rate": 3.894185578085135e-05, "loss": 0.8005829811096191, "mean_token_accuracy": 0.8136610746383667, "num_tokens": 25647408.0, "step": 6390 }, { "entropy": 0.8959879912436008, "epoch": 2.4492920015308073, "grad_norm": 0.09189214557409286, "learning_rate": 3.8673291258224794e-05, "loss": 0.9070920944213867, "mean_token_accuracy": 0.7838554188609124, "num_tokens": 25690271.0, "step": 6400 }, { "entropy": 0.7601668298244476, "epoch": 2.453119020283199, "grad_norm": 0.11115460842847824, "learning_rate": 3.840472673559823e-05, "loss": 0.837701416015625, "mean_token_accuracy": 0.8158529132604599, "num_tokens": 25730098.0, "step": 6410 }, { "entropy": 0.9026189528405666, "epoch": 2.456946039035591, "grad_norm": 0.0951504036784172, "learning_rate": 3.813616221297167e-05, "loss": 0.9555998802185058, "mean_token_accuracy": 0.7768774792551995, "num_tokens": 25769649.0, "step": 6420 }, { "entropy": 0.8566267982125282, "epoch": 2.460773057787983, "grad_norm": 0.1477993279695511, "learning_rate": 3.786759769034511e-05, "loss": 0.901324462890625, "mean_token_accuracy": 0.7918707326054573, "num_tokens": 25805906.0, "step": 6430 }, { "entropy": 0.8576595298945904, "epoch": 2.464600076540375, "grad_norm": 0.08643563091754913, "learning_rate": 3.759903316771854e-05, "loss": 0.9027094841003418, "mean_token_accuracy": 0.7925754263997078, "num_tokens": 25847270.0, "step": 6440 }, { "entropy": 0.8848195761442185, "epoch": 2.4684270952927667, "grad_norm": 0.1148499846458435, "learning_rate": 3.733046864509199e-05, "loss": 0.9222222328186035, "mean_token_accuracy": 0.7866752982139588, "num_tokens": 25890454.0, "step": 6450 }, { "entropy": 0.8222585029900074, "epoch": 2.4722541140451586, "grad_norm": 0.1051439717411995, "learning_rate": 3.706190412246542e-05, "loss": 0.8674264907836914, "mean_token_accuracy": 0.8014690011739731, "num_tokens": 25927176.0, "step": 6460 }, { "entropy": 0.7895723138004541, "epoch": 2.4760811327975505, "grad_norm": 0.08904940634965897, "learning_rate": 3.679333959983886e-05, "loss": 0.8720718383789062, "mean_token_accuracy": 0.8032544136047364, "num_tokens": 25969008.0, "step": 6470 }, { "entropy": 0.8449521534144878, "epoch": 2.4799081515499424, "grad_norm": 0.09109736979007721, "learning_rate": 3.65247750772123e-05, "loss": 0.8994977951049805, "mean_token_accuracy": 0.7939551532268524, "num_tokens": 26008671.0, "step": 6480 }, { "entropy": 0.8769714809954167, "epoch": 2.4837351703023343, "grad_norm": 0.09221527725458145, "learning_rate": 3.625621055458574e-05, "loss": 0.9647493362426758, "mean_token_accuracy": 0.7877351269125938, "num_tokens": 26047583.0, "step": 6490 }, { "entropy": 0.840660959109664, "epoch": 2.487562189054726, "grad_norm": 0.0888860896229744, "learning_rate": 3.598764603195918e-05, "loss": 0.872824764251709, "mean_token_accuracy": 0.7932088255882264, "num_tokens": 26090690.0, "step": 6500 }, { "entropy": 0.9435165245085955, "epoch": 2.491389207807118, "grad_norm": 0.10055243968963623, "learning_rate": 3.571908150933262e-05, "loss": 1.008607769012451, "mean_token_accuracy": 0.7684792190790176, "num_tokens": 26134620.0, "step": 6510 }, { "entropy": 0.9596942149102687, "epoch": 2.49521622655951, "grad_norm": 0.11321604251861572, "learning_rate": 3.545051698670606e-05, "loss": 1.021597957611084, "mean_token_accuracy": 0.7706323087215423, "num_tokens": 26176850.0, "step": 6520 }, { "entropy": 0.9805667255073786, "epoch": 2.499043245311902, "grad_norm": 0.13084010779857635, "learning_rate": 3.51819524640795e-05, "loss": 1.0418537139892579, "mean_token_accuracy": 0.763472905755043, "num_tokens": 26220943.0, "step": 6530 }, { "entropy": 0.9104986634105444, "epoch": 2.5028702640642937, "grad_norm": 0.09176472574472427, "learning_rate": 3.491338794145294e-05, "loss": 0.972693920135498, "mean_token_accuracy": 0.7809211134910583, "num_tokens": 26262084.0, "step": 6540 }, { "entropy": 0.8316202580928802, "epoch": 2.5066972828166856, "grad_norm": 0.11009900271892548, "learning_rate": 3.464482341882637e-05, "loss": 0.8581557273864746, "mean_token_accuracy": 0.7978575736284256, "num_tokens": 26302790.0, "step": 6550 }, { "entropy": 0.9041007287800312, "epoch": 2.5105243015690775, "grad_norm": 0.12103740125894547, "learning_rate": 3.437625889619981e-05, "loss": 0.9546697616577149, "mean_token_accuracy": 0.7800753250718117, "num_tokens": 26347959.0, "step": 6560 }, { "entropy": 0.8139931574463845, "epoch": 2.5143513203214694, "grad_norm": 0.08679619431495667, "learning_rate": 3.410769437357325e-05, "loss": 0.8982272148132324, "mean_token_accuracy": 0.8002956256270408, "num_tokens": 26388946.0, "step": 6570 }, { "entropy": 0.838017127290368, "epoch": 2.5181783390738612, "grad_norm": 0.12066033482551575, "learning_rate": 3.383912985094669e-05, "loss": 0.8589006423950195, "mean_token_accuracy": 0.7943052783608436, "num_tokens": 26431191.0, "step": 6580 }, { "entropy": 0.8299121838063002, "epoch": 2.522005357826253, "grad_norm": 0.08988375216722488, "learning_rate": 3.357056532832013e-05, "loss": 0.9106943130493164, "mean_token_accuracy": 0.7972570925951004, "num_tokens": 26468346.0, "step": 6590 }, { "entropy": 1.0362544253468513, "epoch": 2.525832376578645, "grad_norm": 0.10034547746181488, "learning_rate": 3.3302000805693566e-05, "loss": 1.0991132736206055, "mean_token_accuracy": 0.7502188056707382, "num_tokens": 26508029.0, "step": 6600 }, { "entropy": 0.9098232574760914, "epoch": 2.529659395331037, "grad_norm": 0.12513861060142517, "learning_rate": 3.303343628306701e-05, "loss": 0.9807866096496582, "mean_token_accuracy": 0.7815383434295654, "num_tokens": 26549321.0, "step": 6610 }, { "entropy": 0.8234303712844848, "epoch": 2.533486414083429, "grad_norm": 0.08378947526216507, "learning_rate": 3.2764871760440446e-05, "loss": 0.8650754928588867, "mean_token_accuracy": 0.7995569303631782, "num_tokens": 26589472.0, "step": 6620 }, { "entropy": 0.769949347153306, "epoch": 2.5373134328358207, "grad_norm": 0.12056911736726761, "learning_rate": 3.249630723781389e-05, "loss": 0.8480927467346191, "mean_token_accuracy": 0.818176555633545, "num_tokens": 26627566.0, "step": 6630 }, { "entropy": 0.8099306054413319, "epoch": 2.5411404515882126, "grad_norm": 0.09869939833879471, "learning_rate": 3.222774271518733e-05, "loss": 0.8649662017822266, "mean_token_accuracy": 0.7981634557247161, "num_tokens": 26662566.0, "step": 6640 }, { "entropy": 0.8528701025992632, "epoch": 2.5449674703406044, "grad_norm": 0.10336704552173615, "learning_rate": 3.195917819256076e-05, "loss": 0.9127251625061035, "mean_token_accuracy": 0.7928516089916229, "num_tokens": 26705768.0, "step": 6650 }, { "entropy": 0.8498493686318398, "epoch": 2.5487944890929963, "grad_norm": 0.10704471170902252, "learning_rate": 3.169061366993421e-05, "loss": 0.863565731048584, "mean_token_accuracy": 0.7932710304856301, "num_tokens": 26743574.0, "step": 6660 }, { "entropy": 0.8566017836332321, "epoch": 2.552621507845388, "grad_norm": 0.12135261297225952, "learning_rate": 3.142204914730764e-05, "loss": 0.9187004089355468, "mean_token_accuracy": 0.7913481816649437, "num_tokens": 26784127.0, "step": 6670 }, { "entropy": 0.8302055161446333, "epoch": 2.55644852659778, "grad_norm": 0.1430647373199463, "learning_rate": 3.115348462468108e-05, "loss": 0.8857596397399903, "mean_token_accuracy": 0.7965412393212319, "num_tokens": 26823189.0, "step": 6680 }, { "entropy": 0.8327139757573605, "epoch": 2.560275545350172, "grad_norm": 0.09538804739713669, "learning_rate": 3.088492010205452e-05, "loss": 0.9255412101745606, "mean_token_accuracy": 0.7939359977841377, "num_tokens": 26861599.0, "step": 6690 }, { "entropy": 0.8530606523156166, "epoch": 2.564102564102564, "grad_norm": 0.09193538129329681, "learning_rate": 3.0616355579427955e-05, "loss": 0.9151040077209472, "mean_token_accuracy": 0.7901859179139137, "num_tokens": 26901064.0, "step": 6700 }, { "entropy": 0.794033832848072, "epoch": 2.5679295828549558, "grad_norm": 0.1283407062292099, "learning_rate": 3.03477910568014e-05, "loss": 0.8441056251525879, "mean_token_accuracy": 0.8033816903829575, "num_tokens": 26942161.0, "step": 6710 }, { "entropy": 0.9340717010200024, "epoch": 2.5717566016073476, "grad_norm": 0.09237734973430634, "learning_rate": 3.0079226534174836e-05, "loss": 0.9747485160827637, "mean_token_accuracy": 0.7732965379953385, "num_tokens": 26982759.0, "step": 6720 }, { "entropy": 0.8746799558401108, "epoch": 2.5755836203597395, "grad_norm": 0.1391710638999939, "learning_rate": 2.9810662011548273e-05, "loss": 0.9311764717102051, "mean_token_accuracy": 0.7883311554789543, "num_tokens": 27022926.0, "step": 6730 }, { "entropy": 0.8290158938616514, "epoch": 2.5794106391121314, "grad_norm": 0.10442391782999039, "learning_rate": 2.9542097488921716e-05, "loss": 0.8346040725708008, "mean_token_accuracy": 0.7985544398427009, "num_tokens": 27065028.0, "step": 6740 }, { "entropy": 0.8574424415826798, "epoch": 2.5832376578645233, "grad_norm": 0.13001689314842224, "learning_rate": 2.9273532966295153e-05, "loss": 0.906099510192871, "mean_token_accuracy": 0.790866918861866, "num_tokens": 27100867.0, "step": 6750 }, { "entropy": 0.840974472463131, "epoch": 2.587064676616915, "grad_norm": 0.1224556565284729, "learning_rate": 2.9004968443668594e-05, "loss": 0.8969048500061035, "mean_token_accuracy": 0.7953185483813285, "num_tokens": 27137338.0, "step": 6760 }, { "entropy": 0.8477607406675816, "epoch": 2.590891695369307, "grad_norm": 0.09641005098819733, "learning_rate": 2.873640392104203e-05, "loss": 0.9569526672363281, "mean_token_accuracy": 0.7941199511289596, "num_tokens": 27178308.0, "step": 6770 }, { "entropy": 0.8317056275904179, "epoch": 2.594718714121699, "grad_norm": 0.11853990703821182, "learning_rate": 2.8467839398415468e-05, "loss": 0.9125295639038086, "mean_token_accuracy": 0.7960822626948356, "num_tokens": 27216898.0, "step": 6780 }, { "entropy": 0.8558823302388191, "epoch": 2.598545732874091, "grad_norm": 0.10477570444345474, "learning_rate": 2.819927487578891e-05, "loss": 0.8844131469726563, "mean_token_accuracy": 0.7940610870718956, "num_tokens": 27254443.0, "step": 6790 }, { "entropy": 0.8210954669862985, "epoch": 2.6023727516264827, "grad_norm": 0.14100609719753265, "learning_rate": 2.7930710353162348e-05, "loss": 0.8684535980224609, "mean_token_accuracy": 0.7988820597529411, "num_tokens": 27290079.0, "step": 6800 }, { "entropy": 0.8657392464578152, "epoch": 2.6061997703788746, "grad_norm": 0.09813658148050308, "learning_rate": 2.7662145830535785e-05, "loss": 0.9158803939819335, "mean_token_accuracy": 0.7908033922314643, "num_tokens": 27328190.0, "step": 6810 }, { "entropy": 0.8866597019135952, "epoch": 2.6100267891312665, "grad_norm": 0.11115613579750061, "learning_rate": 2.739358130790923e-05, "loss": 0.9120420455932617, "mean_token_accuracy": 0.7854148596525192, "num_tokens": 27369945.0, "step": 6820 }, { "entropy": 0.7982962183654309, "epoch": 2.6138538078836584, "grad_norm": 0.1377696692943573, "learning_rate": 2.7125016785282666e-05, "loss": 0.8332090377807617, "mean_token_accuracy": 0.8022376418113708, "num_tokens": 27406302.0, "step": 6830 }, { "entropy": 0.8424798093736172, "epoch": 2.6176808266360503, "grad_norm": 0.11442425101995468, "learning_rate": 2.6856452262656106e-05, "loss": 0.8876424789428711, "mean_token_accuracy": 0.7893706291913987, "num_tokens": 27449733.0, "step": 6840 }, { "entropy": 0.9239407800137996, "epoch": 2.621507845388442, "grad_norm": 0.0799759030342102, "learning_rate": 2.6587887740029543e-05, "loss": 0.9658034324645997, "mean_token_accuracy": 0.7757296651601792, "num_tokens": 27492884.0, "step": 6850 }, { "entropy": 0.8720928959548473, "epoch": 2.625334864140834, "grad_norm": 0.11632338911294937, "learning_rate": 2.631932321740298e-05, "loss": 0.9089359283447266, "mean_token_accuracy": 0.7913818553090095, "num_tokens": 27531878.0, "step": 6860 }, { "entropy": 0.9302754916250706, "epoch": 2.629161882893226, "grad_norm": 0.11215951293706894, "learning_rate": 2.6050758694776423e-05, "loss": 1.0027677536010742, "mean_token_accuracy": 0.7739164605736732, "num_tokens": 27567970.0, "step": 6870 }, { "entropy": 0.9016003269702196, "epoch": 2.632988901645618, "grad_norm": 0.11951353400945663, "learning_rate": 2.578219417214986e-05, "loss": 0.9493217468261719, "mean_token_accuracy": 0.7779877439141274, "num_tokens": 27609840.0, "step": 6880 }, { "entropy": 0.8870487026870251, "epoch": 2.6368159203980097, "grad_norm": 0.1124744564294815, "learning_rate": 2.55136296495233e-05, "loss": 1.0031387329101562, "mean_token_accuracy": 0.7866110280156136, "num_tokens": 27649655.0, "step": 6890 }, { "entropy": 0.9296976864337921, "epoch": 2.6406429391504016, "grad_norm": 0.1161704882979393, "learning_rate": 2.5245065126896738e-05, "loss": 1.012251853942871, "mean_token_accuracy": 0.7726465791463852, "num_tokens": 27694105.0, "step": 6900 }, { "entropy": 0.8415393102914095, "epoch": 2.6444699579027935, "grad_norm": 0.0987096056342125, "learning_rate": 2.4976500604270178e-05, "loss": 0.9147520065307617, "mean_token_accuracy": 0.7973951831459999, "num_tokens": 27730663.0, "step": 6910 }, { "entropy": 0.8274203538894653, "epoch": 2.6482969766551854, "grad_norm": 0.1101188212633133, "learning_rate": 2.4707936081643615e-05, "loss": 0.8873770713806153, "mean_token_accuracy": 0.7974746853113175, "num_tokens": 27772881.0, "step": 6920 }, { "entropy": 0.7984559834003448, "epoch": 2.6521239954075773, "grad_norm": 0.10185439884662628, "learning_rate": 2.4439371559017055e-05, "loss": 0.8775921821594238, "mean_token_accuracy": 0.807880648970604, "num_tokens": 27809534.0, "step": 6930 }, { "entropy": 0.887981615960598, "epoch": 2.655951014159969, "grad_norm": 0.08309295773506165, "learning_rate": 2.4170807036390495e-05, "loss": 0.9466443061828613, "mean_token_accuracy": 0.7859978228807449, "num_tokens": 27852591.0, "step": 6940 }, { "entropy": 0.9378888584673405, "epoch": 2.659778032912361, "grad_norm": 0.136076882481575, "learning_rate": 2.3902242513763932e-05, "loss": 1.0269956588745117, "mean_token_accuracy": 0.7709244459867477, "num_tokens": 27892120.0, "step": 6950 }, { "entropy": 0.9220107842236758, "epoch": 2.663605051664753, "grad_norm": 0.08248933404684067, "learning_rate": 2.363367799113737e-05, "loss": 0.9726594924926758, "mean_token_accuracy": 0.7753236919641495, "num_tokens": 27935380.0, "step": 6960 }, { "entropy": 0.7793348811566829, "epoch": 2.667432070417145, "grad_norm": 0.08308061957359314, "learning_rate": 2.336511346851081e-05, "loss": 0.7947993278503418, "mean_token_accuracy": 0.8088447406888009, "num_tokens": 27973020.0, "step": 6970 }, { "entropy": 0.9587450519204139, "epoch": 2.6712590891695367, "grad_norm": 0.10263237357139587, "learning_rate": 2.309654894588425e-05, "loss": 0.9791707038879395, "mean_token_accuracy": 0.7663016110658646, "num_tokens": 28016389.0, "step": 6980 }, { "entropy": 0.8766636185348033, "epoch": 2.6750861079219286, "grad_norm": 0.09917714446783066, "learning_rate": 2.282798442325769e-05, "loss": 0.9187355041503906, "mean_token_accuracy": 0.7864622801542283, "num_tokens": 28058100.0, "step": 6990 }, { "entropy": 0.8623256701976061, "epoch": 2.6789131266743205, "grad_norm": 0.08802894502878189, "learning_rate": 2.255941990063113e-05, "loss": 0.9108509063720703, "mean_token_accuracy": 0.7891170993447304, "num_tokens": 28095166.0, "step": 7000 }, { "entropy": 0.919238954409957, "epoch": 2.6827401454267124, "grad_norm": 0.11916540563106537, "learning_rate": 2.2290855378004567e-05, "loss": 0.9972674369812011, "mean_token_accuracy": 0.7765705808997154, "num_tokens": 28137533.0, "step": 7010 }, { "entropy": 0.918128065392375, "epoch": 2.6865671641791042, "grad_norm": 0.09536208212375641, "learning_rate": 2.2022290855378004e-05, "loss": 0.9865476608276367, "mean_token_accuracy": 0.7736267536878586, "num_tokens": 28179301.0, "step": 7020 }, { "entropy": 0.8265572734177112, "epoch": 2.690394182931496, "grad_norm": 0.09432680904865265, "learning_rate": 2.1753726332751444e-05, "loss": 0.8995939254760742, "mean_token_accuracy": 0.7947996065020562, "num_tokens": 28223849.0, "step": 7030 }, { "entropy": 0.8321899034082889, "epoch": 2.694221201683888, "grad_norm": 0.1223755031824112, "learning_rate": 2.1485161810124885e-05, "loss": 0.9003139495849609, "mean_token_accuracy": 0.7975824415683747, "num_tokens": 28268485.0, "step": 7040 }, { "entropy": 0.9064472205936909, "epoch": 2.69804822043628, "grad_norm": 0.13409113883972168, "learning_rate": 2.121659728749832e-05, "loss": 0.9323970794677734, "mean_token_accuracy": 0.7808707699179649, "num_tokens": 28307792.0, "step": 7050 }, { "entropy": 0.9527742668986321, "epoch": 2.701875239188672, "grad_norm": 0.09863030910491943, "learning_rate": 2.0948032764871762e-05, "loss": 1.0056820869445802, "mean_token_accuracy": 0.7673134744167328, "num_tokens": 28355447.0, "step": 7060 }, { "entropy": 0.8202732041478157, "epoch": 2.7057022579410637, "grad_norm": 0.10251973569393158, "learning_rate": 2.0679468242245202e-05, "loss": 0.8743599891662598, "mean_token_accuracy": 0.7957186102867126, "num_tokens": 28397195.0, "step": 7070 }, { "entropy": 0.9328485410660505, "epoch": 2.7095292766934556, "grad_norm": 0.09044504910707474, "learning_rate": 2.041090371961864e-05, "loss": 0.9707870483398438, "mean_token_accuracy": 0.7739486545324326, "num_tokens": 28440070.0, "step": 7080 }, { "entropy": 0.9110265091061592, "epoch": 2.7133562954458474, "grad_norm": 0.10417858511209488, "learning_rate": 2.0142339196992076e-05, "loss": 0.9495024681091309, "mean_token_accuracy": 0.7784481555223465, "num_tokens": 28483039.0, "step": 7090 }, { "entropy": 0.907703897356987, "epoch": 2.7171833141982393, "grad_norm": 0.10365665704011917, "learning_rate": 1.9873774674365516e-05, "loss": 0.9539920806884765, "mean_token_accuracy": 0.7803053423762322, "num_tokens": 28524922.0, "step": 7100 }, { "entropy": 0.8090648584067821, "epoch": 2.721010332950631, "grad_norm": 0.13015250861644745, "learning_rate": 1.9605210151738957e-05, "loss": 0.8559967994689941, "mean_token_accuracy": 0.7999090999364853, "num_tokens": 28565638.0, "step": 7110 }, { "entropy": 0.832624789327383, "epoch": 2.724837351703023, "grad_norm": 0.12992241978645325, "learning_rate": 1.9336645629112397e-05, "loss": 0.886108112335205, "mean_token_accuracy": 0.7986625626683235, "num_tokens": 28603666.0, "step": 7120 }, { "entropy": 0.8167526118457318, "epoch": 2.728664370455415, "grad_norm": 0.0879233330488205, "learning_rate": 1.9068081106485834e-05, "loss": 0.8744274139404297, "mean_token_accuracy": 0.8013173520565033, "num_tokens": 28647331.0, "step": 7130 }, { "entropy": 0.8693740144371986, "epoch": 2.732491389207807, "grad_norm": 0.11505398899316788, "learning_rate": 1.879951658385927e-05, "loss": 0.9142866134643555, "mean_token_accuracy": 0.7936322972178459, "num_tokens": 28683073.0, "step": 7140 }, { "entropy": 0.7896613411605358, "epoch": 2.7363184079601988, "grad_norm": 0.10490158945322037, "learning_rate": 1.853095206123271e-05, "loss": 0.8762624740600586, "mean_token_accuracy": 0.8044975116848946, "num_tokens": 28722340.0, "step": 7150 }, { "entropy": 0.8261051677167416, "epoch": 2.7401454267125906, "grad_norm": 0.10280875116586685, "learning_rate": 1.826238753860615e-05, "loss": 0.888590145111084, "mean_token_accuracy": 0.7989666223526001, "num_tokens": 28757940.0, "step": 7160 }, { "entropy": 0.8630577899515629, "epoch": 2.7439724454649825, "grad_norm": 0.12757791578769684, "learning_rate": 1.799382301597959e-05, "loss": 0.9082697868347168, "mean_token_accuracy": 0.7890564352273941, "num_tokens": 28796985.0, "step": 7170 }, { "entropy": 0.8979216992855072, "epoch": 2.7477994642173744, "grad_norm": 0.13048897683620453, "learning_rate": 1.772525849335303e-05, "loss": 0.9468406677246094, "mean_token_accuracy": 0.7829687342047691, "num_tokens": 28838091.0, "step": 7180 }, { "entropy": 0.9002114910632372, "epoch": 2.7516264829697663, "grad_norm": 0.130500927567482, "learning_rate": 1.745669397072647e-05, "loss": 0.9897032737731933, "mean_token_accuracy": 0.7817048847675323, "num_tokens": 28879084.0, "step": 7190 }, { "entropy": 0.861878028512001, "epoch": 2.755453501722158, "grad_norm": 0.10523588210344315, "learning_rate": 1.7188129448099906e-05, "loss": 0.9628341674804688, "mean_token_accuracy": 0.7882343173027039, "num_tokens": 28918018.0, "step": 7200 }, { "entropy": 0.7814029835164547, "epoch": 2.75928052047455, "grad_norm": 0.14345957338809967, "learning_rate": 1.6919564925473346e-05, "loss": 0.8377615928649902, "mean_token_accuracy": 0.8100636526942253, "num_tokens": 28953674.0, "step": 7210 }, { "entropy": 0.8798072785139084, "epoch": 2.763107539226942, "grad_norm": 0.10911094397306442, "learning_rate": 1.6651000402846783e-05, "loss": 0.9405971527099609, "mean_token_accuracy": 0.7843327835202217, "num_tokens": 28995212.0, "step": 7220 }, { "entropy": 0.7432700909674168, "epoch": 2.766934557979334, "grad_norm": 0.09271088242530823, "learning_rate": 1.6382435880220223e-05, "loss": 0.7987990856170655, "mean_token_accuracy": 0.8185402989387512, "num_tokens": 29034878.0, "step": 7230 }, { "entropy": 0.7937459200620651, "epoch": 2.7707615767317257, "grad_norm": 0.11122163385152817, "learning_rate": 1.6113871357593664e-05, "loss": 0.8469036102294922, "mean_token_accuracy": 0.8031805381178856, "num_tokens": 29074372.0, "step": 7240 }, { "entropy": 0.8456454008817673, "epoch": 2.7745885954841176, "grad_norm": 0.11189702153205872, "learning_rate": 1.5845306834967104e-05, "loss": 0.8942484855651855, "mean_token_accuracy": 0.7923400938510895, "num_tokens": 29117619.0, "step": 7250 }, { "entropy": 0.885396859049797, "epoch": 2.7784156142365095, "grad_norm": 0.10170719027519226, "learning_rate": 1.557674231234054e-05, "loss": 0.9175837516784668, "mean_token_accuracy": 0.7860854491591454, "num_tokens": 29156601.0, "step": 7260 }, { "entropy": 0.8742636401206255, "epoch": 2.7822426329889014, "grad_norm": 0.11130956560373306, "learning_rate": 1.5308177789713978e-05, "loss": 0.9322646141052247, "mean_token_accuracy": 0.7902692511677742, "num_tokens": 29200295.0, "step": 7270 }, { "entropy": 0.8523757141083479, "epoch": 2.7860696517412933, "grad_norm": 0.08611233532428741, "learning_rate": 1.5039613267087418e-05, "loss": 0.9210372924804687, "mean_token_accuracy": 0.7912763133645058, "num_tokens": 29235323.0, "step": 7280 }, { "entropy": 0.7804547689855099, "epoch": 2.789896670493685, "grad_norm": 0.08091949671506882, "learning_rate": 1.4771048744460858e-05, "loss": 0.8202395439147949, "mean_token_accuracy": 0.8117679923772811, "num_tokens": 29270182.0, "step": 7290 }, { "entropy": 0.8199648998677731, "epoch": 2.793723689246077, "grad_norm": 0.07486634701490402, "learning_rate": 1.4502484221834297e-05, "loss": 0.8396285057067872, "mean_token_accuracy": 0.8032143607735633, "num_tokens": 29311588.0, "step": 7300 }, { "entropy": 0.9650515951216221, "epoch": 2.797550707998469, "grad_norm": 0.10391585528850555, "learning_rate": 1.4233919699207734e-05, "loss": 1.047046184539795, "mean_token_accuracy": 0.7648886650800705, "num_tokens": 29353979.0, "step": 7310 }, { "entropy": 0.7674700990319252, "epoch": 2.801377726750861, "grad_norm": 0.09043332189321518, "learning_rate": 1.3965355176581174e-05, "loss": 0.8154891014099122, "mean_token_accuracy": 0.8105725541710853, "num_tokens": 29393298.0, "step": 7320 }, { "entropy": 0.7795201197266579, "epoch": 2.8052047455032527, "grad_norm": 0.14624197781085968, "learning_rate": 1.3696790653954614e-05, "loss": 0.7968831062316895, "mean_token_accuracy": 0.808569261431694, "num_tokens": 29423547.0, "step": 7330 }, { "entropy": 0.9187626458704472, "epoch": 2.8090317642556446, "grad_norm": 0.1368781179189682, "learning_rate": 1.3428226131328053e-05, "loss": 0.9583258628845215, "mean_token_accuracy": 0.7731027945876121, "num_tokens": 29465593.0, "step": 7340 }, { "entropy": 0.9403511643409729, "epoch": 2.8128587830080365, "grad_norm": 0.10892713069915771, "learning_rate": 1.315966160870149e-05, "loss": 0.9621626853942871, "mean_token_accuracy": 0.767315211892128, "num_tokens": 29506888.0, "step": 7350 }, { "entropy": 0.842640140466392, "epoch": 2.8166858017604284, "grad_norm": 0.08862321823835373, "learning_rate": 1.289109708607493e-05, "loss": 0.9031145095825195, "mean_token_accuracy": 0.7967306047677993, "num_tokens": 29550811.0, "step": 7360 }, { "entropy": 0.8931968793272972, "epoch": 2.8205128205128203, "grad_norm": 0.0979296937584877, "learning_rate": 1.2622532563448369e-05, "loss": 0.9369117736816406, "mean_token_accuracy": 0.785995215177536, "num_tokens": 29587036.0, "step": 7370 }, { "entropy": 0.8621913805603981, "epoch": 2.824339839265212, "grad_norm": 0.08778136223554611, "learning_rate": 1.2353968040821807e-05, "loss": 0.884724235534668, "mean_token_accuracy": 0.790358729660511, "num_tokens": 29627992.0, "step": 7380 }, { "entropy": 0.8695362661033869, "epoch": 2.828166858017604, "grad_norm": 0.09141552448272705, "learning_rate": 1.2085403518195248e-05, "loss": 0.9539263725280762, "mean_token_accuracy": 0.78631162494421, "num_tokens": 29668509.0, "step": 7390 }, { "entropy": 0.8454725466668606, "epoch": 2.831993876769996, "grad_norm": 0.10090988874435425, "learning_rate": 1.1816838995568685e-05, "loss": 0.9256816864013672, "mean_token_accuracy": 0.7941092774271965, "num_tokens": 29706794.0, "step": 7400 }, { "entropy": 0.8406473740935325, "epoch": 2.835820895522388, "grad_norm": 0.12991519272327423, "learning_rate": 1.1548274472942125e-05, "loss": 0.8969921112060547, "mean_token_accuracy": 0.7950825378298759, "num_tokens": 29745883.0, "step": 7410 }, { "entropy": 0.8951507560908795, "epoch": 2.8396479142747797, "grad_norm": 0.14208164811134338, "learning_rate": 1.1279709950315565e-05, "loss": 0.9443653106689454, "mean_token_accuracy": 0.7820898026227951, "num_tokens": 29788428.0, "step": 7420 }, { "entropy": 0.859702505543828, "epoch": 2.8434749330271716, "grad_norm": 0.10485101491212845, "learning_rate": 1.1011145427689002e-05, "loss": 0.9106481552124024, "mean_token_accuracy": 0.7910059571266175, "num_tokens": 29829552.0, "step": 7430 }, { "entropy": 0.838575328886509, "epoch": 2.8473019517795635, "grad_norm": 0.09105801582336426, "learning_rate": 1.0742580905062442e-05, "loss": 0.9367799758911133, "mean_token_accuracy": 0.7953649654984474, "num_tokens": 29869380.0, "step": 7440 }, { "entropy": 0.9112126015126705, "epoch": 2.8511289705319554, "grad_norm": 0.09724974632263184, "learning_rate": 1.0474016382435881e-05, "loss": 0.9621581077575684, "mean_token_accuracy": 0.7795066565275193, "num_tokens": 29913977.0, "step": 7450 }, { "entropy": 0.7964273016899824, "epoch": 2.8549559892843472, "grad_norm": 0.09481512755155563, "learning_rate": 1.020545185980932e-05, "loss": 0.8208577156066894, "mean_token_accuracy": 0.8045729547739029, "num_tokens": 29949229.0, "step": 7460 }, { "entropy": 0.9103045649826527, "epoch": 2.858783008036739, "grad_norm": 0.08678591996431351, "learning_rate": 9.936887337182758e-06, "loss": 0.9599167823791503, "mean_token_accuracy": 0.7792657531797886, "num_tokens": 29999070.0, "step": 7470 }, { "entropy": 0.8333844318985939, "epoch": 2.862610026789131, "grad_norm": 0.07823742181062698, "learning_rate": 9.668322814556198e-06, "loss": 0.8832645416259766, "mean_token_accuracy": 0.7986885383725166, "num_tokens": 30041797.0, "step": 7480 }, { "entropy": 0.8970901295542717, "epoch": 2.866437045541523, "grad_norm": 0.11852974444627762, "learning_rate": 9.399758291929635e-06, "loss": 0.9755334854125977, "mean_token_accuracy": 0.7814395651221275, "num_tokens": 30080534.0, "step": 7490 }, { "entropy": 0.8733609687536955, "epoch": 2.870264064293915, "grad_norm": 0.08307944238185883, "learning_rate": 9.131193769303076e-06, "loss": 0.9116435050964355, "mean_token_accuracy": 0.786429825425148, "num_tokens": 30123488.0, "step": 7500 }, { "entropy": 0.7967244807630778, "epoch": 2.8740910830463067, "grad_norm": 0.121941938996315, "learning_rate": 8.862629246676514e-06, "loss": 0.8209601402282715, "mean_token_accuracy": 0.8040247783064842, "num_tokens": 30158076.0, "step": 7510 }, { "entropy": 0.8655086796730757, "epoch": 2.8779181017986986, "grad_norm": 0.10017320513725281, "learning_rate": 8.594064724049953e-06, "loss": 0.9246477127075196, "mean_token_accuracy": 0.7905631095170975, "num_tokens": 30198329.0, "step": 7520 }, { "entropy": 0.7916971929371357, "epoch": 2.8817451205510904, "grad_norm": 0.08822990953922272, "learning_rate": 8.325500201423391e-06, "loss": 0.8695680618286132, "mean_token_accuracy": 0.8063599601387977, "num_tokens": 30239990.0, "step": 7530 }, { "entropy": 0.7693583916872739, "epoch": 2.8855721393034823, "grad_norm": 0.1178632378578186, "learning_rate": 8.056935678796832e-06, "loss": 0.8029808044433594, "mean_token_accuracy": 0.808713173866272, "num_tokens": 30272583.0, "step": 7540 }, { "entropy": 0.9072235215455293, "epoch": 2.889399158055874, "grad_norm": 0.11368006467819214, "learning_rate": 7.78837115617027e-06, "loss": 0.9859001159667968, "mean_token_accuracy": 0.7825366839766502, "num_tokens": 30314370.0, "step": 7550 }, { "entropy": 0.909162075817585, "epoch": 2.893226176808266, "grad_norm": 0.10643935948610306, "learning_rate": 7.519806633543709e-06, "loss": 0.9263824462890625, "mean_token_accuracy": 0.7813168540596962, "num_tokens": 30362103.0, "step": 7560 }, { "entropy": 0.8779693342745304, "epoch": 2.897053195560658, "grad_norm": 0.12511365115642548, "learning_rate": 7.2512421109171484e-06, "loss": 0.9283166885375976, "mean_token_accuracy": 0.7876154363155365, "num_tokens": 30400468.0, "step": 7570 }, { "entropy": 0.9308112382888794, "epoch": 2.90088021431305, "grad_norm": 0.08942066878080368, "learning_rate": 6.982677588290587e-06, "loss": 0.9894198417663574, "mean_token_accuracy": 0.7739586725831031, "num_tokens": 30444628.0, "step": 7580 }, { "entropy": 0.8830183774232865, "epoch": 2.9047072330654418, "grad_norm": 0.08949998021125793, "learning_rate": 6.7141130656640265e-06, "loss": 0.9515928268432617, "mean_token_accuracy": 0.7846902176737786, "num_tokens": 30485845.0, "step": 7590 }, { "entropy": 0.8058773010969162, "epoch": 2.9085342518178336, "grad_norm": 0.1035229042172432, "learning_rate": 6.445548543037465e-06, "loss": 0.846186637878418, "mean_token_accuracy": 0.8066700398921967, "num_tokens": 30523979.0, "step": 7600 }, { "entropy": 0.9146121144294739, "epoch": 2.9123612705702255, "grad_norm": 0.09379884600639343, "learning_rate": 6.176984020410904e-06, "loss": 0.9735233306884765, "mean_token_accuracy": 0.7774886921048164, "num_tokens": 30564775.0, "step": 7610 }, { "entropy": 0.8396586284041405, "epoch": 2.9161882893226174, "grad_norm": 0.11920839548110962, "learning_rate": 5.908419497784342e-06, "loss": 0.9061779022216797, "mean_token_accuracy": 0.7974281132221221, "num_tokens": 30609113.0, "step": 7620 }, { "entropy": 0.8665836162865161, "epoch": 2.9200153080750093, "grad_norm": 0.10214731842279434, "learning_rate": 5.639854975157783e-06, "loss": 0.9333956718444825, "mean_token_accuracy": 0.7912585958838463, "num_tokens": 30652409.0, "step": 7630 }, { "entropy": 0.8082432024180889, "epoch": 2.923842326827401, "grad_norm": 0.09191566705703735, "learning_rate": 5.371290452531221e-06, "loss": 0.8443769454956055, "mean_token_accuracy": 0.797667445242405, "num_tokens": 30689299.0, "step": 7640 }, { "entropy": 0.8395522754639387, "epoch": 2.927669345579793, "grad_norm": 0.08281564712524414, "learning_rate": 5.10272592990466e-06, "loss": 0.8710539817810059, "mean_token_accuracy": 0.7973509266972542, "num_tokens": 30724619.0, "step": 7650 }, { "entropy": 0.8130493897944688, "epoch": 2.931496364332185, "grad_norm": 0.0996284931898117, "learning_rate": 4.834161407278099e-06, "loss": 0.8514342308044434, "mean_token_accuracy": 0.800888329744339, "num_tokens": 30764224.0, "step": 7660 }, { "entropy": 0.7793916609138251, "epoch": 2.935323383084577, "grad_norm": 0.09503267705440521, "learning_rate": 4.565596884651538e-06, "loss": 0.8305204391479493, "mean_token_accuracy": 0.8106261268258095, "num_tokens": 30800800.0, "step": 7670 }, { "entropy": 0.817446855083108, "epoch": 2.9391504018369687, "grad_norm": 0.13637053966522217, "learning_rate": 4.2970323620249764e-06, "loss": 0.839473819732666, "mean_token_accuracy": 0.8018909886479377, "num_tokens": 30841481.0, "step": 7680 }, { "entropy": 0.8140060313045978, "epoch": 2.9429774205893606, "grad_norm": 0.13390128314495087, "learning_rate": 4.028467839398416e-06, "loss": 0.8653444290161133, "mean_token_accuracy": 0.8000675857067108, "num_tokens": 30880001.0, "step": 7690 }, { "entropy": 0.7898532018065453, "epoch": 2.9468044393417525, "grad_norm": 0.11585478484630585, "learning_rate": 3.7599033167718545e-06, "loss": 0.8074365615844726, "mean_token_accuracy": 0.8053972944617271, "num_tokens": 30915563.0, "step": 7700 }, { "entropy": 0.8091453645378351, "epoch": 2.9506314580941444, "grad_norm": 0.09755035489797592, "learning_rate": 3.4913387941452935e-06, "loss": 0.8457134246826172, "mean_token_accuracy": 0.8031114682555198, "num_tokens": 30955410.0, "step": 7710 }, { "entropy": 0.8444364190101623, "epoch": 2.9544584768465363, "grad_norm": 0.1297679990530014, "learning_rate": 3.2227742715187325e-06, "loss": 0.910922622680664, "mean_token_accuracy": 0.7976488128304482, "num_tokens": 30997246.0, "step": 7720 }, { "entropy": 0.8454434804618358, "epoch": 2.958285495598928, "grad_norm": 0.15091662108898163, "learning_rate": 2.954209748892171e-06, "loss": 0.8977128982543945, "mean_token_accuracy": 0.7951600447297096, "num_tokens": 31042192.0, "step": 7730 }, { "entropy": 0.838621474429965, "epoch": 2.96211251435132, "grad_norm": 0.10101021081209183, "learning_rate": 2.6856452262656106e-06, "loss": 0.9142851829528809, "mean_token_accuracy": 0.7966463148593903, "num_tokens": 31082777.0, "step": 7740 }, { "entropy": 0.8021124713122845, "epoch": 2.965939533103712, "grad_norm": 0.11373798549175262, "learning_rate": 2.4170807036390496e-06, "loss": 0.845030403137207, "mean_token_accuracy": 0.8039181783795357, "num_tokens": 31122973.0, "step": 7750 }, { "entropy": 0.8570070005953312, "epoch": 2.969766551856104, "grad_norm": 0.0995812863111496, "learning_rate": 2.1485161810124882e-06, "loss": 0.8876262664794922, "mean_token_accuracy": 0.7908932328224182, "num_tokens": 31166313.0, "step": 7760 }, { "entropy": 0.9019658699631691, "epoch": 2.9735935706084957, "grad_norm": 0.10546575486660004, "learning_rate": 1.8799516583859272e-06, "loss": 0.9777070999145507, "mean_token_accuracy": 0.7821963891386986, "num_tokens": 31202060.0, "step": 7770 }, { "entropy": 0.9346055820584297, "epoch": 2.9774205893608876, "grad_norm": 0.11632298678159714, "learning_rate": 1.6113871357593663e-06, "loss": 1.017040729522705, "mean_token_accuracy": 0.7751505836844444, "num_tokens": 31241536.0, "step": 7780 }, { "entropy": 0.8882534563541412, "epoch": 2.9812476081132795, "grad_norm": 0.13064302504062653, "learning_rate": 1.3428226131328053e-06, "loss": 0.9505605697631836, "mean_token_accuracy": 0.7848831593990326, "num_tokens": 31278060.0, "step": 7790 }, { "entropy": 0.8854026839137077, "epoch": 2.9850746268656714, "grad_norm": 0.0977831557393074, "learning_rate": 1.0742580905062441e-06, "loss": 0.9311306953430176, "mean_token_accuracy": 0.7847100362181664, "num_tokens": 31325802.0, "step": 7800 }, { "entropy": 0.9448695838451385, "epoch": 2.9889016456180633, "grad_norm": 0.11724492162466049, "learning_rate": 8.056935678796831e-07, "loss": 0.983949089050293, "mean_token_accuracy": 0.7636413291096688, "num_tokens": 31367954.0, "step": 7810 }, { "entropy": 0.8787743166089058, "epoch": 2.992728664370455, "grad_norm": 0.09530383348464966, "learning_rate": 5.371290452531221e-07, "loss": 0.9605165481567383, "mean_token_accuracy": 0.7847816556692123, "num_tokens": 31410151.0, "step": 7820 }, { "entropy": 0.810061177611351, "epoch": 2.996555683122847, "grad_norm": 0.09042539447546005, "learning_rate": 2.6856452262656103e-07, "loss": 0.8766719818115234, "mean_token_accuracy": 0.8047587737441063, "num_tokens": 31451314.0, "step": 7830 } ], "logging_steps": 10, "max_steps": 7839, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.17346463002948e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }